diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,67370 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 37.1042463148092, + "learning_rate": 3.4602076124567476e-08, + "loss": 3.1037, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 25.207250934654958, + "learning_rate": 6.920415224913495e-08, + "loss": 2.5937, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 34.4866252238902, + "learning_rate": 1.0380622837370243e-07, + "loss": 3.1256, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 37.287480505182415, + "learning_rate": 1.384083044982699e-07, + "loss": 3.208, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 31.298895147565904, + "learning_rate": 1.730103806228374e-07, + "loss": 3.1655, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 28.5099030032688, + "learning_rate": 2.0761245674740486e-07, + "loss": 2.9876, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 30.91252751714465, + "learning_rate": 2.422145328719723e-07, + "loss": 3.0633, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 34.07859194615419, + "learning_rate": 2.768166089965398e-07, + "loss": 3.0316, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 25.74310170641338, + "learning_rate": 3.114186851211073e-07, + "loss": 2.5585, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 29.18649162666028, + "learning_rate": 3.460207612456748e-07, + "loss": 2.9646, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 35.07862998050571, + "learning_rate": 3.8062283737024223e-07, + "loss": 2.9688, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 28.233677390485585, + "learning_rate": 4.152249134948097e-07, + "loss": 2.545, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 31.86301568731232, + "learning_rate": 4.498269896193772e-07, + "loss": 3.1589, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 31.297767276970973, + "learning_rate": 4.844290657439446e-07, + "loss": 2.9288, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 31.327661562085037, + "learning_rate": 5.190311418685121e-07, + "loss": 3.187, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 32.09228220848974, + "learning_rate": 5.536332179930796e-07, + "loss": 3.2806, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 28.693100834543692, + "learning_rate": 5.882352941176471e-07, + "loss": 2.8149, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 24.493063660463303, + "learning_rate": 6.228373702422146e-07, + "loss": 2.6128, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 22.455923474790353, + "learning_rate": 6.57439446366782e-07, + "loss": 2.473, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 29.804700164829907, + "learning_rate": 6.920415224913496e-07, + "loss": 2.8412, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 26.638497790578334, + "learning_rate": 7.266435986159171e-07, + "loss": 2.5866, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 21.04505500067805, + "learning_rate": 7.612456747404845e-07, + "loss": 2.3629, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 20.49490636193544, + "learning_rate": 7.958477508650519e-07, + "loss": 2.0194, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 19.347203073629032, + "learning_rate": 8.304498269896194e-07, + "loss": 1.9241, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 18.464360513749167, + "learning_rate": 8.650519031141868e-07, + "loss": 1.9576, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 15.683836407692967, + "learning_rate": 8.996539792387544e-07, + "loss": 1.8351, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 15.676400385932228, + "learning_rate": 9.342560553633219e-07, + "loss": 1.8349, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 13.392235449843433, + "learning_rate": 9.688581314878893e-07, + "loss": 1.6501, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 16.527415581012864, + "learning_rate": 1.0034602076124569e-06, + "loss": 1.8315, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 13.694825750097737, + "learning_rate": 1.0380622837370243e-06, + "loss": 1.5652, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 18.763414643263694, + "learning_rate": 1.0726643598615919e-06, + "loss": 1.6854, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 20.33865629455925, + "learning_rate": 1.1072664359861592e-06, + "loss": 1.5039, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 12.763246152388538, + "learning_rate": 1.1418685121107268e-06, + "loss": 1.3142, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 9.263903811995936, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.2849, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 6.439565470509954, + "learning_rate": 1.2110726643598616e-06, + "loss": 1.1053, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 7.952049399538764, + "learning_rate": 1.2456747404844292e-06, + "loss": 1.1444, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 8.01097935846218, + "learning_rate": 1.2802768166089966e-06, + "loss": 1.143, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 9.871471103375363, + "learning_rate": 1.314878892733564e-06, + "loss": 1.1158, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 8.728062914617196, + "learning_rate": 1.3494809688581318e-06, + "loss": 1.1664, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 7.892186888010498, + "learning_rate": 1.3840830449826992e-06, + "loss": 1.0177, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 8.409090504197081, + "learning_rate": 1.4186851211072665e-06, + "loss": 0.971, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 5.601033155317786, + "learning_rate": 1.4532871972318341e-06, + "loss": 0.8399, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 4.431069512628087, + "learning_rate": 1.4878892733564015e-06, + "loss": 0.8619, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 4.253547266802608, + "learning_rate": 1.522491349480969e-06, + "loss": 0.8158, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 3.451235861424146, + "learning_rate": 1.5570934256055365e-06, + "loss": 0.8109, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 3.3351939398703987, + "learning_rate": 1.5916955017301039e-06, + "loss": 0.7258, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.942797536420241, + "learning_rate": 1.6262975778546713e-06, + "loss": 0.6719, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 2.7074094109335363, + "learning_rate": 1.6608996539792389e-06, + "loss": 0.6014, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 2.9317496409320096, + "learning_rate": 1.6955017301038063e-06, + "loss": 0.6469, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 2.5351542200072026, + "learning_rate": 1.7301038062283736e-06, + "loss": 0.8268, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.393778283674769, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.6724, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.274905557406526, + "learning_rate": 1.7993079584775088e-06, + "loss": 0.6593, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 2.8743291218680316, + "learning_rate": 1.8339100346020764e-06, + "loss": 0.7329, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 2.5017134187558137, + "learning_rate": 1.8685121107266438e-06, + "loss": 0.7654, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 1.9875397548331608, + "learning_rate": 1.9031141868512112e-06, + "loss": 0.7153, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 2.3596053776617127, + "learning_rate": 1.9377162629757786e-06, + "loss": 0.7149, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 2.166401176661029, + "learning_rate": 1.972318339100346e-06, + "loss": 0.6804, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 2.0951820308221527, + "learning_rate": 2.0069204152249138e-06, + "loss": 0.6972, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 2.4657081002289796, + "learning_rate": 2.041522491349481e-06, + "loss": 0.6038, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 2.760273057855939, + "learning_rate": 2.0761245674740485e-06, + "loss": 0.7049, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.111128237734746, + "learning_rate": 2.110726643598616e-06, + "loss": 0.7256, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 2.1541359357914462, + "learning_rate": 2.1453287197231837e-06, + "loss": 0.7687, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.169472440876441, + "learning_rate": 2.1799307958477513e-06, + "loss": 0.6799, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 2.02142080617264, + "learning_rate": 2.2145328719723185e-06, + "loss": 0.5648, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.4144204497805015, + "learning_rate": 2.249134948096886e-06, + "loss": 0.7278, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 2.4257419352048184, + "learning_rate": 2.2837370242214537e-06, + "loss": 0.691, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 2.3058060047031423, + "learning_rate": 2.318339100346021e-06, + "loss": 0.7196, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.963077781184842, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.5515, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.312593059971846, + "learning_rate": 2.387543252595156e-06, + "loss": 0.7084, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 2.2385262578581604, + "learning_rate": 2.4221453287197232e-06, + "loss": 0.7047, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.1074433866202082, + "learning_rate": 2.456747404844291e-06, + "loss": 0.6752, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 2.1381474039570216, + "learning_rate": 2.4913494809688584e-06, + "loss": 0.7712, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.0078823847937706, + "learning_rate": 2.5259515570934256e-06, + "loss": 0.655, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 2.28321088070362, + "learning_rate": 2.560553633217993e-06, + "loss": 0.7052, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.310153415920439, + "learning_rate": 2.5951557093425604e-06, + "loss": 0.5941, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 2.165070016989384, + "learning_rate": 2.629757785467128e-06, + "loss": 0.6693, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 2.0797734502814063, + "learning_rate": 2.6643598615916955e-06, + "loss": 0.6087, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.2813786170710113, + "learning_rate": 2.6989619377162636e-06, + "loss": 0.7298, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.3968138762934, + "learning_rate": 2.7335640138408307e-06, + "loss": 0.769, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.130423945439428, + "learning_rate": 2.7681660899653983e-06, + "loss": 0.6529, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.3282052971504674, + "learning_rate": 2.802768166089966e-06, + "loss": 0.7926, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.282966381673334, + "learning_rate": 2.837370242214533e-06, + "loss": 0.6593, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.160804388058384, + "learning_rate": 2.8719723183391007e-06, + "loss": 0.7173, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 2.072121784879253, + "learning_rate": 2.9065743944636683e-06, + "loss": 0.6375, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 2.1103173238484403, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.5965, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 2.3387135179333542, + "learning_rate": 2.975778546712803e-06, + "loss": 0.6975, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 2.3469544991445765, + "learning_rate": 3.0103806228373707e-06, + "loss": 0.6548, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 2.4499716727822958, + "learning_rate": 3.044982698961938e-06, + "loss": 0.7056, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.395912867646452, + "learning_rate": 3.0795847750865054e-06, + "loss": 0.6031, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.124375425140296, + "learning_rate": 3.114186851211073e-06, + "loss": 0.6674, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.4152158278140674, + "learning_rate": 3.14878892733564e-06, + "loss": 0.6561, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.645106503355264, + "learning_rate": 3.1833910034602078e-06, + "loss": 0.6203, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.170849019708104, + "learning_rate": 3.2179930795847754e-06, + "loss": 0.5372, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.085614854916659, + "learning_rate": 3.2525951557093425e-06, + "loss": 0.6429, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.1899737577611824, + "learning_rate": 3.28719723183391e-06, + "loss": 0.663, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 2.0251727260233188, + "learning_rate": 3.3217993079584777e-06, + "loss": 0.6597, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.3645519837714564, + "learning_rate": 3.356401384083045e-06, + "loss": 0.6406, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.4034145704451593, + "learning_rate": 3.3910034602076125e-06, + "loss": 0.7707, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.322115990647384, + "learning_rate": 3.42560553633218e-06, + "loss": 0.7373, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 2.326687603326004, + "learning_rate": 3.4602076124567473e-06, + "loss": 0.584, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.3114109061072936, + "learning_rate": 3.4948096885813153e-06, + "loss": 0.6832, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 2.195809015277355, + "learning_rate": 3.529411764705883e-06, + "loss": 0.609, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 2.1867017908477737, + "learning_rate": 3.5640138408304505e-06, + "loss": 0.5833, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 2.0914140112949577, + "learning_rate": 3.5986159169550177e-06, + "loss": 0.6508, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.3076122744482297, + "learning_rate": 3.6332179930795853e-06, + "loss": 0.6108, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.1765304437298867, + "learning_rate": 3.667820069204153e-06, + "loss": 0.6641, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 2.6803492350052185, + "learning_rate": 3.70242214532872e-06, + "loss": 0.6669, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 2.199931218934597, + "learning_rate": 3.7370242214532876e-06, + "loss": 0.705, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.9358294457907268, + "learning_rate": 3.7716262975778552e-06, + "loss": 0.733, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 2.3313246015906355, + "learning_rate": 3.8062283737024224e-06, + "loss": 0.6512, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.7685678589026836, + "learning_rate": 3.84083044982699e-06, + "loss": 0.6651, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 2.207787027694181, + "learning_rate": 3.875432525951557e-06, + "loss": 0.6167, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.343240858434686, + "learning_rate": 3.910034602076125e-06, + "loss": 0.7268, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.0089194459476643, + "learning_rate": 3.944636678200692e-06, + "loss": 0.5841, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 2.012684446361227, + "learning_rate": 3.9792387543252595e-06, + "loss": 0.5901, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.396831390118752, + "learning_rate": 4.0138408304498275e-06, + "loss": 0.706, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 2.2855393661408883, + "learning_rate": 4.048442906574395e-06, + "loss": 0.8205, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.015682700463381, + "learning_rate": 4.083044982698962e-06, + "loss": 0.5668, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.181291526395132, + "learning_rate": 4.11764705882353e-06, + "loss": 0.6465, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.2248428690994806, + "learning_rate": 4.152249134948097e-06, + "loss": 0.5714, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.202853322518201, + "learning_rate": 4.186851211072664e-06, + "loss": 0.6137, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.9078978560651283, + "learning_rate": 4.221453287197232e-06, + "loss": 0.5781, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 2.283481882986506, + "learning_rate": 4.2560553633218e-06, + "loss": 0.7298, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 2.2045034916450135, + "learning_rate": 4.2906574394463675e-06, + "loss": 0.6633, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.064486144309271, + "learning_rate": 4.325259515570935e-06, + "loss": 0.6187, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 2.191299849944456, + "learning_rate": 4.359861591695503e-06, + "loss": 0.6644, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 2.256115550781429, + "learning_rate": 4.39446366782007e-06, + "loss": 0.7002, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.4171539822945998, + "learning_rate": 4.429065743944637e-06, + "loss": 0.642, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.9676280086993452, + "learning_rate": 4.463667820069205e-06, + "loss": 0.6169, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 2.315131249667166, + "learning_rate": 4.498269896193772e-06, + "loss": 0.6301, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.016187038955443, + "learning_rate": 4.532871972318339e-06, + "loss": 0.6258, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 2.1690571819984803, + "learning_rate": 4.567474048442907e-06, + "loss": 0.6108, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 2.228343012890392, + "learning_rate": 4.6020761245674745e-06, + "loss": 0.7412, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 2.3230648080604444, + "learning_rate": 4.636678200692042e-06, + "loss": 0.7101, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.2702336427350787, + "learning_rate": 4.67128027681661e-06, + "loss": 0.6535, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.9953352017886066, + "learning_rate": 4.705882352941177e-06, + "loss": 0.5654, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 2.1335665240811483, + "learning_rate": 4.740484429065744e-06, + "loss": 0.7365, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 2.0081551952295067, + "learning_rate": 4.775086505190312e-06, + "loss": 0.5654, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 2.608564793222911, + "learning_rate": 4.809688581314879e-06, + "loss": 0.6156, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 2.2903018042810865, + "learning_rate": 4.8442906574394464e-06, + "loss": 0.5923, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 2.606584727809493, + "learning_rate": 4.8788927335640145e-06, + "loss": 0.6669, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.2676586545153983, + "learning_rate": 4.913494809688582e-06, + "loss": 0.5231, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 2.08733797395526, + "learning_rate": 4.948096885813149e-06, + "loss": 0.6447, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 2.280353041745458, + "learning_rate": 4.982698961937717e-06, + "loss": 0.7165, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 2.2993638112411756, + "learning_rate": 5.017301038062284e-06, + "loss": 0.6171, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 2.1155138048920445, + "learning_rate": 5.051903114186851e-06, + "loss": 0.678, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 2.092000766414222, + "learning_rate": 5.086505190311419e-06, + "loss": 0.5561, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 2.4110387919086103, + "learning_rate": 5.121107266435986e-06, + "loss": 0.6236, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 2.4295417190634003, + "learning_rate": 5.155709342560554e-06, + "loss": 0.7297, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 2.1821328342305524, + "learning_rate": 5.190311418685121e-06, + "loss": 0.8451, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 1.9821707599109635, + "learning_rate": 5.224913494809689e-06, + "loss": 0.5909, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 1.935677871558701, + "learning_rate": 5.259515570934256e-06, + "loss": 0.6249, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 2.2167343078418633, + "learning_rate": 5.294117647058824e-06, + "loss": 0.6286, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 2.177970005050448, + "learning_rate": 5.328719723183391e-06, + "loss": 0.631, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 2.2333962515280295, + "learning_rate": 5.363321799307959e-06, + "loss": 0.6793, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 2.1950449323297736, + "learning_rate": 5.397923875432527e-06, + "loss": 0.6113, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 2.172906333435821, + "learning_rate": 5.4325259515570934e-06, + "loss": 0.6419, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 2.219433419070003, + "learning_rate": 5.4671280276816615e-06, + "loss": 0.6086, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 2.1272302168820723, + "learning_rate": 5.501730103806229e-06, + "loss": 0.6471, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 2.2725105069940783, + "learning_rate": 5.536332179930797e-06, + "loss": 0.5728, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 2.3320622478808186, + "learning_rate": 5.570934256055364e-06, + "loss": 0.6638, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 2.4359930813276236, + "learning_rate": 5.605536332179932e-06, + "loss": 0.6696, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 2.225861962560492, + "learning_rate": 5.640138408304498e-06, + "loss": 0.6584, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 2.2669038485616375, + "learning_rate": 5.674740484429066e-06, + "loss": 0.6774, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 2.3265153532039795, + "learning_rate": 5.709342560553633e-06, + "loss": 0.6741, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 2.419471267005024, + "learning_rate": 5.743944636678201e-06, + "loss": 0.6057, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 2.276964154368608, + "learning_rate": 5.7785467128027686e-06, + "loss": 0.5884, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 2.410507009541092, + "learning_rate": 5.8131487889273366e-06, + "loss": 0.6878, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 2.083503032279742, + "learning_rate": 5.847750865051903e-06, + "loss": 0.6552, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 2.2509896017989117, + "learning_rate": 5.882352941176471e-06, + "loss": 0.6079, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 2.1859289065791256, + "learning_rate": 5.916955017301038e-06, + "loss": 0.621, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 2.147850941536798, + "learning_rate": 5.951557093425606e-06, + "loss": 0.5711, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 2.1062480954749314, + "learning_rate": 5.986159169550173e-06, + "loss": 0.6168, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.025096030131394, + "learning_rate": 6.020761245674741e-06, + "loss": 0.6314, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 2.1106251684861093, + "learning_rate": 6.055363321799308e-06, + "loss": 0.6165, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 1.9386258712780828, + "learning_rate": 6.089965397923876e-06, + "loss": 0.5944, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 2.439704310973515, + "learning_rate": 6.124567474048443e-06, + "loss": 0.7301, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 2.193053790974262, + "learning_rate": 6.159169550173011e-06, + "loss": 0.5184, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 2.1972037628573364, + "learning_rate": 6.193771626297579e-06, + "loss": 0.6819, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 2.108660851248478, + "learning_rate": 6.228373702422146e-06, + "loss": 0.6435, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 2.36479259661094, + "learning_rate": 6.262975778546714e-06, + "loss": 0.5749, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 2.0278184423083347, + "learning_rate": 6.29757785467128e-06, + "loss": 0.5742, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 2.3224464327459984, + "learning_rate": 6.332179930795848e-06, + "loss": 0.6336, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 2.3133190199193168, + "learning_rate": 6.3667820069204156e-06, + "loss": 0.6914, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 2.188171505795143, + "learning_rate": 6.401384083044984e-06, + "loss": 0.7221, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 2.58564757756167, + "learning_rate": 6.435986159169551e-06, + "loss": 0.6711, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 2.122268004241992, + "learning_rate": 6.470588235294119e-06, + "loss": 0.6429, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.9690551649287422, + "learning_rate": 6.505190311418685e-06, + "loss": 0.6549, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 2.3770166505133887, + "learning_rate": 6.539792387543253e-06, + "loss": 0.7386, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 2.332845067178479, + "learning_rate": 6.57439446366782e-06, + "loss": 0.7431, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 2.292052198271124, + "learning_rate": 6.608996539792388e-06, + "loss": 0.569, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 2.2147634719443836, + "learning_rate": 6.6435986159169555e-06, + "loss": 0.7045, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 2.328089977091099, + "learning_rate": 6.6782006920415235e-06, + "loss": 0.6364, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 2.274054083618043, + "learning_rate": 6.71280276816609e-06, + "loss": 0.6056, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 2.3784158616015643, + "learning_rate": 6.747404844290658e-06, + "loss": 0.6709, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 2.0892411869210346, + "learning_rate": 6.782006920415225e-06, + "loss": 0.562, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 2.1091681111983496, + "learning_rate": 6.816608996539793e-06, + "loss": 0.5607, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.8902408651238167, + "learning_rate": 6.85121107266436e-06, + "loss": 0.707, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 2.561780143261129, + "learning_rate": 6.885813148788928e-06, + "loss": 0.6728, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 2.0776566394374365, + "learning_rate": 6.9204152249134946e-06, + "loss": 0.6455, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 2.215431897046199, + "learning_rate": 6.9550173010380626e-06, + "loss": 0.5443, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 2.0873081020830857, + "learning_rate": 6.989619377162631e-06, + "loss": 0.6858, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 2.1124011856789098, + "learning_rate": 7.024221453287198e-06, + "loss": 0.5993, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 2.4320988871766493, + "learning_rate": 7.058823529411766e-06, + "loss": 0.6647, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 2.035002393310255, + "learning_rate": 7.093425605536333e-06, + "loss": 0.619, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 2.0914719636443317, + "learning_rate": 7.128027681660901e-06, + "loss": 0.7546, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 2.008874075320383, + "learning_rate": 7.162629757785467e-06, + "loss": 0.6114, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 2.193799041028231, + "learning_rate": 7.197231833910035e-06, + "loss": 0.7129, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 2.197495559283392, + "learning_rate": 7.2318339100346025e-06, + "loss": 0.7395, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.8616873688955218, + "learning_rate": 7.2664359861591705e-06, + "loss": 0.5019, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 2.432719510249827, + "learning_rate": 7.301038062283738e-06, + "loss": 0.7087, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 2.3669190124323127, + "learning_rate": 7.335640138408306e-06, + "loss": 0.6296, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 2.1872694291480785, + "learning_rate": 7.370242214532872e-06, + "loss": 0.6005, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 1.7828744622515829, + "learning_rate": 7.40484429065744e-06, + "loss": 0.6305, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 2.0040108138250723, + "learning_rate": 7.439446366782007e-06, + "loss": 0.6873, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 2.0796961366745226, + "learning_rate": 7.474048442906575e-06, + "loss": 0.6407, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 2.1967248607492937, + "learning_rate": 7.508650519031142e-06, + "loss": 0.6707, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 1.9175115859089473, + "learning_rate": 7.5432525951557104e-06, + "loss": 0.6526, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 2.0514016284982213, + "learning_rate": 7.577854671280277e-06, + "loss": 0.6123, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 2.5117956633223484, + "learning_rate": 7.612456747404845e-06, + "loss": 0.6014, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 2.2156322226471796, + "learning_rate": 7.647058823529411e-06, + "loss": 0.6571, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 2.2926948007219115, + "learning_rate": 7.68166089965398e-06, + "loss": 0.774, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.9721980908588659, + "learning_rate": 7.716262975778547e-06, + "loss": 0.6152, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 2.111098976593361, + "learning_rate": 7.750865051903114e-06, + "loss": 0.6419, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 2.300877354238802, + "learning_rate": 7.785467128027683e-06, + "loss": 0.7069, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 2.2866412650431434, + "learning_rate": 7.82006920415225e-06, + "loss": 0.6085, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 2.0848079985085226, + "learning_rate": 7.854671280276818e-06, + "loss": 0.5879, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 2.201530251445851, + "learning_rate": 7.889273356401385e-06, + "loss": 0.7731, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 2.0570052426900394, + "learning_rate": 7.923875432525952e-06, + "loss": 0.5881, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 2.1400020806991757, + "learning_rate": 7.958477508650519e-06, + "loss": 0.694, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 2.1227819097860645, + "learning_rate": 7.993079584775088e-06, + "loss": 0.7255, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 2.137417177560603, + "learning_rate": 8.027681660899655e-06, + "loss": 0.5723, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 2.1108437635031136, + "learning_rate": 8.062283737024222e-06, + "loss": 0.6994, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 2.124414466611215, + "learning_rate": 8.09688581314879e-06, + "loss": 0.6547, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 2.0520237300562854, + "learning_rate": 8.131487889273357e-06, + "loss": 0.6525, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.9714185147383696, + "learning_rate": 8.166089965397924e-06, + "loss": 0.6038, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 1.939078039152394, + "learning_rate": 8.200692041522493e-06, + "loss": 0.711, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 2.0670237054998393, + "learning_rate": 8.23529411764706e-06, + "loss": 0.614, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 2.242190530112608, + "learning_rate": 8.269896193771627e-06, + "loss": 0.6014, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 2.1485542120755152, + "learning_rate": 8.304498269896194e-06, + "loss": 0.6358, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 1.8482103878471823, + "learning_rate": 8.339100346020761e-06, + "loss": 0.5711, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 1.9737201478872424, + "learning_rate": 8.373702422145328e-06, + "loss": 0.6359, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 2.4163559055070887, + "learning_rate": 8.408304498269897e-06, + "loss": 0.7363, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 2.244452576938016, + "learning_rate": 8.442906574394465e-06, + "loss": 0.6451, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 2.086004638946662, + "learning_rate": 8.477508650519032e-06, + "loss": 0.7345, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 2.128590628871988, + "learning_rate": 8.5121107266436e-06, + "loss": 0.7326, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 2.235562449068478, + "learning_rate": 8.546712802768166e-06, + "loss": 0.7656, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 2.2509200985615063, + "learning_rate": 8.581314878892735e-06, + "loss": 0.6271, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 2.295331842170622, + "learning_rate": 8.615916955017302e-06, + "loss": 0.6678, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 2.391429221795401, + "learning_rate": 8.65051903114187e-06, + "loss": 0.7425, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 1.9529324621840602, + "learning_rate": 8.685121107266436e-06, + "loss": 0.5745, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 2.246043588101345, + "learning_rate": 8.719723183391005e-06, + "loss": 0.6345, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 2.4904984344511076, + "learning_rate": 8.75432525951557e-06, + "loss": 0.6939, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 2.1642783247420363, + "learning_rate": 8.78892733564014e-06, + "loss": 0.6032, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 2.1940931042243257, + "learning_rate": 8.823529411764707e-06, + "loss": 0.7277, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 1.8948954672045033, + "learning_rate": 8.858131487889274e-06, + "loss": 0.6893, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 2.4582031636820525, + "learning_rate": 8.892733564013841e-06, + "loss": 0.7363, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 2.3058632907633654, + "learning_rate": 8.92733564013841e-06, + "loss": 0.7746, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.8228869043171054, + "learning_rate": 8.961937716262975e-06, + "loss": 0.487, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 2.219005308745244, + "learning_rate": 8.996539792387544e-06, + "loss": 0.7147, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 1.9983314588691001, + "learning_rate": 9.031141868512112e-06, + "loss": 0.6375, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 2.5810389232454507, + "learning_rate": 9.065743944636679e-06, + "loss": 0.8198, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 2.319366870420018, + "learning_rate": 9.100346020761246e-06, + "loss": 0.7102, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 2.2528281665453656, + "learning_rate": 9.134948096885815e-06, + "loss": 0.6178, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 2.268866438407876, + "learning_rate": 9.16955017301038e-06, + "loss": 0.649, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 2.1190340028998707, + "learning_rate": 9.204152249134949e-06, + "loss": 0.6279, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 2.0714620681045663, + "learning_rate": 9.238754325259516e-06, + "loss": 0.6938, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 2.325221307942943, + "learning_rate": 9.273356401384083e-06, + "loss": 0.5867, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 2.2470942028995986, + "learning_rate": 9.307958477508652e-06, + "loss": 0.6852, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 2.007572350176665, + "learning_rate": 9.34256055363322e-06, + "loss": 0.5481, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 2.0257482560883275, + "learning_rate": 9.377162629757787e-06, + "loss": 0.6386, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 2.0485510426599878, + "learning_rate": 9.411764705882354e-06, + "loss": 0.7489, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.9623446612734317, + "learning_rate": 9.446366782006921e-06, + "loss": 0.5764, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 2.127396095227505, + "learning_rate": 9.480968858131488e-06, + "loss": 0.6338, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 2.215498491322338, + "learning_rate": 9.515570934256057e-06, + "loss": 0.758, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 2.08023339836091, + "learning_rate": 9.550173010380624e-06, + "loss": 0.6621, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 1.9378232010147092, + "learning_rate": 9.584775086505191e-06, + "loss": 0.6002, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 2.1412793080715358, + "learning_rate": 9.619377162629759e-06, + "loss": 0.6288, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 2.095326824165883, + "learning_rate": 9.653979238754326e-06, + "loss": 0.6168, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 2.093548208464301, + "learning_rate": 9.688581314878893e-06, + "loss": 0.6185, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 2.0683217073155142, + "learning_rate": 9.723183391003462e-06, + "loss": 0.6579, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 2.2050921361510256, + "learning_rate": 9.757785467128029e-06, + "loss": 0.6564, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 2.0666754378724512, + "learning_rate": 9.792387543252596e-06, + "loss": 0.6938, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 2.0255579239228676, + "learning_rate": 9.826989619377163e-06, + "loss": 0.5832, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 2.2059232192312477, + "learning_rate": 9.86159169550173e-06, + "loss": 0.6408, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 2.227961737598211, + "learning_rate": 9.896193771626298e-06, + "loss": 0.6834, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 2.384395993502413, + "learning_rate": 9.930795847750866e-06, + "loss": 0.6507, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 2.2259501520536786, + "learning_rate": 9.965397923875434e-06, + "loss": 0.8218, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 2.2153438165060004, + "learning_rate": 1e-05, + "loss": 0.7089, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 2.090933521636028, + "learning_rate": 9.99999971661075e-06, + "loss": 0.6669, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 2.2188852773516707, + "learning_rate": 9.999998866443035e-06, + "loss": 0.8329, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 1.9643905919582283, + "learning_rate": 9.999997449496947e-06, + "loss": 0.604, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 1.7302965204706007, + "learning_rate": 9.999995465772652e-06, + "loss": 0.5917, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 1.836246496796778, + "learning_rate": 9.99999291527037e-06, + "loss": 0.5272, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 2.2820920836002587, + "learning_rate": 9.999989797990391e-06, + "loss": 0.7376, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 2.2425304223811446, + "learning_rate": 9.999986113933071e-06, + "loss": 0.6333, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 2.021448989644418, + "learning_rate": 9.999981863098825e-06, + "loss": 0.5886, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 2.1689623016635897, + "learning_rate": 9.999977045488135e-06, + "loss": 0.6014, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 2.3336741309421347, + "learning_rate": 9.999971661101551e-06, + "loss": 0.7371, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 2.1629679103619783, + "learning_rate": 9.999965709939679e-06, + "loss": 0.6577, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.8935808966516685, + "learning_rate": 9.999959192003194e-06, + "loss": 0.5906, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 2.1444391808967738, + "learning_rate": 9.999952107292836e-06, + "loss": 0.7357, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 2.022296007147585, + "learning_rate": 9.999944455809408e-06, + "loss": 0.574, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 2.063507571811703, + "learning_rate": 9.999936237553777e-06, + "loss": 0.6103, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 2.1313448786683913, + "learning_rate": 9.999927452526877e-06, + "loss": 0.6104, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 2.112051884258629, + "learning_rate": 9.999918100729698e-06, + "loss": 0.657, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 2.2604850689998757, + "learning_rate": 9.999908182163306e-06, + "loss": 0.7052, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 2.1654923977851617, + "learning_rate": 9.999897696828822e-06, + "loss": 0.6801, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 1.996921654859269, + "learning_rate": 9.999886644727436e-06, + "loss": 0.6856, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 2.145453425352305, + "learning_rate": 9.999875025860401e-06, + "loss": 0.6513, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 2.234415993232513, + "learning_rate": 9.999862840229033e-06, + "loss": 0.6971, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 2.0645659727143943, + "learning_rate": 9.999850087834715e-06, + "loss": 0.7306, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 1.8744568621278508, + "learning_rate": 9.99983676867889e-06, + "loss": 0.638, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 2.3222295206374604, + "learning_rate": 9.999822882763068e-06, + "loss": 0.6553, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 2.099484652851686, + "learning_rate": 9.999808430088826e-06, + "loss": 0.7374, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 2.228145088882417, + "learning_rate": 9.999793410657802e-06, + "loss": 0.8169, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 2.1291708934516986, + "learning_rate": 9.999777824471694e-06, + "loss": 0.631, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 2.080530025890949, + "learning_rate": 9.999761671532273e-06, + "loss": 0.6407, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 2.119009430136004, + "learning_rate": 9.999744951841367e-06, + "loss": 0.6849, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 1.8955703890753102, + "learning_rate": 9.999727665400876e-06, + "loss": 0.7005, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 2.1038990841547496, + "learning_rate": 9.999709812212756e-06, + "loss": 0.6685, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 2.2589247469878213, + "learning_rate": 9.99969139227903e-06, + "loss": 0.7754, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 2.1825354390943943, + "learning_rate": 9.99967240560179e-06, + "loss": 0.7121, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.916129116293965, + "learning_rate": 9.999652852183184e-06, + "loss": 0.5697, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 2.0643197115454033, + "learning_rate": 9.999632732025428e-06, + "loss": 0.6152, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 2.0612939818120957, + "learning_rate": 9.99961204513081e-06, + "loss": 0.6136, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 2.216817383090462, + "learning_rate": 9.999590791501665e-06, + "loss": 0.5763, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 2.0495658313065555, + "learning_rate": 9.999568971140409e-06, + "loss": 0.6226, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 2.0958367292073277, + "learning_rate": 9.999546584049513e-06, + "loss": 0.7229, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 2.0926501956699592, + "learning_rate": 9.999523630231516e-06, + "loss": 0.7376, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.9068405784141171, + "learning_rate": 9.999500109689018e-06, + "loss": 0.5987, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 2.0006081400254963, + "learning_rate": 9.999476022424688e-06, + "loss": 0.6085, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.9085783103448943, + "learning_rate": 9.999451368441254e-06, + "loss": 0.564, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 2.1115680818865235, + "learning_rate": 9.999426147741512e-06, + "loss": 0.6871, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 2.1237154103762186, + "learning_rate": 9.999400360328318e-06, + "loss": 0.6901, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.929206892415265, + "learning_rate": 9.999374006204601e-06, + "loss": 0.6018, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 2.0204061701835094, + "learning_rate": 9.999347085373343e-06, + "loss": 0.645, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 1.9226738874188818, + "learning_rate": 9.999319597837599e-06, + "loss": 0.6821, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 2.107623603422912, + "learning_rate": 9.99929154360048e-06, + "loss": 0.6378, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 1.8743119821525225, + "learning_rate": 9.999262922665172e-06, + "loss": 0.59, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 1.8550368642600081, + "learning_rate": 9.999233735034916e-06, + "loss": 0.6298, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 2.079258305023673, + "learning_rate": 9.999203980713023e-06, + "loss": 0.7754, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 2.16644850862517, + "learning_rate": 9.999173659702863e-06, + "loss": 0.5724, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 1.7680175701749203, + "learning_rate": 9.999142772007875e-06, + "loss": 0.6038, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 1.9697469124412608, + "learning_rate": 9.999111317631559e-06, + "loss": 0.6786, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 2.0214076901251707, + "learning_rate": 9.999079296577482e-06, + "loss": 0.7071, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 2.1220150741444526, + "learning_rate": 9.99904670884927e-06, + "loss": 0.7243, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 2.023388262077159, + "learning_rate": 9.999013554450624e-06, + "loss": 0.8155, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 2.047321068044065, + "learning_rate": 9.998979833385296e-06, + "loss": 0.6606, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 2.090748901325982, + "learning_rate": 9.998945545657113e-06, + "loss": 0.6928, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 2.052795459098353, + "learning_rate": 9.998910691269957e-06, + "loss": 0.8328, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 2.402064265656104, + "learning_rate": 9.998875270227781e-06, + "loss": 0.672, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 2.1481436238474636, + "learning_rate": 9.998839282534602e-06, + "loss": 0.6665, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 2.2395539693277526, + "learning_rate": 9.998802728194496e-06, + "loss": 0.7083, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 1.9776107871723176, + "learning_rate": 9.998765607211612e-06, + "loss": 0.6582, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 2.147090458916406, + "learning_rate": 9.99872791959015e-06, + "loss": 0.6026, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 2.034674606820822, + "learning_rate": 9.99868966533439e-06, + "loss": 0.6911, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 2.0951350122740298, + "learning_rate": 9.998650844448663e-06, + "loss": 0.6416, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 2.089497478746895, + "learning_rate": 9.998611456937373e-06, + "loss": 0.6531, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 2.0664793585495898, + "learning_rate": 9.998571502804982e-06, + "loss": 0.6725, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 1.8814458336505542, + "learning_rate": 9.998530982056021e-06, + "loss": 0.6481, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 1.774755021585096, + "learning_rate": 9.99848989469508e-06, + "loss": 0.6176, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 2.1620141766232686, + "learning_rate": 9.998448240726822e-06, + "loss": 0.6294, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 2.288599249402456, + "learning_rate": 9.998406020155964e-06, + "loss": 0.6562, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 1.9908199779664653, + "learning_rate": 9.998363232987294e-06, + "loss": 0.687, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 2.139573273038414, + "learning_rate": 9.998319879225662e-06, + "loss": 0.591, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 1.951025573732339, + "learning_rate": 9.998275958875983e-06, + "loss": 0.7088, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 2.185154704796897, + "learning_rate": 9.998231471943234e-06, + "loss": 0.7013, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 2.093786252997598, + "learning_rate": 9.998186418432459e-06, + "loss": 0.7835, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 2.2585179169164817, + "learning_rate": 9.998140798348766e-06, + "loss": 0.7529, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 2.086435056903612, + "learning_rate": 9.998094611697322e-06, + "loss": 0.783, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 2.076639780170476, + "learning_rate": 9.998047858483369e-06, + "loss": 0.7017, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 2.1634960145020017, + "learning_rate": 9.9980005387122e-06, + "loss": 0.7516, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 2.359129077276345, + "learning_rate": 9.997952652389184e-06, + "loss": 0.7177, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 2.69710045388187, + "learning_rate": 9.997904199519748e-06, + "loss": 0.7553, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 1.932081602102441, + "learning_rate": 9.997855180109383e-06, + "loss": 0.7331, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 2.155197890696538, + "learning_rate": 9.997805594163646e-06, + "loss": 0.7416, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 1.9314889268578335, + "learning_rate": 9.997755441688159e-06, + "loss": 0.7268, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 2.0768241365813447, + "learning_rate": 9.997704722688607e-06, + "loss": 0.696, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 2.1302481157387874, + "learning_rate": 9.997653437170739e-06, + "loss": 0.8132, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 1.8926932428974472, + "learning_rate": 9.997601585140367e-06, + "loss": 0.5856, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 2.1087789610541, + "learning_rate": 9.99754916660337e-06, + "loss": 0.7283, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 2.0973004274036833, + "learning_rate": 9.997496181565691e-06, + "loss": 0.615, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 1.9798386111356991, + "learning_rate": 9.997442630033333e-06, + "loss": 0.6571, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 1.9119680312757281, + "learning_rate": 9.997388512012371e-06, + "loss": 0.6829, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 2.132931602798204, + "learning_rate": 9.997333827508936e-06, + "loss": 0.8136, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 1.926068179217276, + "learning_rate": 9.997278576529228e-06, + "loss": 0.6462, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 1.8398296773499356, + "learning_rate": 9.997222759079507e-06, + "loss": 0.6632, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 2.2236470134644355, + "learning_rate": 9.997166375166107e-06, + "loss": 0.7079, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 2.3302822862993993, + "learning_rate": 9.997109424795415e-06, + "loss": 0.6415, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 2.0538248803181034, + "learning_rate": 9.997051907973885e-06, + "loss": 0.6829, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 2.039125915190051, + "learning_rate": 9.99699382470804e-06, + "loss": 0.6388, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 1.9562271717913728, + "learning_rate": 9.996935175004462e-06, + "loss": 0.5704, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 2.4363968316295357, + "learning_rate": 9.996875958869803e-06, + "loss": 0.5985, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 1.8544403613485045, + "learning_rate": 9.996816176310771e-06, + "loss": 0.5727, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 1.8820693987504735, + "learning_rate": 9.996755827334145e-06, + "loss": 0.5675, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 2.008444476428537, + "learning_rate": 9.996694911946765e-06, + "loss": 0.8005, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 2.3530880181617486, + "learning_rate": 9.996633430155537e-06, + "loss": 0.7435, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 4.325936114579622, + "learning_rate": 9.99657138196743e-06, + "loss": 0.6764, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 2.780809596195899, + "learning_rate": 9.996508767389477e-06, + "loss": 0.7005, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 1.915152329293021, + "learning_rate": 9.996445586428776e-06, + "loss": 0.6623, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 6.956582503534672, + "learning_rate": 9.996381839092489e-06, + "loss": 0.7644, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 5.432085078434551, + "learning_rate": 9.996317525387841e-06, + "loss": 0.7633, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 2.2033123567972406, + "learning_rate": 9.996252645322124e-06, + "loss": 0.6479, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 1.8373000696608504, + "learning_rate": 9.996187198902693e-06, + "loss": 0.605, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 2.115896017831422, + "learning_rate": 9.996121186136964e-06, + "loss": 0.7126, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 1.9436621726914547, + "learning_rate": 9.996054607032425e-06, + "loss": 0.5966, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 3.2683255229608346, + "learning_rate": 9.995987461596617e-06, + "loss": 0.7094, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 2.1334958544798432, + "learning_rate": 9.995919749837154e-06, + "loss": 0.7025, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 3.3033075412526345, + "learning_rate": 9.995851471761711e-06, + "loss": 0.7209, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 2.069310070603776, + "learning_rate": 9.99578262737803e-06, + "loss": 0.7111, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 3.7124534446102473, + "learning_rate": 9.995713216693913e-06, + "loss": 0.7023, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 2.116142709972029, + "learning_rate": 9.995643239717228e-06, + "loss": 0.6724, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 2.04036069618067, + "learning_rate": 9.995572696455907e-06, + "loss": 0.7658, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 2.080715602390722, + "learning_rate": 9.995501586917949e-06, + "loss": 0.67, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 2.104421238433803, + "learning_rate": 9.99542991111141e-06, + "loss": 0.6032, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 2.2301607740265124, + "learning_rate": 9.995357669044418e-06, + "loss": 0.616, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 2.0462075925593584, + "learning_rate": 9.995284860725162e-06, + "loss": 0.6749, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 2.100344727358706, + "learning_rate": 9.995211486161896e-06, + "loss": 0.6815, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 2.3266601471436794, + "learning_rate": 9.995137545362937e-06, + "loss": 0.8017, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 1.9367072875868272, + "learning_rate": 9.995063038336663e-06, + "loss": 0.6028, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 2.2292281685756214, + "learning_rate": 9.994987965091525e-06, + "loss": 0.6752, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 2.129402276054949, + "learning_rate": 9.994912325636029e-06, + "loss": 0.6371, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 2.070140533823007, + "learning_rate": 9.99483611997875e-06, + "loss": 0.8579, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 3.418866897472175, + "learning_rate": 9.994759348128331e-06, + "loss": 0.6827, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 1.8482776838557704, + "learning_rate": 9.994682010093468e-06, + "loss": 0.6993, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 2.2212930148340524, + "learning_rate": 9.99460410588293e-06, + "loss": 0.6308, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 1.872863356848253, + "learning_rate": 9.99452563550555e-06, + "loss": 0.6843, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 2.1550669400337763, + "learning_rate": 9.99444659897022e-06, + "loss": 0.6675, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 2.3214088233689716, + "learning_rate": 9.994366996285903e-06, + "loss": 0.5922, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 1.9772960079757542, + "learning_rate": 9.994286827461616e-06, + "loss": 0.5801, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 2.0146780220081135, + "learning_rate": 9.994206092506455e-06, + "loss": 0.6058, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 1.985534965943388, + "learning_rate": 9.994124791429565e-06, + "loss": 0.6692, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 2.010878212245351, + "learning_rate": 9.994042924240164e-06, + "loss": 0.6293, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 2.2715551445991986, + "learning_rate": 9.993960490947533e-06, + "loss": 0.6041, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 2.2311954165914183, + "learning_rate": 9.993877491561015e-06, + "loss": 0.7262, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 2.2721565123934253, + "learning_rate": 9.99379392609002e-06, + "loss": 0.5909, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 2.0679330089748333, + "learning_rate": 9.993709794544022e-06, + "loss": 0.6332, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 2.018072655643991, + "learning_rate": 9.993625096932552e-06, + "loss": 0.7004, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 2.1394905686491663, + "learning_rate": 9.993539833265216e-06, + "loss": 0.6536, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 2.040527633692036, + "learning_rate": 9.993454003551676e-06, + "loss": 0.6955, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 1.890590599198691, + "learning_rate": 9.993367607801666e-06, + "loss": 0.5941, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 1.9929644547312386, + "learning_rate": 9.993280646024975e-06, + "loss": 0.6812, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 1.961053725767758, + "learning_rate": 9.993193118231463e-06, + "loss": 0.6022, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 1.9181967448391344, + "learning_rate": 9.993105024431049e-06, + "loss": 0.6078, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 2.968002625002131, + "learning_rate": 9.99301636463372e-06, + "loss": 0.6773, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 2.4130336710151465, + "learning_rate": 9.99292713884953e-06, + "loss": 0.6979, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 2.1316184739110313, + "learning_rate": 9.992837347088589e-06, + "loss": 0.6288, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 1.892151196066065, + "learning_rate": 9.992746989361075e-06, + "loss": 0.6294, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 1.9575225798059654, + "learning_rate": 9.992656065677234e-06, + "loss": 0.7004, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 2.2829375163660637, + "learning_rate": 9.99256457604737e-06, + "loss": 0.7471, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 1.9971984927642012, + "learning_rate": 9.992472520481852e-06, + "loss": 0.6576, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 2.2277902080354606, + "learning_rate": 9.99237989899112e-06, + "loss": 0.7368, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 1.9400694676938235, + "learning_rate": 9.992286711585673e-06, + "loss": 0.7523, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 2.088769530783493, + "learning_rate": 9.992192958276068e-06, + "loss": 0.7252, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 1.932950291206959, + "learning_rate": 9.99209863907294e-06, + "loss": 0.7604, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 2.024874686283672, + "learning_rate": 9.992003753986976e-06, + "loss": 0.6524, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 1.87618503750641, + "learning_rate": 9.991908303028932e-06, + "loss": 0.6223, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 2.036129483108596, + "learning_rate": 9.99181228620963e-06, + "loss": 0.7189, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 1.785804649378539, + "learning_rate": 9.991715703539952e-06, + "loss": 0.6044, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 2.090553085564057, + "learning_rate": 9.991618555030848e-06, + "loss": 0.6913, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 1.9098029143505888, + "learning_rate": 9.991520840693331e-06, + "loss": 0.7355, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 2.1358723717085457, + "learning_rate": 9.991422560538475e-06, + "loss": 0.6728, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 1.7182227416827212, + "learning_rate": 9.991323714577421e-06, + "loss": 0.6789, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 1.9285440404571363, + "learning_rate": 9.991224302821374e-06, + "loss": 0.5603, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 2.317719534583608, + "learning_rate": 9.991124325281603e-06, + "loss": 0.8379, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 1.8938461805189346, + "learning_rate": 9.991023781969442e-06, + "loss": 0.5413, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 1.843694757502963, + "learning_rate": 9.990922672896288e-06, + "loss": 0.6094, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 2.073935582799042, + "learning_rate": 9.9908209980736e-06, + "loss": 0.5987, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 2.2169101162154323, + "learning_rate": 9.990718757512906e-06, + "loss": 0.6508, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 2.149787600797134, + "learning_rate": 9.990615951225797e-06, + "loss": 0.7176, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 1.9433589280184018, + "learning_rate": 9.990512579223921e-06, + "loss": 0.7356, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 2.0519977838803003, + "learning_rate": 9.990408641519e-06, + "loss": 0.6478, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 1.9732250611772912, + "learning_rate": 9.990304138122818e-06, + "loss": 0.7158, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 2.0862447871994005, + "learning_rate": 9.990199069047216e-06, + "loss": 0.7862, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 1.9627513971284538, + "learning_rate": 9.990093434304104e-06, + "loss": 0.7475, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 2.5440687784424054, + "learning_rate": 9.989987233905462e-06, + "loss": 0.6919, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 2.011594577111942, + "learning_rate": 9.989880467863323e-06, + "loss": 0.6319, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 1.8685240663183704, + "learning_rate": 9.989773136189793e-06, + "loss": 0.6188, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 2.031643677974096, + "learning_rate": 9.989665238897036e-06, + "loss": 0.7435, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 1.9163910387696532, + "learning_rate": 9.989556775997284e-06, + "loss": 0.6668, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 2.017766119421384, + "learning_rate": 9.989447747502834e-06, + "loss": 0.6212, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 2.145062820353988, + "learning_rate": 9.98933815342604e-06, + "loss": 0.8157, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 1.9832549864486289, + "learning_rate": 9.989227993779332e-06, + "loss": 0.6679, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 1.8769152114042982, + "learning_rate": 9.98911726857519e-06, + "loss": 0.6053, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 2.135665021547759, + "learning_rate": 9.98900597782617e-06, + "loss": 0.7071, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 2.1442490339858016, + "learning_rate": 9.988894121544885e-06, + "loss": 0.671, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 2.3109603092567896, + "learning_rate": 9.988781699744016e-06, + "loss": 0.6207, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 5.03188736806228, + "learning_rate": 9.988668712436306e-06, + "loss": 0.6313, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 2.019550529610631, + "learning_rate": 9.988555159634563e-06, + "loss": 0.6206, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 2.0704169777481303, + "learning_rate": 9.98844104135166e-06, + "loss": 0.6363, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 1.9685873032572212, + "learning_rate": 9.98832635760053e-06, + "loss": 0.6772, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 1.8690658409387129, + "learning_rate": 9.988211108394177e-06, + "loss": 0.6678, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 2.192593402844829, + "learning_rate": 9.98809529374566e-06, + "loss": 0.5719, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 2.064493939887632, + "learning_rate": 9.987978913668112e-06, + "loss": 0.6047, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 1.9756919384468792, + "learning_rate": 9.987861968174723e-06, + "loss": 0.6073, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 1.9737506247163636, + "learning_rate": 9.987744457278753e-06, + "loss": 0.6947, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 1.904287026030052, + "learning_rate": 9.987626380993516e-06, + "loss": 0.7352, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 2.0224472723257634, + "learning_rate": 9.987507739332401e-06, + "loss": 0.614, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 2.0295872447401706, + "learning_rate": 9.987388532308858e-06, + "loss": 0.6972, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 2.0149908366267906, + "learning_rate": 9.987268759936396e-06, + "loss": 0.7121, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 2.0962870953861183, + "learning_rate": 9.987148422228591e-06, + "loss": 0.6515, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 1.984269566815456, + "learning_rate": 9.98702751919909e-06, + "loss": 0.7571, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 2.1123697955385983, + "learning_rate": 9.986906050861595e-06, + "loss": 0.6053, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 2.3344562773460167, + "learning_rate": 9.986784017229873e-06, + "loss": 0.7462, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 1.9590396488402444, + "learning_rate": 9.986661418317759e-06, + "loss": 0.6522, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 2.2031954129578954, + "learning_rate": 9.986538254139151e-06, + "loss": 0.742, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 1.8872521125527797, + "learning_rate": 9.98641452470801e-06, + "loss": 0.5652, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 2.0562007032310485, + "learning_rate": 9.986290230038359e-06, + "loss": 0.6684, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 1.9331683041832113, + "learning_rate": 9.986165370144291e-06, + "loss": 0.5762, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 1.9644374337511354, + "learning_rate": 9.986039945039959e-06, + "loss": 0.6082, + "step": 511 + }, + { + "epoch": 0.05, + "grad_norm": 1.8950957493260308, + "learning_rate": 9.985913954739577e-06, + "loss": 0.5627, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 1.897071876769113, + "learning_rate": 9.985787399257431e-06, + "loss": 0.6015, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 2.2177366478263383, + "learning_rate": 9.985660278607865e-06, + "loss": 0.7003, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 2.229245926344614, + "learning_rate": 9.985532592805289e-06, + "loss": 0.6129, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 2.312963580341358, + "learning_rate": 9.985404341864178e-06, + "loss": 0.849, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 2.121250005175201, + "learning_rate": 9.985275525799069e-06, + "loss": 0.6587, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 1.999789191521043, + "learning_rate": 9.985146144624563e-06, + "loss": 0.7018, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 1.6613885225525393, + "learning_rate": 9.985016198355328e-06, + "loss": 0.5701, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 2.048330879977503, + "learning_rate": 9.984885687006093e-06, + "loss": 0.7511, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 2.1663530153067065, + "learning_rate": 9.98475461059165e-06, + "loss": 0.5997, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 2.1390771494454555, + "learning_rate": 9.984622969126864e-06, + "loss": 0.7191, + "step": 522 + }, + { + "epoch": 0.05, + "grad_norm": 1.9328096420880079, + "learning_rate": 9.984490762626651e-06, + "loss": 0.6176, + "step": 523 + }, + { + "epoch": 0.05, + "grad_norm": 1.850042905202259, + "learning_rate": 9.984357991105999e-06, + "loss": 0.7516, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 2.125264947491382, + "learning_rate": 9.984224654579959e-06, + "loss": 0.6857, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 2.34692785182022, + "learning_rate": 9.984090753063647e-06, + "loss": 0.6944, + "step": 526 + }, + { + "epoch": 0.05, + "grad_norm": 2.0665039132353433, + "learning_rate": 9.983956286572238e-06, + "loss": 0.7515, + "step": 527 + }, + { + "epoch": 0.05, + "grad_norm": 2.085772307929629, + "learning_rate": 9.983821255120977e-06, + "loss": 0.6996, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 1.9510182753018945, + "learning_rate": 9.98368565872517e-06, + "loss": 0.7206, + "step": 529 + }, + { + "epoch": 0.06, + "grad_norm": 2.357416665846521, + "learning_rate": 9.983549497400187e-06, + "loss": 0.5892, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 2.108482481371756, + "learning_rate": 9.983412771161463e-06, + "loss": 0.605, + "step": 531 + }, + { + "epoch": 0.06, + "grad_norm": 2.118696322223107, + "learning_rate": 9.983275480024498e-06, + "loss": 0.5796, + "step": 532 + }, + { + "epoch": 0.06, + "grad_norm": 1.9269095508935703, + "learning_rate": 9.983137624004851e-06, + "loss": 0.5309, + "step": 533 + }, + { + "epoch": 0.06, + "grad_norm": 1.9783461770372632, + "learning_rate": 9.982999203118153e-06, + "loss": 0.68, + "step": 534 + }, + { + "epoch": 0.06, + "grad_norm": 2.1354909936380326, + "learning_rate": 9.982860217380096e-06, + "loss": 0.6303, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 2.1276687172019457, + "learning_rate": 9.982720666806427e-06, + "loss": 0.6457, + "step": 536 + }, + { + "epoch": 0.06, + "grad_norm": 2.5725435712780516, + "learning_rate": 9.982580551412972e-06, + "loss": 0.6247, + "step": 537 + }, + { + "epoch": 0.06, + "grad_norm": 2.0827610945764903, + "learning_rate": 9.982439871215612e-06, + "loss": 0.7351, + "step": 538 + }, + { + "epoch": 0.06, + "grad_norm": 1.928191390389645, + "learning_rate": 9.982298626230295e-06, + "loss": 0.6179, + "step": 539 + }, + { + "epoch": 0.06, + "grad_norm": 2.0571831064927184, + "learning_rate": 9.982156816473029e-06, + "loss": 0.5946, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 1.9831697955213374, + "learning_rate": 9.982014441959891e-06, + "loss": 0.7186, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 1.8500259652367927, + "learning_rate": 9.98187150270702e-06, + "loss": 0.6823, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 2.026854564673327, + "learning_rate": 9.981727998730616e-06, + "loss": 0.7699, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 1.9807028683174117, + "learning_rate": 9.98158393004695e-06, + "loss": 0.6984, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 2.589963467561242, + "learning_rate": 9.981439296672352e-06, + "loss": 0.822, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 1.8283692667191949, + "learning_rate": 9.981294098623215e-06, + "loss": 0.668, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 2.0929282423372433, + "learning_rate": 9.981148335916e-06, + "loss": 0.6257, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 1.9273830056696597, + "learning_rate": 9.98100200856723e-06, + "loss": 0.6686, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 2.1577089469497137, + "learning_rate": 9.980855116593494e-06, + "loss": 0.6959, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 1.8461204532541409, + "learning_rate": 9.980707660011437e-06, + "loss": 0.6695, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 2.0891244440911216, + "learning_rate": 9.980559638837778e-06, + "loss": 0.7155, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 4.60684984565562, + "learning_rate": 9.980411053089298e-06, + "loss": 0.7105, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 2.274419285519808, + "learning_rate": 9.980261902782835e-06, + "loss": 0.6461, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 2.1995811698396714, + "learning_rate": 9.9801121879353e-06, + "loss": 0.6659, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 2.24104794740926, + "learning_rate": 9.979961908563663e-06, + "loss": 0.6843, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 2.027896058717004, + "learning_rate": 9.97981106468496e-06, + "loss": 0.6571, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 1.8618556456848303, + "learning_rate": 9.979659656316288e-06, + "loss": 0.6097, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 2.011473620144725, + "learning_rate": 9.97950768347481e-06, + "loss": 0.5543, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 1.8198767395429643, + "learning_rate": 9.979355146177754e-06, + "loss": 0.5747, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 2.1523169045634907, + "learning_rate": 9.97920204444241e-06, + "loss": 0.5956, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 2.1313458275799584, + "learning_rate": 9.979048378286134e-06, + "loss": 0.6804, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 1.972092650478314, + "learning_rate": 9.978894147726346e-06, + "loss": 0.685, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 2.0429159532350414, + "learning_rate": 9.978739352780528e-06, + "loss": 0.5422, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 2.4276858350172383, + "learning_rate": 9.978583993466224e-06, + "loss": 0.6217, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 2.117733238279458, + "learning_rate": 9.97842806980105e-06, + "loss": 0.8119, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 2.0507740651286666, + "learning_rate": 9.978271581802677e-06, + "loss": 0.6132, + "step": 566 + }, + { + "epoch": 0.06, + "grad_norm": 1.8381248701512136, + "learning_rate": 9.978114529488845e-06, + "loss": 0.6151, + "step": 567 + }, + { + "epoch": 0.06, + "grad_norm": 2.251649730917416, + "learning_rate": 9.977956912877356e-06, + "loss": 0.6969, + "step": 568 + }, + { + "epoch": 0.06, + "grad_norm": 2.085417466229146, + "learning_rate": 9.977798731986079e-06, + "loss": 0.6001, + "step": 569 + }, + { + "epoch": 0.06, + "grad_norm": 2.1417707930582948, + "learning_rate": 9.977639986832943e-06, + "loss": 0.6477, + "step": 570 + }, + { + "epoch": 0.06, + "grad_norm": 2.002691296527279, + "learning_rate": 9.977480677435942e-06, + "loss": 0.6204, + "step": 571 + }, + { + "epoch": 0.06, + "grad_norm": 2.3573137310537087, + "learning_rate": 9.977320803813137e-06, + "loss": 0.666, + "step": 572 + }, + { + "epoch": 0.06, + "grad_norm": 2.13178658494689, + "learning_rate": 9.977160365982647e-06, + "loss": 0.6276, + "step": 573 + }, + { + "epoch": 0.06, + "grad_norm": 1.9299279931740343, + "learning_rate": 9.976999363962663e-06, + "loss": 0.6463, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 2.1014604248299285, + "learning_rate": 9.97683779777143e-06, + "loss": 0.6793, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 1.8767137873590285, + "learning_rate": 9.976675667427268e-06, + "loss": 0.5687, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 1.9640671342301481, + "learning_rate": 9.976512972948553e-06, + "loss": 0.6154, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 1.7884320005984804, + "learning_rate": 9.976349714353729e-06, + "loss": 0.6065, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 1.950622961463605, + "learning_rate": 9.976185891661296e-06, + "loss": 0.7064, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 2.3083440495765513, + "learning_rate": 9.976021504889833e-06, + "loss": 0.6901, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 1.963846800423321, + "learning_rate": 9.975856554057968e-06, + "loss": 0.5613, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 2.0211788630851535, + "learning_rate": 9.9756910391844e-06, + "loss": 0.6375, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 1.8268256773710685, + "learning_rate": 9.975524960287895e-06, + "loss": 0.671, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 1.7600000040813513, + "learning_rate": 9.975358317387277e-06, + "loss": 0.5318, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 2.049565771650243, + "learning_rate": 9.975191110501432e-06, + "loss": 0.6675, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 2.314117076668117, + "learning_rate": 9.975023339649317e-06, + "loss": 0.6742, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 2.0701957129127018, + "learning_rate": 9.974855004849952e-06, + "loss": 0.7539, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 1.90648641172633, + "learning_rate": 9.974686106122415e-06, + "loss": 0.6037, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 1.9875373685123054, + "learning_rate": 9.974516643485852e-06, + "loss": 0.7258, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 2.2446293325637647, + "learning_rate": 9.974346616959476e-06, + "loss": 0.7915, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 2.106574799745109, + "learning_rate": 9.974176026562558e-06, + "loss": 0.6764, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 2.062901024543719, + "learning_rate": 9.974004872314435e-06, + "loss": 0.6992, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 2.0645404013113877, + "learning_rate": 9.97383315423451e-06, + "loss": 0.7663, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 2.1783453708050393, + "learning_rate": 9.973660872342244e-06, + "loss": 0.7534, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 2.040114380019622, + "learning_rate": 9.973488026657171e-06, + "loss": 0.7122, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 2.063422864247258, + "learning_rate": 9.973314617198881e-06, + "loss": 0.6104, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 1.8912025270720374, + "learning_rate": 9.973140643987034e-06, + "loss": 0.611, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 2.0682555441292996, + "learning_rate": 9.972966107041349e-06, + "loss": 0.748, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 2.0628034068097745, + "learning_rate": 9.97279100638161e-06, + "loss": 0.7246, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 2.084889035523409, + "learning_rate": 9.972615342027667e-06, + "loss": 0.7109, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 1.8611852818863357, + "learning_rate": 9.972439113999431e-06, + "loss": 0.6385, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 2.0416032338007635, + "learning_rate": 9.97226232231688e-06, + "loss": 0.7019, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 1.9483099939451032, + "learning_rate": 9.972084967000055e-06, + "loss": 0.7654, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 1.7570415982383194, + "learning_rate": 9.971907048069058e-06, + "loss": 0.5443, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 2.3140913687038647, + "learning_rate": 9.97172856554406e-06, + "loss": 0.6249, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 1.943667180111752, + "learning_rate": 9.971549519445288e-06, + "loss": 0.7652, + "step": 606 + }, + { + "epoch": 0.06, + "grad_norm": 2.2683476874649537, + "learning_rate": 9.971369909793043e-06, + "loss": 0.5836, + "step": 607 + }, + { + "epoch": 0.06, + "grad_norm": 1.9552541335662836, + "learning_rate": 9.971189736607681e-06, + "loss": 0.6996, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 2.054084324981065, + "learning_rate": 9.97100899990963e-06, + "loss": 0.7218, + "step": 609 + }, + { + "epoch": 0.06, + "grad_norm": 2.1292215277853184, + "learning_rate": 9.970827699719372e-06, + "loss": 0.7322, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 2.2273055903437506, + "learning_rate": 9.970645836057464e-06, + "loss": 0.7003, + "step": 611 + }, + { + "epoch": 0.06, + "grad_norm": 1.9357280807082045, + "learning_rate": 9.97046340894452e-06, + "loss": 0.7023, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 2.197029414583496, + "learning_rate": 9.970280418401215e-06, + "loss": 0.68, + "step": 613 + }, + { + "epoch": 0.06, + "grad_norm": 2.0053051191734634, + "learning_rate": 9.970096864448296e-06, + "loss": 0.7367, + "step": 614 + }, + { + "epoch": 0.06, + "grad_norm": 2.1301040443505563, + "learning_rate": 9.96991274710657e-06, + "loss": 0.6358, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 2.419659091613593, + "learning_rate": 9.969728066396904e-06, + "loss": 0.7072, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 1.9261589500687413, + "learning_rate": 9.969542822340238e-06, + "loss": 0.6603, + "step": 617 + }, + { + "epoch": 0.06, + "grad_norm": 2.1022673917783963, + "learning_rate": 9.969357014957564e-06, + "loss": 0.7254, + "step": 618 + }, + { + "epoch": 0.06, + "grad_norm": 2.182516114339237, + "learning_rate": 9.96917064426995e-06, + "loss": 0.6666, + "step": 619 + }, + { + "epoch": 0.06, + "grad_norm": 2.3098245453776904, + "learning_rate": 9.968983710298522e-06, + "loss": 0.7824, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 1.9539441765468966, + "learning_rate": 9.968796213064466e-06, + "loss": 0.7051, + "step": 621 + }, + { + "epoch": 0.06, + "grad_norm": 2.0138948888168176, + "learning_rate": 9.968608152589038e-06, + "loss": 0.7378, + "step": 622 + }, + { + "epoch": 0.06, + "grad_norm": 2.175463284078723, + "learning_rate": 9.968419528893555e-06, + "loss": 0.6006, + "step": 623 + }, + { + "epoch": 0.06, + "grad_norm": 2.3105811086307466, + "learning_rate": 9.968230341999403e-06, + "loss": 0.763, + "step": 624 + }, + { + "epoch": 0.06, + "grad_norm": 2.08502012437374, + "learning_rate": 9.96804059192802e-06, + "loss": 0.6397, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 2.5120934211323087, + "learning_rate": 9.96785027870092e-06, + "loss": 0.6508, + "step": 626 + }, + { + "epoch": 0.07, + "grad_norm": 2.131440836614907, + "learning_rate": 9.967659402339677e-06, + "loss": 0.677, + "step": 627 + }, + { + "epoch": 0.07, + "grad_norm": 2.2797304046639217, + "learning_rate": 9.967467962865925e-06, + "loss": 0.8344, + "step": 628 + }, + { + "epoch": 0.07, + "grad_norm": 1.8568138944212114, + "learning_rate": 9.967275960301364e-06, + "loss": 0.6212, + "step": 629 + }, + { + "epoch": 0.07, + "grad_norm": 1.9700358484439096, + "learning_rate": 9.967083394667763e-06, + "loss": 0.7429, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 1.9885571768773675, + "learning_rate": 9.966890265986947e-06, + "loss": 0.6928, + "step": 631 + }, + { + "epoch": 0.07, + "grad_norm": 1.8696680843723692, + "learning_rate": 9.966696574280808e-06, + "loss": 0.639, + "step": 632 + }, + { + "epoch": 0.07, + "grad_norm": 1.9946364083173456, + "learning_rate": 9.966502319571303e-06, + "loss": 0.7326, + "step": 633 + }, + { + "epoch": 0.07, + "grad_norm": 1.9618197771145713, + "learning_rate": 9.966307501880452e-06, + "loss": 0.6483, + "step": 634 + }, + { + "epoch": 0.07, + "grad_norm": 1.9669889529206006, + "learning_rate": 9.966112121230341e-06, + "loss": 0.6443, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 2.0924747286776855, + "learning_rate": 9.965916177643112e-06, + "loss": 0.738, + "step": 636 + }, + { + "epoch": 0.07, + "grad_norm": 1.935928912743703, + "learning_rate": 9.965719671140981e-06, + "loss": 0.776, + "step": 637 + }, + { + "epoch": 0.07, + "grad_norm": 2.047670358994524, + "learning_rate": 9.965522601746222e-06, + "loss": 0.767, + "step": 638 + }, + { + "epoch": 0.07, + "grad_norm": 1.9296679711225158, + "learning_rate": 9.965324969481172e-06, + "loss": 0.6554, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 2.1303563423399687, + "learning_rate": 9.965126774368237e-06, + "loss": 0.6197, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 2.074249841183784, + "learning_rate": 9.964928016429883e-06, + "loss": 0.5803, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 1.9354294943036128, + "learning_rate": 9.964728695688635e-06, + "loss": 0.5228, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 1.8052649810413786, + "learning_rate": 9.964528812167095e-06, + "loss": 0.7194, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 4.177670415548599, + "learning_rate": 9.964328365887917e-06, + "loss": 0.7194, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 1.9997756821719956, + "learning_rate": 9.964127356873821e-06, + "loss": 0.6131, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 1.8801154696599298, + "learning_rate": 9.963925785147595e-06, + "loss": 0.6044, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 2.0091370827752275, + "learning_rate": 9.96372365073209e-06, + "loss": 0.7101, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 1.839460149960612, + "learning_rate": 9.963520953650214e-06, + "loss": 0.6756, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 2.036076135546113, + "learning_rate": 9.963317693924947e-06, + "loss": 0.6115, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 2.545887352696099, + "learning_rate": 9.963113871579332e-06, + "loss": 0.7111, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 2.0933171239683985, + "learning_rate": 9.96290948663647e-06, + "loss": 0.6289, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 1.9363189028685541, + "learning_rate": 9.962704539119528e-06, + "loss": 0.6259, + "step": 652 + }, + { + "epoch": 0.07, + "grad_norm": 2.064128197173807, + "learning_rate": 9.962499029051742e-06, + "loss": 0.6223, + "step": 653 + }, + { + "epoch": 0.07, + "grad_norm": 1.8337003404299514, + "learning_rate": 9.962292956456405e-06, + "loss": 0.6341, + "step": 654 + }, + { + "epoch": 0.07, + "grad_norm": 2.094853295361747, + "learning_rate": 9.962086321356878e-06, + "loss": 0.6245, + "step": 655 + }, + { + "epoch": 0.07, + "grad_norm": 1.977672637587837, + "learning_rate": 9.961879123776584e-06, + "loss": 0.6979, + "step": 656 + }, + { + "epoch": 0.07, + "grad_norm": 2.1712110101678403, + "learning_rate": 9.961671363739008e-06, + "loss": 0.6226, + "step": 657 + }, + { + "epoch": 0.07, + "grad_norm": 1.951254338928478, + "learning_rate": 9.961463041267703e-06, + "loss": 0.6741, + "step": 658 + }, + { + "epoch": 0.07, + "grad_norm": 2.170459383653736, + "learning_rate": 9.961254156386282e-06, + "loss": 0.6541, + "step": 659 + }, + { + "epoch": 0.07, + "grad_norm": 2.1749433390231916, + "learning_rate": 9.961044709118425e-06, + "loss": 0.7235, + "step": 660 + }, + { + "epoch": 0.07, + "grad_norm": 2.3401410783814383, + "learning_rate": 9.960834699487873e-06, + "loss": 0.6685, + "step": 661 + }, + { + "epoch": 0.07, + "grad_norm": 1.9667835737612456, + "learning_rate": 9.960624127518432e-06, + "loss": 0.6615, + "step": 662 + }, + { + "epoch": 0.07, + "grad_norm": 1.928408867792227, + "learning_rate": 9.960412993233973e-06, + "loss": 0.6487, + "step": 663 + }, + { + "epoch": 0.07, + "grad_norm": 2.1721273990729517, + "learning_rate": 9.960201296658425e-06, + "loss": 0.6058, + "step": 664 + }, + { + "epoch": 0.07, + "grad_norm": 2.391564010754368, + "learning_rate": 9.959989037815789e-06, + "loss": 0.5665, + "step": 665 + }, + { + "epoch": 0.07, + "grad_norm": 1.944892387838344, + "learning_rate": 9.959776216730125e-06, + "loss": 0.6924, + "step": 666 + }, + { + "epoch": 0.07, + "grad_norm": 2.236898508390765, + "learning_rate": 9.959562833425557e-06, + "loss": 0.6553, + "step": 667 + }, + { + "epoch": 0.07, + "grad_norm": 1.9932574533230005, + "learning_rate": 9.959348887926274e-06, + "loss": 0.7432, + "step": 668 + }, + { + "epoch": 0.07, + "grad_norm": 2.027491358248483, + "learning_rate": 9.959134380256525e-06, + "loss": 0.6426, + "step": 669 + }, + { + "epoch": 0.07, + "grad_norm": 2.0001475575651027, + "learning_rate": 9.95891931044063e-06, + "loss": 0.6808, + "step": 670 + }, + { + "epoch": 0.07, + "grad_norm": 1.820682926080947, + "learning_rate": 9.958703678502966e-06, + "loss": 0.6167, + "step": 671 + }, + { + "epoch": 0.07, + "grad_norm": 2.018955263359039, + "learning_rate": 9.958487484467976e-06, + "loss": 0.6538, + "step": 672 + }, + { + "epoch": 0.07, + "grad_norm": 2.040371696124914, + "learning_rate": 9.958270728360166e-06, + "loss": 0.6865, + "step": 673 + }, + { + "epoch": 0.07, + "grad_norm": 2.1337195894507404, + "learning_rate": 9.95805341020411e-06, + "loss": 0.6423, + "step": 674 + }, + { + "epoch": 0.07, + "grad_norm": 2.01030394805898, + "learning_rate": 9.957835530024438e-06, + "loss": 0.6973, + "step": 675 + }, + { + "epoch": 0.07, + "grad_norm": 2.220551541879092, + "learning_rate": 9.95761708784585e-06, + "loss": 0.6665, + "step": 676 + }, + { + "epoch": 0.07, + "grad_norm": 2.0803421961018715, + "learning_rate": 9.95739808369311e-06, + "loss": 0.711, + "step": 677 + }, + { + "epoch": 0.07, + "grad_norm": 2.2906932549637853, + "learning_rate": 9.95717851759104e-06, + "loss": 0.7801, + "step": 678 + }, + { + "epoch": 0.07, + "grad_norm": 2.13204944489717, + "learning_rate": 9.956958389564528e-06, + "loss": 0.6244, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 2.0425735599412174, + "learning_rate": 9.95673769963853e-06, + "loss": 0.7015, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 1.9345199013727183, + "learning_rate": 9.956516447838063e-06, + "loss": 0.6477, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 1.7219188066839115, + "learning_rate": 9.956294634188204e-06, + "loss": 0.5635, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 2.2382443419248705, + "learning_rate": 9.956072258714097e-06, + "loss": 0.7586, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 1.9477024443776676, + "learning_rate": 9.955849321440953e-06, + "loss": 0.728, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 1.9813355258791023, + "learning_rate": 9.95562582239404e-06, + "loss": 0.6703, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 2.1476591448584776, + "learning_rate": 9.955401761598693e-06, + "loss": 0.6842, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 2.0908641901740936, + "learning_rate": 9.955177139080312e-06, + "loss": 0.7313, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 2.084194905379439, + "learning_rate": 9.954951954864361e-06, + "loss": 0.7477, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 1.9635843996926026, + "learning_rate": 9.954726208976361e-06, + "loss": 0.6205, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 1.9130869685081529, + "learning_rate": 9.954499901441905e-06, + "loss": 0.6619, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 2.037636079788371, + "learning_rate": 9.954273032286646e-06, + "loss": 0.6274, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 1.9224440773396205, + "learning_rate": 9.9540456015363e-06, + "loss": 0.6808, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 2.134253881991058, + "learning_rate": 9.953817609216647e-06, + "loss": 0.7179, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 2.120477071210494, + "learning_rate": 9.953589055353534e-06, + "loss": 0.6869, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 2.05650737737351, + "learning_rate": 9.953359939972866e-06, + "loss": 0.6046, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 1.886773870673862, + "learning_rate": 9.953130263100615e-06, + "loss": 0.7154, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 2.036217797024567, + "learning_rate": 9.952900024762818e-06, + "loss": 0.761, + "step": 697 + }, + { + "epoch": 0.07, + "grad_norm": 2.2783625596602746, + "learning_rate": 9.952669224985572e-06, + "loss": 0.6307, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 1.9435212040471868, + "learning_rate": 9.95243786379504e-06, + "loss": 0.7085, + "step": 699 + }, + { + "epoch": 0.07, + "grad_norm": 1.804310202195145, + "learning_rate": 9.952205941217449e-06, + "loss": 0.6781, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 2.06225387355469, + "learning_rate": 9.951973457279087e-06, + "loss": 0.7211, + "step": 701 + }, + { + "epoch": 0.07, + "grad_norm": 2.076961953947264, + "learning_rate": 9.951740412006308e-06, + "loss": 0.6094, + "step": 702 + }, + { + "epoch": 0.07, + "grad_norm": 1.985672552340446, + "learning_rate": 9.951506805425531e-06, + "loss": 0.7098, + "step": 703 + }, + { + "epoch": 0.07, + "grad_norm": 1.8527325941283108, + "learning_rate": 9.951272637563233e-06, + "loss": 0.5818, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 1.9109812540751534, + "learning_rate": 9.951037908445961e-06, + "loss": 0.6898, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 2.020324053220724, + "learning_rate": 9.950802618100323e-06, + "loss": 0.7053, + "step": 706 + }, + { + "epoch": 0.07, + "grad_norm": 2.1678914704220067, + "learning_rate": 9.950566766552989e-06, + "loss": 0.6887, + "step": 707 + }, + { + "epoch": 0.07, + "grad_norm": 2.0424556490788475, + "learning_rate": 9.950330353830694e-06, + "loss": 0.7281, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 1.9996565401479152, + "learning_rate": 9.950093379960238e-06, + "loss": 0.6922, + "step": 709 + }, + { + "epoch": 0.07, + "grad_norm": 1.9744601447827723, + "learning_rate": 9.949855844968484e-06, + "loss": 0.6319, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 1.9385415848049423, + "learning_rate": 9.949617748882354e-06, + "loss": 0.6659, + "step": 711 + }, + { + "epoch": 0.07, + "grad_norm": 2.078954845276994, + "learning_rate": 9.949379091728843e-06, + "loss": 0.6764, + "step": 712 + }, + { + "epoch": 0.07, + "grad_norm": 1.9193955270459089, + "learning_rate": 9.949139873535e-06, + "loss": 0.6107, + "step": 713 + }, + { + "epoch": 0.07, + "grad_norm": 1.8955909321628803, + "learning_rate": 9.948900094327943e-06, + "loss": 0.6845, + "step": 714 + }, + { + "epoch": 0.07, + "grad_norm": 1.932089420380341, + "learning_rate": 9.948659754134852e-06, + "loss": 0.6588, + "step": 715 + }, + { + "epoch": 0.07, + "grad_norm": 1.8948454482363528, + "learning_rate": 9.948418852982973e-06, + "loss": 0.5758, + "step": 716 + }, + { + "epoch": 0.07, + "grad_norm": 1.8286956041099895, + "learning_rate": 9.948177390899611e-06, + "loss": 0.6549, + "step": 717 + }, + { + "epoch": 0.07, + "grad_norm": 1.894187924245138, + "learning_rate": 9.94793536791214e-06, + "loss": 0.5799, + "step": 718 + }, + { + "epoch": 0.07, + "grad_norm": 2.0176210263591576, + "learning_rate": 9.94769278404799e-06, + "loss": 0.6644, + "step": 719 + }, + { + "epoch": 0.07, + "grad_norm": 1.8158957435685823, + "learning_rate": 9.947449639334663e-06, + "loss": 0.7072, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 1.8892811407727792, + "learning_rate": 9.94720593379972e-06, + "loss": 0.6623, + "step": 721 + }, + { + "epoch": 0.08, + "grad_norm": 2.201408847040572, + "learning_rate": 9.946961667470787e-06, + "loss": 0.6785, + "step": 722 + }, + { + "epoch": 0.08, + "grad_norm": 1.8596998671238845, + "learning_rate": 9.946716840375552e-06, + "loss": 0.6325, + "step": 723 + }, + { + "epoch": 0.08, + "grad_norm": 2.1929689551861618, + "learning_rate": 9.946471452541768e-06, + "loss": 0.7123, + "step": 724 + }, + { + "epoch": 0.08, + "grad_norm": 1.8739563959458077, + "learning_rate": 9.94622550399725e-06, + "loss": 0.7399, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 1.9897122706062023, + "learning_rate": 9.945978994769878e-06, + "loss": 0.7497, + "step": 726 + }, + { + "epoch": 0.08, + "grad_norm": 1.9516686408380788, + "learning_rate": 9.945731924887598e-06, + "loss": 0.6577, + "step": 727 + }, + { + "epoch": 0.08, + "grad_norm": 1.7720438709863602, + "learning_rate": 9.945484294378413e-06, + "loss": 0.6154, + "step": 728 + }, + { + "epoch": 0.08, + "grad_norm": 2.249334522149298, + "learning_rate": 9.945236103270395e-06, + "loss": 0.7274, + "step": 729 + }, + { + "epoch": 0.08, + "grad_norm": 1.8705068155172135, + "learning_rate": 9.944987351591677e-06, + "loss": 0.5798, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 2.8901122235028085, + "learning_rate": 9.944738039370458e-06, + "loss": 0.7301, + "step": 731 + }, + { + "epoch": 0.08, + "grad_norm": 2.0901188573895166, + "learning_rate": 9.944488166635e-06, + "loss": 0.6661, + "step": 732 + }, + { + "epoch": 0.08, + "grad_norm": 2.0716119906288104, + "learning_rate": 9.944237733413623e-06, + "loss": 0.7104, + "step": 733 + }, + { + "epoch": 0.08, + "grad_norm": 2.0084060471211003, + "learning_rate": 9.943986739734718e-06, + "loss": 0.7807, + "step": 734 + }, + { + "epoch": 0.08, + "grad_norm": 2.126162332371512, + "learning_rate": 9.943735185626739e-06, + "loss": 0.7289, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 2.048734123772361, + "learning_rate": 9.943483071118197e-06, + "loss": 0.7006, + "step": 736 + }, + { + "epoch": 0.08, + "grad_norm": 2.232634046448412, + "learning_rate": 9.94323039623767e-06, + "loss": 0.7472, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 2.0219318375835824, + "learning_rate": 9.942977161013802e-06, + "loss": 0.6657, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 1.9114763004154225, + "learning_rate": 9.942723365475301e-06, + "loss": 0.7154, + "step": 739 + }, + { + "epoch": 0.08, + "grad_norm": 2.2527373356434115, + "learning_rate": 9.942469009650933e-06, + "loss": 0.732, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 2.2324878692245385, + "learning_rate": 9.942214093569534e-06, + "loss": 0.7158, + "step": 741 + }, + { + "epoch": 0.08, + "grad_norm": 2.1365648202427594, + "learning_rate": 9.941958617259994e-06, + "loss": 0.722, + "step": 742 + }, + { + "epoch": 0.08, + "grad_norm": 2.1565366949992963, + "learning_rate": 9.941702580751278e-06, + "loss": 0.7789, + "step": 743 + }, + { + "epoch": 0.08, + "grad_norm": 1.9027629563938848, + "learning_rate": 9.941445984072408e-06, + "loss": 0.6342, + "step": 744 + }, + { + "epoch": 0.08, + "grad_norm": 2.1215899854013403, + "learning_rate": 9.941188827252471e-06, + "loss": 0.6236, + "step": 745 + }, + { + "epoch": 0.08, + "grad_norm": 2.124700681765373, + "learning_rate": 9.940931110320615e-06, + "loss": 0.6907, + "step": 746 + }, + { + "epoch": 0.08, + "grad_norm": 1.894826319876532, + "learning_rate": 9.940672833306056e-06, + "loss": 0.648, + "step": 747 + }, + { + "epoch": 0.08, + "grad_norm": 2.1683138583935646, + "learning_rate": 9.940413996238071e-06, + "loss": 0.6152, + "step": 748 + }, + { + "epoch": 0.08, + "grad_norm": 2.304827301613214, + "learning_rate": 9.940154599145998e-06, + "loss": 0.6609, + "step": 749 + }, + { + "epoch": 0.08, + "grad_norm": 2.1667628822195653, + "learning_rate": 9.939894642059248e-06, + "loss": 0.773, + "step": 750 + }, + { + "epoch": 0.08, + "grad_norm": 2.2670965613851357, + "learning_rate": 9.939634125007279e-06, + "loss": 0.749, + "step": 751 + }, + { + "epoch": 0.08, + "grad_norm": 1.9105308819933733, + "learning_rate": 9.939373048019629e-06, + "loss": 0.5988, + "step": 752 + }, + { + "epoch": 0.08, + "grad_norm": 2.001684787938153, + "learning_rate": 9.93911141112589e-06, + "loss": 0.756, + "step": 753 + }, + { + "epoch": 0.08, + "grad_norm": 1.9897912439886951, + "learning_rate": 9.938849214355722e-06, + "loss": 0.5951, + "step": 754 + }, + { + "epoch": 0.08, + "grad_norm": 1.8260079718444404, + "learning_rate": 9.938586457738844e-06, + "loss": 0.6922, + "step": 755 + }, + { + "epoch": 0.08, + "grad_norm": 2.0208943552103538, + "learning_rate": 9.938323141305042e-06, + "loss": 0.6881, + "step": 756 + }, + { + "epoch": 0.08, + "grad_norm": 1.8892014681961153, + "learning_rate": 9.938059265084163e-06, + "loss": 0.6852, + "step": 757 + }, + { + "epoch": 0.08, + "grad_norm": 2.132937380951946, + "learning_rate": 9.937794829106122e-06, + "loss": 0.7, + "step": 758 + }, + { + "epoch": 0.08, + "grad_norm": 2.0088322043327507, + "learning_rate": 9.937529833400892e-06, + "loss": 0.6808, + "step": 759 + }, + { + "epoch": 0.08, + "grad_norm": 2.041185119184499, + "learning_rate": 9.937264277998513e-06, + "loss": 0.5472, + "step": 760 + }, + { + "epoch": 0.08, + "grad_norm": 1.8536310070000221, + "learning_rate": 9.936998162929086e-06, + "loss": 0.5643, + "step": 761 + }, + { + "epoch": 0.08, + "grad_norm": 2.001083354207652, + "learning_rate": 9.936731488222776e-06, + "loss": 0.6498, + "step": 762 + }, + { + "epoch": 0.08, + "grad_norm": 2.0706938113208717, + "learning_rate": 9.936464253909817e-06, + "loss": 0.6487, + "step": 763 + }, + { + "epoch": 0.08, + "grad_norm": 2.0794572945560916, + "learning_rate": 9.936196460020496e-06, + "loss": 0.6505, + "step": 764 + }, + { + "epoch": 0.08, + "grad_norm": 2.1561768544194146, + "learning_rate": 9.93592810658517e-06, + "loss": 0.6906, + "step": 765 + }, + { + "epoch": 0.08, + "grad_norm": 2.008123268630148, + "learning_rate": 9.935659193634261e-06, + "loss": 0.6367, + "step": 766 + }, + { + "epoch": 0.08, + "grad_norm": 2.25399791489673, + "learning_rate": 9.935389721198249e-06, + "loss": 0.7417, + "step": 767 + }, + { + "epoch": 0.08, + "grad_norm": 1.8352434635664747, + "learning_rate": 9.935119689307682e-06, + "loss": 0.6074, + "step": 768 + }, + { + "epoch": 0.08, + "grad_norm": 2.326860474330298, + "learning_rate": 9.934849097993168e-06, + "loss": 0.7976, + "step": 769 + }, + { + "epoch": 0.08, + "grad_norm": 1.8668608391500925, + "learning_rate": 9.934577947285382e-06, + "loss": 0.7362, + "step": 770 + }, + { + "epoch": 0.08, + "grad_norm": 1.877597080458074, + "learning_rate": 9.934306237215057e-06, + "loss": 0.5724, + "step": 771 + }, + { + "epoch": 0.08, + "grad_norm": 1.8322003762305377, + "learning_rate": 9.934033967812998e-06, + "loss": 0.6854, + "step": 772 + }, + { + "epoch": 0.08, + "grad_norm": 1.7621008758677785, + "learning_rate": 9.933761139110065e-06, + "loss": 0.6506, + "step": 773 + }, + { + "epoch": 0.08, + "grad_norm": 2.0299854137963975, + "learning_rate": 9.933487751137185e-06, + "loss": 0.6442, + "step": 774 + }, + { + "epoch": 0.08, + "grad_norm": 1.8777959177763561, + "learning_rate": 9.93321380392535e-06, + "loss": 0.7335, + "step": 775 + }, + { + "epoch": 0.08, + "grad_norm": 2.3950292493331333, + "learning_rate": 9.932939297505611e-06, + "loss": 0.7287, + "step": 776 + }, + { + "epoch": 0.08, + "grad_norm": 2.148775632603258, + "learning_rate": 9.932664231909087e-06, + "loss": 0.6965, + "step": 777 + }, + { + "epoch": 0.08, + "grad_norm": 2.3030378873226254, + "learning_rate": 9.932388607166954e-06, + "loss": 0.5862, + "step": 778 + }, + { + "epoch": 0.08, + "grad_norm": 1.8487026804028814, + "learning_rate": 9.93211242331046e-06, + "loss": 0.5834, + "step": 779 + }, + { + "epoch": 0.08, + "grad_norm": 1.6659735801019722, + "learning_rate": 9.931835680370912e-06, + "loss": 0.6148, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 2.0126658622092233, + "learning_rate": 9.931558378379677e-06, + "loss": 0.7435, + "step": 781 + }, + { + "epoch": 0.08, + "grad_norm": 2.067686271560952, + "learning_rate": 9.931280517368193e-06, + "loss": 0.7257, + "step": 782 + }, + { + "epoch": 0.08, + "grad_norm": 2.1745154175639487, + "learning_rate": 9.931002097367954e-06, + "loss": 0.6712, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 2.045784496263562, + "learning_rate": 9.930723118410521e-06, + "loss": 0.6423, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 2.270602416768308, + "learning_rate": 9.93044358052752e-06, + "loss": 0.6607, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 1.845961952430096, + "learning_rate": 9.930163483750636e-06, + "loss": 0.6405, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 1.681941760116097, + "learning_rate": 9.929882828111619e-06, + "loss": 0.6073, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 2.1011423694291844, + "learning_rate": 9.929601613642285e-06, + "loss": 0.715, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 1.7766205903261256, + "learning_rate": 9.92931984037451e-06, + "loss": 0.5952, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 1.897836028769625, + "learning_rate": 9.929037508340234e-06, + "loss": 0.7663, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 1.85405318753218, + "learning_rate": 9.928754617571464e-06, + "loss": 0.7752, + "step": 791 + }, + { + "epoch": 0.08, + "grad_norm": 2.0696560918392723, + "learning_rate": 9.928471168100264e-06, + "loss": 0.5531, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 2.2262392903553256, + "learning_rate": 9.928187159958764e-06, + "loss": 0.7426, + "step": 793 + }, + { + "epoch": 0.08, + "grad_norm": 2.1714035118411035, + "learning_rate": 9.927902593179163e-06, + "loss": 0.6794, + "step": 794 + }, + { + "epoch": 0.08, + "grad_norm": 2.1031096079567733, + "learning_rate": 9.927617467793713e-06, + "loss": 0.7313, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 2.178237030703268, + "learning_rate": 9.927331783834737e-06, + "loss": 0.6139, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 2.006402940126805, + "learning_rate": 9.927045541334618e-06, + "loss": 0.7167, + "step": 797 + }, + { + "epoch": 0.08, + "grad_norm": 1.7349004505955714, + "learning_rate": 9.926758740325803e-06, + "loss": 0.5405, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 1.7514974917690225, + "learning_rate": 9.926471380840805e-06, + "loss": 0.6142, + "step": 799 + }, + { + "epoch": 0.08, + "grad_norm": 2.002352977374873, + "learning_rate": 9.926183462912196e-06, + "loss": 0.673, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 1.820390271795529, + "learning_rate": 9.92589498657261e-06, + "loss": 0.5349, + "step": 801 + }, + { + "epoch": 0.08, + "grad_norm": 2.061683813396355, + "learning_rate": 9.925605951854754e-06, + "loss": 0.7079, + "step": 802 + }, + { + "epoch": 0.08, + "grad_norm": 1.9960408210755969, + "learning_rate": 9.925316358791388e-06, + "loss": 0.6418, + "step": 803 + }, + { + "epoch": 0.08, + "grad_norm": 2.014159080687005, + "learning_rate": 9.925026207415338e-06, + "loss": 0.5767, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 1.7464740551614473, + "learning_rate": 9.924735497759497e-06, + "loss": 0.5434, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 1.9006992994964862, + "learning_rate": 9.924444229856817e-06, + "loss": 0.6631, + "step": 806 + }, + { + "epoch": 0.08, + "grad_norm": 1.8335712721819453, + "learning_rate": 9.924152403740315e-06, + "loss": 0.6265, + "step": 807 + }, + { + "epoch": 0.08, + "grad_norm": 1.9521569470070748, + "learning_rate": 9.92386001944307e-06, + "loss": 0.6922, + "step": 808 + }, + { + "epoch": 0.08, + "grad_norm": 2.102488618677563, + "learning_rate": 9.923567076998228e-06, + "loss": 0.6094, + "step": 809 + }, + { + "epoch": 0.08, + "grad_norm": 2.150033884300534, + "learning_rate": 9.923273576438994e-06, + "loss": 0.5725, + "step": 810 + }, + { + "epoch": 0.08, + "grad_norm": 2.030112095841831, + "learning_rate": 9.92297951779864e-06, + "loss": 0.6741, + "step": 811 + }, + { + "epoch": 0.08, + "grad_norm": 1.9791624948612083, + "learning_rate": 9.922684901110496e-06, + "loss": 0.6707, + "step": 812 + }, + { + "epoch": 0.08, + "grad_norm": 2.2930407423495707, + "learning_rate": 9.92238972640796e-06, + "loss": 0.6664, + "step": 813 + }, + { + "epoch": 0.08, + "grad_norm": 1.8906542205262489, + "learning_rate": 9.922093993724492e-06, + "loss": 0.6267, + "step": 814 + }, + { + "epoch": 0.08, + "grad_norm": 1.9261005637706718, + "learning_rate": 9.921797703093614e-06, + "loss": 0.6355, + "step": 815 + }, + { + "epoch": 0.08, + "grad_norm": 1.8862503970508953, + "learning_rate": 9.921500854548916e-06, + "loss": 0.6027, + "step": 816 + }, + { + "epoch": 0.08, + "grad_norm": 2.1505482544915844, + "learning_rate": 9.921203448124042e-06, + "loss": 0.6235, + "step": 817 + }, + { + "epoch": 0.09, + "grad_norm": 1.918340051848613, + "learning_rate": 9.920905483852708e-06, + "loss": 0.7459, + "step": 818 + }, + { + "epoch": 0.09, + "grad_norm": 1.8334734949147398, + "learning_rate": 9.920606961768689e-06, + "loss": 0.628, + "step": 819 + }, + { + "epoch": 0.09, + "grad_norm": 1.9869297816294038, + "learning_rate": 9.920307881905824e-06, + "loss": 0.6617, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 1.9449685673150783, + "learning_rate": 9.920008244298016e-06, + "loss": 0.6223, + "step": 821 + }, + { + "epoch": 0.09, + "grad_norm": 2.03168416029326, + "learning_rate": 9.91970804897923e-06, + "loss": 0.6794, + "step": 822 + }, + { + "epoch": 0.09, + "grad_norm": 1.878625929884055, + "learning_rate": 9.919407295983496e-06, + "loss": 0.6658, + "step": 823 + }, + { + "epoch": 0.09, + "grad_norm": 1.9650937939981765, + "learning_rate": 9.919105985344906e-06, + "loss": 0.5458, + "step": 824 + }, + { + "epoch": 0.09, + "grad_norm": 2.0909683014256286, + "learning_rate": 9.918804117097612e-06, + "loss": 0.7894, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 2.0613301951740173, + "learning_rate": 9.918501691275837e-06, + "loss": 0.6921, + "step": 826 + }, + { + "epoch": 0.09, + "grad_norm": 2.0419520750382265, + "learning_rate": 9.918198707913861e-06, + "loss": 0.6937, + "step": 827 + }, + { + "epoch": 0.09, + "grad_norm": 1.8581097977278722, + "learning_rate": 9.917895167046027e-06, + "loss": 0.6482, + "step": 828 + }, + { + "epoch": 0.09, + "grad_norm": 1.8748461988863785, + "learning_rate": 9.917591068706747e-06, + "loss": 0.6154, + "step": 829 + }, + { + "epoch": 0.09, + "grad_norm": 1.8925608747518752, + "learning_rate": 9.917286412930489e-06, + "loss": 0.6991, + "step": 830 + }, + { + "epoch": 0.09, + "grad_norm": 1.8160060819942032, + "learning_rate": 9.916981199751789e-06, + "loss": 0.5783, + "step": 831 + }, + { + "epoch": 0.09, + "grad_norm": 1.871521248122172, + "learning_rate": 9.916675429205243e-06, + "loss": 0.7109, + "step": 832 + }, + { + "epoch": 0.09, + "grad_norm": 2.1074885386758977, + "learning_rate": 9.916369101325514e-06, + "loss": 0.7108, + "step": 833 + }, + { + "epoch": 0.09, + "grad_norm": 2.1192414719954025, + "learning_rate": 9.916062216147324e-06, + "loss": 0.6341, + "step": 834 + }, + { + "epoch": 0.09, + "grad_norm": 2.08239120677423, + "learning_rate": 9.915754773705461e-06, + "loss": 0.6375, + "step": 835 + }, + { + "epoch": 0.09, + "grad_norm": 2.1316657605531093, + "learning_rate": 9.915446774034776e-06, + "loss": 0.7224, + "step": 836 + }, + { + "epoch": 0.09, + "grad_norm": 2.281939628017067, + "learning_rate": 9.915138217170184e-06, + "loss": 0.7129, + "step": 837 + }, + { + "epoch": 0.09, + "grad_norm": 1.8741130808070126, + "learning_rate": 9.914829103146658e-06, + "loss": 0.6406, + "step": 838 + }, + { + "epoch": 0.09, + "grad_norm": 1.9684244785118747, + "learning_rate": 9.91451943199924e-06, + "loss": 0.7195, + "step": 839 + }, + { + "epoch": 0.09, + "grad_norm": 2.0475888837677045, + "learning_rate": 9.914209203763032e-06, + "loss": 0.6547, + "step": 840 + }, + { + "epoch": 0.09, + "grad_norm": 2.04921767979233, + "learning_rate": 9.9138984184732e-06, + "loss": 0.6901, + "step": 841 + }, + { + "epoch": 0.09, + "grad_norm": 2.172629056653797, + "learning_rate": 9.913587076164976e-06, + "loss": 0.7042, + "step": 842 + }, + { + "epoch": 0.09, + "grad_norm": 2.267660676241233, + "learning_rate": 9.91327517687365e-06, + "loss": 0.685, + "step": 843 + }, + { + "epoch": 0.09, + "grad_norm": 1.7760733119676897, + "learning_rate": 9.912962720634575e-06, + "loss": 0.7247, + "step": 844 + }, + { + "epoch": 0.09, + "grad_norm": 1.8798377763004066, + "learning_rate": 9.912649707483174e-06, + "loss": 0.6346, + "step": 845 + }, + { + "epoch": 0.09, + "grad_norm": 1.869284735547129, + "learning_rate": 9.91233613745493e-06, + "loss": 0.6542, + "step": 846 + }, + { + "epoch": 0.09, + "grad_norm": 1.926516930261669, + "learning_rate": 9.912022010585385e-06, + "loss": 0.5949, + "step": 847 + }, + { + "epoch": 0.09, + "grad_norm": 1.972750668754731, + "learning_rate": 9.911707326910145e-06, + "loss": 0.5647, + "step": 848 + }, + { + "epoch": 0.09, + "grad_norm": 1.7730609944376714, + "learning_rate": 9.911392086464886e-06, + "loss": 0.69, + "step": 849 + }, + { + "epoch": 0.09, + "grad_norm": 2.0654042895948033, + "learning_rate": 9.911076289285338e-06, + "loss": 0.6786, + "step": 850 + }, + { + "epoch": 0.09, + "grad_norm": 1.9272137533364486, + "learning_rate": 9.910759935407301e-06, + "loss": 0.6813, + "step": 851 + }, + { + "epoch": 0.09, + "grad_norm": 2.076512024838693, + "learning_rate": 9.910443024866636e-06, + "loss": 0.7075, + "step": 852 + }, + { + "epoch": 0.09, + "grad_norm": 1.9792218894356493, + "learning_rate": 9.910125557699266e-06, + "loss": 0.5437, + "step": 853 + }, + { + "epoch": 0.09, + "grad_norm": 1.96480203139793, + "learning_rate": 9.909807533941176e-06, + "loss": 0.6791, + "step": 854 + }, + { + "epoch": 0.09, + "grad_norm": 2.571498964350275, + "learning_rate": 9.909488953628416e-06, + "loss": 0.7562, + "step": 855 + }, + { + "epoch": 0.09, + "grad_norm": 2.0928806202081334, + "learning_rate": 9.909169816797102e-06, + "loss": 0.6678, + "step": 856 + }, + { + "epoch": 0.09, + "grad_norm": 2.1045247209467752, + "learning_rate": 9.908850123483406e-06, + "loss": 0.63, + "step": 857 + }, + { + "epoch": 0.09, + "grad_norm": 2.0200550049787336, + "learning_rate": 9.908529873723571e-06, + "loss": 0.6311, + "step": 858 + }, + { + "epoch": 0.09, + "grad_norm": 2.152052863379161, + "learning_rate": 9.908209067553897e-06, + "loss": 0.6502, + "step": 859 + }, + { + "epoch": 0.09, + "grad_norm": 1.8592974529643835, + "learning_rate": 9.907887705010748e-06, + "loss": 0.6613, + "step": 860 + }, + { + "epoch": 0.09, + "grad_norm": 1.9787376817789002, + "learning_rate": 9.907565786130556e-06, + "loss": 0.6377, + "step": 861 + }, + { + "epoch": 0.09, + "grad_norm": 2.1440337219046173, + "learning_rate": 9.907243310949806e-06, + "loss": 0.7986, + "step": 862 + }, + { + "epoch": 0.09, + "grad_norm": 1.8768089367088299, + "learning_rate": 9.906920279505058e-06, + "loss": 0.6163, + "step": 863 + }, + { + "epoch": 0.09, + "grad_norm": 2.23557857803684, + "learning_rate": 9.90659669183293e-06, + "loss": 0.7218, + "step": 864 + }, + { + "epoch": 0.09, + "grad_norm": 1.739848221698496, + "learning_rate": 9.906272547970098e-06, + "loss": 0.6745, + "step": 865 + }, + { + "epoch": 0.09, + "grad_norm": 2.055645158811453, + "learning_rate": 9.90594784795331e-06, + "loss": 0.7725, + "step": 866 + }, + { + "epoch": 0.09, + "grad_norm": 1.9873627014566904, + "learning_rate": 9.905622591819368e-06, + "loss": 0.6261, + "step": 867 + }, + { + "epoch": 0.09, + "grad_norm": 2.2197485693258354, + "learning_rate": 9.905296779605144e-06, + "loss": 0.6184, + "step": 868 + }, + { + "epoch": 0.09, + "grad_norm": 1.8826852954404683, + "learning_rate": 9.904970411347574e-06, + "loss": 0.5683, + "step": 869 + }, + { + "epoch": 0.09, + "grad_norm": 1.8557977806798933, + "learning_rate": 9.904643487083648e-06, + "loss": 0.5471, + "step": 870 + }, + { + "epoch": 0.09, + "grad_norm": 2.871864907218477, + "learning_rate": 9.904316006850428e-06, + "loss": 0.6914, + "step": 871 + }, + { + "epoch": 0.09, + "grad_norm": 2.020931495944233, + "learning_rate": 9.903987970685034e-06, + "loss": 0.7881, + "step": 872 + }, + { + "epoch": 0.09, + "grad_norm": 1.9461530000097567, + "learning_rate": 9.903659378624652e-06, + "loss": 0.6323, + "step": 873 + }, + { + "epoch": 0.09, + "grad_norm": 1.9688203893811644, + "learning_rate": 9.903330230706529e-06, + "loss": 0.7003, + "step": 874 + }, + { + "epoch": 0.09, + "grad_norm": 1.8043702789163878, + "learning_rate": 9.903000526967977e-06, + "loss": 0.6036, + "step": 875 + }, + { + "epoch": 0.09, + "grad_norm": 1.950517288648671, + "learning_rate": 9.90267026744637e-06, + "loss": 0.6606, + "step": 876 + }, + { + "epoch": 0.09, + "grad_norm": 2.053483126005183, + "learning_rate": 9.902339452179142e-06, + "loss": 0.6498, + "step": 877 + }, + { + "epoch": 0.09, + "grad_norm": 1.8803225276792594, + "learning_rate": 9.902008081203796e-06, + "loss": 0.7156, + "step": 878 + }, + { + "epoch": 0.09, + "grad_norm": 1.8191419159760944, + "learning_rate": 9.901676154557893e-06, + "loss": 0.6745, + "step": 879 + }, + { + "epoch": 0.09, + "grad_norm": 1.8484125244211091, + "learning_rate": 9.90134367227906e-06, + "loss": 0.7064, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 2.0590899215765233, + "learning_rate": 9.901010634404983e-06, + "loss": 0.656, + "step": 881 + }, + { + "epoch": 0.09, + "grad_norm": 1.9746227071307032, + "learning_rate": 9.900677040973418e-06, + "loss": 0.7299, + "step": 882 + }, + { + "epoch": 0.09, + "grad_norm": 2.073087199185832, + "learning_rate": 9.900342892022176e-06, + "loss": 0.7405, + "step": 883 + }, + { + "epoch": 0.09, + "grad_norm": 2.0167336165515475, + "learning_rate": 9.900008187589138e-06, + "loss": 0.6863, + "step": 884 + }, + { + "epoch": 0.09, + "grad_norm": 1.938178039873934, + "learning_rate": 9.899672927712242e-06, + "loss": 0.8384, + "step": 885 + }, + { + "epoch": 0.09, + "grad_norm": 1.6977338968973716, + "learning_rate": 9.899337112429492e-06, + "loss": 0.6711, + "step": 886 + }, + { + "epoch": 0.09, + "grad_norm": 1.9888290143305485, + "learning_rate": 9.899000741778956e-06, + "loss": 0.6642, + "step": 887 + }, + { + "epoch": 0.09, + "grad_norm": 1.9917854977117735, + "learning_rate": 9.898663815798761e-06, + "loss": 0.6263, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 2.0969486651793816, + "learning_rate": 9.898326334527102e-06, + "loss": 0.6614, + "step": 889 + }, + { + "epoch": 0.09, + "grad_norm": 2.026670466094795, + "learning_rate": 9.897988298002233e-06, + "loss": 0.6663, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 1.9815021235689791, + "learning_rate": 9.897649706262474e-06, + "loss": 0.6395, + "step": 891 + }, + { + "epoch": 0.09, + "grad_norm": 2.2613612523304845, + "learning_rate": 9.897310559346203e-06, + "loss": 0.801, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 1.821267540525475, + "learning_rate": 9.896970857291868e-06, + "loss": 0.6847, + "step": 893 + }, + { + "epoch": 0.09, + "grad_norm": 2.0966029040749694, + "learning_rate": 9.896630600137974e-06, + "loss": 0.713, + "step": 894 + }, + { + "epoch": 0.09, + "grad_norm": 1.920981843170863, + "learning_rate": 9.896289787923092e-06, + "loss": 0.6597, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 1.9392140861418414, + "learning_rate": 9.895948420685855e-06, + "loss": 0.7259, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 2.1670282608670166, + "learning_rate": 9.895606498464956e-06, + "loss": 0.7213, + "step": 897 + }, + { + "epoch": 0.09, + "grad_norm": 2.0848482577609, + "learning_rate": 9.895264021299158e-06, + "loss": 0.7159, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 1.716703867604487, + "learning_rate": 9.894920989227282e-06, + "loss": 0.5575, + "step": 899 + }, + { + "epoch": 0.09, + "grad_norm": 1.7866881854548302, + "learning_rate": 9.89457740228821e-06, + "loss": 0.6121, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 1.8671996440775374, + "learning_rate": 9.894233260520893e-06, + "loss": 0.5676, + "step": 901 + }, + { + "epoch": 0.09, + "grad_norm": 1.9434408487574717, + "learning_rate": 9.89388856396434e-06, + "loss": 0.6724, + "step": 902 + }, + { + "epoch": 0.09, + "grad_norm": 1.7936908863669354, + "learning_rate": 9.893543312657623e-06, + "loss": 0.6487, + "step": 903 + }, + { + "epoch": 0.09, + "grad_norm": 2.027708079476974, + "learning_rate": 9.89319750663988e-06, + "loss": 0.6754, + "step": 904 + }, + { + "epoch": 0.09, + "grad_norm": 2.0114888734872203, + "learning_rate": 9.892851145950308e-06, + "loss": 0.6927, + "step": 905 + }, + { + "epoch": 0.09, + "grad_norm": 1.924424371946275, + "learning_rate": 9.89250423062817e-06, + "loss": 0.6884, + "step": 906 + }, + { + "epoch": 0.09, + "grad_norm": 1.8961310755643144, + "learning_rate": 9.892156760712793e-06, + "loss": 0.6769, + "step": 907 + }, + { + "epoch": 0.09, + "grad_norm": 1.9632794427660414, + "learning_rate": 9.891808736243563e-06, + "loss": 0.6999, + "step": 908 + }, + { + "epoch": 0.09, + "grad_norm": 2.495602941098346, + "learning_rate": 9.89146015725993e-06, + "loss": 0.7327, + "step": 909 + }, + { + "epoch": 0.09, + "grad_norm": 2.3402932804708834, + "learning_rate": 9.891111023801405e-06, + "loss": 0.636, + "step": 910 + }, + { + "epoch": 0.09, + "grad_norm": 1.9472235512569598, + "learning_rate": 9.89076133590757e-06, + "loss": 0.6924, + "step": 911 + }, + { + "epoch": 0.09, + "grad_norm": 1.9751978430386354, + "learning_rate": 9.89041109361806e-06, + "loss": 0.6588, + "step": 912 + }, + { + "epoch": 0.09, + "grad_norm": 1.9931745510211398, + "learning_rate": 9.89006029697258e-06, + "loss": 0.7378, + "step": 913 + }, + { + "epoch": 0.1, + "grad_norm": 1.9711240247848647, + "learning_rate": 9.88970894601089e-06, + "loss": 0.7044, + "step": 914 + }, + { + "epoch": 0.1, + "grad_norm": 1.945758925356476, + "learning_rate": 9.889357040772822e-06, + "loss": 0.7053, + "step": 915 + }, + { + "epoch": 0.1, + "grad_norm": 1.903319178019049, + "learning_rate": 9.889004581298265e-06, + "loss": 0.6292, + "step": 916 + }, + { + "epoch": 0.1, + "grad_norm": 1.9525247895967082, + "learning_rate": 9.888651567627173e-06, + "loss": 0.5822, + "step": 917 + }, + { + "epoch": 0.1, + "grad_norm": 2.0574072003726425, + "learning_rate": 9.88829799979956e-06, + "loss": 0.6102, + "step": 918 + }, + { + "epoch": 0.1, + "grad_norm": 2.044284692228813, + "learning_rate": 9.887943877855505e-06, + "loss": 0.6571, + "step": 919 + }, + { + "epoch": 0.1, + "grad_norm": 1.5942595611222257, + "learning_rate": 9.887589201835154e-06, + "loss": 0.5227, + "step": 920 + }, + { + "epoch": 0.1, + "grad_norm": 1.8786375947915854, + "learning_rate": 9.88723397177871e-06, + "loss": 0.586, + "step": 921 + }, + { + "epoch": 0.1, + "grad_norm": 1.8615389080365194, + "learning_rate": 9.886878187726435e-06, + "loss": 0.5845, + "step": 922 + }, + { + "epoch": 0.1, + "grad_norm": 2.064836397948735, + "learning_rate": 9.886521849718665e-06, + "loss": 0.614, + "step": 923 + }, + { + "epoch": 0.1, + "grad_norm": 1.8259649534315854, + "learning_rate": 9.886164957795792e-06, + "loss": 0.5673, + "step": 924 + }, + { + "epoch": 0.1, + "grad_norm": 1.7726574431040119, + "learning_rate": 9.885807511998269e-06, + "loss": 0.6505, + "step": 925 + }, + { + "epoch": 0.1, + "grad_norm": 1.9798645528179286, + "learning_rate": 9.885449512366617e-06, + "loss": 0.7142, + "step": 926 + }, + { + "epoch": 0.1, + "grad_norm": 1.7854622151133066, + "learning_rate": 9.885090958941416e-06, + "loss": 0.6493, + "step": 927 + }, + { + "epoch": 0.1, + "grad_norm": 1.9520396817553052, + "learning_rate": 9.884731851763313e-06, + "loss": 0.5635, + "step": 928 + }, + { + "epoch": 0.1, + "grad_norm": 2.002592135377459, + "learning_rate": 9.884372190873011e-06, + "loss": 0.7452, + "step": 929 + }, + { + "epoch": 0.1, + "grad_norm": 1.7836620496269653, + "learning_rate": 9.88401197631128e-06, + "loss": 0.6534, + "step": 930 + }, + { + "epoch": 0.1, + "grad_norm": 1.8592328756550733, + "learning_rate": 9.883651208118956e-06, + "loss": 0.6744, + "step": 931 + }, + { + "epoch": 0.1, + "grad_norm": 1.8997837483930826, + "learning_rate": 9.88328988633693e-06, + "loss": 0.7845, + "step": 932 + }, + { + "epoch": 0.1, + "grad_norm": 2.1336391293428023, + "learning_rate": 9.882928011006163e-06, + "loss": 0.5957, + "step": 933 + }, + { + "epoch": 0.1, + "grad_norm": 1.8537580965801874, + "learning_rate": 9.882565582167673e-06, + "loss": 0.6615, + "step": 934 + }, + { + "epoch": 0.1, + "grad_norm": 1.857686711318004, + "learning_rate": 9.882202599862545e-06, + "loss": 0.6753, + "step": 935 + }, + { + "epoch": 0.1, + "grad_norm": 1.9365530494057581, + "learning_rate": 9.881839064131925e-06, + "loss": 0.6454, + "step": 936 + }, + { + "epoch": 0.1, + "grad_norm": 2.0038963286408062, + "learning_rate": 9.88147497501702e-06, + "loss": 0.6664, + "step": 937 + }, + { + "epoch": 0.1, + "grad_norm": 2.3477040250832326, + "learning_rate": 9.881110332559104e-06, + "loss": 0.7557, + "step": 938 + }, + { + "epoch": 0.1, + "grad_norm": 2.0700186962715974, + "learning_rate": 9.88074513679951e-06, + "loss": 0.6809, + "step": 939 + }, + { + "epoch": 0.1, + "grad_norm": 1.7916439421510102, + "learning_rate": 9.880379387779637e-06, + "loss": 0.5874, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 2.0284864284873567, + "learning_rate": 9.880013085540942e-06, + "loss": 0.7315, + "step": 941 + }, + { + "epoch": 0.1, + "grad_norm": 2.079999840493714, + "learning_rate": 9.879646230124949e-06, + "loss": 0.7182, + "step": 942 + }, + { + "epoch": 0.1, + "grad_norm": 1.981513474191177, + "learning_rate": 9.879278821573241e-06, + "loss": 0.6527, + "step": 943 + }, + { + "epoch": 0.1, + "grad_norm": 2.082383292274721, + "learning_rate": 9.87891085992747e-06, + "loss": 0.6105, + "step": 944 + }, + { + "epoch": 0.1, + "grad_norm": 1.772693746109168, + "learning_rate": 9.878542345229342e-06, + "loss": 0.6322, + "step": 945 + }, + { + "epoch": 0.1, + "grad_norm": 1.7969774735537083, + "learning_rate": 9.878173277520636e-06, + "loss": 0.7354, + "step": 946 + }, + { + "epoch": 0.1, + "grad_norm": 1.7088315539160064, + "learning_rate": 9.877803656843182e-06, + "loss": 0.6111, + "step": 947 + }, + { + "epoch": 0.1, + "grad_norm": 2.0378349193456065, + "learning_rate": 9.877433483238881e-06, + "loss": 0.6576, + "step": 948 + }, + { + "epoch": 0.1, + "grad_norm": 1.9073383214298338, + "learning_rate": 9.877062756749694e-06, + "loss": 0.7225, + "step": 949 + }, + { + "epoch": 0.1, + "grad_norm": 1.9049859401892608, + "learning_rate": 9.876691477417644e-06, + "loss": 0.6158, + "step": 950 + }, + { + "epoch": 0.1, + "grad_norm": 1.8083745188555216, + "learning_rate": 9.876319645284821e-06, + "loss": 0.6074, + "step": 951 + }, + { + "epoch": 0.1, + "grad_norm": 2.0656455996647165, + "learning_rate": 9.875947260393371e-06, + "loss": 0.6854, + "step": 952 + }, + { + "epoch": 0.1, + "grad_norm": 2.1464444197411945, + "learning_rate": 9.875574322785508e-06, + "loss": 0.6364, + "step": 953 + }, + { + "epoch": 0.1, + "grad_norm": 1.8324837012367032, + "learning_rate": 9.875200832503505e-06, + "loss": 0.6757, + "step": 954 + }, + { + "epoch": 0.1, + "grad_norm": 1.9747983267467775, + "learning_rate": 9.8748267895897e-06, + "loss": 0.7113, + "step": 955 + }, + { + "epoch": 0.1, + "grad_norm": 1.7988094223419764, + "learning_rate": 9.874452194086492e-06, + "loss": 0.5959, + "step": 956 + }, + { + "epoch": 0.1, + "grad_norm": 2.297513189742376, + "learning_rate": 9.874077046036345e-06, + "loss": 0.71, + "step": 957 + }, + { + "epoch": 0.1, + "grad_norm": 2.0557038443885123, + "learning_rate": 9.873701345481784e-06, + "loss": 0.7535, + "step": 958 + }, + { + "epoch": 0.1, + "grad_norm": 2.1326435760926987, + "learning_rate": 9.873325092465395e-06, + "loss": 0.7603, + "step": 959 + }, + { + "epoch": 0.1, + "grad_norm": 1.980369170639488, + "learning_rate": 9.872948287029833e-06, + "loss": 0.6861, + "step": 960 + }, + { + "epoch": 0.1, + "grad_norm": 2.004930978839084, + "learning_rate": 9.872570929217804e-06, + "loss": 0.6836, + "step": 961 + }, + { + "epoch": 0.1, + "grad_norm": 2.0397373283212565, + "learning_rate": 9.87219301907209e-06, + "loss": 0.6653, + "step": 962 + }, + { + "epoch": 0.1, + "grad_norm": 1.932230567382412, + "learning_rate": 9.871814556635525e-06, + "loss": 0.674, + "step": 963 + }, + { + "epoch": 0.1, + "grad_norm": 1.9015175721561417, + "learning_rate": 9.871435541951011e-06, + "loss": 0.6211, + "step": 964 + }, + { + "epoch": 0.1, + "grad_norm": 1.8075315769084097, + "learning_rate": 9.871055975061514e-06, + "loss": 0.7, + "step": 965 + }, + { + "epoch": 0.1, + "grad_norm": 1.782486219766704, + "learning_rate": 9.870675856010058e-06, + "loss": 0.7312, + "step": 966 + }, + { + "epoch": 0.1, + "grad_norm": 1.9553211188111437, + "learning_rate": 9.87029518483973e-06, + "loss": 0.6937, + "step": 967 + }, + { + "epoch": 0.1, + "grad_norm": 2.077651871787083, + "learning_rate": 9.869913961593685e-06, + "loss": 0.7183, + "step": 968 + }, + { + "epoch": 0.1, + "grad_norm": 1.8307112275173123, + "learning_rate": 9.869532186315134e-06, + "loss": 0.6975, + "step": 969 + }, + { + "epoch": 0.1, + "grad_norm": 2.1319077853591404, + "learning_rate": 9.869149859047355e-06, + "loss": 0.6904, + "step": 970 + }, + { + "epoch": 0.1, + "grad_norm": 2.1805511045289396, + "learning_rate": 9.868766979833686e-06, + "loss": 0.6585, + "step": 971 + }, + { + "epoch": 0.1, + "grad_norm": 1.9877639959778115, + "learning_rate": 9.868383548717529e-06, + "loss": 0.6813, + "step": 972 + }, + { + "epoch": 0.1, + "grad_norm": 2.4334437231614343, + "learning_rate": 9.867999565742348e-06, + "loss": 0.7393, + "step": 973 + }, + { + "epoch": 0.1, + "grad_norm": 1.7359878345550932, + "learning_rate": 9.867615030951668e-06, + "loss": 0.6564, + "step": 974 + }, + { + "epoch": 0.1, + "grad_norm": 1.8375121930806249, + "learning_rate": 9.86722994438908e-06, + "loss": 0.7168, + "step": 975 + }, + { + "epoch": 0.1, + "grad_norm": 2.7151473512488256, + "learning_rate": 9.866844306098238e-06, + "loss": 0.6273, + "step": 976 + }, + { + "epoch": 0.1, + "grad_norm": 1.830403049133302, + "learning_rate": 9.866458116122852e-06, + "loss": 0.6368, + "step": 977 + }, + { + "epoch": 0.1, + "grad_norm": 1.970209094089418, + "learning_rate": 9.866071374506701e-06, + "loss": 0.769, + "step": 978 + }, + { + "epoch": 0.1, + "grad_norm": 2.141400584070093, + "learning_rate": 9.865684081293624e-06, + "loss": 0.679, + "step": 979 + }, + { + "epoch": 0.1, + "grad_norm": 1.690775547376312, + "learning_rate": 9.865296236527523e-06, + "loss": 0.6908, + "step": 980 + }, + { + "epoch": 0.1, + "grad_norm": 2.043177681315642, + "learning_rate": 9.86490784025236e-06, + "loss": 0.6868, + "step": 981 + }, + { + "epoch": 0.1, + "grad_norm": 1.833251578843401, + "learning_rate": 9.864518892512167e-06, + "loss": 0.6162, + "step": 982 + }, + { + "epoch": 0.1, + "grad_norm": 2.1456831160855403, + "learning_rate": 9.86412939335103e-06, + "loss": 0.7424, + "step": 983 + }, + { + "epoch": 0.1, + "grad_norm": 2.0893900408271007, + "learning_rate": 9.8637393428131e-06, + "loss": 0.6559, + "step": 984 + }, + { + "epoch": 0.1, + "grad_norm": 2.3334547598341455, + "learning_rate": 9.863348740942595e-06, + "loss": 0.6998, + "step": 985 + }, + { + "epoch": 0.1, + "grad_norm": 1.9053267391294475, + "learning_rate": 9.86295758778379e-06, + "loss": 0.5793, + "step": 986 + }, + { + "epoch": 0.1, + "grad_norm": 2.053265187490565, + "learning_rate": 9.862565883381025e-06, + "loss": 0.6087, + "step": 987 + }, + { + "epoch": 0.1, + "grad_norm": 1.8738984376187984, + "learning_rate": 9.862173627778699e-06, + "loss": 0.6344, + "step": 988 + }, + { + "epoch": 0.1, + "grad_norm": 1.9312070177429788, + "learning_rate": 9.861780821021282e-06, + "loss": 0.6113, + "step": 989 + }, + { + "epoch": 0.1, + "grad_norm": 2.0840209825648373, + "learning_rate": 9.861387463153295e-06, + "loss": 0.7107, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 2.053479103718914, + "learning_rate": 9.860993554219333e-06, + "loss": 0.6901, + "step": 991 + }, + { + "epoch": 0.1, + "grad_norm": 2.3013383389382795, + "learning_rate": 9.860599094264043e-06, + "loss": 0.6469, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 1.8821636541147546, + "learning_rate": 9.860204083332142e-06, + "loss": 0.7016, + "step": 993 + }, + { + "epoch": 0.1, + "grad_norm": 1.871488491257933, + "learning_rate": 9.859808521468404e-06, + "loss": 0.6663, + "step": 994 + }, + { + "epoch": 0.1, + "grad_norm": 1.8671302354097286, + "learning_rate": 9.859412408717672e-06, + "loss": 0.67, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 2.1881384088375335, + "learning_rate": 9.859015745124844e-06, + "loss": 0.5869, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 1.8839529894787386, + "learning_rate": 9.858618530734887e-06, + "loss": 0.6813, + "step": 997 + }, + { + "epoch": 0.1, + "grad_norm": 2.024864643623028, + "learning_rate": 9.858220765592828e-06, + "loss": 0.584, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 1.8817794170694133, + "learning_rate": 9.857822449743752e-06, + "loss": 0.6694, + "step": 999 + }, + { + "epoch": 0.1, + "grad_norm": 2.3342042069446327, + "learning_rate": 9.857423583232812e-06, + "loss": 0.8106, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 1.7941891959216587, + "learning_rate": 9.857024166105224e-06, + "loss": 0.6335, + "step": 1001 + }, + { + "epoch": 0.1, + "grad_norm": 2.181704718868215, + "learning_rate": 9.856624198406262e-06, + "loss": 0.6525, + "step": 1002 + }, + { + "epoch": 0.1, + "grad_norm": 1.908208488414037, + "learning_rate": 9.856223680181267e-06, + "loss": 0.6317, + "step": 1003 + }, + { + "epoch": 0.1, + "grad_norm": 2.517562370686754, + "learning_rate": 9.855822611475636e-06, + "loss": 0.6677, + "step": 1004 + }, + { + "epoch": 0.1, + "grad_norm": 1.755038812115677, + "learning_rate": 9.855420992334836e-06, + "loss": 0.6095, + "step": 1005 + }, + { + "epoch": 0.1, + "grad_norm": 2.0790618780998917, + "learning_rate": 9.85501882280439e-06, + "loss": 0.7459, + "step": 1006 + }, + { + "epoch": 0.1, + "grad_norm": 1.9235236943259157, + "learning_rate": 9.85461610292989e-06, + "loss": 0.6692, + "step": 1007 + }, + { + "epoch": 0.1, + "grad_norm": 1.6253316092641337, + "learning_rate": 9.854212832756984e-06, + "loss": 0.6678, + "step": 1008 + }, + { + "epoch": 0.1, + "grad_norm": 2.117501284290816, + "learning_rate": 9.853809012331384e-06, + "loss": 0.7446, + "step": 1009 + }, + { + "epoch": 0.1, + "grad_norm": 2.156750435017462, + "learning_rate": 9.853404641698866e-06, + "loss": 0.6416, + "step": 1010 + }, + { + "epoch": 0.11, + "grad_norm": 1.7000772300008768, + "learning_rate": 9.85299972090527e-06, + "loss": 0.6848, + "step": 1011 + }, + { + "epoch": 0.11, + "grad_norm": 1.7492572101693626, + "learning_rate": 9.852594249996494e-06, + "loss": 0.5989, + "step": 1012 + }, + { + "epoch": 0.11, + "grad_norm": 1.9475380721456124, + "learning_rate": 9.852188229018502e-06, + "loss": 0.6094, + "step": 1013 + }, + { + "epoch": 0.11, + "grad_norm": 1.923991064008745, + "learning_rate": 9.851781658017317e-06, + "loss": 0.6641, + "step": 1014 + }, + { + "epoch": 0.11, + "grad_norm": 2.2032915876519743, + "learning_rate": 9.851374537039027e-06, + "loss": 0.6064, + "step": 1015 + }, + { + "epoch": 0.11, + "grad_norm": 2.112853405985047, + "learning_rate": 9.850966866129779e-06, + "loss": 0.6987, + "step": 1016 + }, + { + "epoch": 0.11, + "grad_norm": 2.257447982728473, + "learning_rate": 9.85055864533579e-06, + "loss": 0.6616, + "step": 1017 + }, + { + "epoch": 0.11, + "grad_norm": 1.9407354984677891, + "learning_rate": 9.85014987470333e-06, + "loss": 0.618, + "step": 1018 + }, + { + "epoch": 0.11, + "grad_norm": 2.0091396799893797, + "learning_rate": 9.849740554278738e-06, + "loss": 0.7182, + "step": 1019 + }, + { + "epoch": 0.11, + "grad_norm": 2.0832354129855615, + "learning_rate": 9.849330684108409e-06, + "loss": 0.6517, + "step": 1020 + }, + { + "epoch": 0.11, + "grad_norm": 2.1550178340245205, + "learning_rate": 9.848920264238809e-06, + "loss": 0.8038, + "step": 1021 + }, + { + "epoch": 0.11, + "grad_norm": 1.8398166027965528, + "learning_rate": 9.84850929471646e-06, + "loss": 0.5977, + "step": 1022 + }, + { + "epoch": 0.11, + "grad_norm": 2.4712742132470282, + "learning_rate": 9.848097775587944e-06, + "loss": 0.7969, + "step": 1023 + }, + { + "epoch": 0.11, + "grad_norm": 1.9800854460786408, + "learning_rate": 9.847685706899913e-06, + "loss": 0.6424, + "step": 1024 + }, + { + "epoch": 0.11, + "grad_norm": 2.160887630420676, + "learning_rate": 9.847273088699077e-06, + "loss": 0.5793, + "step": 1025 + }, + { + "epoch": 0.11, + "grad_norm": 1.7402490160067203, + "learning_rate": 9.846859921032207e-06, + "loss": 0.6306, + "step": 1026 + }, + { + "epoch": 0.11, + "grad_norm": 1.9858410317423765, + "learning_rate": 9.846446203946139e-06, + "loss": 0.6972, + "step": 1027 + }, + { + "epoch": 0.11, + "grad_norm": 1.7681585176033914, + "learning_rate": 9.84603193748777e-06, + "loss": 0.689, + "step": 1028 + }, + { + "epoch": 0.11, + "grad_norm": 1.8047938502571725, + "learning_rate": 9.84561712170406e-06, + "loss": 0.5521, + "step": 1029 + }, + { + "epoch": 0.11, + "grad_norm": 2.0028920908430434, + "learning_rate": 9.84520175664203e-06, + "loss": 0.693, + "step": 1030 + }, + { + "epoch": 0.11, + "grad_norm": 2.074323617732677, + "learning_rate": 9.844785842348764e-06, + "loss": 0.7734, + "step": 1031 + }, + { + "epoch": 0.11, + "grad_norm": 1.7873634371638705, + "learning_rate": 9.844369378871409e-06, + "loss": 0.5996, + "step": 1032 + }, + { + "epoch": 0.11, + "grad_norm": 1.8578006878258289, + "learning_rate": 9.84395236625717e-06, + "loss": 0.5908, + "step": 1033 + }, + { + "epoch": 0.11, + "grad_norm": 1.9206170170554828, + "learning_rate": 9.843534804553323e-06, + "loss": 0.6535, + "step": 1034 + }, + { + "epoch": 0.11, + "grad_norm": 1.8725064621105443, + "learning_rate": 9.843116693807199e-06, + "loss": 0.5975, + "step": 1035 + }, + { + "epoch": 0.11, + "grad_norm": 2.2188161598759315, + "learning_rate": 9.842698034066192e-06, + "loss": 0.7365, + "step": 1036 + }, + { + "epoch": 0.11, + "grad_norm": 1.921338516012938, + "learning_rate": 9.842278825377761e-06, + "loss": 0.7024, + "step": 1037 + }, + { + "epoch": 0.11, + "grad_norm": 2.2716226482375554, + "learning_rate": 9.841859067789425e-06, + "loss": 0.6117, + "step": 1038 + }, + { + "epoch": 0.11, + "grad_norm": 1.8260022112754144, + "learning_rate": 9.841438761348766e-06, + "loss": 0.5538, + "step": 1039 + }, + { + "epoch": 0.11, + "grad_norm": 1.8908219800095016, + "learning_rate": 9.841017906103427e-06, + "loss": 0.7356, + "step": 1040 + }, + { + "epoch": 0.11, + "grad_norm": 2.021688449597609, + "learning_rate": 9.840596502101117e-06, + "loss": 0.6077, + "step": 1041 + }, + { + "epoch": 0.11, + "grad_norm": 2.1832877381909452, + "learning_rate": 9.840174549389603e-06, + "loss": 0.6493, + "step": 1042 + }, + { + "epoch": 0.11, + "grad_norm": 2.174239711824139, + "learning_rate": 9.839752048016714e-06, + "loss": 0.633, + "step": 1043 + }, + { + "epoch": 0.11, + "grad_norm": 2.0734549390443435, + "learning_rate": 9.839328998030347e-06, + "loss": 0.7503, + "step": 1044 + }, + { + "epoch": 0.11, + "grad_norm": 1.7587601682627316, + "learning_rate": 9.838905399478453e-06, + "loss": 0.6627, + "step": 1045 + }, + { + "epoch": 0.11, + "grad_norm": 2.254845491574179, + "learning_rate": 9.838481252409053e-06, + "loss": 0.7206, + "step": 1046 + }, + { + "epoch": 0.11, + "grad_norm": 2.0682471001717357, + "learning_rate": 9.838056556870223e-06, + "loss": 0.6788, + "step": 1047 + }, + { + "epoch": 0.11, + "grad_norm": 1.9779991745810939, + "learning_rate": 9.837631312910107e-06, + "loss": 0.6749, + "step": 1048 + }, + { + "epoch": 0.11, + "grad_norm": 1.9718255224988028, + "learning_rate": 9.837205520576907e-06, + "loss": 0.6956, + "step": 1049 + }, + { + "epoch": 0.11, + "grad_norm": 1.8145469518157462, + "learning_rate": 9.836779179918891e-06, + "loss": 0.6854, + "step": 1050 + }, + { + "epoch": 0.11, + "grad_norm": 1.9375617890849886, + "learning_rate": 9.836352290984386e-06, + "loss": 0.6184, + "step": 1051 + }, + { + "epoch": 0.11, + "grad_norm": 1.9338031264153726, + "learning_rate": 9.835924853821783e-06, + "loss": 0.5955, + "step": 1052 + }, + { + "epoch": 0.11, + "grad_norm": 1.9522052631485587, + "learning_rate": 9.835496868479533e-06, + "loss": 0.5681, + "step": 1053 + }, + { + "epoch": 0.11, + "grad_norm": 1.8700014507217293, + "learning_rate": 9.835068335006153e-06, + "loss": 0.6794, + "step": 1054 + }, + { + "epoch": 0.11, + "grad_norm": 1.9556206221572114, + "learning_rate": 9.834639253450217e-06, + "loss": 0.5931, + "step": 1055 + }, + { + "epoch": 0.11, + "grad_norm": 2.1358836379045387, + "learning_rate": 9.834209623860367e-06, + "loss": 0.7265, + "step": 1056 + }, + { + "epoch": 0.11, + "grad_norm": 1.8328538254509168, + "learning_rate": 9.8337794462853e-06, + "loss": 0.6213, + "step": 1057 + }, + { + "epoch": 0.11, + "grad_norm": 1.9064305026210864, + "learning_rate": 9.833348720773782e-06, + "loss": 0.6486, + "step": 1058 + }, + { + "epoch": 0.11, + "grad_norm": 1.887679929354486, + "learning_rate": 9.832917447374637e-06, + "loss": 0.7484, + "step": 1059 + }, + { + "epoch": 0.11, + "grad_norm": 2.012238919175227, + "learning_rate": 9.832485626136751e-06, + "loss": 0.6625, + "step": 1060 + }, + { + "epoch": 0.11, + "grad_norm": 2.0410737340872145, + "learning_rate": 9.832053257109077e-06, + "loss": 0.7055, + "step": 1061 + }, + { + "epoch": 0.11, + "grad_norm": 1.7772590663935988, + "learning_rate": 9.831620340340626e-06, + "loss": 0.6214, + "step": 1062 + }, + { + "epoch": 0.11, + "grad_norm": 1.8966041179438153, + "learning_rate": 9.831186875880467e-06, + "loss": 0.6835, + "step": 1063 + }, + { + "epoch": 0.11, + "grad_norm": 1.8203185091064622, + "learning_rate": 9.830752863777741e-06, + "loss": 0.6473, + "step": 1064 + }, + { + "epoch": 0.11, + "grad_norm": 1.993896863641108, + "learning_rate": 9.830318304081642e-06, + "loss": 0.7101, + "step": 1065 + }, + { + "epoch": 0.11, + "grad_norm": 2.2042609661267836, + "learning_rate": 9.829883196841433e-06, + "loss": 0.6174, + "step": 1066 + }, + { + "epoch": 0.11, + "grad_norm": 2.0253946730893886, + "learning_rate": 9.829447542106434e-06, + "loss": 0.7096, + "step": 1067 + }, + { + "epoch": 0.11, + "grad_norm": 1.8318754462273392, + "learning_rate": 9.829011339926028e-06, + "loss": 0.662, + "step": 1068 + }, + { + "epoch": 0.11, + "grad_norm": 1.9474284559392272, + "learning_rate": 9.828574590349662e-06, + "loss": 0.7439, + "step": 1069 + }, + { + "epoch": 0.11, + "grad_norm": 2.07339427770848, + "learning_rate": 9.828137293426844e-06, + "loss": 0.7443, + "step": 1070 + }, + { + "epoch": 0.11, + "grad_norm": 1.9755027213686065, + "learning_rate": 9.827699449207147e-06, + "loss": 0.6352, + "step": 1071 + }, + { + "epoch": 0.11, + "grad_norm": 1.9897415411397834, + "learning_rate": 9.827261057740198e-06, + "loss": 0.7521, + "step": 1072 + }, + { + "epoch": 0.11, + "grad_norm": 2.04504258074226, + "learning_rate": 9.826822119075694e-06, + "loss": 0.7339, + "step": 1073 + }, + { + "epoch": 0.11, + "grad_norm": 2.2651880336477728, + "learning_rate": 9.826382633263392e-06, + "loss": 0.7575, + "step": 1074 + }, + { + "epoch": 0.11, + "grad_norm": 2.0751191341447877, + "learning_rate": 9.825942600353107e-06, + "loss": 0.6432, + "step": 1075 + }, + { + "epoch": 0.11, + "grad_norm": 1.9471220241261464, + "learning_rate": 9.825502020394724e-06, + "loss": 0.6165, + "step": 1076 + }, + { + "epoch": 0.11, + "grad_norm": 2.0197119177571525, + "learning_rate": 9.82506089343818e-06, + "loss": 0.6898, + "step": 1077 + }, + { + "epoch": 0.11, + "grad_norm": 2.2111513000318053, + "learning_rate": 9.824619219533482e-06, + "loss": 0.8225, + "step": 1078 + }, + { + "epoch": 0.11, + "grad_norm": 2.072255616698154, + "learning_rate": 9.824176998730698e-06, + "loss": 0.5451, + "step": 1079 + }, + { + "epoch": 0.11, + "grad_norm": 2.0203232264892574, + "learning_rate": 9.823734231079953e-06, + "loss": 0.6907, + "step": 1080 + }, + { + "epoch": 0.11, + "grad_norm": 1.9978646542749907, + "learning_rate": 9.823290916631438e-06, + "loss": 0.7582, + "step": 1081 + }, + { + "epoch": 0.11, + "grad_norm": 1.997655399908216, + "learning_rate": 9.822847055435407e-06, + "loss": 0.6675, + "step": 1082 + }, + { + "epoch": 0.11, + "grad_norm": 2.600826888649854, + "learning_rate": 9.822402647542173e-06, + "loss": 0.6995, + "step": 1083 + }, + { + "epoch": 0.11, + "grad_norm": 1.8108675661821911, + "learning_rate": 9.82195769300211e-06, + "loss": 0.7427, + "step": 1084 + }, + { + "epoch": 0.11, + "grad_norm": 2.0866686825001666, + "learning_rate": 9.821512191865662e-06, + "loss": 0.653, + "step": 1085 + }, + { + "epoch": 0.11, + "grad_norm": 1.9633193896611827, + "learning_rate": 9.821066144183322e-06, + "loss": 0.6388, + "step": 1086 + }, + { + "epoch": 0.11, + "grad_norm": 1.7284744819995272, + "learning_rate": 9.820619550005656e-06, + "loss": 0.5978, + "step": 1087 + }, + { + "epoch": 0.11, + "grad_norm": 1.9204126261783792, + "learning_rate": 9.820172409383288e-06, + "loss": 0.6652, + "step": 1088 + }, + { + "epoch": 0.11, + "grad_norm": 1.9950911805464604, + "learning_rate": 9.819724722366903e-06, + "loss": 0.5597, + "step": 1089 + }, + { + "epoch": 0.11, + "grad_norm": 1.9588977885654488, + "learning_rate": 9.81927648900725e-06, + "loss": 0.6817, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 1.9645877951323167, + "learning_rate": 9.818827709355138e-06, + "loss": 0.6927, + "step": 1091 + }, + { + "epoch": 0.11, + "grad_norm": 1.9913277378900867, + "learning_rate": 9.818378383461438e-06, + "loss": 0.5648, + "step": 1092 + }, + { + "epoch": 0.11, + "grad_norm": 2.167093446329645, + "learning_rate": 9.817928511377085e-06, + "loss": 0.721, + "step": 1093 + }, + { + "epoch": 0.11, + "grad_norm": 2.006202235369134, + "learning_rate": 9.817478093153074e-06, + "loss": 0.6331, + "step": 1094 + }, + { + "epoch": 0.11, + "grad_norm": 1.8430866473192573, + "learning_rate": 9.817027128840462e-06, + "loss": 0.6632, + "step": 1095 + }, + { + "epoch": 0.11, + "grad_norm": 2.1072808477952307, + "learning_rate": 9.816575618490368e-06, + "loss": 0.677, + "step": 1096 + }, + { + "epoch": 0.11, + "grad_norm": 1.8674328585658524, + "learning_rate": 9.816123562153975e-06, + "loss": 0.6894, + "step": 1097 + }, + { + "epoch": 0.11, + "grad_norm": 1.9548566740415998, + "learning_rate": 9.815670959882526e-06, + "loss": 0.6354, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 1.958662754789633, + "learning_rate": 9.815217811727325e-06, + "loss": 0.6949, + "step": 1099 + }, + { + "epoch": 0.11, + "grad_norm": 2.0309136527346965, + "learning_rate": 9.814764117739737e-06, + "loss": 0.6162, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 1.866529266557843, + "learning_rate": 9.814309877971195e-06, + "loss": 0.7048, + "step": 1101 + }, + { + "epoch": 0.11, + "grad_norm": 1.7941888834949336, + "learning_rate": 9.813855092473189e-06, + "loss": 0.6341, + "step": 1102 + }, + { + "epoch": 0.11, + "grad_norm": 1.8740743463137512, + "learning_rate": 9.813399761297267e-06, + "loss": 0.6796, + "step": 1103 + }, + { + "epoch": 0.11, + "grad_norm": 1.9916876162956634, + "learning_rate": 9.81294388449505e-06, + "loss": 0.7041, + "step": 1104 + }, + { + "epoch": 0.11, + "grad_norm": 2.328375158894037, + "learning_rate": 9.812487462118207e-06, + "loss": 0.7484, + "step": 1105 + }, + { + "epoch": 0.11, + "grad_norm": 2.0851750400015714, + "learning_rate": 9.812030494218484e-06, + "loss": 0.6148, + "step": 1106 + }, + { + "epoch": 0.12, + "grad_norm": 2.0721317087262685, + "learning_rate": 9.811572980847674e-06, + "loss": 0.6666, + "step": 1107 + }, + { + "epoch": 0.12, + "grad_norm": 2.3735592824344534, + "learning_rate": 9.811114922057642e-06, + "loss": 0.7982, + "step": 1108 + }, + { + "epoch": 0.12, + "grad_norm": 2.0309673267983404, + "learning_rate": 9.810656317900312e-06, + "loss": 0.5905, + "step": 1109 + }, + { + "epoch": 0.12, + "grad_norm": 1.9784198177751062, + "learning_rate": 9.810197168427667e-06, + "loss": 0.7343, + "step": 1110 + }, + { + "epoch": 0.12, + "grad_norm": 1.8042340469617641, + "learning_rate": 9.809737473691758e-06, + "loss": 0.6254, + "step": 1111 + }, + { + "epoch": 0.12, + "grad_norm": 2.0968682452666974, + "learning_rate": 9.80927723374469e-06, + "loss": 0.6969, + "step": 1112 + }, + { + "epoch": 0.12, + "grad_norm": 2.354904562448494, + "learning_rate": 9.808816448638636e-06, + "loss": 0.7833, + "step": 1113 + }, + { + "epoch": 0.12, + "grad_norm": 2.163126833363332, + "learning_rate": 9.808355118425827e-06, + "loss": 0.6467, + "step": 1114 + }, + { + "epoch": 0.12, + "grad_norm": 2.0346429879262433, + "learning_rate": 9.807893243158562e-06, + "loss": 0.6335, + "step": 1115 + }, + { + "epoch": 0.12, + "grad_norm": 2.1759311495573788, + "learning_rate": 9.80743082288919e-06, + "loss": 0.7353, + "step": 1116 + }, + { + "epoch": 0.12, + "grad_norm": 1.8028313593352352, + "learning_rate": 9.806967857670135e-06, + "loss": 0.609, + "step": 1117 + }, + { + "epoch": 0.12, + "grad_norm": 1.7536865880903942, + "learning_rate": 9.806504347553874e-06, + "loss": 0.6642, + "step": 1118 + }, + { + "epoch": 0.12, + "grad_norm": 1.9892546388260939, + "learning_rate": 9.80604029259295e-06, + "loss": 0.7288, + "step": 1119 + }, + { + "epoch": 0.12, + "grad_norm": 1.9316832882932709, + "learning_rate": 9.805575692839964e-06, + "loss": 0.5975, + "step": 1120 + }, + { + "epoch": 0.12, + "grad_norm": 1.8501490970207397, + "learning_rate": 9.805110548347583e-06, + "loss": 0.7319, + "step": 1121 + }, + { + "epoch": 0.12, + "grad_norm": 1.9680500445329732, + "learning_rate": 9.804644859168534e-06, + "loss": 0.697, + "step": 1122 + }, + { + "epoch": 0.12, + "grad_norm": 2.109835278999002, + "learning_rate": 9.804178625355602e-06, + "loss": 0.6059, + "step": 1123 + }, + { + "epoch": 0.12, + "grad_norm": 2.145342051858608, + "learning_rate": 9.803711846961641e-06, + "loss": 0.7171, + "step": 1124 + }, + { + "epoch": 0.12, + "grad_norm": 2.088705701522735, + "learning_rate": 9.803244524039564e-06, + "loss": 0.745, + "step": 1125 + }, + { + "epoch": 0.12, + "grad_norm": 1.9112491216751644, + "learning_rate": 9.802776656642341e-06, + "loss": 0.6755, + "step": 1126 + }, + { + "epoch": 0.12, + "grad_norm": 2.317044885396738, + "learning_rate": 9.80230824482301e-06, + "loss": 0.7217, + "step": 1127 + }, + { + "epoch": 0.12, + "grad_norm": 2.359531160259817, + "learning_rate": 9.801839288634664e-06, + "loss": 0.7032, + "step": 1128 + }, + { + "epoch": 0.12, + "grad_norm": 1.8376679215636302, + "learning_rate": 9.801369788130468e-06, + "loss": 0.5589, + "step": 1129 + }, + { + "epoch": 0.12, + "grad_norm": 2.038462127646075, + "learning_rate": 9.800899743363638e-06, + "loss": 0.5897, + "step": 1130 + }, + { + "epoch": 0.12, + "grad_norm": 1.958505348413095, + "learning_rate": 9.80042915438746e-06, + "loss": 0.6568, + "step": 1131 + }, + { + "epoch": 0.12, + "grad_norm": 1.9417917565556433, + "learning_rate": 9.799958021255275e-06, + "loss": 0.7257, + "step": 1132 + }, + { + "epoch": 0.12, + "grad_norm": 1.9224737730859809, + "learning_rate": 9.799486344020488e-06, + "loss": 0.6253, + "step": 1133 + }, + { + "epoch": 0.12, + "grad_norm": 1.8633516811392656, + "learning_rate": 9.79901412273657e-06, + "loss": 0.6349, + "step": 1134 + }, + { + "epoch": 0.12, + "grad_norm": 2.020034769094531, + "learning_rate": 9.798541357457045e-06, + "loss": 0.5633, + "step": 1135 + }, + { + "epoch": 0.12, + "grad_norm": 1.9064146253836125, + "learning_rate": 9.79806804823551e-06, + "loss": 0.6964, + "step": 1136 + }, + { + "epoch": 0.12, + "grad_norm": 1.9919505999551952, + "learning_rate": 9.797594195125611e-06, + "loss": 0.5642, + "step": 1137 + }, + { + "epoch": 0.12, + "grad_norm": 2.09562120886189, + "learning_rate": 9.797119798181066e-06, + "loss": 0.6715, + "step": 1138 + }, + { + "epoch": 0.12, + "grad_norm": 1.8738420808946492, + "learning_rate": 9.796644857455648e-06, + "loss": 0.6187, + "step": 1139 + }, + { + "epoch": 0.12, + "grad_norm": 2.072085736447773, + "learning_rate": 9.796169373003194e-06, + "loss": 0.7333, + "step": 1140 + }, + { + "epoch": 0.12, + "grad_norm": 2.0711394615651577, + "learning_rate": 9.795693344877609e-06, + "loss": 0.697, + "step": 1141 + }, + { + "epoch": 0.12, + "grad_norm": 1.9046229443546778, + "learning_rate": 9.795216773132846e-06, + "loss": 0.7762, + "step": 1142 + }, + { + "epoch": 0.12, + "grad_norm": 1.9207903000528437, + "learning_rate": 9.794739657822929e-06, + "loss": 0.6793, + "step": 1143 + }, + { + "epoch": 0.12, + "grad_norm": 1.782458495479854, + "learning_rate": 9.794261999001944e-06, + "loss": 0.7346, + "step": 1144 + }, + { + "epoch": 0.12, + "grad_norm": 1.7754725000419709, + "learning_rate": 9.793783796724033e-06, + "loss": 0.5992, + "step": 1145 + }, + { + "epoch": 0.12, + "grad_norm": 1.8592470716968033, + "learning_rate": 9.793305051043407e-06, + "loss": 0.7468, + "step": 1146 + }, + { + "epoch": 0.12, + "grad_norm": 1.857487138917833, + "learning_rate": 9.792825762014333e-06, + "loss": 0.6613, + "step": 1147 + }, + { + "epoch": 0.12, + "grad_norm": 1.8265947454383122, + "learning_rate": 9.79234592969114e-06, + "loss": 0.6637, + "step": 1148 + }, + { + "epoch": 0.12, + "grad_norm": 1.9552167151750668, + "learning_rate": 9.79186555412822e-06, + "loss": 0.7084, + "step": 1149 + }, + { + "epoch": 0.12, + "grad_norm": 1.9545771112614327, + "learning_rate": 9.791384635380028e-06, + "loss": 0.7181, + "step": 1150 + }, + { + "epoch": 0.12, + "grad_norm": 1.958640881269104, + "learning_rate": 9.790903173501075e-06, + "loss": 0.6016, + "step": 1151 + }, + { + "epoch": 0.12, + "grad_norm": 1.9827078690692417, + "learning_rate": 9.790421168545942e-06, + "loss": 0.7443, + "step": 1152 + }, + { + "epoch": 0.12, + "grad_norm": 2.133727647342895, + "learning_rate": 9.789938620569265e-06, + "loss": 0.731, + "step": 1153 + }, + { + "epoch": 0.12, + "grad_norm": 2.2672150699453457, + "learning_rate": 9.789455529625743e-06, + "loss": 0.6824, + "step": 1154 + }, + { + "epoch": 0.12, + "grad_norm": 1.91456235430189, + "learning_rate": 9.788971895770138e-06, + "loss": 0.6286, + "step": 1155 + }, + { + "epoch": 0.12, + "grad_norm": 1.7757643292986403, + "learning_rate": 9.788487719057273e-06, + "loss": 0.614, + "step": 1156 + }, + { + "epoch": 0.12, + "grad_norm": 2.1250419908716687, + "learning_rate": 9.78800299954203e-06, + "loss": 0.6005, + "step": 1157 + }, + { + "epoch": 0.12, + "grad_norm": 1.8486976022057549, + "learning_rate": 9.787517737279357e-06, + "loss": 0.6048, + "step": 1158 + }, + { + "epoch": 0.12, + "grad_norm": 2.3506355338274845, + "learning_rate": 9.787031932324262e-06, + "loss": 0.7821, + "step": 1159 + }, + { + "epoch": 0.12, + "grad_norm": 1.8385308824731086, + "learning_rate": 9.78654558473181e-06, + "loss": 0.7732, + "step": 1160 + }, + { + "epoch": 0.12, + "grad_norm": 1.8744460433610264, + "learning_rate": 9.786058694557136e-06, + "loss": 0.7004, + "step": 1161 + }, + { + "epoch": 0.12, + "grad_norm": 1.7990251826604449, + "learning_rate": 9.785571261855429e-06, + "loss": 0.6008, + "step": 1162 + }, + { + "epoch": 0.12, + "grad_norm": 2.0387781703421046, + "learning_rate": 9.78508328668194e-06, + "loss": 0.7354, + "step": 1163 + }, + { + "epoch": 0.12, + "grad_norm": 1.8379294386830753, + "learning_rate": 9.784594769091989e-06, + "loss": 0.6562, + "step": 1164 + }, + { + "epoch": 0.12, + "grad_norm": 1.951858093945663, + "learning_rate": 9.78410570914095e-06, + "loss": 0.5601, + "step": 1165 + }, + { + "epoch": 0.12, + "grad_norm": 1.963636595805986, + "learning_rate": 9.783616106884258e-06, + "loss": 0.7216, + "step": 1166 + }, + { + "epoch": 0.12, + "grad_norm": 2.0298983330028193, + "learning_rate": 9.783125962377416e-06, + "loss": 0.6501, + "step": 1167 + }, + { + "epoch": 0.12, + "grad_norm": 2.273467968545706, + "learning_rate": 9.782635275675983e-06, + "loss": 0.7842, + "step": 1168 + }, + { + "epoch": 0.12, + "grad_norm": 1.872811853225732, + "learning_rate": 9.78214404683558e-06, + "loss": 0.6591, + "step": 1169 + }, + { + "epoch": 0.12, + "grad_norm": 1.8397528593162067, + "learning_rate": 9.781652275911894e-06, + "loss": 0.6264, + "step": 1170 + }, + { + "epoch": 0.12, + "grad_norm": 1.8859285853928063, + "learning_rate": 9.781159962960667e-06, + "loss": 0.5893, + "step": 1171 + }, + { + "epoch": 0.12, + "grad_norm": 1.905973149180084, + "learning_rate": 9.780667108037706e-06, + "loss": 0.6312, + "step": 1172 + }, + { + "epoch": 0.12, + "grad_norm": 1.9716374922186313, + "learning_rate": 9.78017371119888e-06, + "loss": 0.6401, + "step": 1173 + }, + { + "epoch": 0.12, + "grad_norm": 1.9769244317036858, + "learning_rate": 9.77967977250012e-06, + "loss": 0.6444, + "step": 1174 + }, + { + "epoch": 0.12, + "grad_norm": 2.0872679872136737, + "learning_rate": 9.779185291997412e-06, + "loss": 0.761, + "step": 1175 + }, + { + "epoch": 0.12, + "grad_norm": 1.9107214967319497, + "learning_rate": 9.778690269746811e-06, + "loss": 0.6832, + "step": 1176 + }, + { + "epoch": 0.12, + "grad_norm": 1.7386000757691096, + "learning_rate": 9.778194705804431e-06, + "loss": 0.619, + "step": 1177 + }, + { + "epoch": 0.12, + "grad_norm": 2.181891137096523, + "learning_rate": 9.777698600226446e-06, + "loss": 0.7406, + "step": 1178 + }, + { + "epoch": 0.12, + "grad_norm": 2.012111320499033, + "learning_rate": 9.777201953069092e-06, + "loss": 0.8356, + "step": 1179 + }, + { + "epoch": 0.12, + "grad_norm": 2.2266386752482403, + "learning_rate": 9.776704764388668e-06, + "loss": 0.7769, + "step": 1180 + }, + { + "epoch": 0.12, + "grad_norm": 2.0481630064459275, + "learning_rate": 9.776207034241534e-06, + "loss": 0.706, + "step": 1181 + }, + { + "epoch": 0.12, + "grad_norm": 2.1147866234960637, + "learning_rate": 9.775708762684106e-06, + "loss": 0.6764, + "step": 1182 + }, + { + "epoch": 0.12, + "grad_norm": 1.9980720033432766, + "learning_rate": 9.775209949772872e-06, + "loss": 0.5919, + "step": 1183 + }, + { + "epoch": 0.12, + "grad_norm": 1.9142784484222324, + "learning_rate": 9.774710595564371e-06, + "loss": 0.7484, + "step": 1184 + }, + { + "epoch": 0.12, + "grad_norm": 2.0104955568224896, + "learning_rate": 9.774210700115209e-06, + "loss": 0.6362, + "step": 1185 + }, + { + "epoch": 0.12, + "grad_norm": 2.060715102831267, + "learning_rate": 9.773710263482053e-06, + "loss": 0.8285, + "step": 1186 + }, + { + "epoch": 0.12, + "grad_norm": 2.375817415064652, + "learning_rate": 9.77320928572163e-06, + "loss": 0.6754, + "step": 1187 + }, + { + "epoch": 0.12, + "grad_norm": 1.9904053183850345, + "learning_rate": 9.772707766890726e-06, + "loss": 0.5919, + "step": 1188 + }, + { + "epoch": 0.12, + "grad_norm": 1.8189149884702953, + "learning_rate": 9.772205707046194e-06, + "loss": 0.6269, + "step": 1189 + }, + { + "epoch": 0.12, + "grad_norm": 2.0893307362744165, + "learning_rate": 9.771703106244945e-06, + "loss": 0.6025, + "step": 1190 + }, + { + "epoch": 0.12, + "grad_norm": 1.872217178188383, + "learning_rate": 9.77119996454395e-06, + "loss": 0.6917, + "step": 1191 + }, + { + "epoch": 0.12, + "grad_norm": 1.8516747196972398, + "learning_rate": 9.770696282000245e-06, + "loss": 0.6101, + "step": 1192 + }, + { + "epoch": 0.12, + "grad_norm": 1.7350140128893028, + "learning_rate": 9.770192058670924e-06, + "loss": 0.7053, + "step": 1193 + }, + { + "epoch": 0.12, + "grad_norm": 2.124522427032116, + "learning_rate": 9.769687294613143e-06, + "loss": 0.7225, + "step": 1194 + }, + { + "epoch": 0.12, + "grad_norm": 1.6443356705537862, + "learning_rate": 9.769181989884123e-06, + "loss": 0.6327, + "step": 1195 + }, + { + "epoch": 0.12, + "grad_norm": 1.8787937842514755, + "learning_rate": 9.76867614454114e-06, + "loss": 0.685, + "step": 1196 + }, + { + "epoch": 0.12, + "grad_norm": 2.054491694078201, + "learning_rate": 9.768169758641535e-06, + "loss": 0.6185, + "step": 1197 + }, + { + "epoch": 0.12, + "grad_norm": 2.0466021116932374, + "learning_rate": 9.767662832242711e-06, + "loss": 0.722, + "step": 1198 + }, + { + "epoch": 0.12, + "grad_norm": 1.9725740768686584, + "learning_rate": 9.76715536540213e-06, + "loss": 0.6419, + "step": 1199 + }, + { + "epoch": 0.12, + "grad_norm": 1.8607471593989677, + "learning_rate": 9.766647358177317e-06, + "loss": 0.5834, + "step": 1200 + }, + { + "epoch": 0.12, + "grad_norm": 1.8176363699369578, + "learning_rate": 9.766138810625855e-06, + "loss": 0.5901, + "step": 1201 + }, + { + "epoch": 0.12, + "grad_norm": 1.902595364021203, + "learning_rate": 9.765629722805394e-06, + "loss": 0.642, + "step": 1202 + }, + { + "epoch": 0.13, + "grad_norm": 1.9999598335834559, + "learning_rate": 9.765120094773641e-06, + "loss": 0.7298, + "step": 1203 + }, + { + "epoch": 0.13, + "grad_norm": 1.8506077363965423, + "learning_rate": 9.764609926588365e-06, + "loss": 0.6875, + "step": 1204 + }, + { + "epoch": 0.13, + "grad_norm": 2.0798876015732035, + "learning_rate": 9.764099218307396e-06, + "loss": 0.7775, + "step": 1205 + }, + { + "epoch": 0.13, + "grad_norm": 1.8982097105923708, + "learning_rate": 9.763587969988626e-06, + "loss": 0.6821, + "step": 1206 + }, + { + "epoch": 0.13, + "grad_norm": 2.051160655228281, + "learning_rate": 9.763076181690008e-06, + "loss": 0.6225, + "step": 1207 + }, + { + "epoch": 0.13, + "grad_norm": 1.7605857993358935, + "learning_rate": 9.762563853469557e-06, + "loss": 0.6409, + "step": 1208 + }, + { + "epoch": 0.13, + "grad_norm": 1.9105078969357518, + "learning_rate": 9.762050985385348e-06, + "loss": 0.5791, + "step": 1209 + }, + { + "epoch": 0.13, + "grad_norm": 1.7956662172842133, + "learning_rate": 9.761537577495514e-06, + "loss": 0.6378, + "step": 1210 + }, + { + "epoch": 0.13, + "grad_norm": 1.9382003285098017, + "learning_rate": 9.761023629858258e-06, + "loss": 0.5976, + "step": 1211 + }, + { + "epoch": 0.13, + "grad_norm": 1.7644171198864156, + "learning_rate": 9.760509142531837e-06, + "loss": 0.6155, + "step": 1212 + }, + { + "epoch": 0.13, + "grad_norm": 3.0937484763695258, + "learning_rate": 9.759994115574571e-06, + "loss": 0.7084, + "step": 1213 + }, + { + "epoch": 0.13, + "grad_norm": 2.145729104645392, + "learning_rate": 9.759478549044839e-06, + "loss": 0.6385, + "step": 1214 + }, + { + "epoch": 0.13, + "grad_norm": 1.7488025165240058, + "learning_rate": 9.758962443001086e-06, + "loss": 0.6194, + "step": 1215 + }, + { + "epoch": 0.13, + "grad_norm": 2.008568905377275, + "learning_rate": 9.758445797501816e-06, + "loss": 0.5682, + "step": 1216 + }, + { + "epoch": 0.13, + "grad_norm": 1.998000158709324, + "learning_rate": 9.757928612605592e-06, + "loss": 0.6302, + "step": 1217 + }, + { + "epoch": 0.13, + "grad_norm": 1.929368096839565, + "learning_rate": 9.75741088837104e-06, + "loss": 0.7117, + "step": 1218 + }, + { + "epoch": 0.13, + "grad_norm": 1.7873401858196971, + "learning_rate": 9.756892624856848e-06, + "loss": 0.5734, + "step": 1219 + }, + { + "epoch": 0.13, + "grad_norm": 1.8865122527560079, + "learning_rate": 9.756373822121762e-06, + "loss": 0.6212, + "step": 1220 + }, + { + "epoch": 0.13, + "grad_norm": 2.0688832226726075, + "learning_rate": 9.755854480224596e-06, + "loss": 0.6526, + "step": 1221 + }, + { + "epoch": 0.13, + "grad_norm": 1.8863437615329601, + "learning_rate": 9.755334599224215e-06, + "loss": 0.6192, + "step": 1222 + }, + { + "epoch": 0.13, + "grad_norm": 2.0972603354248753, + "learning_rate": 9.754814179179552e-06, + "loss": 0.6935, + "step": 1223 + }, + { + "epoch": 0.13, + "grad_norm": 1.8672839080556047, + "learning_rate": 9.754293220149602e-06, + "loss": 0.6415, + "step": 1224 + }, + { + "epoch": 0.13, + "grad_norm": 2.1056721713435014, + "learning_rate": 9.753771722193417e-06, + "loss": 0.6053, + "step": 1225 + }, + { + "epoch": 0.13, + "grad_norm": 1.9996835954094754, + "learning_rate": 9.753249685370112e-06, + "loss": 0.7002, + "step": 1226 + }, + { + "epoch": 0.13, + "grad_norm": 2.183109070932942, + "learning_rate": 9.752727109738859e-06, + "loss": 0.6372, + "step": 1227 + }, + { + "epoch": 0.13, + "grad_norm": 1.9844165571120553, + "learning_rate": 9.752203995358902e-06, + "loss": 0.6659, + "step": 1228 + }, + { + "epoch": 0.13, + "grad_norm": 1.910791627714843, + "learning_rate": 9.751680342289536e-06, + "loss": 0.6214, + "step": 1229 + }, + { + "epoch": 0.13, + "grad_norm": 1.9193510028750618, + "learning_rate": 9.751156150590117e-06, + "loss": 0.6494, + "step": 1230 + }, + { + "epoch": 0.13, + "grad_norm": 2.2006190540580146, + "learning_rate": 9.750631420320069e-06, + "loss": 0.6964, + "step": 1231 + }, + { + "epoch": 0.13, + "grad_norm": 1.8660242845944797, + "learning_rate": 9.750106151538871e-06, + "loss": 0.6691, + "step": 1232 + }, + { + "epoch": 0.13, + "grad_norm": 1.7139198365306583, + "learning_rate": 9.749580344306067e-06, + "loss": 0.5357, + "step": 1233 + }, + { + "epoch": 0.13, + "grad_norm": 1.9794675828758101, + "learning_rate": 9.74905399868126e-06, + "loss": 0.6414, + "step": 1234 + }, + { + "epoch": 0.13, + "grad_norm": 2.053972284777646, + "learning_rate": 9.748527114724111e-06, + "loss": 0.6585, + "step": 1235 + }, + { + "epoch": 0.13, + "grad_norm": 1.8160635181241953, + "learning_rate": 9.74799969249435e-06, + "loss": 0.5822, + "step": 1236 + }, + { + "epoch": 0.13, + "grad_norm": 1.9594988518586947, + "learning_rate": 9.74747173205176e-06, + "loss": 0.6526, + "step": 1237 + }, + { + "epoch": 0.13, + "grad_norm": 2.1104409643045696, + "learning_rate": 9.74694323345619e-06, + "loss": 0.6936, + "step": 1238 + }, + { + "epoch": 0.13, + "grad_norm": 1.9216660825585377, + "learning_rate": 9.746414196767548e-06, + "loss": 0.6798, + "step": 1239 + }, + { + "epoch": 0.13, + "grad_norm": 2.1185106363995607, + "learning_rate": 9.745884622045803e-06, + "loss": 0.6708, + "step": 1240 + }, + { + "epoch": 0.13, + "grad_norm": 1.7835909517236361, + "learning_rate": 9.745354509350983e-06, + "loss": 0.7385, + "step": 1241 + }, + { + "epoch": 0.13, + "grad_norm": 2.3602417280460326, + "learning_rate": 9.744823858743186e-06, + "loss": 0.643, + "step": 1242 + }, + { + "epoch": 0.13, + "grad_norm": 1.837677043941556, + "learning_rate": 9.744292670282557e-06, + "loss": 0.6202, + "step": 1243 + }, + { + "epoch": 0.13, + "grad_norm": 1.9538243258295147, + "learning_rate": 9.743760944029313e-06, + "loss": 0.6141, + "step": 1244 + }, + { + "epoch": 0.13, + "grad_norm": 1.7094725933192672, + "learning_rate": 9.743228680043729e-06, + "loss": 0.5873, + "step": 1245 + }, + { + "epoch": 0.13, + "grad_norm": 1.898722893037348, + "learning_rate": 9.742695878386136e-06, + "loss": 0.6622, + "step": 1246 + }, + { + "epoch": 0.13, + "grad_norm": 2.3475080247477265, + "learning_rate": 9.742162539116936e-06, + "loss": 0.6986, + "step": 1247 + }, + { + "epoch": 0.13, + "grad_norm": 1.8734682037675061, + "learning_rate": 9.74162866229658e-06, + "loss": 0.6931, + "step": 1248 + }, + { + "epoch": 0.13, + "grad_norm": 1.8891784901953723, + "learning_rate": 9.74109424798559e-06, + "loss": 0.6264, + "step": 1249 + }, + { + "epoch": 0.13, + "grad_norm": 2.0263925697740683, + "learning_rate": 9.740559296244543e-06, + "loss": 0.6856, + "step": 1250 + }, + { + "epoch": 0.13, + "grad_norm": 2.2614934669273516, + "learning_rate": 9.74002380713408e-06, + "loss": 0.6658, + "step": 1251 + }, + { + "epoch": 0.13, + "grad_norm": 1.9590626287273527, + "learning_rate": 9.7394877807149e-06, + "loss": 0.6018, + "step": 1252 + }, + { + "epoch": 0.13, + "grad_norm": 1.8968171625714494, + "learning_rate": 9.738951217047767e-06, + "loss": 0.6378, + "step": 1253 + }, + { + "epoch": 0.13, + "grad_norm": 2.7508689880032957, + "learning_rate": 9.738414116193503e-06, + "loss": 0.7191, + "step": 1254 + }, + { + "epoch": 0.13, + "grad_norm": 2.1485846944757117, + "learning_rate": 9.737876478212989e-06, + "loss": 0.7119, + "step": 1255 + }, + { + "epoch": 0.13, + "grad_norm": 2.0333547103913663, + "learning_rate": 9.737338303167173e-06, + "loss": 0.6638, + "step": 1256 + }, + { + "epoch": 0.13, + "grad_norm": 1.8862361580061715, + "learning_rate": 9.736799591117057e-06, + "loss": 0.7242, + "step": 1257 + }, + { + "epoch": 0.13, + "grad_norm": 1.654552718644028, + "learning_rate": 9.73626034212371e-06, + "loss": 0.6576, + "step": 1258 + }, + { + "epoch": 0.13, + "grad_norm": 1.8740868267530033, + "learning_rate": 9.735720556248256e-06, + "loss": 0.7337, + "step": 1259 + }, + { + "epoch": 0.13, + "grad_norm": 1.9350463131930447, + "learning_rate": 9.735180233551884e-06, + "loss": 0.823, + "step": 1260 + }, + { + "epoch": 0.13, + "grad_norm": 2.0570724020864093, + "learning_rate": 9.734639374095845e-06, + "loss": 0.6597, + "step": 1261 + }, + { + "epoch": 0.13, + "grad_norm": 1.8547501290708488, + "learning_rate": 9.734097977941446e-06, + "loss": 0.6362, + "step": 1262 + }, + { + "epoch": 0.13, + "grad_norm": 1.9035543204587644, + "learning_rate": 9.733556045150057e-06, + "loss": 0.6648, + "step": 1263 + }, + { + "epoch": 0.13, + "grad_norm": 2.0108959931171557, + "learning_rate": 9.73301357578311e-06, + "loss": 0.7228, + "step": 1264 + }, + { + "epoch": 0.13, + "grad_norm": 1.865013262710532, + "learning_rate": 9.732470569902097e-06, + "loss": 0.7365, + "step": 1265 + }, + { + "epoch": 0.13, + "grad_norm": 2.217716624827927, + "learning_rate": 9.731927027568569e-06, + "loss": 0.7753, + "step": 1266 + }, + { + "epoch": 0.13, + "grad_norm": 1.7979547620947482, + "learning_rate": 9.731382948844143e-06, + "loss": 0.5906, + "step": 1267 + }, + { + "epoch": 0.13, + "grad_norm": 1.7508977673648394, + "learning_rate": 9.730838333790493e-06, + "loss": 0.6117, + "step": 1268 + }, + { + "epoch": 0.13, + "grad_norm": 1.6893969959243762, + "learning_rate": 9.73029318246935e-06, + "loss": 0.6358, + "step": 1269 + }, + { + "epoch": 0.13, + "grad_norm": 2.166240330404318, + "learning_rate": 9.729747494942515e-06, + "loss": 0.6924, + "step": 1270 + }, + { + "epoch": 0.13, + "grad_norm": 2.000608514191742, + "learning_rate": 9.729201271271842e-06, + "loss": 0.7347, + "step": 1271 + }, + { + "epoch": 0.13, + "grad_norm": 2.103793789959187, + "learning_rate": 9.728654511519251e-06, + "loss": 0.7285, + "step": 1272 + }, + { + "epoch": 0.13, + "grad_norm": 1.8409733568718099, + "learning_rate": 9.728107215746717e-06, + "loss": 0.692, + "step": 1273 + }, + { + "epoch": 0.13, + "grad_norm": 1.963562475533344, + "learning_rate": 9.72755938401628e-06, + "loss": 0.6062, + "step": 1274 + }, + { + "epoch": 0.13, + "grad_norm": 1.8176381153691845, + "learning_rate": 9.727011016390044e-06, + "loss": 0.7648, + "step": 1275 + }, + { + "epoch": 0.13, + "grad_norm": 1.8358596377570153, + "learning_rate": 9.726462112930165e-06, + "loss": 0.6272, + "step": 1276 + }, + { + "epoch": 0.13, + "grad_norm": 1.8482133621035386, + "learning_rate": 9.725912673698863e-06, + "loss": 0.6437, + "step": 1277 + }, + { + "epoch": 0.13, + "grad_norm": 1.7093243741213524, + "learning_rate": 9.725362698758425e-06, + "loss": 0.5643, + "step": 1278 + }, + { + "epoch": 0.13, + "grad_norm": 1.9313380353034446, + "learning_rate": 9.72481218817119e-06, + "loss": 0.7326, + "step": 1279 + }, + { + "epoch": 0.13, + "grad_norm": 2.0110578630440696, + "learning_rate": 9.724261141999564e-06, + "loss": 0.7809, + "step": 1280 + }, + { + "epoch": 0.13, + "grad_norm": 2.0212582494727056, + "learning_rate": 9.723709560306009e-06, + "loss": 0.7539, + "step": 1281 + }, + { + "epoch": 0.13, + "grad_norm": 1.956126223537327, + "learning_rate": 9.723157443153053e-06, + "loss": 0.6653, + "step": 1282 + }, + { + "epoch": 0.13, + "grad_norm": 1.8182910112574044, + "learning_rate": 9.722604790603279e-06, + "loss": 0.7183, + "step": 1283 + }, + { + "epoch": 0.13, + "grad_norm": 2.108210363465132, + "learning_rate": 9.722051602719333e-06, + "loss": 0.712, + "step": 1284 + }, + { + "epoch": 0.13, + "grad_norm": 1.9131048983229353, + "learning_rate": 9.721497879563924e-06, + "loss": 0.7338, + "step": 1285 + }, + { + "epoch": 0.13, + "grad_norm": 1.949411903606035, + "learning_rate": 9.72094362119982e-06, + "loss": 0.6708, + "step": 1286 + }, + { + "epoch": 0.13, + "grad_norm": 2.0746178969752416, + "learning_rate": 9.720388827689843e-06, + "loss": 0.563, + "step": 1287 + }, + { + "epoch": 0.13, + "grad_norm": 2.190230314831356, + "learning_rate": 9.719833499096891e-06, + "loss": 0.6351, + "step": 1288 + }, + { + "epoch": 0.13, + "grad_norm": 1.8460044124762562, + "learning_rate": 9.71927763548391e-06, + "loss": 0.6958, + "step": 1289 + }, + { + "epoch": 0.13, + "grad_norm": 1.6752039016736493, + "learning_rate": 9.718721236913909e-06, + "loss": 0.5744, + "step": 1290 + }, + { + "epoch": 0.13, + "grad_norm": 1.9927244031221076, + "learning_rate": 9.718164303449961e-06, + "loss": 0.7696, + "step": 1291 + }, + { + "epoch": 0.13, + "grad_norm": 1.8995517099170962, + "learning_rate": 9.717606835155195e-06, + "loss": 0.6031, + "step": 1292 + }, + { + "epoch": 0.13, + "grad_norm": 1.940650266443028, + "learning_rate": 9.717048832092806e-06, + "loss": 0.6741, + "step": 1293 + }, + { + "epoch": 0.13, + "grad_norm": 1.578177156458987, + "learning_rate": 9.716490294326046e-06, + "loss": 0.5341, + "step": 1294 + }, + { + "epoch": 0.13, + "grad_norm": 2.3682750170322975, + "learning_rate": 9.715931221918227e-06, + "loss": 0.6361, + "step": 1295 + }, + { + "epoch": 0.13, + "grad_norm": 1.8435349334772175, + "learning_rate": 9.715371614932725e-06, + "loss": 0.6257, + "step": 1296 + }, + { + "epoch": 0.13, + "grad_norm": 1.9264375579621833, + "learning_rate": 9.714811473432973e-06, + "loss": 0.7395, + "step": 1297 + }, + { + "epoch": 0.13, + "grad_norm": 1.9461581602936884, + "learning_rate": 9.714250797482468e-06, + "loss": 0.6327, + "step": 1298 + }, + { + "epoch": 0.14, + "grad_norm": 2.1144212881507425, + "learning_rate": 9.713689587144762e-06, + "loss": 0.6785, + "step": 1299 + }, + { + "epoch": 0.14, + "grad_norm": 1.6928260272004876, + "learning_rate": 9.713127842483476e-06, + "loss": 0.5079, + "step": 1300 + }, + { + "epoch": 0.14, + "grad_norm": 2.3246045980360783, + "learning_rate": 9.712565563562286e-06, + "loss": 0.7336, + "step": 1301 + }, + { + "epoch": 0.14, + "grad_norm": 1.7227636255508532, + "learning_rate": 9.712002750444926e-06, + "loss": 0.7164, + "step": 1302 + }, + { + "epoch": 0.14, + "grad_norm": 1.8982343901775387, + "learning_rate": 9.7114394031952e-06, + "loss": 0.708, + "step": 1303 + }, + { + "epoch": 0.14, + "grad_norm": 2.1258743453940028, + "learning_rate": 9.71087552187696e-06, + "loss": 0.7317, + "step": 1304 + }, + { + "epoch": 0.14, + "grad_norm": 1.9721335267277356, + "learning_rate": 9.710311106554132e-06, + "loss": 0.6371, + "step": 1305 + }, + { + "epoch": 0.14, + "grad_norm": 1.7625782159118968, + "learning_rate": 9.70974615729069e-06, + "loss": 0.5839, + "step": 1306 + }, + { + "epoch": 0.14, + "grad_norm": 2.4807222261088127, + "learning_rate": 9.709180674150677e-06, + "loss": 0.7081, + "step": 1307 + }, + { + "epoch": 0.14, + "grad_norm": 1.9903239990967319, + "learning_rate": 9.708614657198194e-06, + "loss": 0.6491, + "step": 1308 + }, + { + "epoch": 0.14, + "grad_norm": 1.925530757651826, + "learning_rate": 9.7080481064974e-06, + "loss": 0.7496, + "step": 1309 + }, + { + "epoch": 0.14, + "grad_norm": 1.9292533506628393, + "learning_rate": 9.70748102211252e-06, + "loss": 0.6853, + "step": 1310 + }, + { + "epoch": 0.14, + "grad_norm": 1.9690106352432073, + "learning_rate": 9.706913404107832e-06, + "loss": 0.5786, + "step": 1311 + }, + { + "epoch": 0.14, + "grad_norm": 2.0319183887210044, + "learning_rate": 9.706345252547681e-06, + "loss": 0.6004, + "step": 1312 + }, + { + "epoch": 0.14, + "grad_norm": 1.9599275382294734, + "learning_rate": 9.705776567496473e-06, + "loss": 0.7236, + "step": 1313 + }, + { + "epoch": 0.14, + "grad_norm": 1.9230052599444754, + "learning_rate": 9.705207349018668e-06, + "loss": 0.708, + "step": 1314 + }, + { + "epoch": 0.14, + "grad_norm": 1.8972194347454603, + "learning_rate": 9.70463759717879e-06, + "loss": 0.716, + "step": 1315 + }, + { + "epoch": 0.14, + "grad_norm": 2.010675575041699, + "learning_rate": 9.704067312041426e-06, + "loss": 0.6905, + "step": 1316 + }, + { + "epoch": 0.14, + "grad_norm": 2.2746498328993505, + "learning_rate": 9.703496493671219e-06, + "loss": 0.5917, + "step": 1317 + }, + { + "epoch": 0.14, + "grad_norm": 1.9221483056550144, + "learning_rate": 9.702925142132876e-06, + "loss": 0.6797, + "step": 1318 + }, + { + "epoch": 0.14, + "grad_norm": 2.0044204535629047, + "learning_rate": 9.702353257491162e-06, + "loss": 0.7663, + "step": 1319 + }, + { + "epoch": 0.14, + "grad_norm": 2.133217438263875, + "learning_rate": 9.701780839810903e-06, + "loss": 0.7136, + "step": 1320 + }, + { + "epoch": 0.14, + "grad_norm": 1.84335445808742, + "learning_rate": 9.701207889156989e-06, + "loss": 0.6814, + "step": 1321 + }, + { + "epoch": 0.14, + "grad_norm": 1.984334565307144, + "learning_rate": 9.700634405594364e-06, + "loss": 0.6403, + "step": 1322 + }, + { + "epoch": 0.14, + "grad_norm": 1.827908300013016, + "learning_rate": 9.700060389188035e-06, + "loss": 0.6725, + "step": 1323 + }, + { + "epoch": 0.14, + "grad_norm": 2.009494835552701, + "learning_rate": 9.699485840003072e-06, + "loss": 0.6841, + "step": 1324 + }, + { + "epoch": 0.14, + "grad_norm": 1.91704570730664, + "learning_rate": 9.698910758104603e-06, + "loss": 0.6762, + "step": 1325 + }, + { + "epoch": 0.14, + "grad_norm": 2.005127161184001, + "learning_rate": 9.698335143557818e-06, + "loss": 0.7461, + "step": 1326 + }, + { + "epoch": 0.14, + "grad_norm": 2.1535407683860095, + "learning_rate": 9.697758996427962e-06, + "loss": 0.7321, + "step": 1327 + }, + { + "epoch": 0.14, + "grad_norm": 1.9080003028880403, + "learning_rate": 9.69718231678035e-06, + "loss": 0.672, + "step": 1328 + }, + { + "epoch": 0.14, + "grad_norm": 1.9437619321838724, + "learning_rate": 9.696605104680348e-06, + "loss": 0.7428, + "step": 1329 + }, + { + "epoch": 0.14, + "grad_norm": 2.037382120827923, + "learning_rate": 9.696027360193387e-06, + "loss": 0.7703, + "step": 1330 + }, + { + "epoch": 0.14, + "grad_norm": 1.8931689172830715, + "learning_rate": 9.69544908338496e-06, + "loss": 0.6491, + "step": 1331 + }, + { + "epoch": 0.14, + "grad_norm": 1.8790236237302318, + "learning_rate": 9.694870274320616e-06, + "loss": 0.6525, + "step": 1332 + }, + { + "epoch": 0.14, + "grad_norm": 1.910244499985207, + "learning_rate": 9.694290933065966e-06, + "loss": 0.6577, + "step": 1333 + }, + { + "epoch": 0.14, + "grad_norm": 1.9579528999546667, + "learning_rate": 9.693711059686682e-06, + "loss": 0.6902, + "step": 1334 + }, + { + "epoch": 0.14, + "grad_norm": 1.9396330418699732, + "learning_rate": 9.693130654248497e-06, + "loss": 0.7542, + "step": 1335 + }, + { + "epoch": 0.14, + "grad_norm": 1.9042496696960487, + "learning_rate": 9.692549716817202e-06, + "loss": 0.5711, + "step": 1336 + }, + { + "epoch": 0.14, + "grad_norm": 1.8895202592925349, + "learning_rate": 9.691968247458648e-06, + "loss": 0.6255, + "step": 1337 + }, + { + "epoch": 0.14, + "grad_norm": 2.0965642202557544, + "learning_rate": 9.691386246238753e-06, + "loss": 0.7484, + "step": 1338 + }, + { + "epoch": 0.14, + "grad_norm": 1.8015674513354294, + "learning_rate": 9.690803713223485e-06, + "loss": 0.6473, + "step": 1339 + }, + { + "epoch": 0.14, + "grad_norm": 1.9184186707119182, + "learning_rate": 9.690220648478879e-06, + "loss": 0.5988, + "step": 1340 + }, + { + "epoch": 0.14, + "grad_norm": 1.9770192689507424, + "learning_rate": 9.689637052071031e-06, + "loss": 0.7, + "step": 1341 + }, + { + "epoch": 0.14, + "grad_norm": 2.0449862402255508, + "learning_rate": 9.689052924066091e-06, + "loss": 0.6675, + "step": 1342 + }, + { + "epoch": 0.14, + "grad_norm": 1.8859554632950104, + "learning_rate": 9.688468264530278e-06, + "loss": 0.676, + "step": 1343 + }, + { + "epoch": 0.14, + "grad_norm": 2.006052287719628, + "learning_rate": 9.68788307352986e-06, + "loss": 0.7827, + "step": 1344 + }, + { + "epoch": 0.14, + "grad_norm": 1.8945746237944512, + "learning_rate": 9.687297351131179e-06, + "loss": 0.7134, + "step": 1345 + }, + { + "epoch": 0.14, + "grad_norm": 2.02227847196895, + "learning_rate": 9.686711097400625e-06, + "loss": 0.7382, + "step": 1346 + }, + { + "epoch": 0.14, + "grad_norm": 1.878484044957481, + "learning_rate": 9.686124312404656e-06, + "loss": 0.7726, + "step": 1347 + }, + { + "epoch": 0.14, + "grad_norm": 1.630184078610918, + "learning_rate": 9.685536996209785e-06, + "loss": 0.5358, + "step": 1348 + }, + { + "epoch": 0.14, + "grad_norm": 1.85620192643215, + "learning_rate": 9.68494914888259e-06, + "loss": 0.6286, + "step": 1349 + }, + { + "epoch": 0.14, + "grad_norm": 2.0114268912492403, + "learning_rate": 9.684360770489704e-06, + "loss": 0.6904, + "step": 1350 + }, + { + "epoch": 0.14, + "grad_norm": 1.6746633261022315, + "learning_rate": 9.683771861097825e-06, + "loss": 0.6124, + "step": 1351 + }, + { + "epoch": 0.14, + "grad_norm": 1.9084737163323533, + "learning_rate": 9.68318242077371e-06, + "loss": 0.6784, + "step": 1352 + }, + { + "epoch": 0.14, + "grad_norm": 2.053592952699433, + "learning_rate": 9.682592449584174e-06, + "loss": 0.7364, + "step": 1353 + }, + { + "epoch": 0.14, + "grad_norm": 1.8759874911112053, + "learning_rate": 9.682001947596093e-06, + "loss": 0.7605, + "step": 1354 + }, + { + "epoch": 0.14, + "grad_norm": 2.051882469686672, + "learning_rate": 9.681410914876406e-06, + "loss": 0.6258, + "step": 1355 + }, + { + "epoch": 0.14, + "grad_norm": 1.8668394817877236, + "learning_rate": 9.68081935149211e-06, + "loss": 0.6545, + "step": 1356 + }, + { + "epoch": 0.14, + "grad_norm": 2.151948478937282, + "learning_rate": 9.680227257510257e-06, + "loss": 0.7088, + "step": 1357 + }, + { + "epoch": 0.14, + "grad_norm": 1.896198321769729, + "learning_rate": 9.67963463299797e-06, + "loss": 0.6797, + "step": 1358 + }, + { + "epoch": 0.14, + "grad_norm": 1.8333904019961873, + "learning_rate": 9.679041478022424e-06, + "loss": 0.6455, + "step": 1359 + }, + { + "epoch": 0.14, + "grad_norm": 1.920785330687945, + "learning_rate": 9.678447792650858e-06, + "loss": 0.7701, + "step": 1360 + }, + { + "epoch": 0.14, + "grad_norm": 1.9740522491158696, + "learning_rate": 9.677853576950567e-06, + "loss": 0.6122, + "step": 1361 + }, + { + "epoch": 0.14, + "grad_norm": 2.0093433073912195, + "learning_rate": 9.677258830988911e-06, + "loss": 0.688, + "step": 1362 + }, + { + "epoch": 0.14, + "grad_norm": 1.9899501677577536, + "learning_rate": 9.676663554833307e-06, + "loss": 0.6969, + "step": 1363 + }, + { + "epoch": 0.14, + "grad_norm": 2.0662149446869846, + "learning_rate": 9.676067748551232e-06, + "loss": 0.7624, + "step": 1364 + }, + { + "epoch": 0.14, + "grad_norm": 1.8510578010237781, + "learning_rate": 9.675471412210225e-06, + "loss": 0.6667, + "step": 1365 + }, + { + "epoch": 0.14, + "grad_norm": 2.2586481507249565, + "learning_rate": 9.674874545877886e-06, + "loss": 0.7025, + "step": 1366 + }, + { + "epoch": 0.14, + "grad_norm": 1.774087072360693, + "learning_rate": 9.674277149621869e-06, + "loss": 0.7498, + "step": 1367 + }, + { + "epoch": 0.14, + "grad_norm": 1.9300929758419567, + "learning_rate": 9.673679223509895e-06, + "loss": 0.7127, + "step": 1368 + }, + { + "epoch": 0.14, + "grad_norm": 2.0352754562115294, + "learning_rate": 9.673080767609743e-06, + "loss": 0.6448, + "step": 1369 + }, + { + "epoch": 0.14, + "grad_norm": 1.8977730151853256, + "learning_rate": 9.67248178198925e-06, + "loss": 0.6987, + "step": 1370 + }, + { + "epoch": 0.14, + "grad_norm": 2.004055084209392, + "learning_rate": 9.671882266716315e-06, + "loss": 0.8061, + "step": 1371 + }, + { + "epoch": 0.14, + "grad_norm": 1.9454168688791278, + "learning_rate": 9.671282221858897e-06, + "loss": 0.7274, + "step": 1372 + }, + { + "epoch": 0.14, + "grad_norm": 1.9198819052306686, + "learning_rate": 9.670681647485012e-06, + "loss": 0.6711, + "step": 1373 + }, + { + "epoch": 0.14, + "grad_norm": 1.7097410217988573, + "learning_rate": 9.670080543662742e-06, + "loss": 0.6368, + "step": 1374 + }, + { + "epoch": 0.14, + "grad_norm": 2.050598473467362, + "learning_rate": 9.669478910460222e-06, + "loss": 0.7401, + "step": 1375 + }, + { + "epoch": 0.14, + "grad_norm": 1.9465646412985562, + "learning_rate": 9.668876747945652e-06, + "loss": 0.6021, + "step": 1376 + }, + { + "epoch": 0.14, + "grad_norm": 1.9335106076774247, + "learning_rate": 9.668274056187293e-06, + "loss": 0.6118, + "step": 1377 + }, + { + "epoch": 0.14, + "grad_norm": 2.022082750564337, + "learning_rate": 9.66767083525346e-06, + "loss": 0.6889, + "step": 1378 + }, + { + "epoch": 0.14, + "grad_norm": 1.755623299169476, + "learning_rate": 9.667067085212533e-06, + "loss": 0.7057, + "step": 1379 + }, + { + "epoch": 0.14, + "grad_norm": 1.928965446296919, + "learning_rate": 9.66646280613295e-06, + "loss": 0.5745, + "step": 1380 + }, + { + "epoch": 0.14, + "grad_norm": 1.8155202857113888, + "learning_rate": 9.665857998083212e-06, + "loss": 0.6229, + "step": 1381 + }, + { + "epoch": 0.14, + "grad_norm": 1.7813070676468963, + "learning_rate": 9.665252661131874e-06, + "loss": 0.5898, + "step": 1382 + }, + { + "epoch": 0.14, + "grad_norm": 1.9563053298185136, + "learning_rate": 9.664646795347556e-06, + "loss": 0.7274, + "step": 1383 + }, + { + "epoch": 0.14, + "grad_norm": 1.9349658516984791, + "learning_rate": 9.664040400798937e-06, + "loss": 0.6404, + "step": 1384 + }, + { + "epoch": 0.14, + "grad_norm": 2.188084590268582, + "learning_rate": 9.663433477554753e-06, + "loss": 0.6934, + "step": 1385 + }, + { + "epoch": 0.14, + "grad_norm": 2.2197372636124917, + "learning_rate": 9.662826025683805e-06, + "loss": 0.6701, + "step": 1386 + }, + { + "epoch": 0.14, + "grad_norm": 1.9856243079185785, + "learning_rate": 9.662218045254949e-06, + "loss": 0.667, + "step": 1387 + }, + { + "epoch": 0.14, + "grad_norm": 1.9225996019695955, + "learning_rate": 9.661609536337104e-06, + "loss": 0.6174, + "step": 1388 + }, + { + "epoch": 0.14, + "grad_norm": 2.0817089630736727, + "learning_rate": 9.661000498999248e-06, + "loss": 0.7104, + "step": 1389 + }, + { + "epoch": 0.14, + "grad_norm": 1.8238863336509912, + "learning_rate": 9.660390933310418e-06, + "loss": 0.612, + "step": 1390 + }, + { + "epoch": 0.14, + "grad_norm": 1.8431042408210576, + "learning_rate": 9.659780839339713e-06, + "loss": 0.6929, + "step": 1391 + }, + { + "epoch": 0.14, + "grad_norm": 1.889104137450392, + "learning_rate": 9.65917021715629e-06, + "loss": 0.6759, + "step": 1392 + }, + { + "epoch": 0.14, + "grad_norm": 1.9497779511731066, + "learning_rate": 9.658559066829365e-06, + "loss": 0.6041, + "step": 1393 + }, + { + "epoch": 0.14, + "grad_norm": 2.039428108567977, + "learning_rate": 9.65794738842822e-06, + "loss": 0.7342, + "step": 1394 + }, + { + "epoch": 0.15, + "grad_norm": 2.0211962356651734, + "learning_rate": 9.657335182022187e-06, + "loss": 0.7797, + "step": 1395 + }, + { + "epoch": 0.15, + "grad_norm": 1.9990273830284, + "learning_rate": 9.656722447680664e-06, + "loss": 0.7011, + "step": 1396 + }, + { + "epoch": 0.15, + "grad_norm": 1.88149406805676, + "learning_rate": 9.65610918547311e-06, + "loss": 0.6511, + "step": 1397 + }, + { + "epoch": 0.15, + "grad_norm": 1.6989907788026, + "learning_rate": 9.655495395469042e-06, + "loss": 0.5867, + "step": 1398 + }, + { + "epoch": 0.15, + "grad_norm": 1.9148308369176725, + "learning_rate": 9.654881077738035e-06, + "loss": 0.577, + "step": 1399 + }, + { + "epoch": 0.15, + "grad_norm": 1.8599257790893318, + "learning_rate": 9.654266232349727e-06, + "loss": 0.64, + "step": 1400 + }, + { + "epoch": 0.15, + "grad_norm": 2.0076623853739903, + "learning_rate": 9.653650859373811e-06, + "loss": 0.7321, + "step": 1401 + }, + { + "epoch": 0.15, + "grad_norm": 1.8313666092285423, + "learning_rate": 9.653034958880045e-06, + "loss": 0.6341, + "step": 1402 + }, + { + "epoch": 0.15, + "grad_norm": 1.8722180403613942, + "learning_rate": 9.652418530938248e-06, + "loss": 0.7881, + "step": 1403 + }, + { + "epoch": 0.15, + "grad_norm": 2.083375770806738, + "learning_rate": 9.651801575618289e-06, + "loss": 0.7394, + "step": 1404 + }, + { + "epoch": 0.15, + "grad_norm": 1.6813245697465693, + "learning_rate": 9.651184092990109e-06, + "loss": 0.5917, + "step": 1405 + }, + { + "epoch": 0.15, + "grad_norm": 1.8348700669336107, + "learning_rate": 9.650566083123701e-06, + "loss": 0.5931, + "step": 1406 + }, + { + "epoch": 0.15, + "grad_norm": 2.0187798705574207, + "learning_rate": 9.64994754608912e-06, + "loss": 0.7104, + "step": 1407 + }, + { + "epoch": 0.15, + "grad_norm": 1.949254115187299, + "learning_rate": 9.649328481956481e-06, + "loss": 0.7475, + "step": 1408 + }, + { + "epoch": 0.15, + "grad_norm": 1.9312680985383277, + "learning_rate": 9.64870889079596e-06, + "loss": 0.6517, + "step": 1409 + }, + { + "epoch": 0.15, + "grad_norm": 2.28060400124159, + "learning_rate": 9.648088772677787e-06, + "loss": 0.6683, + "step": 1410 + }, + { + "epoch": 0.15, + "grad_norm": 1.9498068405147828, + "learning_rate": 9.647468127672258e-06, + "loss": 0.6945, + "step": 1411 + }, + { + "epoch": 0.15, + "grad_norm": 2.3803787498905944, + "learning_rate": 9.64684695584973e-06, + "loss": 0.7755, + "step": 1412 + }, + { + "epoch": 0.15, + "grad_norm": 1.7106971204478838, + "learning_rate": 9.64622525728061e-06, + "loss": 0.6462, + "step": 1413 + }, + { + "epoch": 0.15, + "grad_norm": 1.720138728717716, + "learning_rate": 9.645603032035375e-06, + "loss": 0.5997, + "step": 1414 + }, + { + "epoch": 0.15, + "grad_norm": 1.8333700801882125, + "learning_rate": 9.644980280184559e-06, + "loss": 0.6211, + "step": 1415 + }, + { + "epoch": 0.15, + "grad_norm": 1.9341434824630561, + "learning_rate": 9.644357001798752e-06, + "loss": 0.5569, + "step": 1416 + }, + { + "epoch": 0.15, + "grad_norm": 1.9348393525012055, + "learning_rate": 9.643733196948607e-06, + "loss": 0.5925, + "step": 1417 + }, + { + "epoch": 0.15, + "grad_norm": 1.8537207494480576, + "learning_rate": 9.643108865704836e-06, + "loss": 0.686, + "step": 1418 + }, + { + "epoch": 0.15, + "grad_norm": 1.9794781788094675, + "learning_rate": 9.64248400813821e-06, + "loss": 0.6331, + "step": 1419 + }, + { + "epoch": 0.15, + "grad_norm": 1.8934282311552093, + "learning_rate": 9.64185862431956e-06, + "loss": 0.6109, + "step": 1420 + }, + { + "epoch": 0.15, + "grad_norm": 1.9403239182134335, + "learning_rate": 9.641232714319777e-06, + "loss": 0.7185, + "step": 1421 + }, + { + "epoch": 0.15, + "grad_norm": 2.157701765972057, + "learning_rate": 9.640606278209812e-06, + "loss": 0.7755, + "step": 1422 + }, + { + "epoch": 0.15, + "grad_norm": 1.6740846643105078, + "learning_rate": 9.639979316060675e-06, + "loss": 0.6013, + "step": 1423 + }, + { + "epoch": 0.15, + "grad_norm": 2.12677231177225, + "learning_rate": 9.639351827943436e-06, + "loss": 0.6144, + "step": 1424 + }, + { + "epoch": 0.15, + "grad_norm": 2.118780104812153, + "learning_rate": 9.638723813929224e-06, + "loss": 0.7439, + "step": 1425 + }, + { + "epoch": 0.15, + "grad_norm": 2.2357617900959728, + "learning_rate": 9.638095274089226e-06, + "loss": 0.6893, + "step": 1426 + }, + { + "epoch": 0.15, + "grad_norm": 1.7246115215231097, + "learning_rate": 9.637466208494694e-06, + "loss": 0.7229, + "step": 1427 + }, + { + "epoch": 0.15, + "grad_norm": 2.01688121939328, + "learning_rate": 9.636836617216934e-06, + "loss": 0.6563, + "step": 1428 + }, + { + "epoch": 0.15, + "grad_norm": 1.916960546551675, + "learning_rate": 9.636206500327316e-06, + "loss": 0.734, + "step": 1429 + }, + { + "epoch": 0.15, + "grad_norm": 1.9670238441866768, + "learning_rate": 9.635575857897264e-06, + "loss": 0.8007, + "step": 1430 + }, + { + "epoch": 0.15, + "grad_norm": 1.9222282752835942, + "learning_rate": 9.634944689998267e-06, + "loss": 0.6885, + "step": 1431 + }, + { + "epoch": 0.15, + "grad_norm": 1.8951313109327579, + "learning_rate": 9.63431299670187e-06, + "loss": 0.5511, + "step": 1432 + }, + { + "epoch": 0.15, + "grad_norm": 2.087254254458319, + "learning_rate": 9.633680778079682e-06, + "loss": 0.7344, + "step": 1433 + }, + { + "epoch": 0.15, + "grad_norm": 1.8839664631491326, + "learning_rate": 9.633048034203365e-06, + "loss": 0.7775, + "step": 1434 + }, + { + "epoch": 0.15, + "grad_norm": 1.90221214075658, + "learning_rate": 9.632414765144646e-06, + "loss": 0.6708, + "step": 1435 + }, + { + "epoch": 0.15, + "grad_norm": 2.078581620152747, + "learning_rate": 9.631780970975311e-06, + "loss": 0.6322, + "step": 1436 + }, + { + "epoch": 0.15, + "grad_norm": 1.83487929881022, + "learning_rate": 9.631146651767202e-06, + "loss": 0.669, + "step": 1437 + }, + { + "epoch": 0.15, + "grad_norm": 1.832923326521618, + "learning_rate": 9.630511807592224e-06, + "loss": 0.6101, + "step": 1438 + }, + { + "epoch": 0.15, + "grad_norm": 2.1141323443224436, + "learning_rate": 9.629876438522338e-06, + "loss": 0.7117, + "step": 1439 + }, + { + "epoch": 0.15, + "grad_norm": 2.06022961270754, + "learning_rate": 9.62924054462957e-06, + "loss": 0.7348, + "step": 1440 + }, + { + "epoch": 0.15, + "grad_norm": 1.8267609596216754, + "learning_rate": 9.628604125985999e-06, + "loss": 0.7431, + "step": 1441 + }, + { + "epoch": 0.15, + "grad_norm": 1.9004596571143735, + "learning_rate": 9.627967182663768e-06, + "loss": 0.7026, + "step": 1442 + }, + { + "epoch": 0.15, + "grad_norm": 2.0016223705105727, + "learning_rate": 9.627329714735079e-06, + "loss": 0.6671, + "step": 1443 + }, + { + "epoch": 0.15, + "grad_norm": 1.930460301179286, + "learning_rate": 9.626691722272193e-06, + "loss": 0.6102, + "step": 1444 + }, + { + "epoch": 0.15, + "grad_norm": 1.9009166256026644, + "learning_rate": 9.626053205347428e-06, + "loss": 0.7215, + "step": 1445 + }, + { + "epoch": 0.15, + "grad_norm": 2.18241662087263, + "learning_rate": 9.625414164033163e-06, + "loss": 0.7554, + "step": 1446 + }, + { + "epoch": 0.15, + "grad_norm": 1.7751878116739175, + "learning_rate": 9.62477459840184e-06, + "loss": 0.6397, + "step": 1447 + }, + { + "epoch": 0.15, + "grad_norm": 1.946828033661672, + "learning_rate": 9.624134508525957e-06, + "loss": 0.6376, + "step": 1448 + }, + { + "epoch": 0.15, + "grad_norm": 2.096187878367154, + "learning_rate": 9.623493894478069e-06, + "loss": 0.782, + "step": 1449 + }, + { + "epoch": 0.15, + "grad_norm": 1.6619296901186194, + "learning_rate": 9.622852756330797e-06, + "loss": 0.6232, + "step": 1450 + }, + { + "epoch": 0.15, + "grad_norm": 2.130541226400497, + "learning_rate": 9.622211094156812e-06, + "loss": 0.6484, + "step": 1451 + }, + { + "epoch": 0.15, + "grad_norm": 1.9178708685945536, + "learning_rate": 9.621568908028857e-06, + "loss": 0.7487, + "step": 1452 + }, + { + "epoch": 0.15, + "grad_norm": 1.9169807937008756, + "learning_rate": 9.620926198019724e-06, + "loss": 0.6049, + "step": 1453 + }, + { + "epoch": 0.15, + "grad_norm": 1.8835439671046228, + "learning_rate": 9.620282964202267e-06, + "loss": 0.7385, + "step": 1454 + }, + { + "epoch": 0.15, + "grad_norm": 1.92405276657743, + "learning_rate": 9.619639206649402e-06, + "loss": 0.646, + "step": 1455 + }, + { + "epoch": 0.15, + "grad_norm": 1.9741432328020077, + "learning_rate": 9.618994925434103e-06, + "loss": 0.6466, + "step": 1456 + }, + { + "epoch": 0.15, + "grad_norm": 2.0886770963919425, + "learning_rate": 9.618350120629398e-06, + "loss": 0.614, + "step": 1457 + }, + { + "epoch": 0.15, + "grad_norm": 1.8307278823762825, + "learning_rate": 9.617704792308387e-06, + "loss": 0.6774, + "step": 1458 + }, + { + "epoch": 0.15, + "grad_norm": 2.003204931637325, + "learning_rate": 9.617058940544216e-06, + "loss": 0.6881, + "step": 1459 + }, + { + "epoch": 0.15, + "grad_norm": 1.716683988327422, + "learning_rate": 9.616412565410097e-06, + "loss": 0.644, + "step": 1460 + }, + { + "epoch": 0.15, + "grad_norm": 1.9389630819810084, + "learning_rate": 9.615765666979302e-06, + "loss": 0.6889, + "step": 1461 + }, + { + "epoch": 0.15, + "grad_norm": 1.734722035180772, + "learning_rate": 9.615118245325161e-06, + "loss": 0.6487, + "step": 1462 + }, + { + "epoch": 0.15, + "grad_norm": 1.9708045285573386, + "learning_rate": 9.61447030052106e-06, + "loss": 0.7563, + "step": 1463 + }, + { + "epoch": 0.15, + "grad_norm": 2.045152715335636, + "learning_rate": 9.613821832640448e-06, + "loss": 0.6307, + "step": 1464 + }, + { + "epoch": 0.15, + "grad_norm": 1.8477983077439863, + "learning_rate": 9.613172841756835e-06, + "loss": 0.5452, + "step": 1465 + }, + { + "epoch": 0.15, + "grad_norm": 1.9003093700116294, + "learning_rate": 9.612523327943786e-06, + "loss": 0.6483, + "step": 1466 + }, + { + "epoch": 0.15, + "grad_norm": 1.8638599615140454, + "learning_rate": 9.611873291274927e-06, + "loss": 0.7114, + "step": 1467 + }, + { + "epoch": 0.15, + "grad_norm": 1.8520477349826896, + "learning_rate": 9.611222731823944e-06, + "loss": 0.6058, + "step": 1468 + }, + { + "epoch": 0.15, + "grad_norm": 1.9085000538619996, + "learning_rate": 9.61057164966458e-06, + "loss": 0.6045, + "step": 1469 + }, + { + "epoch": 0.15, + "grad_norm": 1.8006432662990515, + "learning_rate": 9.609920044870643e-06, + "loss": 0.6029, + "step": 1470 + }, + { + "epoch": 0.15, + "grad_norm": 1.9179518990616973, + "learning_rate": 9.60926791751599e-06, + "loss": 0.7078, + "step": 1471 + }, + { + "epoch": 0.15, + "grad_norm": 2.042582586101166, + "learning_rate": 9.608615267674548e-06, + "loss": 0.662, + "step": 1472 + }, + { + "epoch": 0.15, + "grad_norm": 1.7613901736843571, + "learning_rate": 9.607962095420297e-06, + "loss": 0.6698, + "step": 1473 + }, + { + "epoch": 0.15, + "grad_norm": 2.0988777706402897, + "learning_rate": 9.607308400827277e-06, + "loss": 0.6381, + "step": 1474 + }, + { + "epoch": 0.15, + "grad_norm": 2.378994760615408, + "learning_rate": 9.606654183969591e-06, + "loss": 0.7589, + "step": 1475 + }, + { + "epoch": 0.15, + "grad_norm": 1.8172655352330878, + "learning_rate": 9.605999444921394e-06, + "loss": 0.6398, + "step": 1476 + }, + { + "epoch": 0.15, + "grad_norm": 2.037892899762544, + "learning_rate": 9.605344183756908e-06, + "loss": 0.7299, + "step": 1477 + }, + { + "epoch": 0.15, + "grad_norm": 1.7584598825730333, + "learning_rate": 9.60468840055041e-06, + "loss": 0.7002, + "step": 1478 + }, + { + "epoch": 0.15, + "grad_norm": 1.743347054484532, + "learning_rate": 9.604032095376234e-06, + "loss": 0.6489, + "step": 1479 + }, + { + "epoch": 0.15, + "grad_norm": 1.8804620415715747, + "learning_rate": 9.603375268308779e-06, + "loss": 0.6259, + "step": 1480 + }, + { + "epoch": 0.15, + "grad_norm": 2.170002273856052, + "learning_rate": 9.602717919422499e-06, + "loss": 0.7196, + "step": 1481 + }, + { + "epoch": 0.15, + "grad_norm": 1.9421409150574973, + "learning_rate": 9.602060048791908e-06, + "loss": 0.6924, + "step": 1482 + }, + { + "epoch": 0.15, + "grad_norm": 1.9717758470116444, + "learning_rate": 9.60140165649158e-06, + "loss": 0.7171, + "step": 1483 + }, + { + "epoch": 0.15, + "grad_norm": 2.4743252790664467, + "learning_rate": 9.600742742596146e-06, + "loss": 0.7644, + "step": 1484 + }, + { + "epoch": 0.15, + "grad_norm": 2.03725397790848, + "learning_rate": 9.6000833071803e-06, + "loss": 0.6426, + "step": 1485 + }, + { + "epoch": 0.15, + "grad_norm": 2.004957380913632, + "learning_rate": 9.599423350318791e-06, + "loss": 0.7055, + "step": 1486 + }, + { + "epoch": 0.15, + "grad_norm": 1.8582757642070944, + "learning_rate": 9.598762872086428e-06, + "loss": 0.6467, + "step": 1487 + }, + { + "epoch": 0.15, + "grad_norm": 2.0838491494548603, + "learning_rate": 9.598101872558085e-06, + "loss": 0.6039, + "step": 1488 + }, + { + "epoch": 0.15, + "grad_norm": 1.8164477514953552, + "learning_rate": 9.597440351808684e-06, + "loss": 0.6671, + "step": 1489 + }, + { + "epoch": 0.15, + "grad_norm": 1.9808384701216841, + "learning_rate": 9.596778309913215e-06, + "loss": 0.6528, + "step": 1490 + }, + { + "epoch": 0.15, + "grad_norm": 2.003749745611465, + "learning_rate": 9.596115746946723e-06, + "loss": 0.7215, + "step": 1491 + }, + { + "epoch": 0.16, + "grad_norm": 1.9103354885895816, + "learning_rate": 9.595452662984314e-06, + "loss": 0.6408, + "step": 1492 + }, + { + "epoch": 0.16, + "grad_norm": 2.056675445688021, + "learning_rate": 9.594789058101154e-06, + "loss": 0.729, + "step": 1493 + }, + { + "epoch": 0.16, + "grad_norm": 2.008591475324911, + "learning_rate": 9.594124932372465e-06, + "loss": 0.6412, + "step": 1494 + }, + { + "epoch": 0.16, + "grad_norm": 2.091130750278084, + "learning_rate": 9.593460285873528e-06, + "loss": 0.6649, + "step": 1495 + }, + { + "epoch": 0.16, + "grad_norm": 1.920035920120745, + "learning_rate": 9.592795118679686e-06, + "loss": 0.6084, + "step": 1496 + }, + { + "epoch": 0.16, + "grad_norm": 1.8093080186617532, + "learning_rate": 9.59212943086634e-06, + "loss": 0.6288, + "step": 1497 + }, + { + "epoch": 0.16, + "grad_norm": 1.8862928362176083, + "learning_rate": 9.591463222508947e-06, + "loss": 0.6367, + "step": 1498 + }, + { + "epoch": 0.16, + "grad_norm": 1.8277193723517655, + "learning_rate": 9.590796493683028e-06, + "loss": 0.7249, + "step": 1499 + }, + { + "epoch": 0.16, + "grad_norm": 2.19331615809431, + "learning_rate": 9.59012924446416e-06, + "loss": 0.7191, + "step": 1500 + }, + { + "epoch": 0.16, + "grad_norm": 1.6401164370086485, + "learning_rate": 9.589461474927979e-06, + "loss": 0.6349, + "step": 1501 + }, + { + "epoch": 0.16, + "grad_norm": 2.357420255966235, + "learning_rate": 9.588793185150182e-06, + "loss": 0.6767, + "step": 1502 + }, + { + "epoch": 0.16, + "grad_norm": 1.9435062989464436, + "learning_rate": 9.58812437520652e-06, + "loss": 0.6545, + "step": 1503 + }, + { + "epoch": 0.16, + "grad_norm": 1.927844968000663, + "learning_rate": 9.58745504517281e-06, + "loss": 0.6596, + "step": 1504 + }, + { + "epoch": 0.16, + "grad_norm": 1.7570483201394154, + "learning_rate": 9.586785195124924e-06, + "loss": 0.673, + "step": 1505 + }, + { + "epoch": 0.16, + "grad_norm": 1.8155004596147315, + "learning_rate": 9.586114825138792e-06, + "loss": 0.683, + "step": 1506 + }, + { + "epoch": 0.16, + "grad_norm": 1.8583467053252918, + "learning_rate": 9.585443935290403e-06, + "loss": 0.6367, + "step": 1507 + }, + { + "epoch": 0.16, + "grad_norm": 1.9030635673277672, + "learning_rate": 9.58477252565581e-06, + "loss": 0.6275, + "step": 1508 + }, + { + "epoch": 0.16, + "grad_norm": 1.788718336973656, + "learning_rate": 9.584100596311117e-06, + "loss": 0.6945, + "step": 1509 + }, + { + "epoch": 0.16, + "grad_norm": 1.7351275011949883, + "learning_rate": 9.583428147332493e-06, + "loss": 0.5619, + "step": 1510 + }, + { + "epoch": 0.16, + "grad_norm": 1.8450518135634049, + "learning_rate": 9.582755178796164e-06, + "loss": 0.6165, + "step": 1511 + }, + { + "epoch": 0.16, + "grad_norm": 1.972540427042877, + "learning_rate": 9.582081690778415e-06, + "loss": 0.5639, + "step": 1512 + }, + { + "epoch": 0.16, + "grad_norm": 1.9050637486054645, + "learning_rate": 9.58140768335559e-06, + "loss": 0.6104, + "step": 1513 + }, + { + "epoch": 0.16, + "grad_norm": 2.0985873225265284, + "learning_rate": 9.58073315660409e-06, + "loss": 0.717, + "step": 1514 + }, + { + "epoch": 0.16, + "grad_norm": 3.3097437422761256, + "learning_rate": 9.580058110600377e-06, + "loss": 0.617, + "step": 1515 + }, + { + "epoch": 0.16, + "grad_norm": 1.9690754280591791, + "learning_rate": 9.579382545420972e-06, + "loss": 0.6871, + "step": 1516 + }, + { + "epoch": 0.16, + "grad_norm": 1.782107200466709, + "learning_rate": 9.578706461142455e-06, + "loss": 0.7152, + "step": 1517 + }, + { + "epoch": 0.16, + "grad_norm": 1.8507836163162588, + "learning_rate": 9.578029857841462e-06, + "loss": 0.673, + "step": 1518 + }, + { + "epoch": 0.16, + "grad_norm": 1.890334013229442, + "learning_rate": 9.577352735594692e-06, + "loss": 0.7108, + "step": 1519 + }, + { + "epoch": 0.16, + "grad_norm": 1.8323036417412149, + "learning_rate": 9.576675094478898e-06, + "loss": 0.5764, + "step": 1520 + }, + { + "epoch": 0.16, + "grad_norm": 2.088690846097015, + "learning_rate": 9.575996934570896e-06, + "loss": 0.7671, + "step": 1521 + }, + { + "epoch": 0.16, + "grad_norm": 1.896412011489828, + "learning_rate": 9.575318255947558e-06, + "loss": 0.6737, + "step": 1522 + }, + { + "epoch": 0.16, + "grad_norm": 1.8886169195982683, + "learning_rate": 9.57463905868582e-06, + "loss": 0.6384, + "step": 1523 + }, + { + "epoch": 0.16, + "grad_norm": 2.219113889815762, + "learning_rate": 9.573959342862667e-06, + "loss": 0.6598, + "step": 1524 + }, + { + "epoch": 0.16, + "grad_norm": 2.0301247271789857, + "learning_rate": 9.573279108555154e-06, + "loss": 0.6046, + "step": 1525 + }, + { + "epoch": 0.16, + "grad_norm": 1.9843830977227843, + "learning_rate": 9.572598355840386e-06, + "loss": 0.6969, + "step": 1526 + }, + { + "epoch": 0.16, + "grad_norm": 1.8464680774728668, + "learning_rate": 9.571917084795532e-06, + "loss": 0.5952, + "step": 1527 + }, + { + "epoch": 0.16, + "grad_norm": 1.7925551511247095, + "learning_rate": 9.571235295497818e-06, + "loss": 0.6387, + "step": 1528 + }, + { + "epoch": 0.16, + "grad_norm": 1.7944467845098961, + "learning_rate": 9.570552988024527e-06, + "loss": 0.6061, + "step": 1529 + }, + { + "epoch": 0.16, + "grad_norm": 2.1431104616084493, + "learning_rate": 9.569870162453004e-06, + "loss": 0.7645, + "step": 1530 + }, + { + "epoch": 0.16, + "grad_norm": 1.9799001726970609, + "learning_rate": 9.569186818860652e-06, + "loss": 0.8162, + "step": 1531 + }, + { + "epoch": 0.16, + "grad_norm": 2.029920054940862, + "learning_rate": 9.568502957324928e-06, + "loss": 0.6148, + "step": 1532 + }, + { + "epoch": 0.16, + "grad_norm": 2.2030292163042304, + "learning_rate": 9.567818577923356e-06, + "loss": 0.7205, + "step": 1533 + }, + { + "epoch": 0.16, + "grad_norm": 1.9233321976022704, + "learning_rate": 9.567133680733512e-06, + "loss": 0.6518, + "step": 1534 + }, + { + "epoch": 0.16, + "grad_norm": 1.7910034855382688, + "learning_rate": 9.566448265833034e-06, + "loss": 0.6862, + "step": 1535 + }, + { + "epoch": 0.16, + "grad_norm": 2.0613102871389777, + "learning_rate": 9.565762333299616e-06, + "loss": 0.7762, + "step": 1536 + }, + { + "epoch": 0.16, + "grad_norm": 2.1927507319579074, + "learning_rate": 9.565075883211015e-06, + "loss": 0.7046, + "step": 1537 + }, + { + "epoch": 0.16, + "grad_norm": 1.833817329880263, + "learning_rate": 9.564388915645042e-06, + "loss": 0.6771, + "step": 1538 + }, + { + "epoch": 0.16, + "grad_norm": 1.9954731003848947, + "learning_rate": 9.563701430679568e-06, + "loss": 0.7152, + "step": 1539 + }, + { + "epoch": 0.16, + "grad_norm": 1.9271801113839422, + "learning_rate": 9.563013428392528e-06, + "loss": 0.7178, + "step": 1540 + }, + { + "epoch": 0.16, + "grad_norm": 1.9666689051116464, + "learning_rate": 9.562324908861904e-06, + "loss": 0.5967, + "step": 1541 + }, + { + "epoch": 0.16, + "grad_norm": 2.020042739074386, + "learning_rate": 9.561635872165747e-06, + "loss": 0.6152, + "step": 1542 + }, + { + "epoch": 0.16, + "grad_norm": 1.7887557940007968, + "learning_rate": 9.560946318382166e-06, + "loss": 0.6381, + "step": 1543 + }, + { + "epoch": 0.16, + "grad_norm": 2.1556497061967823, + "learning_rate": 9.560256247589321e-06, + "loss": 0.5674, + "step": 1544 + }, + { + "epoch": 0.16, + "grad_norm": 1.7794594694424544, + "learning_rate": 9.559565659865439e-06, + "loss": 0.6416, + "step": 1545 + }, + { + "epoch": 0.16, + "grad_norm": 1.9390939469813973, + "learning_rate": 9.558874555288801e-06, + "loss": 0.7169, + "step": 1546 + }, + { + "epoch": 0.16, + "grad_norm": 1.8786485630013083, + "learning_rate": 9.558182933937747e-06, + "loss": 0.5799, + "step": 1547 + }, + { + "epoch": 0.16, + "grad_norm": 1.856557490530216, + "learning_rate": 9.557490795890679e-06, + "loss": 0.5993, + "step": 1548 + }, + { + "epoch": 0.16, + "grad_norm": 1.974287937611842, + "learning_rate": 9.55679814122605e-06, + "loss": 0.7003, + "step": 1549 + }, + { + "epoch": 0.16, + "grad_norm": 1.9169969331494634, + "learning_rate": 9.556104970022378e-06, + "loss": 0.7669, + "step": 1550 + }, + { + "epoch": 0.16, + "grad_norm": 1.978215919977572, + "learning_rate": 9.55541128235824e-06, + "loss": 0.6096, + "step": 1551 + }, + { + "epoch": 0.16, + "grad_norm": 1.8521148274637858, + "learning_rate": 9.554717078312269e-06, + "loss": 0.7282, + "step": 1552 + }, + { + "epoch": 0.16, + "grad_norm": 2.0251613160942914, + "learning_rate": 9.554022357963153e-06, + "loss": 0.7232, + "step": 1553 + }, + { + "epoch": 0.16, + "grad_norm": 2.0117740878267436, + "learning_rate": 9.553327121389648e-06, + "loss": 0.7499, + "step": 1554 + }, + { + "epoch": 0.16, + "grad_norm": 1.8316021424004985, + "learning_rate": 9.552631368670562e-06, + "loss": 0.6253, + "step": 1555 + }, + { + "epoch": 0.16, + "grad_norm": 1.7375512663566117, + "learning_rate": 9.55193509988476e-06, + "loss": 0.5862, + "step": 1556 + }, + { + "epoch": 0.16, + "grad_norm": 2.226876317475117, + "learning_rate": 9.55123831511117e-06, + "loss": 0.6528, + "step": 1557 + }, + { + "epoch": 0.16, + "grad_norm": 2.280983795980818, + "learning_rate": 9.550541014428773e-06, + "loss": 0.7361, + "step": 1558 + }, + { + "epoch": 0.16, + "grad_norm": 1.949008109587866, + "learning_rate": 9.549843197916616e-06, + "loss": 0.6629, + "step": 1559 + }, + { + "epoch": 0.16, + "grad_norm": 1.876199206288432, + "learning_rate": 9.5491448656538e-06, + "loss": 0.5759, + "step": 1560 + }, + { + "epoch": 0.16, + "grad_norm": 2.0055504701332407, + "learning_rate": 9.548446017719484e-06, + "loss": 0.6555, + "step": 1561 + }, + { + "epoch": 0.16, + "grad_norm": 1.8950761565227399, + "learning_rate": 9.547746654192887e-06, + "loss": 0.6419, + "step": 1562 + }, + { + "epoch": 0.16, + "grad_norm": 2.147644898855912, + "learning_rate": 9.547046775153285e-06, + "loss": 0.7729, + "step": 1563 + }, + { + "epoch": 0.16, + "grad_norm": 1.9930761501595675, + "learning_rate": 9.546346380680015e-06, + "loss": 0.757, + "step": 1564 + }, + { + "epoch": 0.16, + "grad_norm": 2.03876848772226, + "learning_rate": 9.545645470852466e-06, + "loss": 0.7823, + "step": 1565 + }, + { + "epoch": 0.16, + "grad_norm": 2.0341781603182385, + "learning_rate": 9.544944045750097e-06, + "loss": 0.6754, + "step": 1566 + }, + { + "epoch": 0.16, + "grad_norm": 1.9935660269354207, + "learning_rate": 9.544242105452414e-06, + "loss": 0.6485, + "step": 1567 + }, + { + "epoch": 0.16, + "grad_norm": 2.2846043842839254, + "learning_rate": 9.543539650038987e-06, + "loss": 0.6906, + "step": 1568 + }, + { + "epoch": 0.16, + "grad_norm": 2.1741854085761876, + "learning_rate": 9.542836679589443e-06, + "loss": 0.6777, + "step": 1569 + }, + { + "epoch": 0.16, + "grad_norm": 1.8573681266861257, + "learning_rate": 9.542133194183469e-06, + "loss": 0.6227, + "step": 1570 + }, + { + "epoch": 0.16, + "grad_norm": 1.768231953559204, + "learning_rate": 9.541429193900808e-06, + "loss": 0.6056, + "step": 1571 + }, + { + "epoch": 0.16, + "grad_norm": 1.8561209820293387, + "learning_rate": 9.540724678821261e-06, + "loss": 0.7023, + "step": 1572 + }, + { + "epoch": 0.16, + "grad_norm": 1.769977024069318, + "learning_rate": 9.540019649024692e-06, + "loss": 0.6391, + "step": 1573 + }, + { + "epoch": 0.16, + "grad_norm": 1.913878231055899, + "learning_rate": 9.539314104591019e-06, + "loss": 0.6818, + "step": 1574 + }, + { + "epoch": 0.16, + "grad_norm": 2.0615080474215635, + "learning_rate": 9.538608045600218e-06, + "loss": 0.6203, + "step": 1575 + }, + { + "epoch": 0.16, + "grad_norm": 1.872307597705844, + "learning_rate": 9.537901472132324e-06, + "loss": 0.6047, + "step": 1576 + }, + { + "epoch": 0.16, + "grad_norm": 2.001299949883912, + "learning_rate": 9.537194384267436e-06, + "loss": 0.6852, + "step": 1577 + }, + { + "epoch": 0.16, + "grad_norm": 1.9756233996839685, + "learning_rate": 9.536486782085703e-06, + "loss": 0.666, + "step": 1578 + }, + { + "epoch": 0.16, + "grad_norm": 1.7817231162157137, + "learning_rate": 9.535778665667334e-06, + "loss": 0.7383, + "step": 1579 + }, + { + "epoch": 0.16, + "grad_norm": 1.8093842277276013, + "learning_rate": 9.535070035092603e-06, + "loss": 0.7256, + "step": 1580 + }, + { + "epoch": 0.16, + "grad_norm": 1.7685386576759308, + "learning_rate": 9.534360890441833e-06, + "loss": 0.5198, + "step": 1581 + }, + { + "epoch": 0.16, + "grad_norm": 2.1297343383103717, + "learning_rate": 9.533651231795412e-06, + "loss": 0.706, + "step": 1582 + }, + { + "epoch": 0.16, + "grad_norm": 2.059039551713341, + "learning_rate": 9.532941059233782e-06, + "loss": 0.7166, + "step": 1583 + }, + { + "epoch": 0.16, + "grad_norm": 1.8142204295763857, + "learning_rate": 9.532230372837446e-06, + "loss": 0.7129, + "step": 1584 + }, + { + "epoch": 0.16, + "grad_norm": 2.039237252791234, + "learning_rate": 9.531519172686964e-06, + "loss": 0.7075, + "step": 1585 + }, + { + "epoch": 0.16, + "grad_norm": 1.8287133514681926, + "learning_rate": 9.530807458862956e-06, + "loss": 0.6532, + "step": 1586 + }, + { + "epoch": 0.16, + "grad_norm": 1.7667190625126445, + "learning_rate": 9.530095231446096e-06, + "loss": 0.5883, + "step": 1587 + }, + { + "epoch": 0.17, + "grad_norm": 1.8291627106185575, + "learning_rate": 9.529382490517123e-06, + "loss": 0.6794, + "step": 1588 + }, + { + "epoch": 0.17, + "grad_norm": 1.9608081910222792, + "learning_rate": 9.528669236156827e-06, + "loss": 0.6054, + "step": 1589 + }, + { + "epoch": 0.17, + "grad_norm": 1.9924170196955373, + "learning_rate": 9.52795546844606e-06, + "loss": 0.7171, + "step": 1590 + }, + { + "epoch": 0.17, + "grad_norm": 1.6524851266305414, + "learning_rate": 9.527241187465735e-06, + "loss": 0.6372, + "step": 1591 + }, + { + "epoch": 0.17, + "grad_norm": 2.160203220354208, + "learning_rate": 9.526526393296814e-06, + "loss": 0.7238, + "step": 1592 + }, + { + "epoch": 0.17, + "grad_norm": 1.9441794717429497, + "learning_rate": 9.525811086020327e-06, + "loss": 0.639, + "step": 1593 + }, + { + "epoch": 0.17, + "grad_norm": 2.0185775044915775, + "learning_rate": 9.525095265717357e-06, + "loss": 0.7046, + "step": 1594 + }, + { + "epoch": 0.17, + "grad_norm": 1.9035968009066015, + "learning_rate": 9.524378932469045e-06, + "loss": 0.7046, + "step": 1595 + }, + { + "epoch": 0.17, + "grad_norm": 1.616356917467379, + "learning_rate": 9.523662086356596e-06, + "loss": 0.6115, + "step": 1596 + }, + { + "epoch": 0.17, + "grad_norm": 2.175508595395853, + "learning_rate": 9.522944727461264e-06, + "loss": 0.6567, + "step": 1597 + }, + { + "epoch": 0.17, + "grad_norm": 2.1150702390919536, + "learning_rate": 9.522226855864366e-06, + "loss": 0.7182, + "step": 1598 + }, + { + "epoch": 0.17, + "grad_norm": 1.7610105376084912, + "learning_rate": 9.52150847164728e-06, + "loss": 0.7062, + "step": 1599 + }, + { + "epoch": 0.17, + "grad_norm": 1.8587370870268596, + "learning_rate": 9.520789574891436e-06, + "loss": 0.6725, + "step": 1600 + }, + { + "epoch": 0.17, + "grad_norm": 2.246019651567058, + "learning_rate": 9.520070165678325e-06, + "loss": 0.6444, + "step": 1601 + }, + { + "epoch": 0.17, + "grad_norm": 2.0514198751421873, + "learning_rate": 9.5193502440895e-06, + "loss": 0.7817, + "step": 1602 + }, + { + "epoch": 0.17, + "grad_norm": 1.7478983922533107, + "learning_rate": 9.518629810206564e-06, + "loss": 0.5888, + "step": 1603 + }, + { + "epoch": 0.17, + "grad_norm": 1.809367012317806, + "learning_rate": 9.517908864111182e-06, + "loss": 0.6072, + "step": 1604 + }, + { + "epoch": 0.17, + "grad_norm": 1.8021793691886772, + "learning_rate": 9.517187405885082e-06, + "loss": 0.653, + "step": 1605 + }, + { + "epoch": 0.17, + "grad_norm": 1.9639447106313237, + "learning_rate": 9.516465435610041e-06, + "loss": 0.7472, + "step": 1606 + }, + { + "epoch": 0.17, + "grad_norm": 1.7756152134174483, + "learning_rate": 9.515742953367899e-06, + "loss": 0.5715, + "step": 1607 + }, + { + "epoch": 0.17, + "grad_norm": 1.910782270315552, + "learning_rate": 9.515019959240555e-06, + "loss": 0.7156, + "step": 1608 + }, + { + "epoch": 0.17, + "grad_norm": 1.9990951548136033, + "learning_rate": 9.514296453309965e-06, + "loss": 0.7261, + "step": 1609 + }, + { + "epoch": 0.17, + "grad_norm": 1.6970077590281472, + "learning_rate": 9.51357243565814e-06, + "loss": 0.5351, + "step": 1610 + }, + { + "epoch": 0.17, + "grad_norm": 2.1610796647883306, + "learning_rate": 9.512847906367153e-06, + "loss": 0.7213, + "step": 1611 + }, + { + "epoch": 0.17, + "grad_norm": 2.0053270299753994, + "learning_rate": 9.512122865519135e-06, + "loss": 0.6144, + "step": 1612 + }, + { + "epoch": 0.17, + "grad_norm": 2.246635639941283, + "learning_rate": 9.51139731319627e-06, + "loss": 0.6258, + "step": 1613 + }, + { + "epoch": 0.17, + "grad_norm": 2.0140091119514767, + "learning_rate": 9.510671249480806e-06, + "loss": 0.6814, + "step": 1614 + }, + { + "epoch": 0.17, + "grad_norm": 2.0477698552459644, + "learning_rate": 9.509944674455047e-06, + "loss": 0.7852, + "step": 1615 + }, + { + "epoch": 0.17, + "grad_norm": 2.0320102733793246, + "learning_rate": 9.509217588201351e-06, + "loss": 0.7089, + "step": 1616 + }, + { + "epoch": 0.17, + "grad_norm": 1.8191152766376675, + "learning_rate": 9.508489990802142e-06, + "loss": 0.6528, + "step": 1617 + }, + { + "epoch": 0.17, + "grad_norm": 1.797405031205996, + "learning_rate": 9.507761882339895e-06, + "loss": 0.6468, + "step": 1618 + }, + { + "epoch": 0.17, + "grad_norm": 1.9825745171869442, + "learning_rate": 9.507033262897142e-06, + "loss": 0.706, + "step": 1619 + }, + { + "epoch": 0.17, + "grad_norm": 2.052359939793622, + "learning_rate": 9.506304132556484e-06, + "loss": 0.7695, + "step": 1620 + }, + { + "epoch": 0.17, + "grad_norm": 1.9565939487072954, + "learning_rate": 9.505574491400564e-06, + "loss": 0.7543, + "step": 1621 + }, + { + "epoch": 0.17, + "grad_norm": 1.819843813882747, + "learning_rate": 9.504844339512096e-06, + "loss": 0.5962, + "step": 1622 + }, + { + "epoch": 0.17, + "grad_norm": 1.9030655320222276, + "learning_rate": 9.504113676973846e-06, + "loss": 0.7255, + "step": 1623 + }, + { + "epoch": 0.17, + "grad_norm": 2.0056629019007954, + "learning_rate": 9.503382503868637e-06, + "loss": 0.8137, + "step": 1624 + }, + { + "epoch": 0.17, + "grad_norm": 1.9704662860200837, + "learning_rate": 9.502650820279354e-06, + "loss": 0.6582, + "step": 1625 + }, + { + "epoch": 0.17, + "grad_norm": 2.3756436822304345, + "learning_rate": 9.501918626288935e-06, + "loss": 0.7373, + "step": 1626 + }, + { + "epoch": 0.17, + "grad_norm": 1.7756355257502683, + "learning_rate": 9.50118592198038e-06, + "loss": 0.5558, + "step": 1627 + }, + { + "epoch": 0.17, + "grad_norm": 1.8277201108402616, + "learning_rate": 9.500452707436744e-06, + "loss": 0.6353, + "step": 1628 + }, + { + "epoch": 0.17, + "grad_norm": 1.8042213079432121, + "learning_rate": 9.499718982741143e-06, + "loss": 0.758, + "step": 1629 + }, + { + "epoch": 0.17, + "grad_norm": 1.8420878240688283, + "learning_rate": 9.498984747976747e-06, + "loss": 0.5722, + "step": 1630 + }, + { + "epoch": 0.17, + "grad_norm": 2.045825387663684, + "learning_rate": 9.498250003226787e-06, + "loss": 0.6005, + "step": 1631 + }, + { + "epoch": 0.17, + "grad_norm": 1.8251107667602975, + "learning_rate": 9.49751474857455e-06, + "loss": 0.6692, + "step": 1632 + }, + { + "epoch": 0.17, + "grad_norm": 1.8065273338239225, + "learning_rate": 9.496778984103381e-06, + "loss": 0.709, + "step": 1633 + }, + { + "epoch": 0.17, + "grad_norm": 1.8042150998335234, + "learning_rate": 9.496042709896684e-06, + "loss": 0.6163, + "step": 1634 + }, + { + "epoch": 0.17, + "grad_norm": 1.898311108212609, + "learning_rate": 9.495305926037918e-06, + "loss": 0.647, + "step": 1635 + }, + { + "epoch": 0.17, + "grad_norm": 1.9689807722120047, + "learning_rate": 9.494568632610603e-06, + "loss": 0.7747, + "step": 1636 + }, + { + "epoch": 0.17, + "grad_norm": 1.8341428636567332, + "learning_rate": 9.493830829698317e-06, + "loss": 0.5898, + "step": 1637 + }, + { + "epoch": 0.17, + "grad_norm": 2.1033082614827414, + "learning_rate": 9.493092517384692e-06, + "loss": 0.6607, + "step": 1638 + }, + { + "epoch": 0.17, + "grad_norm": 1.829421971966054, + "learning_rate": 9.49235369575342e-06, + "loss": 0.7291, + "step": 1639 + }, + { + "epoch": 0.17, + "grad_norm": 1.8996716409526258, + "learning_rate": 9.49161436488825e-06, + "loss": 0.7602, + "step": 1640 + }, + { + "epoch": 0.17, + "grad_norm": 2.1680339164477433, + "learning_rate": 9.49087452487299e-06, + "loss": 0.6938, + "step": 1641 + }, + { + "epoch": 0.17, + "grad_norm": 1.6508009166489983, + "learning_rate": 9.490134175791507e-06, + "loss": 0.6141, + "step": 1642 + }, + { + "epoch": 0.17, + "grad_norm": 1.7767239402836295, + "learning_rate": 9.489393317727724e-06, + "loss": 0.618, + "step": 1643 + }, + { + "epoch": 0.17, + "grad_norm": 2.038443432035079, + "learning_rate": 9.488651950765617e-06, + "loss": 0.6579, + "step": 1644 + }, + { + "epoch": 0.17, + "grad_norm": 1.741168133888333, + "learning_rate": 9.487910074989228e-06, + "loss": 0.6494, + "step": 1645 + }, + { + "epoch": 0.17, + "grad_norm": 1.857138742618028, + "learning_rate": 9.48716769048265e-06, + "loss": 0.5755, + "step": 1646 + }, + { + "epoch": 0.17, + "grad_norm": 2.1591345481710036, + "learning_rate": 9.48642479733004e-06, + "loss": 0.6383, + "step": 1647 + }, + { + "epoch": 0.17, + "grad_norm": 1.9364895503622959, + "learning_rate": 9.485681395615607e-06, + "loss": 0.6839, + "step": 1648 + }, + { + "epoch": 0.17, + "grad_norm": 2.031828191064373, + "learning_rate": 9.484937485423622e-06, + "loss": 0.6035, + "step": 1649 + }, + { + "epoch": 0.17, + "grad_norm": 1.754566896444676, + "learning_rate": 9.484193066838408e-06, + "loss": 0.6424, + "step": 1650 + }, + { + "epoch": 0.17, + "grad_norm": 1.9246941194236256, + "learning_rate": 9.48344813994435e-06, + "loss": 0.6513, + "step": 1651 + }, + { + "epoch": 0.17, + "grad_norm": 1.8684221297861972, + "learning_rate": 9.482702704825892e-06, + "loss": 0.7858, + "step": 1652 + }, + { + "epoch": 0.17, + "grad_norm": 1.9414626199310694, + "learning_rate": 9.481956761567531e-06, + "loss": 0.6826, + "step": 1653 + }, + { + "epoch": 0.17, + "grad_norm": 2.0316286317178727, + "learning_rate": 9.481210310253826e-06, + "loss": 0.7082, + "step": 1654 + }, + { + "epoch": 0.17, + "grad_norm": 1.7661332172941508, + "learning_rate": 9.480463350969388e-06, + "loss": 0.6982, + "step": 1655 + }, + { + "epoch": 0.17, + "grad_norm": 1.9932281813194153, + "learning_rate": 9.479715883798895e-06, + "loss": 0.64, + "step": 1656 + }, + { + "epoch": 0.17, + "grad_norm": 2.0257008912279417, + "learning_rate": 9.47896790882707e-06, + "loss": 0.6237, + "step": 1657 + }, + { + "epoch": 0.17, + "grad_norm": 1.7559027656428055, + "learning_rate": 9.478219426138703e-06, + "loss": 0.6753, + "step": 1658 + }, + { + "epoch": 0.17, + "grad_norm": 1.8541314724538382, + "learning_rate": 9.477470435818641e-06, + "loss": 0.7334, + "step": 1659 + }, + { + "epoch": 0.17, + "grad_norm": 1.9262745014920255, + "learning_rate": 9.476720937951785e-06, + "loss": 0.6096, + "step": 1660 + }, + { + "epoch": 0.17, + "grad_norm": 2.00672485707719, + "learning_rate": 9.47597093262309e-06, + "loss": 0.6902, + "step": 1661 + }, + { + "epoch": 0.17, + "grad_norm": 1.8392421439660025, + "learning_rate": 9.475220419917581e-06, + "loss": 0.7256, + "step": 1662 + }, + { + "epoch": 0.17, + "grad_norm": 1.9341160956399497, + "learning_rate": 9.47446939992033e-06, + "loss": 0.7523, + "step": 1663 + }, + { + "epoch": 0.17, + "grad_norm": 1.8634455622742643, + "learning_rate": 9.473717872716469e-06, + "loss": 0.7325, + "step": 1664 + }, + { + "epoch": 0.17, + "grad_norm": 1.8513564589120213, + "learning_rate": 9.472965838391187e-06, + "loss": 0.6888, + "step": 1665 + }, + { + "epoch": 0.17, + "grad_norm": 1.7884451440018407, + "learning_rate": 9.47221329702973e-06, + "loss": 0.5837, + "step": 1666 + }, + { + "epoch": 0.17, + "grad_norm": 1.9881558844202416, + "learning_rate": 9.471460248717406e-06, + "loss": 0.6877, + "step": 1667 + }, + { + "epoch": 0.17, + "grad_norm": 1.8388210711744746, + "learning_rate": 9.470706693539578e-06, + "loss": 0.6638, + "step": 1668 + }, + { + "epoch": 0.17, + "grad_norm": 2.0549800910188534, + "learning_rate": 9.469952631581663e-06, + "loss": 0.7381, + "step": 1669 + }, + { + "epoch": 0.17, + "grad_norm": 1.9226671283551982, + "learning_rate": 9.469198062929139e-06, + "loss": 0.6649, + "step": 1670 + }, + { + "epoch": 0.17, + "grad_norm": 2.0660185079140896, + "learning_rate": 9.468442987667542e-06, + "loss": 0.7564, + "step": 1671 + }, + { + "epoch": 0.17, + "grad_norm": 1.903836190555682, + "learning_rate": 9.467687405882463e-06, + "loss": 0.6271, + "step": 1672 + }, + { + "epoch": 0.17, + "grad_norm": 2.0450685561813824, + "learning_rate": 9.466931317659551e-06, + "loss": 0.7239, + "step": 1673 + }, + { + "epoch": 0.17, + "grad_norm": 2.005334746057944, + "learning_rate": 9.466174723084514e-06, + "loss": 0.7554, + "step": 1674 + }, + { + "epoch": 0.17, + "grad_norm": 1.6452748276607783, + "learning_rate": 9.465417622243116e-06, + "loss": 0.6588, + "step": 1675 + }, + { + "epoch": 0.17, + "grad_norm": 1.9042511113167548, + "learning_rate": 9.464660015221177e-06, + "loss": 0.5927, + "step": 1676 + }, + { + "epoch": 0.17, + "grad_norm": 1.8411751279551711, + "learning_rate": 9.463901902104579e-06, + "loss": 0.6422, + "step": 1677 + }, + { + "epoch": 0.17, + "grad_norm": 1.9047976740761143, + "learning_rate": 9.463143282979258e-06, + "loss": 0.7052, + "step": 1678 + }, + { + "epoch": 0.17, + "grad_norm": 2.0239698176707055, + "learning_rate": 9.462384157931203e-06, + "loss": 0.6387, + "step": 1679 + }, + { + "epoch": 0.17, + "grad_norm": 1.71917711556842, + "learning_rate": 9.461624527046472e-06, + "loss": 0.6492, + "step": 1680 + }, + { + "epoch": 0.17, + "grad_norm": 1.7473641280277281, + "learning_rate": 9.46086439041117e-06, + "loss": 0.6081, + "step": 1681 + }, + { + "epoch": 0.17, + "grad_norm": 2.038864253926127, + "learning_rate": 9.460103748111462e-06, + "loss": 0.6386, + "step": 1682 + }, + { + "epoch": 0.17, + "grad_norm": 1.9706571596391416, + "learning_rate": 9.459342600233575e-06, + "loss": 0.7428, + "step": 1683 + }, + { + "epoch": 0.18, + "grad_norm": 1.954883981835221, + "learning_rate": 9.458580946863784e-06, + "loss": 0.6086, + "step": 1684 + }, + { + "epoch": 0.18, + "grad_norm": 2.0798834277695146, + "learning_rate": 9.457818788088431e-06, + "loss": 0.6907, + "step": 1685 + }, + { + "epoch": 0.18, + "grad_norm": 1.854504688334079, + "learning_rate": 9.45705612399391e-06, + "loss": 0.6769, + "step": 1686 + }, + { + "epoch": 0.18, + "grad_norm": 1.940980581413887, + "learning_rate": 9.456292954666673e-06, + "loss": 0.6356, + "step": 1687 + }, + { + "epoch": 0.18, + "grad_norm": 2.010496752053228, + "learning_rate": 9.45552928019323e-06, + "loss": 0.6778, + "step": 1688 + }, + { + "epoch": 0.18, + "grad_norm": 2.357313247435537, + "learning_rate": 9.454765100660144e-06, + "loss": 0.7522, + "step": 1689 + }, + { + "epoch": 0.18, + "grad_norm": 2.084001616032715, + "learning_rate": 9.454000416154046e-06, + "loss": 0.7403, + "step": 1690 + }, + { + "epoch": 0.18, + "grad_norm": 2.038729610751833, + "learning_rate": 9.453235226761613e-06, + "loss": 0.6874, + "step": 1691 + }, + { + "epoch": 0.18, + "grad_norm": 1.9823748346716907, + "learning_rate": 9.452469532569585e-06, + "loss": 0.6065, + "step": 1692 + }, + { + "epoch": 0.18, + "grad_norm": 1.998207692487797, + "learning_rate": 9.451703333664756e-06, + "loss": 0.6758, + "step": 1693 + }, + { + "epoch": 0.18, + "grad_norm": 1.8763531828895923, + "learning_rate": 9.45093663013398e-06, + "loss": 0.6963, + "step": 1694 + }, + { + "epoch": 0.18, + "grad_norm": 1.6424440865727896, + "learning_rate": 9.45016942206417e-06, + "loss": 0.6559, + "step": 1695 + }, + { + "epoch": 0.18, + "grad_norm": 2.1430067998803564, + "learning_rate": 9.449401709542289e-06, + "loss": 0.7204, + "step": 1696 + }, + { + "epoch": 0.18, + "grad_norm": 2.0047992594876627, + "learning_rate": 9.448633492655363e-06, + "loss": 0.7242, + "step": 1697 + }, + { + "epoch": 0.18, + "grad_norm": 1.9198849998196943, + "learning_rate": 9.447864771490476e-06, + "loss": 0.743, + "step": 1698 + }, + { + "epoch": 0.18, + "grad_norm": 1.8212504164264498, + "learning_rate": 9.447095546134763e-06, + "loss": 0.5829, + "step": 1699 + }, + { + "epoch": 0.18, + "grad_norm": 1.8677748324266148, + "learning_rate": 9.446325816675423e-06, + "loss": 0.6371, + "step": 1700 + }, + { + "epoch": 0.18, + "grad_norm": 2.0007664737650885, + "learning_rate": 9.445555583199711e-06, + "loss": 0.6646, + "step": 1701 + }, + { + "epoch": 0.18, + "grad_norm": 1.8964503663253003, + "learning_rate": 9.444784845794932e-06, + "loss": 0.7622, + "step": 1702 + }, + { + "epoch": 0.18, + "grad_norm": 1.9525551773329504, + "learning_rate": 9.444013604548457e-06, + "loss": 0.5986, + "step": 1703 + }, + { + "epoch": 0.18, + "grad_norm": 2.357983873653802, + "learning_rate": 9.44324185954771e-06, + "loss": 0.5866, + "step": 1704 + }, + { + "epoch": 0.18, + "grad_norm": 1.974929397878334, + "learning_rate": 9.442469610880172e-06, + "loss": 0.7261, + "step": 1705 + }, + { + "epoch": 0.18, + "grad_norm": 2.422100306321085, + "learning_rate": 9.441696858633382e-06, + "loss": 0.7537, + "step": 1706 + }, + { + "epoch": 0.18, + "grad_norm": 1.7334732730860278, + "learning_rate": 9.440923602894937e-06, + "loss": 0.6426, + "step": 1707 + }, + { + "epoch": 0.18, + "grad_norm": 2.085845934820001, + "learning_rate": 9.44014984375249e-06, + "loss": 0.7491, + "step": 1708 + }, + { + "epoch": 0.18, + "grad_norm": 1.957209376001055, + "learning_rate": 9.439375581293751e-06, + "loss": 0.6847, + "step": 1709 + }, + { + "epoch": 0.18, + "grad_norm": 1.850758181214728, + "learning_rate": 9.438600815606483e-06, + "loss": 0.6593, + "step": 1710 + }, + { + "epoch": 0.18, + "grad_norm": 1.9311284970137075, + "learning_rate": 9.437825546778517e-06, + "loss": 0.6886, + "step": 1711 + }, + { + "epoch": 0.18, + "grad_norm": 1.7095134979680247, + "learning_rate": 9.437049774897728e-06, + "loss": 0.6179, + "step": 1712 + }, + { + "epoch": 0.18, + "grad_norm": 2.048187229194672, + "learning_rate": 9.436273500052056e-06, + "loss": 0.7139, + "step": 1713 + }, + { + "epoch": 0.18, + "grad_norm": 1.8280668981290464, + "learning_rate": 9.4354967223295e-06, + "loss": 0.6537, + "step": 1714 + }, + { + "epoch": 0.18, + "grad_norm": 1.7321260549808468, + "learning_rate": 9.434719441818106e-06, + "loss": 0.5556, + "step": 1715 + }, + { + "epoch": 0.18, + "grad_norm": 2.224063985682685, + "learning_rate": 9.433941658605987e-06, + "loss": 0.6958, + "step": 1716 + }, + { + "epoch": 0.18, + "grad_norm": 4.244857086101658, + "learning_rate": 9.43316337278131e-06, + "loss": 0.6439, + "step": 1717 + }, + { + "epoch": 0.18, + "grad_norm": 2.1301665791756372, + "learning_rate": 9.432384584432294e-06, + "loss": 0.7263, + "step": 1718 + }, + { + "epoch": 0.18, + "grad_norm": 2.204264658265296, + "learning_rate": 9.431605293647224e-06, + "loss": 0.6973, + "step": 1719 + }, + { + "epoch": 0.18, + "grad_norm": 1.9886649316067186, + "learning_rate": 9.430825500514433e-06, + "loss": 0.5987, + "step": 1720 + }, + { + "epoch": 0.18, + "grad_norm": 1.9015353627790044, + "learning_rate": 9.430045205122317e-06, + "loss": 0.6792, + "step": 1721 + }, + { + "epoch": 0.18, + "grad_norm": 1.746055037768087, + "learning_rate": 9.429264407559328e-06, + "loss": 0.6747, + "step": 1722 + }, + { + "epoch": 0.18, + "grad_norm": 1.8539817079671905, + "learning_rate": 9.42848310791397e-06, + "loss": 0.6928, + "step": 1723 + }, + { + "epoch": 0.18, + "grad_norm": 2.1077148459976724, + "learning_rate": 9.427701306274812e-06, + "loss": 0.6944, + "step": 1724 + }, + { + "epoch": 0.18, + "grad_norm": 2.2949928578619594, + "learning_rate": 9.426919002730473e-06, + "loss": 0.6411, + "step": 1725 + }, + { + "epoch": 0.18, + "grad_norm": 2.3089363293156584, + "learning_rate": 9.426136197369633e-06, + "loss": 0.7011, + "step": 1726 + }, + { + "epoch": 0.18, + "grad_norm": 2.3775684030564825, + "learning_rate": 9.425352890281028e-06, + "loss": 0.6686, + "step": 1727 + }, + { + "epoch": 0.18, + "grad_norm": 2.3000666445379516, + "learning_rate": 9.424569081553447e-06, + "loss": 0.576, + "step": 1728 + }, + { + "epoch": 0.18, + "grad_norm": 2.2271722236338998, + "learning_rate": 9.423784771275744e-06, + "loss": 0.6952, + "step": 1729 + }, + { + "epoch": 0.18, + "grad_norm": 1.9171166819419831, + "learning_rate": 9.422999959536819e-06, + "loss": 0.8124, + "step": 1730 + }, + { + "epoch": 0.18, + "grad_norm": 1.9396569476898367, + "learning_rate": 9.422214646425641e-06, + "loss": 0.5715, + "step": 1731 + }, + { + "epoch": 0.18, + "grad_norm": 2.1983267960374264, + "learning_rate": 9.421428832031226e-06, + "loss": 0.6675, + "step": 1732 + }, + { + "epoch": 0.18, + "grad_norm": 2.2057794317489123, + "learning_rate": 9.420642516442652e-06, + "loss": 0.6696, + "step": 1733 + }, + { + "epoch": 0.18, + "grad_norm": 1.719469875329039, + "learning_rate": 9.41985569974905e-06, + "loss": 0.6127, + "step": 1734 + }, + { + "epoch": 0.18, + "grad_norm": 2.083556833840758, + "learning_rate": 9.419068382039615e-06, + "loss": 0.7165, + "step": 1735 + }, + { + "epoch": 0.18, + "grad_norm": 2.1664458577953467, + "learning_rate": 9.41828056340359e-06, + "loss": 0.7521, + "step": 1736 + }, + { + "epoch": 0.18, + "grad_norm": 1.9227176468266851, + "learning_rate": 9.41749224393028e-06, + "loss": 0.7552, + "step": 1737 + }, + { + "epoch": 0.18, + "grad_norm": 1.9964207302492, + "learning_rate": 9.416703423709044e-06, + "loss": 0.6417, + "step": 1738 + }, + { + "epoch": 0.18, + "grad_norm": 1.8401124624218785, + "learning_rate": 9.415914102829302e-06, + "loss": 0.6845, + "step": 1739 + }, + { + "epoch": 0.18, + "grad_norm": 1.9982912657871517, + "learning_rate": 9.415124281380525e-06, + "loss": 0.7257, + "step": 1740 + }, + { + "epoch": 0.18, + "grad_norm": 1.931542910740269, + "learning_rate": 9.414333959452247e-06, + "loss": 0.7005, + "step": 1741 + }, + { + "epoch": 0.18, + "grad_norm": 2.0180011776973115, + "learning_rate": 9.413543137134053e-06, + "loss": 0.7342, + "step": 1742 + }, + { + "epoch": 0.18, + "grad_norm": 1.8253225407310454, + "learning_rate": 9.412751814515588e-06, + "loss": 0.697, + "step": 1743 + }, + { + "epoch": 0.18, + "grad_norm": 2.1203442175782348, + "learning_rate": 9.411959991686554e-06, + "loss": 0.7561, + "step": 1744 + }, + { + "epoch": 0.18, + "grad_norm": 1.9259566109424078, + "learning_rate": 9.411167668736707e-06, + "loss": 0.5717, + "step": 1745 + }, + { + "epoch": 0.18, + "grad_norm": 1.95383111394422, + "learning_rate": 9.410374845755862e-06, + "loss": 0.6709, + "step": 1746 + }, + { + "epoch": 0.18, + "grad_norm": 2.4064921390288863, + "learning_rate": 9.409581522833889e-06, + "loss": 0.6546, + "step": 1747 + }, + { + "epoch": 0.18, + "grad_norm": 1.9152540024539662, + "learning_rate": 9.408787700060718e-06, + "loss": 0.6543, + "step": 1748 + }, + { + "epoch": 0.18, + "grad_norm": 2.1151996633398142, + "learning_rate": 9.40799337752633e-06, + "loss": 0.6748, + "step": 1749 + }, + { + "epoch": 0.18, + "grad_norm": 1.8568166821966514, + "learning_rate": 9.40719855532077e-06, + "loss": 0.7002, + "step": 1750 + }, + { + "epoch": 0.18, + "grad_norm": 2.0767917401632983, + "learning_rate": 9.406403233534134e-06, + "loss": 0.6867, + "step": 1751 + }, + { + "epoch": 0.18, + "grad_norm": 2.143310125483721, + "learning_rate": 9.405607412256573e-06, + "loss": 0.7443, + "step": 1752 + }, + { + "epoch": 0.18, + "grad_norm": 2.02257859190377, + "learning_rate": 9.404811091578302e-06, + "loss": 0.6455, + "step": 1753 + }, + { + "epoch": 0.18, + "grad_norm": 1.9046396189464536, + "learning_rate": 9.404014271589588e-06, + "loss": 0.7036, + "step": 1754 + }, + { + "epoch": 0.18, + "grad_norm": 1.7750985666179104, + "learning_rate": 9.403216952380755e-06, + "loss": 0.6842, + "step": 1755 + }, + { + "epoch": 0.18, + "grad_norm": 1.9874375531893456, + "learning_rate": 9.40241913404218e-06, + "loss": 0.6559, + "step": 1756 + }, + { + "epoch": 0.18, + "grad_norm": 1.886971108135804, + "learning_rate": 9.401620816664305e-06, + "loss": 0.6458, + "step": 1757 + }, + { + "epoch": 0.18, + "grad_norm": 1.9427562819969475, + "learning_rate": 9.400822000337622e-06, + "loss": 0.7032, + "step": 1758 + }, + { + "epoch": 0.18, + "grad_norm": 1.8186870029716242, + "learning_rate": 9.400022685152683e-06, + "loss": 0.6731, + "step": 1759 + }, + { + "epoch": 0.18, + "grad_norm": 1.855438322366915, + "learning_rate": 9.399222871200091e-06, + "loss": 0.6359, + "step": 1760 + }, + { + "epoch": 0.18, + "grad_norm": 2.23468285760032, + "learning_rate": 9.398422558570512e-06, + "loss": 0.7936, + "step": 1761 + }, + { + "epoch": 0.18, + "grad_norm": 1.9850677319168017, + "learning_rate": 9.397621747354666e-06, + "loss": 0.5443, + "step": 1762 + }, + { + "epoch": 0.18, + "grad_norm": 2.0215518938615653, + "learning_rate": 9.39682043764333e-06, + "loss": 0.7142, + "step": 1763 + }, + { + "epoch": 0.18, + "grad_norm": 2.1960036942141676, + "learning_rate": 9.396018629527336e-06, + "loss": 0.6934, + "step": 1764 + }, + { + "epoch": 0.18, + "grad_norm": 1.9005397240127722, + "learning_rate": 9.395216323097573e-06, + "loss": 0.7108, + "step": 1765 + }, + { + "epoch": 0.18, + "grad_norm": 1.7738157201021318, + "learning_rate": 9.394413518444989e-06, + "loss": 0.6096, + "step": 1766 + }, + { + "epoch": 0.18, + "grad_norm": 1.936537266975992, + "learning_rate": 9.393610215660585e-06, + "loss": 0.6309, + "step": 1767 + }, + { + "epoch": 0.18, + "grad_norm": 1.9371372960509754, + "learning_rate": 9.392806414835422e-06, + "loss": 0.7551, + "step": 1768 + }, + { + "epoch": 0.18, + "grad_norm": 1.9333032902331897, + "learning_rate": 9.392002116060612e-06, + "loss": 0.6436, + "step": 1769 + }, + { + "epoch": 0.18, + "grad_norm": 2.027033584198372, + "learning_rate": 9.391197319427328e-06, + "loss": 0.6055, + "step": 1770 + }, + { + "epoch": 0.18, + "grad_norm": 2.196135164479654, + "learning_rate": 9.3903920250268e-06, + "loss": 0.7347, + "step": 1771 + }, + { + "epoch": 0.18, + "grad_norm": 1.731083189320279, + "learning_rate": 9.389586232950312e-06, + "loss": 0.5823, + "step": 1772 + }, + { + "epoch": 0.18, + "grad_norm": 1.9930184050076185, + "learning_rate": 9.388779943289204e-06, + "loss": 0.6757, + "step": 1773 + }, + { + "epoch": 0.18, + "grad_norm": 1.933884457327214, + "learning_rate": 9.387973156134872e-06, + "loss": 0.6756, + "step": 1774 + }, + { + "epoch": 0.18, + "grad_norm": 2.391449358670151, + "learning_rate": 9.387165871578774e-06, + "loss": 0.6467, + "step": 1775 + }, + { + "epoch": 0.18, + "grad_norm": 2.0315456443703153, + "learning_rate": 9.38635808971242e-06, + "loss": 0.629, + "step": 1776 + }, + { + "epoch": 0.18, + "grad_norm": 2.051657440819527, + "learning_rate": 9.385549810627374e-06, + "loss": 0.6349, + "step": 1777 + }, + { + "epoch": 0.18, + "grad_norm": 2.208159799053304, + "learning_rate": 9.38474103441526e-06, + "loss": 0.7332, + "step": 1778 + }, + { + "epoch": 0.18, + "grad_norm": 2.003368483790586, + "learning_rate": 9.383931761167757e-06, + "loss": 0.6808, + "step": 1779 + }, + { + "epoch": 0.19, + "grad_norm": 1.9662724949489288, + "learning_rate": 9.383121990976602e-06, + "loss": 0.7002, + "step": 1780 + }, + { + "epoch": 0.19, + "grad_norm": 2.046593510996878, + "learning_rate": 9.382311723933586e-06, + "loss": 0.6752, + "step": 1781 + }, + { + "epoch": 0.19, + "grad_norm": 1.7582074425722836, + "learning_rate": 9.381500960130558e-06, + "loss": 0.6253, + "step": 1782 + }, + { + "epoch": 0.19, + "grad_norm": 1.8022736464724076, + "learning_rate": 9.380689699659423e-06, + "loss": 0.6269, + "step": 1783 + }, + { + "epoch": 0.19, + "grad_norm": 1.850845380128415, + "learning_rate": 9.37987794261214e-06, + "loss": 0.6402, + "step": 1784 + }, + { + "epoch": 0.19, + "grad_norm": 1.8174325837103, + "learning_rate": 9.379065689080729e-06, + "loss": 0.6395, + "step": 1785 + }, + { + "epoch": 0.19, + "grad_norm": 1.965486494511778, + "learning_rate": 9.378252939157262e-06, + "loss": 0.6842, + "step": 1786 + }, + { + "epoch": 0.19, + "grad_norm": 1.7872036813111714, + "learning_rate": 9.377439692933869e-06, + "loss": 0.702, + "step": 1787 + }, + { + "epoch": 0.19, + "grad_norm": 2.056730874252554, + "learning_rate": 9.376625950502736e-06, + "loss": 0.6803, + "step": 1788 + }, + { + "epoch": 0.19, + "grad_norm": 1.7395674020136729, + "learning_rate": 9.375811711956106e-06, + "loss": 0.6639, + "step": 1789 + }, + { + "epoch": 0.19, + "grad_norm": 1.7736550746022077, + "learning_rate": 9.374996977386276e-06, + "loss": 0.6338, + "step": 1790 + }, + { + "epoch": 0.19, + "grad_norm": 1.8660714665304707, + "learning_rate": 9.374181746885603e-06, + "loss": 0.5861, + "step": 1791 + }, + { + "epoch": 0.19, + "grad_norm": 1.9014869405672314, + "learning_rate": 9.373366020546498e-06, + "loss": 0.7147, + "step": 1792 + }, + { + "epoch": 0.19, + "grad_norm": 1.7393522079638204, + "learning_rate": 9.372549798461426e-06, + "loss": 0.5423, + "step": 1793 + }, + { + "epoch": 0.19, + "grad_norm": 1.8812361480908122, + "learning_rate": 9.371733080722911e-06, + "loss": 0.7266, + "step": 1794 + }, + { + "epoch": 0.19, + "grad_norm": 1.808933104084343, + "learning_rate": 9.370915867423534e-06, + "loss": 0.7232, + "step": 1795 + }, + { + "epoch": 0.19, + "grad_norm": 1.7975636897079652, + "learning_rate": 9.37009815865593e-06, + "loss": 0.638, + "step": 1796 + }, + { + "epoch": 0.19, + "grad_norm": 1.9655333154002042, + "learning_rate": 9.369279954512791e-06, + "loss": 0.81, + "step": 1797 + }, + { + "epoch": 0.19, + "grad_norm": 2.1093837615576163, + "learning_rate": 9.368461255086866e-06, + "loss": 0.6529, + "step": 1798 + }, + { + "epoch": 0.19, + "grad_norm": 1.8458410099324793, + "learning_rate": 9.367642060470958e-06, + "loss": 0.5708, + "step": 1799 + }, + { + "epoch": 0.19, + "grad_norm": 1.7681980524691754, + "learning_rate": 9.366822370757927e-06, + "loss": 0.6468, + "step": 1800 + }, + { + "epoch": 0.19, + "grad_norm": 2.0440408178376956, + "learning_rate": 9.36600218604069e-06, + "loss": 0.6947, + "step": 1801 + }, + { + "epoch": 0.19, + "grad_norm": 1.709238322425783, + "learning_rate": 9.36518150641222e-06, + "loss": 0.6153, + "step": 1802 + }, + { + "epoch": 0.19, + "grad_norm": 1.8525472802523686, + "learning_rate": 9.364360331965545e-06, + "loss": 0.6379, + "step": 1803 + }, + { + "epoch": 0.19, + "grad_norm": 2.040402922812123, + "learning_rate": 9.363538662793752e-06, + "loss": 0.73, + "step": 1804 + }, + { + "epoch": 0.19, + "grad_norm": 1.9562526941554614, + "learning_rate": 9.362716498989979e-06, + "loss": 0.6633, + "step": 1805 + }, + { + "epoch": 0.19, + "grad_norm": 2.102318885795319, + "learning_rate": 9.361893840647425e-06, + "loss": 0.6032, + "step": 1806 + }, + { + "epoch": 0.19, + "grad_norm": 1.934206711167693, + "learning_rate": 9.361070687859341e-06, + "loss": 0.6323, + "step": 1807 + }, + { + "epoch": 0.19, + "grad_norm": 2.0505371589582104, + "learning_rate": 9.36024704071904e-06, + "loss": 0.7097, + "step": 1808 + }, + { + "epoch": 0.19, + "grad_norm": 1.9380575372950994, + "learning_rate": 9.359422899319882e-06, + "loss": 0.7334, + "step": 1809 + }, + { + "epoch": 0.19, + "grad_norm": 1.9703923692444525, + "learning_rate": 9.358598263755289e-06, + "loss": 0.6529, + "step": 1810 + }, + { + "epoch": 0.19, + "grad_norm": 2.0041847525050547, + "learning_rate": 9.357773134118743e-06, + "loss": 0.7106, + "step": 1811 + }, + { + "epoch": 0.19, + "grad_norm": 1.8824885667479772, + "learning_rate": 9.356947510503771e-06, + "loss": 0.6663, + "step": 1812 + }, + { + "epoch": 0.19, + "grad_norm": 1.9126377503709864, + "learning_rate": 9.356121393003968e-06, + "loss": 0.6731, + "step": 1813 + }, + { + "epoch": 0.19, + "grad_norm": 2.1234974613692432, + "learning_rate": 9.355294781712974e-06, + "loss": 0.668, + "step": 1814 + }, + { + "epoch": 0.19, + "grad_norm": 2.427177580084952, + "learning_rate": 9.354467676724491e-06, + "loss": 0.6898, + "step": 1815 + }, + { + "epoch": 0.19, + "grad_norm": 1.7188594239543076, + "learning_rate": 9.35364007813228e-06, + "loss": 0.6387, + "step": 1816 + }, + { + "epoch": 0.19, + "grad_norm": 2.1711651881047604, + "learning_rate": 9.352811986030147e-06, + "loss": 0.7751, + "step": 1817 + }, + { + "epoch": 0.19, + "grad_norm": 2.004661231210757, + "learning_rate": 9.35198340051197e-06, + "loss": 0.7885, + "step": 1818 + }, + { + "epoch": 0.19, + "grad_norm": 2.202027654675994, + "learning_rate": 9.351154321671667e-06, + "loss": 0.714, + "step": 1819 + }, + { + "epoch": 0.19, + "grad_norm": 2.108135480448231, + "learning_rate": 9.350324749603221e-06, + "loss": 0.6352, + "step": 1820 + }, + { + "epoch": 0.19, + "grad_norm": 2.0603342012079517, + "learning_rate": 9.34949468440067e-06, + "loss": 0.6355, + "step": 1821 + }, + { + "epoch": 0.19, + "grad_norm": 2.014812774582301, + "learning_rate": 9.348664126158103e-06, + "loss": 0.6858, + "step": 1822 + }, + { + "epoch": 0.19, + "grad_norm": 1.967906011677732, + "learning_rate": 9.34783307496967e-06, + "loss": 0.7113, + "step": 1823 + }, + { + "epoch": 0.19, + "grad_norm": 2.0691130215812827, + "learning_rate": 9.347001530929579e-06, + "loss": 0.7028, + "step": 1824 + }, + { + "epoch": 0.19, + "grad_norm": 1.9448319235913356, + "learning_rate": 9.346169494132086e-06, + "loss": 0.6687, + "step": 1825 + }, + { + "epoch": 0.19, + "grad_norm": 1.858179669232822, + "learning_rate": 9.345336964671507e-06, + "loss": 0.5961, + "step": 1826 + }, + { + "epoch": 0.19, + "grad_norm": 1.6801584205129731, + "learning_rate": 9.344503942642218e-06, + "loss": 0.5703, + "step": 1827 + }, + { + "epoch": 0.19, + "grad_norm": 1.8900232563367874, + "learning_rate": 9.343670428138644e-06, + "loss": 0.6756, + "step": 1828 + }, + { + "epoch": 0.19, + "grad_norm": 1.8460935273114019, + "learning_rate": 9.342836421255268e-06, + "loss": 0.6328, + "step": 1829 + }, + { + "epoch": 0.19, + "grad_norm": 1.8418240477099301, + "learning_rate": 9.34200192208663e-06, + "loss": 0.6728, + "step": 1830 + }, + { + "epoch": 0.19, + "grad_norm": 1.8254277059194648, + "learning_rate": 9.341166930727326e-06, + "loss": 0.7245, + "step": 1831 + }, + { + "epoch": 0.19, + "grad_norm": 1.88091998086299, + "learning_rate": 9.340331447272006e-06, + "loss": 0.6205, + "step": 1832 + }, + { + "epoch": 0.19, + "grad_norm": 1.822431851717622, + "learning_rate": 9.339495471815379e-06, + "loss": 0.6293, + "step": 1833 + }, + { + "epoch": 0.19, + "grad_norm": 1.8356744862913454, + "learning_rate": 9.338659004452204e-06, + "loss": 0.6785, + "step": 1834 + }, + { + "epoch": 0.19, + "grad_norm": 1.853780372536723, + "learning_rate": 9.337822045277303e-06, + "loss": 0.6948, + "step": 1835 + }, + { + "epoch": 0.19, + "grad_norm": 1.8247544720172622, + "learning_rate": 9.336984594385547e-06, + "loss": 0.546, + "step": 1836 + }, + { + "epoch": 0.19, + "grad_norm": 1.7757233288010839, + "learning_rate": 9.33614665187187e-06, + "loss": 0.7182, + "step": 1837 + }, + { + "epoch": 0.19, + "grad_norm": 2.1038639116244946, + "learning_rate": 9.335308217831252e-06, + "loss": 0.7425, + "step": 1838 + }, + { + "epoch": 0.19, + "grad_norm": 1.9022465803573188, + "learning_rate": 9.334469292358736e-06, + "loss": 0.7418, + "step": 1839 + }, + { + "epoch": 0.19, + "grad_norm": 1.982372166611791, + "learning_rate": 9.333629875549424e-06, + "loss": 0.6076, + "step": 1840 + }, + { + "epoch": 0.19, + "grad_norm": 1.866068994191335, + "learning_rate": 9.33278996749846e-06, + "loss": 0.6356, + "step": 1841 + }, + { + "epoch": 0.19, + "grad_norm": 1.9918446912193064, + "learning_rate": 9.331949568301062e-06, + "loss": 0.643, + "step": 1842 + }, + { + "epoch": 0.19, + "grad_norm": 1.9290363357787772, + "learning_rate": 9.331108678052485e-06, + "loss": 0.6967, + "step": 1843 + }, + { + "epoch": 0.19, + "grad_norm": 1.944946068496725, + "learning_rate": 9.330267296848056e-06, + "loss": 0.6828, + "step": 1844 + }, + { + "epoch": 0.19, + "grad_norm": 2.2391804890671683, + "learning_rate": 9.329425424783145e-06, + "loss": 0.66, + "step": 1845 + }, + { + "epoch": 0.19, + "grad_norm": 2.0239600614768927, + "learning_rate": 9.328583061953187e-06, + "loss": 0.6193, + "step": 1846 + }, + { + "epoch": 0.19, + "grad_norm": 1.7541080391109118, + "learning_rate": 9.327740208453666e-06, + "loss": 0.6133, + "step": 1847 + }, + { + "epoch": 0.19, + "grad_norm": 2.0824969747832474, + "learning_rate": 9.326896864380124e-06, + "loss": 0.7947, + "step": 1848 + }, + { + "epoch": 0.19, + "grad_norm": 1.9952093471075816, + "learning_rate": 9.326053029828162e-06, + "loss": 0.6064, + "step": 1849 + }, + { + "epoch": 0.19, + "grad_norm": 1.8993269296443138, + "learning_rate": 9.32520870489343e-06, + "loss": 0.6876, + "step": 1850 + }, + { + "epoch": 0.19, + "grad_norm": 1.8474386957130455, + "learning_rate": 9.32436388967164e-06, + "loss": 0.6825, + "step": 1851 + }, + { + "epoch": 0.19, + "grad_norm": 1.873580562276867, + "learning_rate": 9.323518584258554e-06, + "loss": 0.7735, + "step": 1852 + }, + { + "epoch": 0.19, + "grad_norm": 1.923520628246674, + "learning_rate": 9.32267278874999e-06, + "loss": 0.678, + "step": 1853 + }, + { + "epoch": 0.19, + "grad_norm": 1.7349404592036248, + "learning_rate": 9.32182650324183e-06, + "loss": 0.6314, + "step": 1854 + }, + { + "epoch": 0.19, + "grad_norm": 1.983232010650299, + "learning_rate": 9.320979727830004e-06, + "loss": 0.663, + "step": 1855 + }, + { + "epoch": 0.19, + "grad_norm": 2.116098848893127, + "learning_rate": 9.320132462610495e-06, + "loss": 0.6062, + "step": 1856 + }, + { + "epoch": 0.19, + "grad_norm": 1.954496745764822, + "learning_rate": 9.319284707679348e-06, + "loss": 0.6566, + "step": 1857 + }, + { + "epoch": 0.19, + "grad_norm": 1.944786321375141, + "learning_rate": 9.318436463132661e-06, + "loss": 0.7398, + "step": 1858 + }, + { + "epoch": 0.19, + "grad_norm": 1.9358948718792182, + "learning_rate": 9.317587729066585e-06, + "loss": 0.6646, + "step": 1859 + }, + { + "epoch": 0.19, + "grad_norm": 1.9699922116768356, + "learning_rate": 9.316738505577331e-06, + "loss": 0.7234, + "step": 1860 + }, + { + "epoch": 0.19, + "grad_norm": 2.093839616518822, + "learning_rate": 9.315888792761163e-06, + "loss": 0.659, + "step": 1861 + }, + { + "epoch": 0.19, + "grad_norm": 1.925384133731945, + "learning_rate": 9.3150385907144e-06, + "loss": 0.6902, + "step": 1862 + }, + { + "epoch": 0.19, + "grad_norm": 1.808062266664772, + "learning_rate": 9.31418789953342e-06, + "loss": 0.576, + "step": 1863 + }, + { + "epoch": 0.19, + "grad_norm": 2.073959612645908, + "learning_rate": 9.31333671931465e-06, + "loss": 0.7181, + "step": 1864 + }, + { + "epoch": 0.19, + "grad_norm": 1.8375303473816842, + "learning_rate": 9.312485050154578e-06, + "loss": 0.6194, + "step": 1865 + }, + { + "epoch": 0.19, + "grad_norm": 2.026136079077223, + "learning_rate": 9.311632892149744e-06, + "loss": 0.6778, + "step": 1866 + }, + { + "epoch": 0.19, + "grad_norm": 2.0988523898716376, + "learning_rate": 9.310780245396747e-06, + "loss": 0.7, + "step": 1867 + }, + { + "epoch": 0.19, + "grad_norm": 1.9595698067023832, + "learning_rate": 9.309927109992239e-06, + "loss": 0.5943, + "step": 1868 + }, + { + "epoch": 0.19, + "grad_norm": 2.395683206373414, + "learning_rate": 9.309073486032926e-06, + "loss": 0.6852, + "step": 1869 + }, + { + "epoch": 0.19, + "grad_norm": 1.8437940606159533, + "learning_rate": 9.308219373615574e-06, + "loss": 0.5946, + "step": 1870 + }, + { + "epoch": 0.19, + "grad_norm": 2.23571475537344, + "learning_rate": 9.307364772837e-06, + "loss": 0.7415, + "step": 1871 + }, + { + "epoch": 0.19, + "grad_norm": 2.038379272307801, + "learning_rate": 9.306509683794077e-06, + "loss": 0.7228, + "step": 1872 + }, + { + "epoch": 0.19, + "grad_norm": 1.9487725835663436, + "learning_rate": 9.305654106583735e-06, + "loss": 0.7507, + "step": 1873 + }, + { + "epoch": 0.19, + "grad_norm": 1.887653621297847, + "learning_rate": 9.304798041302959e-06, + "loss": 0.6303, + "step": 1874 + }, + { + "epoch": 0.19, + "grad_norm": 1.884397519021714, + "learning_rate": 9.30394148804879e-06, + "loss": 0.7304, + "step": 1875 + }, + { + "epoch": 0.2, + "grad_norm": 1.9021671805670946, + "learning_rate": 9.303084446918317e-06, + "loss": 0.6402, + "step": 1876 + }, + { + "epoch": 0.2, + "grad_norm": 1.8668810936120013, + "learning_rate": 9.302226918008699e-06, + "loss": 0.6011, + "step": 1877 + }, + { + "epoch": 0.2, + "grad_norm": 1.793475054066034, + "learning_rate": 9.301368901417138e-06, + "loss": 0.6594, + "step": 1878 + }, + { + "epoch": 0.2, + "grad_norm": 2.120762189756595, + "learning_rate": 9.300510397240894e-06, + "loss": 0.7769, + "step": 1879 + }, + { + "epoch": 0.2, + "grad_norm": 2.300340831850511, + "learning_rate": 9.299651405577286e-06, + "loss": 0.7338, + "step": 1880 + }, + { + "epoch": 0.2, + "grad_norm": 1.9583934032957513, + "learning_rate": 9.298791926523683e-06, + "loss": 0.6799, + "step": 1881 + }, + { + "epoch": 0.2, + "grad_norm": 1.9638398719688912, + "learning_rate": 9.29793196017751e-06, + "loss": 0.6121, + "step": 1882 + }, + { + "epoch": 0.2, + "grad_norm": 2.078735718161527, + "learning_rate": 9.297071506636256e-06, + "loss": 0.7302, + "step": 1883 + }, + { + "epoch": 0.2, + "grad_norm": 1.8398638850551559, + "learning_rate": 9.296210565997453e-06, + "loss": 0.6358, + "step": 1884 + }, + { + "epoch": 0.2, + "grad_norm": 1.7991017595645746, + "learning_rate": 9.295349138358693e-06, + "loss": 0.6302, + "step": 1885 + }, + { + "epoch": 0.2, + "grad_norm": 1.738381217294171, + "learning_rate": 9.294487223817628e-06, + "loss": 0.5517, + "step": 1886 + }, + { + "epoch": 0.2, + "grad_norm": 1.7996019958686276, + "learning_rate": 9.293624822471957e-06, + "loss": 0.7795, + "step": 1887 + }, + { + "epoch": 0.2, + "grad_norm": 2.04078306349116, + "learning_rate": 9.29276193441944e-06, + "loss": 0.7368, + "step": 1888 + }, + { + "epoch": 0.2, + "grad_norm": 1.91957856399538, + "learning_rate": 9.29189855975789e-06, + "loss": 0.6685, + "step": 1889 + }, + { + "epoch": 0.2, + "grad_norm": 1.8755599618267553, + "learning_rate": 9.291034698585174e-06, + "loss": 0.6246, + "step": 1890 + }, + { + "epoch": 0.2, + "grad_norm": 1.874547678759945, + "learning_rate": 9.290170350999217e-06, + "loss": 0.6128, + "step": 1891 + }, + { + "epoch": 0.2, + "grad_norm": 1.8915199997141332, + "learning_rate": 9.289305517098e-06, + "loss": 0.6731, + "step": 1892 + }, + { + "epoch": 0.2, + "grad_norm": 1.6748872030367143, + "learning_rate": 9.288440196979552e-06, + "loss": 0.5941, + "step": 1893 + }, + { + "epoch": 0.2, + "grad_norm": 1.7222395068337952, + "learning_rate": 9.287574390741965e-06, + "loss": 0.626, + "step": 1894 + }, + { + "epoch": 0.2, + "grad_norm": 2.162350680248037, + "learning_rate": 9.286708098483383e-06, + "loss": 0.6518, + "step": 1895 + }, + { + "epoch": 0.2, + "grad_norm": 1.8336812096087574, + "learning_rate": 9.285841320302005e-06, + "loss": 0.5188, + "step": 1896 + }, + { + "epoch": 0.2, + "grad_norm": 2.2796155535184344, + "learning_rate": 9.284974056296084e-06, + "loss": 0.6926, + "step": 1897 + }, + { + "epoch": 0.2, + "grad_norm": 1.7071784585343275, + "learning_rate": 9.284106306563929e-06, + "loss": 0.6714, + "step": 1898 + }, + { + "epoch": 0.2, + "grad_norm": 1.6506461809111215, + "learning_rate": 9.283238071203907e-06, + "loss": 0.4856, + "step": 1899 + }, + { + "epoch": 0.2, + "grad_norm": 1.8781568684820324, + "learning_rate": 9.282369350314436e-06, + "loss": 0.6134, + "step": 1900 + }, + { + "epoch": 0.2, + "grad_norm": 1.9914978185733683, + "learning_rate": 9.28150014399399e-06, + "loss": 0.669, + "step": 1901 + }, + { + "epoch": 0.2, + "grad_norm": 1.8618014748000549, + "learning_rate": 9.280630452341099e-06, + "loss": 0.7035, + "step": 1902 + }, + { + "epoch": 0.2, + "grad_norm": 2.0645327853298525, + "learning_rate": 9.279760275454348e-06, + "loss": 0.6232, + "step": 1903 + }, + { + "epoch": 0.2, + "grad_norm": 1.7147310105966709, + "learning_rate": 9.278889613432375e-06, + "loss": 0.571, + "step": 1904 + }, + { + "epoch": 0.2, + "grad_norm": 1.9961133817036107, + "learning_rate": 9.278018466373877e-06, + "loss": 0.7313, + "step": 1905 + }, + { + "epoch": 0.2, + "grad_norm": 1.7602136103564725, + "learning_rate": 9.277146834377601e-06, + "loss": 0.6825, + "step": 1906 + }, + { + "epoch": 0.2, + "grad_norm": 1.8653246475670127, + "learning_rate": 9.276274717542352e-06, + "loss": 0.5975, + "step": 1907 + }, + { + "epoch": 0.2, + "grad_norm": 1.9630113986010014, + "learning_rate": 9.275402115966991e-06, + "loss": 0.7427, + "step": 1908 + }, + { + "epoch": 0.2, + "grad_norm": 1.959845410358148, + "learning_rate": 9.27452902975043e-06, + "loss": 0.6803, + "step": 1909 + }, + { + "epoch": 0.2, + "grad_norm": 1.9040361049898615, + "learning_rate": 9.273655458991639e-06, + "loss": 0.5902, + "step": 1910 + }, + { + "epoch": 0.2, + "grad_norm": 2.0371368293310326, + "learning_rate": 9.272781403789644e-06, + "loss": 0.77, + "step": 1911 + }, + { + "epoch": 0.2, + "grad_norm": 1.8827798471230117, + "learning_rate": 9.271906864243523e-06, + "loss": 0.7044, + "step": 1912 + }, + { + "epoch": 0.2, + "grad_norm": 1.7981269024380084, + "learning_rate": 9.271031840452409e-06, + "loss": 0.6282, + "step": 1913 + }, + { + "epoch": 0.2, + "grad_norm": 1.927440464619091, + "learning_rate": 9.270156332515492e-06, + "loss": 0.6634, + "step": 1914 + }, + { + "epoch": 0.2, + "grad_norm": 1.946925561163631, + "learning_rate": 9.269280340532015e-06, + "loss": 0.6884, + "step": 1915 + }, + { + "epoch": 0.2, + "grad_norm": 1.7885164788997072, + "learning_rate": 9.268403864601279e-06, + "loss": 0.6771, + "step": 1916 + }, + { + "epoch": 0.2, + "grad_norm": 1.740210348011536, + "learning_rate": 9.267526904822634e-06, + "loss": 0.5771, + "step": 1917 + }, + { + "epoch": 0.2, + "grad_norm": 1.8515761495261118, + "learning_rate": 9.266649461295491e-06, + "loss": 0.5456, + "step": 1918 + }, + { + "epoch": 0.2, + "grad_norm": 1.7858501985347042, + "learning_rate": 9.265771534119313e-06, + "loss": 0.6791, + "step": 1919 + }, + { + "epoch": 0.2, + "grad_norm": 1.7471660234340287, + "learning_rate": 9.264893123393618e-06, + "loss": 0.6445, + "step": 1920 + }, + { + "epoch": 0.2, + "grad_norm": 2.0050209490973847, + "learning_rate": 9.264014229217978e-06, + "loss": 0.6816, + "step": 1921 + }, + { + "epoch": 0.2, + "grad_norm": 1.7946185867470288, + "learning_rate": 9.263134851692019e-06, + "loss": 0.6995, + "step": 1922 + }, + { + "epoch": 0.2, + "grad_norm": 1.9338460880907744, + "learning_rate": 9.262254990915427e-06, + "loss": 0.7106, + "step": 1923 + }, + { + "epoch": 0.2, + "grad_norm": 2.1389806917513488, + "learning_rate": 9.261374646987939e-06, + "loss": 0.73, + "step": 1924 + }, + { + "epoch": 0.2, + "grad_norm": 2.5347394068987805, + "learning_rate": 9.260493820009343e-06, + "loss": 0.7632, + "step": 1925 + }, + { + "epoch": 0.2, + "grad_norm": 1.9525928567638877, + "learning_rate": 9.259612510079492e-06, + "loss": 0.6774, + "step": 1926 + }, + { + "epoch": 0.2, + "grad_norm": 2.0893340477312625, + "learning_rate": 9.258730717298281e-06, + "loss": 0.6821, + "step": 1927 + }, + { + "epoch": 0.2, + "grad_norm": 2.0650730951225564, + "learning_rate": 9.25784844176567e-06, + "loss": 0.7423, + "step": 1928 + }, + { + "epoch": 0.2, + "grad_norm": 1.7756513259013247, + "learning_rate": 9.256965683581669e-06, + "loss": 0.6076, + "step": 1929 + }, + { + "epoch": 0.2, + "grad_norm": 1.8897211314293623, + "learning_rate": 9.256082442846347e-06, + "loss": 0.651, + "step": 1930 + }, + { + "epoch": 0.2, + "grad_norm": 1.8280216607096946, + "learning_rate": 9.255198719659818e-06, + "loss": 0.7047, + "step": 1931 + }, + { + "epoch": 0.2, + "grad_norm": 2.0086394653794373, + "learning_rate": 9.25431451412226e-06, + "loss": 0.7658, + "step": 1932 + }, + { + "epoch": 0.2, + "grad_norm": 1.9127465298594584, + "learning_rate": 9.253429826333904e-06, + "loss": 0.618, + "step": 1933 + }, + { + "epoch": 0.2, + "grad_norm": 1.826301070624459, + "learning_rate": 9.252544656395033e-06, + "loss": 0.7363, + "step": 1934 + }, + { + "epoch": 0.2, + "grad_norm": 1.9814272544927, + "learning_rate": 9.251659004405987e-06, + "loss": 0.6725, + "step": 1935 + }, + { + "epoch": 0.2, + "grad_norm": 1.9539602624207804, + "learning_rate": 9.25077287046716e-06, + "loss": 0.6789, + "step": 1936 + }, + { + "epoch": 0.2, + "grad_norm": 2.0032162419200046, + "learning_rate": 9.249886254678996e-06, + "loss": 0.6548, + "step": 1937 + }, + { + "epoch": 0.2, + "grad_norm": 1.7474671664321944, + "learning_rate": 9.248999157142006e-06, + "loss": 0.6904, + "step": 1938 + }, + { + "epoch": 0.2, + "grad_norm": 1.8925018918591514, + "learning_rate": 9.24811157795674e-06, + "loss": 0.6627, + "step": 1939 + }, + { + "epoch": 0.2, + "grad_norm": 2.102589642676861, + "learning_rate": 9.247223517223816e-06, + "loss": 0.6378, + "step": 1940 + }, + { + "epoch": 0.2, + "grad_norm": 1.9593907278097766, + "learning_rate": 9.246334975043896e-06, + "loss": 0.7016, + "step": 1941 + }, + { + "epoch": 0.2, + "grad_norm": 1.8473768840344285, + "learning_rate": 9.245445951517705e-06, + "loss": 0.6621, + "step": 1942 + }, + { + "epoch": 0.2, + "grad_norm": 2.0796609104335326, + "learning_rate": 9.244556446746014e-06, + "loss": 0.7231, + "step": 1943 + }, + { + "epoch": 0.2, + "grad_norm": 1.9370198559859941, + "learning_rate": 9.243666460829659e-06, + "loss": 0.7223, + "step": 1944 + }, + { + "epoch": 0.2, + "grad_norm": 2.053469252910239, + "learning_rate": 9.242775993869521e-06, + "loss": 0.7472, + "step": 1945 + }, + { + "epoch": 0.2, + "grad_norm": 1.9508358716738758, + "learning_rate": 9.241885045966543e-06, + "loss": 0.7247, + "step": 1946 + }, + { + "epoch": 0.2, + "grad_norm": 2.2450279545330183, + "learning_rate": 9.240993617221717e-06, + "loss": 0.7591, + "step": 1947 + }, + { + "epoch": 0.2, + "grad_norm": 1.98183928572626, + "learning_rate": 9.24010170773609e-06, + "loss": 0.5902, + "step": 1948 + }, + { + "epoch": 0.2, + "grad_norm": 1.8675456918185938, + "learning_rate": 9.239209317610766e-06, + "loss": 0.6223, + "step": 1949 + }, + { + "epoch": 0.2, + "grad_norm": 2.1191395908530715, + "learning_rate": 9.238316446946907e-06, + "loss": 0.5797, + "step": 1950 + }, + { + "epoch": 0.2, + "grad_norm": 1.875641872294237, + "learning_rate": 9.237423095845719e-06, + "loss": 0.7013, + "step": 1951 + }, + { + "epoch": 0.2, + "grad_norm": 1.7475234677089921, + "learning_rate": 9.23652926440847e-06, + "loss": 0.717, + "step": 1952 + }, + { + "epoch": 0.2, + "grad_norm": 2.167896282874592, + "learning_rate": 9.23563495273648e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.2, + "grad_norm": 1.7286001629951508, + "learning_rate": 9.234740160931128e-06, + "loss": 0.573, + "step": 1954 + }, + { + "epoch": 0.2, + "grad_norm": 2.266229733508544, + "learning_rate": 9.233844889093842e-06, + "loss": 0.5986, + "step": 1955 + }, + { + "epoch": 0.2, + "grad_norm": 2.032639531476447, + "learning_rate": 9.232949137326104e-06, + "loss": 0.6202, + "step": 1956 + }, + { + "epoch": 0.2, + "grad_norm": 1.8892463261859485, + "learning_rate": 9.232052905729455e-06, + "loss": 0.7258, + "step": 1957 + }, + { + "epoch": 0.2, + "grad_norm": 2.046805895646811, + "learning_rate": 9.231156194405488e-06, + "loss": 0.6311, + "step": 1958 + }, + { + "epoch": 0.2, + "grad_norm": 2.125870271699572, + "learning_rate": 9.230259003455849e-06, + "loss": 0.7484, + "step": 1959 + }, + { + "epoch": 0.2, + "grad_norm": 1.9802725052113708, + "learning_rate": 9.229361332982241e-06, + "loss": 0.7112, + "step": 1960 + }, + { + "epoch": 0.2, + "grad_norm": 1.8447348073498075, + "learning_rate": 9.228463183086417e-06, + "loss": 0.6662, + "step": 1961 + }, + { + "epoch": 0.2, + "grad_norm": 1.8752305247433687, + "learning_rate": 9.227564553870192e-06, + "loss": 0.6881, + "step": 1962 + }, + { + "epoch": 0.2, + "grad_norm": 1.8026957544784308, + "learning_rate": 9.226665445435428e-06, + "loss": 0.5558, + "step": 1963 + }, + { + "epoch": 0.2, + "grad_norm": 1.9002089055531495, + "learning_rate": 9.225765857884044e-06, + "loss": 0.6797, + "step": 1964 + }, + { + "epoch": 0.2, + "grad_norm": 1.81457632840884, + "learning_rate": 9.224865791318015e-06, + "loss": 0.6085, + "step": 1965 + }, + { + "epoch": 0.2, + "grad_norm": 1.7070620723972068, + "learning_rate": 9.223965245839367e-06, + "loss": 0.6723, + "step": 1966 + }, + { + "epoch": 0.2, + "grad_norm": 2.105875693810757, + "learning_rate": 9.223064221550183e-06, + "loss": 0.7318, + "step": 1967 + }, + { + "epoch": 0.2, + "grad_norm": 2.0605135978619633, + "learning_rate": 9.222162718552598e-06, + "loss": 0.8245, + "step": 1968 + }, + { + "epoch": 0.2, + "grad_norm": 2.238718777820951, + "learning_rate": 9.221260736948803e-06, + "loss": 0.7279, + "step": 1969 + }, + { + "epoch": 0.2, + "grad_norm": 2.106801695108455, + "learning_rate": 9.220358276841044e-06, + "loss": 0.6773, + "step": 1970 + }, + { + "epoch": 0.2, + "grad_norm": 1.665299995213328, + "learning_rate": 9.21945533833162e-06, + "loss": 0.6073, + "step": 1971 + }, + { + "epoch": 0.2, + "grad_norm": 1.7703299324370325, + "learning_rate": 9.218551921522882e-06, + "loss": 0.6295, + "step": 1972 + }, + { + "epoch": 0.21, + "grad_norm": 1.9740036444553057, + "learning_rate": 9.217648026517238e-06, + "loss": 0.7084, + "step": 1973 + }, + { + "epoch": 0.21, + "grad_norm": 1.950547982837811, + "learning_rate": 9.216743653417154e-06, + "loss": 0.6489, + "step": 1974 + }, + { + "epoch": 0.21, + "grad_norm": 1.8744414611735833, + "learning_rate": 9.215838802325139e-06, + "loss": 0.6199, + "step": 1975 + }, + { + "epoch": 0.21, + "grad_norm": 1.821711632746101, + "learning_rate": 9.214933473343765e-06, + "loss": 0.6074, + "step": 1976 + }, + { + "epoch": 0.21, + "grad_norm": 2.105716529248528, + "learning_rate": 9.21402766657566e-06, + "loss": 0.6654, + "step": 1977 + }, + { + "epoch": 0.21, + "grad_norm": 1.8437304299013715, + "learning_rate": 9.2131213821235e-06, + "loss": 0.5944, + "step": 1978 + }, + { + "epoch": 0.21, + "grad_norm": 1.9858141668801854, + "learning_rate": 9.212214620090016e-06, + "loss": 0.7049, + "step": 1979 + }, + { + "epoch": 0.21, + "grad_norm": 2.010065081274525, + "learning_rate": 9.211307380577997e-06, + "loss": 0.6696, + "step": 1980 + }, + { + "epoch": 0.21, + "grad_norm": 1.8158239557845401, + "learning_rate": 9.210399663690282e-06, + "loss": 0.5995, + "step": 1981 + }, + { + "epoch": 0.21, + "grad_norm": 1.81249302739702, + "learning_rate": 9.209491469529767e-06, + "loss": 0.5868, + "step": 1982 + }, + { + "epoch": 0.21, + "grad_norm": 1.786339404329761, + "learning_rate": 9.208582798199402e-06, + "loss": 0.5719, + "step": 1983 + }, + { + "epoch": 0.21, + "grad_norm": 1.9842702283554294, + "learning_rate": 9.207673649802187e-06, + "loss": 0.6408, + "step": 1984 + }, + { + "epoch": 0.21, + "grad_norm": 3.5838760747233813, + "learning_rate": 9.206764024441181e-06, + "loss": 0.699, + "step": 1985 + }, + { + "epoch": 0.21, + "grad_norm": 1.9062861026868705, + "learning_rate": 9.205853922219494e-06, + "loss": 0.5787, + "step": 1986 + }, + { + "epoch": 0.21, + "grad_norm": 1.7433682125475243, + "learning_rate": 9.204943343240293e-06, + "loss": 0.6315, + "step": 1987 + }, + { + "epoch": 0.21, + "grad_norm": 1.776540630355498, + "learning_rate": 9.2040322876068e-06, + "loss": 0.6349, + "step": 1988 + }, + { + "epoch": 0.21, + "grad_norm": 1.8949934734507197, + "learning_rate": 9.203120755422282e-06, + "loss": 0.621, + "step": 1989 + }, + { + "epoch": 0.21, + "grad_norm": 2.013754218571031, + "learning_rate": 9.202208746790069e-06, + "loss": 0.6994, + "step": 1990 + }, + { + "epoch": 0.21, + "grad_norm": 1.6216738548604914, + "learning_rate": 9.201296261813542e-06, + "loss": 0.5664, + "step": 1991 + }, + { + "epoch": 0.21, + "grad_norm": 1.746950760201878, + "learning_rate": 9.20038330059614e-06, + "loss": 0.591, + "step": 1992 + }, + { + "epoch": 0.21, + "grad_norm": 2.044019666809054, + "learning_rate": 9.199469863241349e-06, + "loss": 0.6995, + "step": 1993 + }, + { + "epoch": 0.21, + "grad_norm": 1.8783054256116338, + "learning_rate": 9.19855594985271e-06, + "loss": 0.6323, + "step": 1994 + }, + { + "epoch": 0.21, + "grad_norm": 1.8318861488098133, + "learning_rate": 9.197641560533826e-06, + "loss": 0.7148, + "step": 1995 + }, + { + "epoch": 0.21, + "grad_norm": 2.1884926195782684, + "learning_rate": 9.196726695388345e-06, + "loss": 0.6877, + "step": 1996 + }, + { + "epoch": 0.21, + "grad_norm": 1.805667438158848, + "learning_rate": 9.19581135451997e-06, + "loss": 0.5927, + "step": 1997 + }, + { + "epoch": 0.21, + "grad_norm": 1.6125439114431472, + "learning_rate": 9.194895538032464e-06, + "loss": 0.5711, + "step": 1998 + }, + { + "epoch": 0.21, + "grad_norm": 1.8686776317461966, + "learning_rate": 9.19397924602964e-06, + "loss": 0.6378, + "step": 1999 + }, + { + "epoch": 0.21, + "grad_norm": 1.726924867450233, + "learning_rate": 9.193062478615363e-06, + "loss": 0.6463, + "step": 2000 + }, + { + "epoch": 0.21, + "grad_norm": 2.146670051321336, + "learning_rate": 9.192145235893554e-06, + "loss": 0.6873, + "step": 2001 + }, + { + "epoch": 0.21, + "grad_norm": 1.902666731160066, + "learning_rate": 9.191227517968189e-06, + "loss": 0.6114, + "step": 2002 + }, + { + "epoch": 0.21, + "grad_norm": 2.0687284258365057, + "learning_rate": 9.190309324943294e-06, + "loss": 0.712, + "step": 2003 + }, + { + "epoch": 0.21, + "grad_norm": 1.9026767117952792, + "learning_rate": 9.189390656922955e-06, + "loss": 0.6617, + "step": 2004 + }, + { + "epoch": 0.21, + "grad_norm": 1.8653739251485049, + "learning_rate": 9.188471514011304e-06, + "loss": 0.6892, + "step": 2005 + }, + { + "epoch": 0.21, + "grad_norm": 1.954084908851961, + "learning_rate": 9.187551896312536e-06, + "loss": 0.6613, + "step": 2006 + }, + { + "epoch": 0.21, + "grad_norm": 1.855601522830974, + "learning_rate": 9.18663180393089e-06, + "loss": 0.5557, + "step": 2007 + }, + { + "epoch": 0.21, + "grad_norm": 2.0927778065386633, + "learning_rate": 9.185711236970666e-06, + "loss": 0.7094, + "step": 2008 + }, + { + "epoch": 0.21, + "grad_norm": 1.8476031345890764, + "learning_rate": 9.184790195536217e-06, + "loss": 0.6551, + "step": 2009 + }, + { + "epoch": 0.21, + "grad_norm": 1.9500926248954888, + "learning_rate": 9.183868679731947e-06, + "loss": 0.7358, + "step": 2010 + }, + { + "epoch": 0.21, + "grad_norm": 1.8204670833802736, + "learning_rate": 9.182946689662314e-06, + "loss": 0.5735, + "step": 2011 + }, + { + "epoch": 0.21, + "grad_norm": 1.8122772993948097, + "learning_rate": 9.182024225431832e-06, + "loss": 0.5838, + "step": 2012 + }, + { + "epoch": 0.21, + "grad_norm": 2.0100838581998874, + "learning_rate": 9.181101287145067e-06, + "loss": 0.7309, + "step": 2013 + }, + { + "epoch": 0.21, + "grad_norm": 2.0174504680138416, + "learning_rate": 9.180177874906641e-06, + "loss": 0.7442, + "step": 2014 + }, + { + "epoch": 0.21, + "grad_norm": 1.8246358548978066, + "learning_rate": 9.179253988821225e-06, + "loss": 0.6281, + "step": 2015 + }, + { + "epoch": 0.21, + "grad_norm": 1.894892646996983, + "learning_rate": 9.17832962899355e-06, + "loss": 0.7488, + "step": 2016 + }, + { + "epoch": 0.21, + "grad_norm": 1.9804467894690987, + "learning_rate": 9.177404795528395e-06, + "loss": 0.5651, + "step": 2017 + }, + { + "epoch": 0.21, + "grad_norm": 1.9679539263706014, + "learning_rate": 9.176479488530594e-06, + "loss": 0.7255, + "step": 2018 + }, + { + "epoch": 0.21, + "grad_norm": 1.8442427437055013, + "learning_rate": 9.17555370810504e-06, + "loss": 0.6942, + "step": 2019 + }, + { + "epoch": 0.21, + "grad_norm": 1.757489364681599, + "learning_rate": 9.174627454356673e-06, + "loss": 0.6402, + "step": 2020 + }, + { + "epoch": 0.21, + "grad_norm": 2.0898518246682896, + "learning_rate": 9.17370072739049e-06, + "loss": 0.7267, + "step": 2021 + }, + { + "epoch": 0.21, + "grad_norm": 1.620345040398183, + "learning_rate": 9.172773527311541e-06, + "loss": 0.5773, + "step": 2022 + }, + { + "epoch": 0.21, + "grad_norm": 1.8525961488620617, + "learning_rate": 9.171845854224925e-06, + "loss": 0.6378, + "step": 2023 + }, + { + "epoch": 0.21, + "grad_norm": 1.7575000021739289, + "learning_rate": 9.170917708235806e-06, + "loss": 0.7192, + "step": 2024 + }, + { + "epoch": 0.21, + "grad_norm": 1.693136988470001, + "learning_rate": 9.16998908944939e-06, + "loss": 0.6771, + "step": 2025 + }, + { + "epoch": 0.21, + "grad_norm": 1.9844607982204887, + "learning_rate": 9.169059997970943e-06, + "loss": 0.7001, + "step": 2026 + }, + { + "epoch": 0.21, + "grad_norm": 2.0172894851733534, + "learning_rate": 9.168130433905783e-06, + "loss": 0.6599, + "step": 2027 + }, + { + "epoch": 0.21, + "grad_norm": 1.721838081002367, + "learning_rate": 9.167200397359279e-06, + "loss": 0.6509, + "step": 2028 + }, + { + "epoch": 0.21, + "grad_norm": 1.6521500094494894, + "learning_rate": 9.16626988843686e-06, + "loss": 0.575, + "step": 2029 + }, + { + "epoch": 0.21, + "grad_norm": 1.8866447487668851, + "learning_rate": 9.165338907244e-06, + "loss": 0.6215, + "step": 2030 + }, + { + "epoch": 0.21, + "grad_norm": 1.8011492071948654, + "learning_rate": 9.164407453886234e-06, + "loss": 0.7151, + "step": 2031 + }, + { + "epoch": 0.21, + "grad_norm": 2.0920355496930485, + "learning_rate": 9.163475528469148e-06, + "loss": 0.6656, + "step": 2032 + }, + { + "epoch": 0.21, + "grad_norm": 2.070222708565936, + "learning_rate": 9.16254313109838e-06, + "loss": 0.6258, + "step": 2033 + }, + { + "epoch": 0.21, + "grad_norm": 2.123867145449913, + "learning_rate": 9.16161026187962e-06, + "loss": 0.66, + "step": 2034 + }, + { + "epoch": 0.21, + "grad_norm": 1.8609378519738486, + "learning_rate": 9.160676920918618e-06, + "loss": 0.6742, + "step": 2035 + }, + { + "epoch": 0.21, + "grad_norm": 1.9956472499189537, + "learning_rate": 9.159743108321173e-06, + "loss": 0.7092, + "step": 2036 + }, + { + "epoch": 0.21, + "grad_norm": 2.308642760350203, + "learning_rate": 9.158808824193135e-06, + "loss": 0.7591, + "step": 2037 + }, + { + "epoch": 0.21, + "grad_norm": 1.9751497331037764, + "learning_rate": 9.157874068640414e-06, + "loss": 0.6626, + "step": 2038 + }, + { + "epoch": 0.21, + "grad_norm": 2.172657081296383, + "learning_rate": 9.156938841768965e-06, + "loss": 0.6358, + "step": 2039 + }, + { + "epoch": 0.21, + "grad_norm": 1.7053858901126682, + "learning_rate": 9.156003143684808e-06, + "loss": 0.5645, + "step": 2040 + }, + { + "epoch": 0.21, + "grad_norm": 1.669173862779289, + "learning_rate": 9.155066974494005e-06, + "loss": 0.5918, + "step": 2041 + }, + { + "epoch": 0.21, + "grad_norm": 1.887929904534194, + "learning_rate": 9.154130334302677e-06, + "loss": 0.6318, + "step": 2042 + }, + { + "epoch": 0.21, + "grad_norm": 1.8383647204550129, + "learning_rate": 9.153193223216998e-06, + "loss": 0.5643, + "step": 2043 + }, + { + "epoch": 0.21, + "grad_norm": 1.8251294761789811, + "learning_rate": 9.152255641343196e-06, + "loss": 0.5895, + "step": 2044 + }, + { + "epoch": 0.21, + "grad_norm": 1.8955577533945258, + "learning_rate": 9.151317588787546e-06, + "loss": 0.6413, + "step": 2045 + }, + { + "epoch": 0.21, + "grad_norm": 1.6992665218554355, + "learning_rate": 9.150379065656389e-06, + "loss": 0.6495, + "step": 2046 + }, + { + "epoch": 0.21, + "grad_norm": 2.267187657771848, + "learning_rate": 9.149440072056109e-06, + "loss": 0.686, + "step": 2047 + }, + { + "epoch": 0.21, + "grad_norm": 1.8039609054222494, + "learning_rate": 9.148500608093144e-06, + "loss": 0.7414, + "step": 2048 + }, + { + "epoch": 0.21, + "grad_norm": 1.7381527742569263, + "learning_rate": 9.147560673873991e-06, + "loss": 0.6521, + "step": 2049 + }, + { + "epoch": 0.21, + "grad_norm": 1.9806331929152141, + "learning_rate": 9.146620269505194e-06, + "loss": 0.6065, + "step": 2050 + }, + { + "epoch": 0.21, + "grad_norm": 1.8251796761712276, + "learning_rate": 9.145679395093357e-06, + "loss": 0.6152, + "step": 2051 + }, + { + "epoch": 0.21, + "grad_norm": 1.9744130556131152, + "learning_rate": 9.144738050745129e-06, + "loss": 0.6398, + "step": 2052 + }, + { + "epoch": 0.21, + "grad_norm": 1.8608757900689004, + "learning_rate": 9.143796236567218e-06, + "loss": 0.6509, + "step": 2053 + }, + { + "epoch": 0.21, + "grad_norm": 2.0005118742578807, + "learning_rate": 9.142853952666385e-06, + "loss": 0.6495, + "step": 2054 + }, + { + "epoch": 0.21, + "grad_norm": 1.8723374871113159, + "learning_rate": 9.141911199149443e-06, + "loss": 0.7244, + "step": 2055 + }, + { + "epoch": 0.21, + "grad_norm": 1.9810936161123067, + "learning_rate": 9.14096797612326e-06, + "loss": 0.6901, + "step": 2056 + }, + { + "epoch": 0.21, + "grad_norm": 1.7755395689456877, + "learning_rate": 9.140024283694752e-06, + "loss": 0.6726, + "step": 2057 + }, + { + "epoch": 0.21, + "grad_norm": 2.0113646493029913, + "learning_rate": 9.139080121970895e-06, + "loss": 0.728, + "step": 2058 + }, + { + "epoch": 0.21, + "grad_norm": 1.9395275583029608, + "learning_rate": 9.138135491058715e-06, + "loss": 0.6925, + "step": 2059 + }, + { + "epoch": 0.21, + "grad_norm": 1.9086311561274867, + "learning_rate": 9.13719039106529e-06, + "loss": 0.6337, + "step": 2060 + }, + { + "epoch": 0.21, + "grad_norm": 2.0135221869102757, + "learning_rate": 9.136244822097754e-06, + "loss": 0.7061, + "step": 2061 + }, + { + "epoch": 0.21, + "grad_norm": 1.7953338697723547, + "learning_rate": 9.13529878426329e-06, + "loss": 0.6007, + "step": 2062 + }, + { + "epoch": 0.21, + "grad_norm": 1.9043752154264741, + "learning_rate": 9.134352277669139e-06, + "loss": 0.7779, + "step": 2063 + }, + { + "epoch": 0.21, + "grad_norm": 1.8572959317924327, + "learning_rate": 9.13340530242259e-06, + "loss": 0.6555, + "step": 2064 + }, + { + "epoch": 0.21, + "grad_norm": 1.901098228455696, + "learning_rate": 9.132457858630993e-06, + "loss": 0.6443, + "step": 2065 + }, + { + "epoch": 0.21, + "grad_norm": 1.8820392432706698, + "learning_rate": 9.131509946401744e-06, + "loss": 0.6921, + "step": 2066 + }, + { + "epoch": 0.21, + "grad_norm": 1.63515390188287, + "learning_rate": 9.130561565842293e-06, + "loss": 0.6493, + "step": 2067 + }, + { + "epoch": 0.21, + "grad_norm": 1.8004550658092822, + "learning_rate": 9.129612717060145e-06, + "loss": 0.6021, + "step": 2068 + }, + { + "epoch": 0.22, + "grad_norm": 1.8135533498191938, + "learning_rate": 9.128663400162859e-06, + "loss": 0.6516, + "step": 2069 + }, + { + "epoch": 0.22, + "grad_norm": 1.9883782219408912, + "learning_rate": 9.12771361525804e-06, + "loss": 0.5804, + "step": 2070 + }, + { + "epoch": 0.22, + "grad_norm": 1.9267639414496214, + "learning_rate": 9.12676336245336e-06, + "loss": 0.7172, + "step": 2071 + }, + { + "epoch": 0.22, + "grad_norm": 2.241296242570306, + "learning_rate": 9.125812641856529e-06, + "loss": 0.6466, + "step": 2072 + }, + { + "epoch": 0.22, + "grad_norm": 1.8465439639348022, + "learning_rate": 9.124861453575318e-06, + "loss": 0.5546, + "step": 2073 + }, + { + "epoch": 0.22, + "grad_norm": 1.8408819912841634, + "learning_rate": 9.123909797717551e-06, + "loss": 0.6116, + "step": 2074 + }, + { + "epoch": 0.22, + "grad_norm": 2.076263596235832, + "learning_rate": 9.122957674391103e-06, + "loss": 0.7471, + "step": 2075 + }, + { + "epoch": 0.22, + "grad_norm": 2.161931389146741, + "learning_rate": 9.122005083703901e-06, + "loss": 0.7757, + "step": 2076 + }, + { + "epoch": 0.22, + "grad_norm": 1.832263381482899, + "learning_rate": 9.12105202576393e-06, + "loss": 0.6342, + "step": 2077 + }, + { + "epoch": 0.22, + "grad_norm": 1.9116255589770395, + "learning_rate": 9.120098500679222e-06, + "loss": 0.5791, + "step": 2078 + }, + { + "epoch": 0.22, + "grad_norm": 1.7512087408271144, + "learning_rate": 9.119144508557867e-06, + "loss": 0.6596, + "step": 2079 + }, + { + "epoch": 0.22, + "grad_norm": 2.316242858851651, + "learning_rate": 9.118190049508003e-06, + "loss": 0.7156, + "step": 2080 + }, + { + "epoch": 0.22, + "grad_norm": 2.098398279289849, + "learning_rate": 9.117235123637822e-06, + "loss": 0.7091, + "step": 2081 + }, + { + "epoch": 0.22, + "grad_norm": 2.005617652197642, + "learning_rate": 9.116279731055574e-06, + "loss": 0.7734, + "step": 2082 + }, + { + "epoch": 0.22, + "grad_norm": 2.3181625277963342, + "learning_rate": 9.115323871869554e-06, + "loss": 0.7269, + "step": 2083 + }, + { + "epoch": 0.22, + "grad_norm": 2.09049499201878, + "learning_rate": 9.11436754618812e-06, + "loss": 0.6962, + "step": 2084 + }, + { + "epoch": 0.22, + "grad_norm": 1.8233048319574325, + "learning_rate": 9.113410754119671e-06, + "loss": 0.657, + "step": 2085 + }, + { + "epoch": 0.22, + "grad_norm": 1.7611399694198617, + "learning_rate": 9.112453495772668e-06, + "loss": 0.7066, + "step": 2086 + }, + { + "epoch": 0.22, + "grad_norm": 2.30757117904449, + "learning_rate": 9.111495771255623e-06, + "loss": 0.7161, + "step": 2087 + }, + { + "epoch": 0.22, + "grad_norm": 1.7222869131586123, + "learning_rate": 9.110537580677094e-06, + "loss": 0.6955, + "step": 2088 + }, + { + "epoch": 0.22, + "grad_norm": 1.777824577605693, + "learning_rate": 9.109578924145705e-06, + "loss": 0.6405, + "step": 2089 + }, + { + "epoch": 0.22, + "grad_norm": 1.867004091367024, + "learning_rate": 9.108619801770117e-06, + "loss": 0.7337, + "step": 2090 + }, + { + "epoch": 0.22, + "grad_norm": 1.8341940643841907, + "learning_rate": 9.10766021365906e-06, + "loss": 0.6397, + "step": 2091 + }, + { + "epoch": 0.22, + "grad_norm": 1.7108974210893473, + "learning_rate": 9.106700159921301e-06, + "loss": 0.6176, + "step": 2092 + }, + { + "epoch": 0.22, + "grad_norm": 1.763586609221222, + "learning_rate": 9.105739640665675e-06, + "loss": 0.7577, + "step": 2093 + }, + { + "epoch": 0.22, + "grad_norm": 1.8179769534117416, + "learning_rate": 9.104778656001057e-06, + "loss": 0.6602, + "step": 2094 + }, + { + "epoch": 0.22, + "grad_norm": 2.0710656099544944, + "learning_rate": 9.103817206036383e-06, + "loss": 0.6258, + "step": 2095 + }, + { + "epoch": 0.22, + "grad_norm": 1.9636515939076291, + "learning_rate": 9.102855290880637e-06, + "loss": 0.6908, + "step": 2096 + }, + { + "epoch": 0.22, + "grad_norm": 1.7697645394492914, + "learning_rate": 9.101892910642858e-06, + "loss": 0.5772, + "step": 2097 + }, + { + "epoch": 0.22, + "grad_norm": 1.8229562492894091, + "learning_rate": 9.100930065432136e-06, + "loss": 0.6319, + "step": 2098 + }, + { + "epoch": 0.22, + "grad_norm": 1.9665961596361246, + "learning_rate": 9.09996675535762e-06, + "loss": 0.6321, + "step": 2099 + }, + { + "epoch": 0.22, + "grad_norm": 1.9153659580729374, + "learning_rate": 9.099002980528502e-06, + "loss": 0.7386, + "step": 2100 + }, + { + "epoch": 0.22, + "grad_norm": 1.8929607008969653, + "learning_rate": 9.098038741054032e-06, + "loss": 0.6038, + "step": 2101 + }, + { + "epoch": 0.22, + "grad_norm": 2.122236111686674, + "learning_rate": 9.097074037043512e-06, + "loss": 0.7902, + "step": 2102 + }, + { + "epoch": 0.22, + "grad_norm": 1.8063786343678705, + "learning_rate": 9.0961088686063e-06, + "loss": 0.6154, + "step": 2103 + }, + { + "epoch": 0.22, + "grad_norm": 1.8859714107413783, + "learning_rate": 9.095143235851797e-06, + "loss": 0.625, + "step": 2104 + }, + { + "epoch": 0.22, + "grad_norm": 1.8899963332781613, + "learning_rate": 9.094177138889468e-06, + "loss": 0.6285, + "step": 2105 + }, + { + "epoch": 0.22, + "grad_norm": 1.823065246187759, + "learning_rate": 9.093210577828826e-06, + "loss": 0.5222, + "step": 2106 + }, + { + "epoch": 0.22, + "grad_norm": 1.8409605037443102, + "learning_rate": 9.092243552779434e-06, + "loss": 0.545, + "step": 2107 + }, + { + "epoch": 0.22, + "grad_norm": 1.7950723144319713, + "learning_rate": 9.091276063850909e-06, + "loss": 0.7386, + "step": 2108 + }, + { + "epoch": 0.22, + "grad_norm": 1.8619428941975487, + "learning_rate": 9.090308111152924e-06, + "loss": 0.6613, + "step": 2109 + }, + { + "epoch": 0.22, + "grad_norm": 1.8760132631684212, + "learning_rate": 9.0893396947952e-06, + "loss": 0.6567, + "step": 2110 + }, + { + "epoch": 0.22, + "grad_norm": 2.5393612059524457, + "learning_rate": 9.088370814887512e-06, + "loss": 0.6192, + "step": 2111 + }, + { + "epoch": 0.22, + "grad_norm": 2.1997633108009746, + "learning_rate": 9.08740147153969e-06, + "loss": 0.7291, + "step": 2112 + }, + { + "epoch": 0.22, + "grad_norm": 2.0003762945667263, + "learning_rate": 9.086431664861615e-06, + "loss": 0.6385, + "step": 2113 + }, + { + "epoch": 0.22, + "grad_norm": 1.9280568210847042, + "learning_rate": 9.085461394963218e-06, + "loss": 0.7926, + "step": 2114 + }, + { + "epoch": 0.22, + "grad_norm": 2.0839385743826337, + "learning_rate": 9.084490661954487e-06, + "loss": 0.7176, + "step": 2115 + }, + { + "epoch": 0.22, + "grad_norm": 1.751926913173965, + "learning_rate": 9.083519465945456e-06, + "loss": 0.6207, + "step": 2116 + }, + { + "epoch": 0.22, + "grad_norm": 1.6790647212307659, + "learning_rate": 9.082547807046218e-06, + "loss": 0.5995, + "step": 2117 + }, + { + "epoch": 0.22, + "grad_norm": 1.9768132587469367, + "learning_rate": 9.081575685366919e-06, + "loss": 0.6638, + "step": 2118 + }, + { + "epoch": 0.22, + "grad_norm": 1.873044803611899, + "learning_rate": 9.080603101017751e-06, + "loss": 0.6287, + "step": 2119 + }, + { + "epoch": 0.22, + "grad_norm": 1.9026675442854966, + "learning_rate": 9.079630054108962e-06, + "loss": 0.7277, + "step": 2120 + }, + { + "epoch": 0.22, + "grad_norm": 1.786052227401782, + "learning_rate": 9.078656544750854e-06, + "loss": 0.5283, + "step": 2121 + }, + { + "epoch": 0.22, + "grad_norm": 1.835617801364485, + "learning_rate": 9.07768257305378e-06, + "loss": 0.6734, + "step": 2122 + }, + { + "epoch": 0.22, + "grad_norm": 1.9059077413421042, + "learning_rate": 9.07670813912814e-06, + "loss": 0.5662, + "step": 2123 + }, + { + "epoch": 0.22, + "grad_norm": 1.8430365616658317, + "learning_rate": 9.0757332430844e-06, + "loss": 0.6786, + "step": 2124 + }, + { + "epoch": 0.22, + "grad_norm": 2.0912152253587233, + "learning_rate": 9.074757885033065e-06, + "loss": 0.6699, + "step": 2125 + }, + { + "epoch": 0.22, + "grad_norm": 1.8326045285025117, + "learning_rate": 9.073782065084699e-06, + "loss": 0.6516, + "step": 2126 + }, + { + "epoch": 0.22, + "grad_norm": 2.1141153087285747, + "learning_rate": 9.072805783349916e-06, + "loss": 0.6504, + "step": 2127 + }, + { + "epoch": 0.22, + "grad_norm": 1.7665606289047024, + "learning_rate": 9.071829039939382e-06, + "loss": 0.5843, + "step": 2128 + }, + { + "epoch": 0.22, + "grad_norm": 1.965086242675083, + "learning_rate": 9.070851834963818e-06, + "loss": 0.606, + "step": 2129 + }, + { + "epoch": 0.22, + "grad_norm": 1.8212064883638712, + "learning_rate": 9.069874168533996e-06, + "loss": 0.5638, + "step": 2130 + }, + { + "epoch": 0.22, + "grad_norm": 1.914061308706072, + "learning_rate": 9.06889604076074e-06, + "loss": 0.7702, + "step": 2131 + }, + { + "epoch": 0.22, + "grad_norm": 1.9281649798903766, + "learning_rate": 9.067917451754926e-06, + "loss": 0.6979, + "step": 2132 + }, + { + "epoch": 0.22, + "grad_norm": 1.9724035219322607, + "learning_rate": 9.06693840162748e-06, + "loss": 0.6326, + "step": 2133 + }, + { + "epoch": 0.22, + "grad_norm": 1.926538052376226, + "learning_rate": 9.065958890489388e-06, + "loss": 0.6708, + "step": 2134 + }, + { + "epoch": 0.22, + "grad_norm": 1.8135660948677115, + "learning_rate": 9.06497891845168e-06, + "loss": 0.6908, + "step": 2135 + }, + { + "epoch": 0.22, + "grad_norm": 2.0359781123918546, + "learning_rate": 9.063998485625442e-06, + "loss": 0.8177, + "step": 2136 + }, + { + "epoch": 0.22, + "grad_norm": 1.8070490443578695, + "learning_rate": 9.063017592121812e-06, + "loss": 0.6308, + "step": 2137 + }, + { + "epoch": 0.22, + "grad_norm": 2.1534040200827858, + "learning_rate": 9.062036238051978e-06, + "loss": 0.7568, + "step": 2138 + }, + { + "epoch": 0.22, + "grad_norm": 1.9932070854185635, + "learning_rate": 9.061054423527185e-06, + "loss": 0.6454, + "step": 2139 + }, + { + "epoch": 0.22, + "grad_norm": 2.06303057621518, + "learning_rate": 9.060072148658726e-06, + "loss": 0.7897, + "step": 2140 + }, + { + "epoch": 0.22, + "grad_norm": 1.8541768464221198, + "learning_rate": 9.059089413557946e-06, + "loss": 0.7146, + "step": 2141 + }, + { + "epoch": 0.22, + "grad_norm": 2.131577767008987, + "learning_rate": 9.058106218336244e-06, + "loss": 0.64, + "step": 2142 + }, + { + "epoch": 0.22, + "grad_norm": 1.957951302391662, + "learning_rate": 9.057122563105074e-06, + "loss": 0.6772, + "step": 2143 + }, + { + "epoch": 0.22, + "grad_norm": 1.8796034764382545, + "learning_rate": 9.056138447975936e-06, + "loss": 0.5828, + "step": 2144 + }, + { + "epoch": 0.22, + "grad_norm": 1.9984362920448873, + "learning_rate": 9.055153873060387e-06, + "loss": 0.6582, + "step": 2145 + }, + { + "epoch": 0.22, + "grad_norm": 1.818826471290181, + "learning_rate": 9.05416883847003e-06, + "loss": 0.6103, + "step": 2146 + }, + { + "epoch": 0.22, + "grad_norm": 1.9010961716566377, + "learning_rate": 9.05318334431653e-06, + "loss": 0.6502, + "step": 2147 + }, + { + "epoch": 0.22, + "grad_norm": 3.6452104619968555, + "learning_rate": 9.052197390711594e-06, + "loss": 0.6073, + "step": 2148 + }, + { + "epoch": 0.22, + "grad_norm": 2.0593507671841054, + "learning_rate": 9.051210977766987e-06, + "loss": 0.7939, + "step": 2149 + }, + { + "epoch": 0.22, + "grad_norm": 2.8947339812978212, + "learning_rate": 9.050224105594525e-06, + "loss": 0.7403, + "step": 2150 + }, + { + "epoch": 0.22, + "grad_norm": 1.7701582298112364, + "learning_rate": 9.049236774306073e-06, + "loss": 0.7139, + "step": 2151 + }, + { + "epoch": 0.22, + "grad_norm": 1.769772295770342, + "learning_rate": 9.048248984013557e-06, + "loss": 0.6318, + "step": 2152 + }, + { + "epoch": 0.22, + "grad_norm": 1.8668708776556142, + "learning_rate": 9.04726073482894e-06, + "loss": 0.6786, + "step": 2153 + }, + { + "epoch": 0.22, + "grad_norm": 2.8599052090254813, + "learning_rate": 9.046272026864253e-06, + "loss": 0.6792, + "step": 2154 + }, + { + "epoch": 0.22, + "grad_norm": 2.306307988225551, + "learning_rate": 9.045282860231567e-06, + "loss": 0.8036, + "step": 2155 + }, + { + "epoch": 0.22, + "grad_norm": 2.0248004612699835, + "learning_rate": 9.044293235043013e-06, + "loss": 0.5483, + "step": 2156 + }, + { + "epoch": 0.22, + "grad_norm": 2.0040416841239312, + "learning_rate": 9.04330315141077e-06, + "loss": 0.5838, + "step": 2157 + }, + { + "epoch": 0.22, + "grad_norm": 1.8934644260258988, + "learning_rate": 9.042312609447066e-06, + "loss": 0.7741, + "step": 2158 + }, + { + "epoch": 0.22, + "grad_norm": 1.8369307032598101, + "learning_rate": 9.041321609264189e-06, + "loss": 0.6881, + "step": 2159 + }, + { + "epoch": 0.22, + "grad_norm": 1.8734585208709953, + "learning_rate": 9.040330150974472e-06, + "loss": 0.7302, + "step": 2160 + }, + { + "epoch": 0.22, + "grad_norm": 2.274845630281377, + "learning_rate": 9.039338234690304e-06, + "loss": 0.7115, + "step": 2161 + }, + { + "epoch": 0.22, + "grad_norm": 1.9120910239340116, + "learning_rate": 9.038345860524123e-06, + "loss": 0.6324, + "step": 2162 + }, + { + "epoch": 0.22, + "grad_norm": 1.9268141093838203, + "learning_rate": 9.037353028588421e-06, + "loss": 0.6863, + "step": 2163 + }, + { + "epoch": 0.22, + "grad_norm": 1.7626090493830637, + "learning_rate": 9.036359738995741e-06, + "loss": 0.7137, + "step": 2164 + }, + { + "epoch": 0.23, + "grad_norm": 1.9785455080305587, + "learning_rate": 9.035365991858679e-06, + "loss": 0.6363, + "step": 2165 + }, + { + "epoch": 0.23, + "grad_norm": 1.7537820647478297, + "learning_rate": 9.034371787289879e-06, + "loss": 0.6999, + "step": 2166 + }, + { + "epoch": 0.23, + "grad_norm": 1.8144705683970634, + "learning_rate": 9.033377125402045e-06, + "loss": 0.6111, + "step": 2167 + }, + { + "epoch": 0.23, + "grad_norm": 2.0784711520480683, + "learning_rate": 9.032382006307923e-06, + "loss": 0.7317, + "step": 2168 + }, + { + "epoch": 0.23, + "grad_norm": 2.055321365353015, + "learning_rate": 9.031386430120315e-06, + "loss": 0.6697, + "step": 2169 + }, + { + "epoch": 0.23, + "grad_norm": 1.9973752694136535, + "learning_rate": 9.030390396952077e-06, + "loss": 0.6653, + "step": 2170 + }, + { + "epoch": 0.23, + "grad_norm": 2.1908786377684004, + "learning_rate": 9.029393906916118e-06, + "loss": 0.7126, + "step": 2171 + }, + { + "epoch": 0.23, + "grad_norm": 2.2196682641106045, + "learning_rate": 9.028396960125392e-06, + "loss": 0.6975, + "step": 2172 + }, + { + "epoch": 0.23, + "grad_norm": 2.153676140622691, + "learning_rate": 9.02739955669291e-06, + "loss": 0.7135, + "step": 2173 + }, + { + "epoch": 0.23, + "grad_norm": 1.7443698256344395, + "learning_rate": 9.02640169673173e-06, + "loss": 0.6252, + "step": 2174 + }, + { + "epoch": 0.23, + "grad_norm": 1.8900431422297443, + "learning_rate": 9.02540338035497e-06, + "loss": 0.733, + "step": 2175 + }, + { + "epoch": 0.23, + "grad_norm": 1.7568087601555695, + "learning_rate": 9.024404607675792e-06, + "loss": 0.673, + "step": 2176 + }, + { + "epoch": 0.23, + "grad_norm": 1.898316533786659, + "learning_rate": 9.023405378807413e-06, + "loss": 0.6297, + "step": 2177 + }, + { + "epoch": 0.23, + "grad_norm": 1.977725542941323, + "learning_rate": 9.022405693863102e-06, + "loss": 0.7268, + "step": 2178 + }, + { + "epoch": 0.23, + "grad_norm": 2.1481357518729243, + "learning_rate": 9.02140555295618e-06, + "loss": 0.6296, + "step": 2179 + }, + { + "epoch": 0.23, + "grad_norm": 1.9978345929549044, + "learning_rate": 9.020404956200016e-06, + "loss": 0.5896, + "step": 2180 + }, + { + "epoch": 0.23, + "grad_norm": 1.84587899229959, + "learning_rate": 9.019403903708036e-06, + "loss": 0.6713, + "step": 2181 + }, + { + "epoch": 0.23, + "grad_norm": 1.8092614755951855, + "learning_rate": 9.018402395593711e-06, + "loss": 0.6312, + "step": 2182 + }, + { + "epoch": 0.23, + "grad_norm": 2.0435447786587178, + "learning_rate": 9.017400431970572e-06, + "loss": 0.6963, + "step": 2183 + }, + { + "epoch": 0.23, + "grad_norm": 1.7252658369683462, + "learning_rate": 9.016398012952196e-06, + "loss": 0.5749, + "step": 2184 + }, + { + "epoch": 0.23, + "grad_norm": 1.7329348782754614, + "learning_rate": 9.015395138652212e-06, + "loss": 0.6343, + "step": 2185 + }, + { + "epoch": 0.23, + "grad_norm": 2.0542689022507004, + "learning_rate": 9.014391809184302e-06, + "loss": 0.754, + "step": 2186 + }, + { + "epoch": 0.23, + "grad_norm": 2.0538719175252464, + "learning_rate": 9.013388024662199e-06, + "loss": 0.7492, + "step": 2187 + }, + { + "epoch": 0.23, + "grad_norm": 2.194772716927874, + "learning_rate": 9.012383785199688e-06, + "loss": 0.6571, + "step": 2188 + }, + { + "epoch": 0.23, + "grad_norm": 1.7314238687560968, + "learning_rate": 9.011379090910605e-06, + "loss": 0.5858, + "step": 2189 + }, + { + "epoch": 0.23, + "grad_norm": 1.8309501359614229, + "learning_rate": 9.010373941908839e-06, + "loss": 0.561, + "step": 2190 + }, + { + "epoch": 0.23, + "grad_norm": 1.9401424308724535, + "learning_rate": 9.009368338308328e-06, + "loss": 0.6818, + "step": 2191 + }, + { + "epoch": 0.23, + "grad_norm": 2.0913313231859534, + "learning_rate": 9.008362280223062e-06, + "loss": 0.7318, + "step": 2192 + }, + { + "epoch": 0.23, + "grad_norm": 2.01779899943987, + "learning_rate": 9.007355767767085e-06, + "loss": 0.6818, + "step": 2193 + }, + { + "epoch": 0.23, + "grad_norm": 1.8462281597980958, + "learning_rate": 9.006348801054491e-06, + "loss": 0.6496, + "step": 2194 + }, + { + "epoch": 0.23, + "grad_norm": 1.9187146206986834, + "learning_rate": 9.005341380199426e-06, + "loss": 0.6457, + "step": 2195 + }, + { + "epoch": 0.23, + "grad_norm": 1.9328520519266845, + "learning_rate": 9.004333505316085e-06, + "loss": 0.7879, + "step": 2196 + }, + { + "epoch": 0.23, + "grad_norm": 1.708215185089118, + "learning_rate": 9.003325176518718e-06, + "loss": 0.5946, + "step": 2197 + }, + { + "epoch": 0.23, + "grad_norm": 1.9026124558937254, + "learning_rate": 9.002316393921623e-06, + "loss": 0.7187, + "step": 2198 + }, + { + "epoch": 0.23, + "grad_norm": 2.002549571319192, + "learning_rate": 9.001307157639153e-06, + "loss": 0.711, + "step": 2199 + }, + { + "epoch": 0.23, + "grad_norm": 1.7359598802500287, + "learning_rate": 9.000297467785708e-06, + "loss": 0.7036, + "step": 2200 + }, + { + "epoch": 0.23, + "grad_norm": 1.9435004256949489, + "learning_rate": 8.999287324475745e-06, + "loss": 0.6485, + "step": 2201 + }, + { + "epoch": 0.23, + "grad_norm": 1.9218778678631865, + "learning_rate": 8.998276727823769e-06, + "loss": 0.6781, + "step": 2202 + }, + { + "epoch": 0.23, + "grad_norm": 1.8933013319011063, + "learning_rate": 8.997265677944336e-06, + "loss": 0.6569, + "step": 2203 + }, + { + "epoch": 0.23, + "grad_norm": 1.9531255465327648, + "learning_rate": 8.996254174952056e-06, + "loss": 0.7081, + "step": 2204 + }, + { + "epoch": 0.23, + "grad_norm": 2.0631797487414527, + "learning_rate": 8.995242218961586e-06, + "loss": 0.6542, + "step": 2205 + }, + { + "epoch": 0.23, + "grad_norm": 2.0026963960183264, + "learning_rate": 8.99422981008764e-06, + "loss": 0.6282, + "step": 2206 + }, + { + "epoch": 0.23, + "grad_norm": 1.9931938931672886, + "learning_rate": 8.993216948444978e-06, + "loss": 0.6685, + "step": 2207 + }, + { + "epoch": 0.23, + "grad_norm": 2.03677992763092, + "learning_rate": 8.992203634148412e-06, + "loss": 0.5603, + "step": 2208 + }, + { + "epoch": 0.23, + "grad_norm": 1.9775804447589407, + "learning_rate": 8.991189867312813e-06, + "loss": 0.7024, + "step": 2209 + }, + { + "epoch": 0.23, + "grad_norm": 1.9936308622181156, + "learning_rate": 8.990175648053093e-06, + "loss": 0.7432, + "step": 2210 + }, + { + "epoch": 0.23, + "grad_norm": 1.7983836508704376, + "learning_rate": 8.989160976484218e-06, + "loss": 0.6267, + "step": 2211 + }, + { + "epoch": 0.23, + "grad_norm": 1.7341209015462455, + "learning_rate": 8.98814585272121e-06, + "loss": 0.6546, + "step": 2212 + }, + { + "epoch": 0.23, + "grad_norm": 1.7535109689213118, + "learning_rate": 8.987130276879137e-06, + "loss": 0.6133, + "step": 2213 + }, + { + "epoch": 0.23, + "grad_norm": 1.926070930858834, + "learning_rate": 8.986114249073122e-06, + "loss": 0.7027, + "step": 2214 + }, + { + "epoch": 0.23, + "grad_norm": 1.9392805940904663, + "learning_rate": 8.985097769418337e-06, + "loss": 0.6536, + "step": 2215 + }, + { + "epoch": 0.23, + "grad_norm": 2.0611644129881763, + "learning_rate": 8.984080838030005e-06, + "loss": 0.8039, + "step": 2216 + }, + { + "epoch": 0.23, + "grad_norm": 2.142950425462867, + "learning_rate": 8.983063455023402e-06, + "loss": 0.7716, + "step": 2217 + }, + { + "epoch": 0.23, + "grad_norm": 2.054058905566524, + "learning_rate": 8.982045620513855e-06, + "loss": 0.7707, + "step": 2218 + }, + { + "epoch": 0.23, + "grad_norm": 1.889097548556331, + "learning_rate": 8.981027334616737e-06, + "loss": 0.5825, + "step": 2219 + }, + { + "epoch": 0.23, + "grad_norm": 1.8795725986969594, + "learning_rate": 8.98000859744748e-06, + "loss": 0.6551, + "step": 2220 + }, + { + "epoch": 0.23, + "grad_norm": 1.882519672163551, + "learning_rate": 8.978989409121565e-06, + "loss": 0.682, + "step": 2221 + }, + { + "epoch": 0.23, + "grad_norm": 2.0537589705875603, + "learning_rate": 8.97796976975452e-06, + "loss": 0.7304, + "step": 2222 + }, + { + "epoch": 0.23, + "grad_norm": 2.0767498129779574, + "learning_rate": 8.976949679461928e-06, + "loss": 0.7041, + "step": 2223 + }, + { + "epoch": 0.23, + "grad_norm": 2.110118698229804, + "learning_rate": 8.975929138359423e-06, + "loss": 0.6485, + "step": 2224 + }, + { + "epoch": 0.23, + "grad_norm": 1.9109900497230985, + "learning_rate": 8.974908146562686e-06, + "loss": 0.7517, + "step": 2225 + }, + { + "epoch": 0.23, + "grad_norm": 2.015281369277223, + "learning_rate": 8.973886704187457e-06, + "loss": 0.6787, + "step": 2226 + }, + { + "epoch": 0.23, + "grad_norm": 1.8262978970480288, + "learning_rate": 8.972864811349518e-06, + "loss": 0.6435, + "step": 2227 + }, + { + "epoch": 0.23, + "grad_norm": 2.210288335596177, + "learning_rate": 8.97184246816471e-06, + "loss": 0.7442, + "step": 2228 + }, + { + "epoch": 0.23, + "grad_norm": 1.8288254991676658, + "learning_rate": 8.970819674748917e-06, + "loss": 0.6972, + "step": 2229 + }, + { + "epoch": 0.23, + "grad_norm": 1.7051979804580626, + "learning_rate": 8.969796431218081e-06, + "loss": 0.5441, + "step": 2230 + }, + { + "epoch": 0.23, + "grad_norm": 1.9169044631050145, + "learning_rate": 8.968772737688193e-06, + "loss": 0.6199, + "step": 2231 + }, + { + "epoch": 0.23, + "grad_norm": 2.0655732597769925, + "learning_rate": 8.967748594275294e-06, + "loss": 0.766, + "step": 2232 + }, + { + "epoch": 0.23, + "grad_norm": 1.978801046277295, + "learning_rate": 8.966724001095477e-06, + "loss": 0.6957, + "step": 2233 + }, + { + "epoch": 0.23, + "grad_norm": 1.9521332143429797, + "learning_rate": 8.965698958264883e-06, + "loss": 0.6472, + "step": 2234 + }, + { + "epoch": 0.23, + "grad_norm": 1.83147289535707, + "learning_rate": 8.96467346589971e-06, + "loss": 0.6152, + "step": 2235 + }, + { + "epoch": 0.23, + "grad_norm": 1.9956163284753403, + "learning_rate": 8.963647524116202e-06, + "loss": 0.6321, + "step": 2236 + }, + { + "epoch": 0.23, + "grad_norm": 2.133422804155597, + "learning_rate": 8.962621133030655e-06, + "loss": 0.6059, + "step": 2237 + }, + { + "epoch": 0.23, + "grad_norm": 1.9380345874646834, + "learning_rate": 8.961594292759416e-06, + "loss": 0.789, + "step": 2238 + }, + { + "epoch": 0.23, + "grad_norm": 1.8865426268162737, + "learning_rate": 8.960567003418882e-06, + "loss": 0.6757, + "step": 2239 + }, + { + "epoch": 0.23, + "grad_norm": 1.973482306954926, + "learning_rate": 8.959539265125507e-06, + "loss": 0.6109, + "step": 2240 + }, + { + "epoch": 0.23, + "grad_norm": 1.8719939069545541, + "learning_rate": 8.958511077995786e-06, + "loss": 0.6642, + "step": 2241 + }, + { + "epoch": 0.23, + "grad_norm": 2.027942589354299, + "learning_rate": 8.957482442146271e-06, + "loss": 0.6152, + "step": 2242 + }, + { + "epoch": 0.23, + "grad_norm": 1.9377797775087606, + "learning_rate": 8.956453357693565e-06, + "loss": 0.6306, + "step": 2243 + }, + { + "epoch": 0.23, + "grad_norm": 2.1397437040204927, + "learning_rate": 8.955423824754319e-06, + "loss": 0.6442, + "step": 2244 + }, + { + "epoch": 0.23, + "grad_norm": 1.850879857064714, + "learning_rate": 8.954393843445239e-06, + "loss": 0.7046, + "step": 2245 + }, + { + "epoch": 0.23, + "grad_norm": 1.9391040464860019, + "learning_rate": 8.953363413883077e-06, + "loss": 0.633, + "step": 2246 + }, + { + "epoch": 0.23, + "grad_norm": 1.7458294438697912, + "learning_rate": 8.952332536184639e-06, + "loss": 0.6093, + "step": 2247 + }, + { + "epoch": 0.23, + "grad_norm": 1.7462764140457245, + "learning_rate": 8.951301210466779e-06, + "loss": 0.6411, + "step": 2248 + }, + { + "epoch": 0.23, + "grad_norm": 2.0005982387947276, + "learning_rate": 8.950269436846405e-06, + "loss": 0.6312, + "step": 2249 + }, + { + "epoch": 0.23, + "grad_norm": 1.9155274879580744, + "learning_rate": 8.949237215440476e-06, + "loss": 0.6798, + "step": 2250 + }, + { + "epoch": 0.23, + "grad_norm": 1.855490598220143, + "learning_rate": 8.948204546365996e-06, + "loss": 0.648, + "step": 2251 + }, + { + "epoch": 0.23, + "grad_norm": 2.189347602052192, + "learning_rate": 8.94717142974003e-06, + "loss": 0.629, + "step": 2252 + }, + { + "epoch": 0.23, + "grad_norm": 1.6403712428379003, + "learning_rate": 8.946137865679683e-06, + "loss": 0.5136, + "step": 2253 + }, + { + "epoch": 0.23, + "grad_norm": 1.9275044475114163, + "learning_rate": 8.945103854302118e-06, + "loss": 0.6351, + "step": 2254 + }, + { + "epoch": 0.23, + "grad_norm": 1.813897721890047, + "learning_rate": 8.944069395724541e-06, + "loss": 0.7123, + "step": 2255 + }, + { + "epoch": 0.23, + "grad_norm": 2.0580374034292044, + "learning_rate": 8.943034490064222e-06, + "loss": 0.6749, + "step": 2256 + }, + { + "epoch": 0.23, + "grad_norm": 2.0017399104314246, + "learning_rate": 8.941999137438466e-06, + "loss": 0.695, + "step": 2257 + }, + { + "epoch": 0.23, + "grad_norm": 1.9378953715759386, + "learning_rate": 8.940963337964642e-06, + "loss": 0.6528, + "step": 2258 + }, + { + "epoch": 0.23, + "grad_norm": 2.041326591727269, + "learning_rate": 8.93992709176016e-06, + "loss": 0.7131, + "step": 2259 + }, + { + "epoch": 0.23, + "grad_norm": 1.9285147868178183, + "learning_rate": 8.938890398942482e-06, + "loss": 0.7492, + "step": 2260 + }, + { + "epoch": 0.24, + "grad_norm": 1.8238045339749287, + "learning_rate": 8.93785325962913e-06, + "loss": 0.6304, + "step": 2261 + }, + { + "epoch": 0.24, + "grad_norm": 2.0053183644082613, + "learning_rate": 8.936815673937665e-06, + "loss": 0.6946, + "step": 2262 + }, + { + "epoch": 0.24, + "grad_norm": 2.056091891002725, + "learning_rate": 8.935777641985704e-06, + "loss": 0.7323, + "step": 2263 + }, + { + "epoch": 0.24, + "grad_norm": 1.8523458813916833, + "learning_rate": 8.934739163890914e-06, + "loss": 0.6583, + "step": 2264 + }, + { + "epoch": 0.24, + "grad_norm": 1.9742354499979657, + "learning_rate": 8.933700239771013e-06, + "loss": 0.6349, + "step": 2265 + }, + { + "epoch": 0.24, + "grad_norm": 1.9663161560770108, + "learning_rate": 8.932660869743766e-06, + "loss": 0.6875, + "step": 2266 + }, + { + "epoch": 0.24, + "grad_norm": 1.7729449620047906, + "learning_rate": 8.931621053926998e-06, + "loss": 0.5993, + "step": 2267 + }, + { + "epoch": 0.24, + "grad_norm": 1.776396253549101, + "learning_rate": 8.930580792438571e-06, + "loss": 0.6088, + "step": 2268 + }, + { + "epoch": 0.24, + "grad_norm": 1.643385454952664, + "learning_rate": 8.929540085396409e-06, + "loss": 0.6575, + "step": 2269 + }, + { + "epoch": 0.24, + "grad_norm": 1.946490262609231, + "learning_rate": 8.92849893291848e-06, + "loss": 0.6845, + "step": 2270 + }, + { + "epoch": 0.24, + "grad_norm": 1.9913037797693673, + "learning_rate": 8.927457335122807e-06, + "loss": 0.6021, + "step": 2271 + }, + { + "epoch": 0.24, + "grad_norm": 1.9935906721529877, + "learning_rate": 8.926415292127458e-06, + "loss": 0.6359, + "step": 2272 + }, + { + "epoch": 0.24, + "grad_norm": 1.9874498733381067, + "learning_rate": 8.925372804050554e-06, + "loss": 0.8141, + "step": 2273 + }, + { + "epoch": 0.24, + "grad_norm": 1.8641504075702195, + "learning_rate": 8.924329871010271e-06, + "loss": 0.5803, + "step": 2274 + }, + { + "epoch": 0.24, + "grad_norm": 2.0248361904003906, + "learning_rate": 8.92328649312483e-06, + "loss": 0.7611, + "step": 2275 + }, + { + "epoch": 0.24, + "grad_norm": 1.8881478352562464, + "learning_rate": 8.922242670512501e-06, + "loss": 0.7352, + "step": 2276 + }, + { + "epoch": 0.24, + "grad_norm": 1.643021574441532, + "learning_rate": 8.92119840329161e-06, + "loss": 0.5815, + "step": 2277 + }, + { + "epoch": 0.24, + "grad_norm": 1.8384891810153987, + "learning_rate": 8.92015369158053e-06, + "loss": 0.6841, + "step": 2278 + }, + { + "epoch": 0.24, + "grad_norm": 1.8084240539765695, + "learning_rate": 8.919108535497684e-06, + "loss": 0.6603, + "step": 2279 + }, + { + "epoch": 0.24, + "grad_norm": 11.78225246310456, + "learning_rate": 8.91806293516155e-06, + "loss": 0.6892, + "step": 2280 + }, + { + "epoch": 0.24, + "grad_norm": 2.0298859406349905, + "learning_rate": 8.917016890690648e-06, + "loss": 0.7486, + "step": 2281 + }, + { + "epoch": 0.24, + "grad_norm": 2.0279639350940566, + "learning_rate": 8.915970402203555e-06, + "loss": 0.6696, + "step": 2282 + }, + { + "epoch": 0.24, + "grad_norm": 1.994856285908655, + "learning_rate": 8.914923469818897e-06, + "loss": 0.6637, + "step": 2283 + }, + { + "epoch": 0.24, + "grad_norm": 1.8271612294370347, + "learning_rate": 8.913876093655351e-06, + "loss": 0.6463, + "step": 2284 + }, + { + "epoch": 0.24, + "grad_norm": 1.784844135622662, + "learning_rate": 8.912828273831639e-06, + "loss": 0.6222, + "step": 2285 + }, + { + "epoch": 0.24, + "grad_norm": 1.7882797824424788, + "learning_rate": 8.911780010466542e-06, + "loss": 0.745, + "step": 2286 + }, + { + "epoch": 0.24, + "grad_norm": 2.087910231596911, + "learning_rate": 8.910731303678881e-06, + "loss": 0.6256, + "step": 2287 + }, + { + "epoch": 0.24, + "grad_norm": 1.9216932425265978, + "learning_rate": 8.90968215358754e-06, + "loss": 0.6616, + "step": 2288 + }, + { + "epoch": 0.24, + "grad_norm": 1.9486809117012627, + "learning_rate": 8.908632560311441e-06, + "loss": 0.6927, + "step": 2289 + }, + { + "epoch": 0.24, + "grad_norm": 1.8170629926411348, + "learning_rate": 8.907582523969562e-06, + "loss": 0.679, + "step": 2290 + }, + { + "epoch": 0.24, + "grad_norm": 1.761269885787969, + "learning_rate": 8.906532044680933e-06, + "loss": 0.6878, + "step": 2291 + }, + { + "epoch": 0.24, + "grad_norm": 1.8632205027463271, + "learning_rate": 8.905481122564628e-06, + "loss": 0.6873, + "step": 2292 + }, + { + "epoch": 0.24, + "grad_norm": 1.9046793180822075, + "learning_rate": 8.90442975773978e-06, + "loss": 0.6389, + "step": 2293 + }, + { + "epoch": 0.24, + "grad_norm": 1.9488081969903266, + "learning_rate": 8.903377950325563e-06, + "loss": 0.6598, + "step": 2294 + }, + { + "epoch": 0.24, + "grad_norm": 2.1749846253223257, + "learning_rate": 8.902325700441207e-06, + "loss": 0.6878, + "step": 2295 + }, + { + "epoch": 0.24, + "grad_norm": 1.7711244804453699, + "learning_rate": 8.901273008205991e-06, + "loss": 0.5985, + "step": 2296 + }, + { + "epoch": 0.24, + "grad_norm": 1.8428616471257615, + "learning_rate": 8.900219873739242e-06, + "loss": 0.7583, + "step": 2297 + }, + { + "epoch": 0.24, + "grad_norm": 2.2720506937274974, + "learning_rate": 8.89916629716034e-06, + "loss": 0.6609, + "step": 2298 + }, + { + "epoch": 0.24, + "grad_norm": 2.124027712091581, + "learning_rate": 8.898112278588713e-06, + "loss": 0.6003, + "step": 2299 + }, + { + "epoch": 0.24, + "grad_norm": 1.7844632084108558, + "learning_rate": 8.897057818143842e-06, + "loss": 0.6297, + "step": 2300 + }, + { + "epoch": 0.24, + "grad_norm": 1.902743216888347, + "learning_rate": 8.896002915945254e-06, + "loss": 0.6839, + "step": 2301 + }, + { + "epoch": 0.24, + "grad_norm": 1.7514502795812927, + "learning_rate": 8.89494757211253e-06, + "loss": 0.6339, + "step": 2302 + }, + { + "epoch": 0.24, + "grad_norm": 1.648375420162822, + "learning_rate": 8.893891786765298e-06, + "loss": 0.5742, + "step": 2303 + }, + { + "epoch": 0.24, + "grad_norm": 1.87874832019259, + "learning_rate": 8.892835560023236e-06, + "loss": 0.686, + "step": 2304 + }, + { + "epoch": 0.24, + "grad_norm": 2.119276599077235, + "learning_rate": 8.891778892006077e-06, + "loss": 0.7002, + "step": 2305 + }, + { + "epoch": 0.24, + "grad_norm": 1.8114144842978803, + "learning_rate": 8.890721782833596e-06, + "loss": 0.6562, + "step": 2306 + }, + { + "epoch": 0.24, + "grad_norm": 1.9513465793448206, + "learning_rate": 8.889664232625626e-06, + "loss": 0.6201, + "step": 2307 + }, + { + "epoch": 0.24, + "grad_norm": 1.9660798153237917, + "learning_rate": 8.888606241502044e-06, + "loss": 0.6004, + "step": 2308 + }, + { + "epoch": 0.24, + "grad_norm": 1.9794487766731317, + "learning_rate": 8.88754780958278e-06, + "loss": 0.7013, + "step": 2309 + }, + { + "epoch": 0.24, + "grad_norm": 1.7888928185017705, + "learning_rate": 8.886488936987817e-06, + "loss": 0.6845, + "step": 2310 + }, + { + "epoch": 0.24, + "grad_norm": 1.7762297353081091, + "learning_rate": 8.885429623837178e-06, + "loss": 0.6759, + "step": 2311 + }, + { + "epoch": 0.24, + "grad_norm": 1.96582047776657, + "learning_rate": 8.884369870250945e-06, + "loss": 0.6531, + "step": 2312 + }, + { + "epoch": 0.24, + "grad_norm": 1.8501988880713138, + "learning_rate": 8.883309676349247e-06, + "loss": 0.6091, + "step": 2313 + }, + { + "epoch": 0.24, + "grad_norm": 2.003392735286015, + "learning_rate": 8.882249042252262e-06, + "loss": 0.7573, + "step": 2314 + }, + { + "epoch": 0.24, + "grad_norm": 1.9134386167380948, + "learning_rate": 8.881187968080222e-06, + "loss": 0.6455, + "step": 2315 + }, + { + "epoch": 0.24, + "grad_norm": 1.870256379320453, + "learning_rate": 8.880126453953403e-06, + "loss": 0.655, + "step": 2316 + }, + { + "epoch": 0.24, + "grad_norm": 1.759883021241307, + "learning_rate": 8.879064499992133e-06, + "loss": 0.7253, + "step": 2317 + }, + { + "epoch": 0.24, + "grad_norm": 2.05074006883249, + "learning_rate": 8.878002106316795e-06, + "loss": 0.7489, + "step": 2318 + }, + { + "epoch": 0.24, + "grad_norm": 3.367048592554034, + "learning_rate": 8.876939273047813e-06, + "loss": 0.6746, + "step": 2319 + }, + { + "epoch": 0.24, + "grad_norm": 1.951477514551727, + "learning_rate": 8.875876000305666e-06, + "loss": 0.7181, + "step": 2320 + }, + { + "epoch": 0.24, + "grad_norm": 1.839227616535758, + "learning_rate": 8.874812288210883e-06, + "loss": 0.6724, + "step": 2321 + }, + { + "epoch": 0.24, + "grad_norm": 1.7580491487314909, + "learning_rate": 8.87374813688404e-06, + "loss": 0.7564, + "step": 2322 + }, + { + "epoch": 0.24, + "grad_norm": 2.082563087261863, + "learning_rate": 8.872683546445768e-06, + "loss": 0.7732, + "step": 2323 + }, + { + "epoch": 0.24, + "grad_norm": 1.947319163291198, + "learning_rate": 8.871618517016742e-06, + "loss": 0.6136, + "step": 2324 + }, + { + "epoch": 0.24, + "grad_norm": 1.8749719303586856, + "learning_rate": 8.870553048717689e-06, + "loss": 0.6375, + "step": 2325 + }, + { + "epoch": 0.24, + "grad_norm": 1.8189093114890664, + "learning_rate": 8.86948714166939e-06, + "loss": 0.6291, + "step": 2326 + }, + { + "epoch": 0.24, + "grad_norm": 1.7890150891185206, + "learning_rate": 8.868420795992662e-06, + "loss": 0.5662, + "step": 2327 + }, + { + "epoch": 0.24, + "grad_norm": 1.841854935089076, + "learning_rate": 8.867354011808391e-06, + "loss": 0.6774, + "step": 2328 + }, + { + "epoch": 0.24, + "grad_norm": 1.9138052891691462, + "learning_rate": 8.866286789237499e-06, + "loss": 0.6585, + "step": 2329 + }, + { + "epoch": 0.24, + "grad_norm": 1.9134802569985976, + "learning_rate": 8.865219128400964e-06, + "loss": 0.6528, + "step": 2330 + }, + { + "epoch": 0.24, + "grad_norm": 2.1272177907438614, + "learning_rate": 8.864151029419807e-06, + "loss": 0.6907, + "step": 2331 + }, + { + "epoch": 0.24, + "grad_norm": 1.8186258535110367, + "learning_rate": 8.863082492415107e-06, + "loss": 0.6954, + "step": 2332 + }, + { + "epoch": 0.24, + "grad_norm": 1.65082051016591, + "learning_rate": 8.86201351750799e-06, + "loss": 0.5637, + "step": 2333 + }, + { + "epoch": 0.24, + "grad_norm": 1.8293006727742915, + "learning_rate": 8.860944104819625e-06, + "loss": 0.719, + "step": 2334 + }, + { + "epoch": 0.24, + "grad_norm": 1.8125833581391186, + "learning_rate": 8.85987425447124e-06, + "loss": 0.5463, + "step": 2335 + }, + { + "epoch": 0.24, + "grad_norm": 1.8212561193038788, + "learning_rate": 8.858803966584108e-06, + "loss": 0.6743, + "step": 2336 + }, + { + "epoch": 0.24, + "grad_norm": 1.8575482610539216, + "learning_rate": 8.857733241279551e-06, + "loss": 0.6718, + "step": 2337 + }, + { + "epoch": 0.24, + "grad_norm": 2.1513939629386445, + "learning_rate": 8.856662078678944e-06, + "loss": 0.6537, + "step": 2338 + }, + { + "epoch": 0.24, + "grad_norm": 1.8719378153923891, + "learning_rate": 8.855590478903707e-06, + "loss": 0.644, + "step": 2339 + }, + { + "epoch": 0.24, + "grad_norm": 2.1098349957847162, + "learning_rate": 8.854518442075313e-06, + "loss": 0.7041, + "step": 2340 + }, + { + "epoch": 0.24, + "grad_norm": 2.140614121778238, + "learning_rate": 8.853445968315286e-06, + "loss": 0.7882, + "step": 2341 + }, + { + "epoch": 0.24, + "grad_norm": 1.79779569759238, + "learning_rate": 8.852373057745192e-06, + "loss": 0.6862, + "step": 2342 + }, + { + "epoch": 0.24, + "grad_norm": 1.8913245263343401, + "learning_rate": 8.851299710486655e-06, + "loss": 0.6563, + "step": 2343 + }, + { + "epoch": 0.24, + "grad_norm": 1.7848657576070133, + "learning_rate": 8.850225926661344e-06, + "loss": 0.5715, + "step": 2344 + }, + { + "epoch": 0.24, + "grad_norm": 2.042072907360112, + "learning_rate": 8.84915170639098e-06, + "loss": 0.6642, + "step": 2345 + }, + { + "epoch": 0.24, + "grad_norm": 1.882590587542723, + "learning_rate": 8.848077049797327e-06, + "loss": 0.5457, + "step": 2346 + }, + { + "epoch": 0.24, + "grad_norm": 2.0577242147645443, + "learning_rate": 8.847001957002211e-06, + "loss": 0.6979, + "step": 2347 + }, + { + "epoch": 0.24, + "grad_norm": 1.8692862797256795, + "learning_rate": 8.845926428127493e-06, + "loss": 0.678, + "step": 2348 + }, + { + "epoch": 0.24, + "grad_norm": 1.780826018929952, + "learning_rate": 8.844850463295096e-06, + "loss": 0.6109, + "step": 2349 + }, + { + "epoch": 0.24, + "grad_norm": 1.718886412604249, + "learning_rate": 8.843774062626982e-06, + "loss": 0.6187, + "step": 2350 + }, + { + "epoch": 0.24, + "grad_norm": 2.0780976828794624, + "learning_rate": 8.842697226245171e-06, + "loss": 0.6609, + "step": 2351 + }, + { + "epoch": 0.24, + "grad_norm": 2.1857009596687407, + "learning_rate": 8.841619954271725e-06, + "loss": 0.6179, + "step": 2352 + }, + { + "epoch": 0.24, + "grad_norm": 1.9946623483439587, + "learning_rate": 8.840542246828763e-06, + "loss": 0.5932, + "step": 2353 + }, + { + "epoch": 0.24, + "grad_norm": 1.9732400040300602, + "learning_rate": 8.839464104038445e-06, + "loss": 0.719, + "step": 2354 + }, + { + "epoch": 0.24, + "grad_norm": 1.9466418799081235, + "learning_rate": 8.838385526022989e-06, + "loss": 0.6472, + "step": 2355 + }, + { + "epoch": 0.24, + "grad_norm": 1.8709285020035453, + "learning_rate": 8.83730651290465e-06, + "loss": 0.6885, + "step": 2356 + }, + { + "epoch": 0.25, + "grad_norm": 1.8144321685258162, + "learning_rate": 8.836227064805751e-06, + "loss": 0.6892, + "step": 2357 + }, + { + "epoch": 0.25, + "grad_norm": 1.92972926188176, + "learning_rate": 8.835147181848646e-06, + "loss": 0.6428, + "step": 2358 + }, + { + "epoch": 0.25, + "grad_norm": 1.832178705416403, + "learning_rate": 8.83406686415575e-06, + "loss": 0.6024, + "step": 2359 + }, + { + "epoch": 0.25, + "grad_norm": 1.7708427249652576, + "learning_rate": 8.832986111849522e-06, + "loss": 0.5662, + "step": 2360 + }, + { + "epoch": 0.25, + "grad_norm": 1.7190953214028308, + "learning_rate": 8.831904925052468e-06, + "loss": 0.5733, + "step": 2361 + }, + { + "epoch": 0.25, + "grad_norm": 1.5950322378285189, + "learning_rate": 8.830823303887152e-06, + "loss": 0.5816, + "step": 2362 + }, + { + "epoch": 0.25, + "grad_norm": 1.9181816926545536, + "learning_rate": 8.829741248476178e-06, + "loss": 0.6566, + "step": 2363 + }, + { + "epoch": 0.25, + "grad_norm": 2.250978660572423, + "learning_rate": 8.828658758942206e-06, + "loss": 0.7142, + "step": 2364 + }, + { + "epoch": 0.25, + "grad_norm": 1.9235552583017665, + "learning_rate": 8.827575835407942e-06, + "loss": 0.6098, + "step": 2365 + }, + { + "epoch": 0.25, + "grad_norm": 1.6808897773713956, + "learning_rate": 8.826492477996138e-06, + "loss": 0.6614, + "step": 2366 + }, + { + "epoch": 0.25, + "grad_norm": 1.6939820996787394, + "learning_rate": 8.825408686829602e-06, + "loss": 0.6475, + "step": 2367 + }, + { + "epoch": 0.25, + "grad_norm": 2.005566835841052, + "learning_rate": 8.824324462031189e-06, + "loss": 0.7744, + "step": 2368 + }, + { + "epoch": 0.25, + "grad_norm": 1.8165929910866654, + "learning_rate": 8.823239803723799e-06, + "loss": 0.7205, + "step": 2369 + }, + { + "epoch": 0.25, + "grad_norm": 1.8947184852325152, + "learning_rate": 8.822154712030386e-06, + "loss": 0.6283, + "step": 2370 + }, + { + "epoch": 0.25, + "grad_norm": 1.847875985087468, + "learning_rate": 8.82106918707395e-06, + "loss": 0.6647, + "step": 2371 + }, + { + "epoch": 0.25, + "grad_norm": 1.7369026204140432, + "learning_rate": 8.819983228977543e-06, + "loss": 0.6001, + "step": 2372 + }, + { + "epoch": 0.25, + "grad_norm": 1.7589257105558311, + "learning_rate": 8.818896837864263e-06, + "loss": 0.6021, + "step": 2373 + }, + { + "epoch": 0.25, + "grad_norm": 1.782473528288657, + "learning_rate": 8.81781001385726e-06, + "loss": 0.6308, + "step": 2374 + }, + { + "epoch": 0.25, + "grad_norm": 2.24267074124853, + "learning_rate": 8.81672275707973e-06, + "loss": 0.6647, + "step": 2375 + }, + { + "epoch": 0.25, + "grad_norm": 1.7755566946531254, + "learning_rate": 8.815635067654924e-06, + "loss": 0.6279, + "step": 2376 + }, + { + "epoch": 0.25, + "grad_norm": 1.7420803857728735, + "learning_rate": 8.814546945706132e-06, + "loss": 0.6252, + "step": 2377 + }, + { + "epoch": 0.25, + "grad_norm": 2.00014949073347, + "learning_rate": 8.813458391356702e-06, + "loss": 0.6809, + "step": 2378 + }, + { + "epoch": 0.25, + "grad_norm": 1.988415369348472, + "learning_rate": 8.812369404730027e-06, + "loss": 0.6487, + "step": 2379 + }, + { + "epoch": 0.25, + "grad_norm": 1.9412075336525594, + "learning_rate": 8.811279985949551e-06, + "loss": 0.6755, + "step": 2380 + }, + { + "epoch": 0.25, + "grad_norm": 1.9139607368009248, + "learning_rate": 8.810190135138765e-06, + "loss": 0.722, + "step": 2381 + }, + { + "epoch": 0.25, + "grad_norm": 1.9545338507069623, + "learning_rate": 8.80909985242121e-06, + "loss": 0.6374, + "step": 2382 + }, + { + "epoch": 0.25, + "grad_norm": 1.748644293222741, + "learning_rate": 8.808009137920475e-06, + "loss": 0.6028, + "step": 2383 + }, + { + "epoch": 0.25, + "grad_norm": 2.0414545333900227, + "learning_rate": 8.8069179917602e-06, + "loss": 0.7474, + "step": 2384 + }, + { + "epoch": 0.25, + "grad_norm": 2.06277922678675, + "learning_rate": 8.805826414064071e-06, + "loss": 0.6891, + "step": 2385 + }, + { + "epoch": 0.25, + "grad_norm": 2.0083336554857874, + "learning_rate": 8.804734404955825e-06, + "loss": 0.656, + "step": 2386 + }, + { + "epoch": 0.25, + "grad_norm": 1.660482284498147, + "learning_rate": 8.80364196455925e-06, + "loss": 0.5254, + "step": 2387 + }, + { + "epoch": 0.25, + "grad_norm": 1.8909219127501342, + "learning_rate": 8.802549092998176e-06, + "loss": 0.6555, + "step": 2388 + }, + { + "epoch": 0.25, + "grad_norm": 1.9055724159754508, + "learning_rate": 8.80145579039649e-06, + "loss": 0.6571, + "step": 2389 + }, + { + "epoch": 0.25, + "grad_norm": 2.1750249859853925, + "learning_rate": 8.800362056878123e-06, + "loss": 0.7014, + "step": 2390 + }, + { + "epoch": 0.25, + "grad_norm": 2.17662140669081, + "learning_rate": 8.799267892567054e-06, + "loss": 0.6947, + "step": 2391 + }, + { + "epoch": 0.25, + "grad_norm": 1.8554578649297455, + "learning_rate": 8.798173297587316e-06, + "loss": 0.5301, + "step": 2392 + }, + { + "epoch": 0.25, + "grad_norm": 1.8693214266194902, + "learning_rate": 8.797078272062984e-06, + "loss": 0.6241, + "step": 2393 + }, + { + "epoch": 0.25, + "grad_norm": 10.116068708891031, + "learning_rate": 8.795982816118189e-06, + "loss": 0.7266, + "step": 2394 + }, + { + "epoch": 0.25, + "grad_norm": 1.8822655074150916, + "learning_rate": 8.794886929877104e-06, + "loss": 0.6358, + "step": 2395 + }, + { + "epoch": 0.25, + "grad_norm": 1.753515161356571, + "learning_rate": 8.793790613463956e-06, + "loss": 0.6498, + "step": 2396 + }, + { + "epoch": 0.25, + "grad_norm": 2.1198668025700362, + "learning_rate": 8.792693867003017e-06, + "loss": 0.8171, + "step": 2397 + }, + { + "epoch": 0.25, + "grad_norm": 1.9558213690977169, + "learning_rate": 8.791596690618611e-06, + "loss": 0.8011, + "step": 2398 + }, + { + "epoch": 0.25, + "grad_norm": 2.071651445112021, + "learning_rate": 8.79049908443511e-06, + "loss": 0.6883, + "step": 2399 + }, + { + "epoch": 0.25, + "grad_norm": 2.8405970903726203, + "learning_rate": 8.789401048576932e-06, + "loss": 0.7141, + "step": 2400 + }, + { + "epoch": 0.25, + "grad_norm": 1.8874216666737584, + "learning_rate": 8.788302583168546e-06, + "loss": 0.6946, + "step": 2401 + }, + { + "epoch": 0.25, + "grad_norm": 2.0114898090891806, + "learning_rate": 8.78720368833447e-06, + "loss": 0.5675, + "step": 2402 + }, + { + "epoch": 0.25, + "grad_norm": 1.9083054611199246, + "learning_rate": 8.78610436419927e-06, + "loss": 0.6291, + "step": 2403 + }, + { + "epoch": 0.25, + "grad_norm": 1.9970989061556155, + "learning_rate": 8.785004610887559e-06, + "loss": 0.6855, + "step": 2404 + }, + { + "epoch": 0.25, + "grad_norm": 1.7311843615333404, + "learning_rate": 8.783904428524002e-06, + "loss": 0.5969, + "step": 2405 + }, + { + "epoch": 0.25, + "grad_norm": 1.7874300778174452, + "learning_rate": 8.782803817233312e-06, + "loss": 0.6618, + "step": 2406 + }, + { + "epoch": 0.25, + "grad_norm": 1.745175178235986, + "learning_rate": 8.781702777140245e-06, + "loss": 0.6895, + "step": 2407 + }, + { + "epoch": 0.25, + "grad_norm": 1.9717874839652878, + "learning_rate": 8.780601308369615e-06, + "loss": 0.7072, + "step": 2408 + }, + { + "epoch": 0.25, + "grad_norm": 2.286414310695891, + "learning_rate": 8.779499411046279e-06, + "loss": 0.6198, + "step": 2409 + }, + { + "epoch": 0.25, + "grad_norm": 1.8423429664253486, + "learning_rate": 8.778397085295141e-06, + "loss": 0.5958, + "step": 2410 + }, + { + "epoch": 0.25, + "grad_norm": 1.7508724073956576, + "learning_rate": 8.777294331241157e-06, + "loss": 0.6431, + "step": 2411 + }, + { + "epoch": 0.25, + "grad_norm": 1.9141251716681176, + "learning_rate": 8.77619114900933e-06, + "loss": 0.6575, + "step": 2412 + }, + { + "epoch": 0.25, + "grad_norm": 1.9832722148946702, + "learning_rate": 8.775087538724714e-06, + "loss": 0.6805, + "step": 2413 + }, + { + "epoch": 0.25, + "grad_norm": 1.7600986147301019, + "learning_rate": 8.773983500512408e-06, + "loss": 0.7129, + "step": 2414 + }, + { + "epoch": 0.25, + "grad_norm": 1.9150612390096766, + "learning_rate": 8.772879034497561e-06, + "loss": 0.8497, + "step": 2415 + }, + { + "epoch": 0.25, + "grad_norm": 1.812339592854274, + "learning_rate": 8.771774140805372e-06, + "loss": 0.668, + "step": 2416 + }, + { + "epoch": 0.25, + "grad_norm": 1.9986647398148865, + "learning_rate": 8.770668819561085e-06, + "loss": 0.7162, + "step": 2417 + }, + { + "epoch": 0.25, + "grad_norm": 1.9456747118199362, + "learning_rate": 8.769563070889995e-06, + "loss": 0.7052, + "step": 2418 + }, + { + "epoch": 0.25, + "grad_norm": 1.965657407221823, + "learning_rate": 8.768456894917445e-06, + "loss": 0.7759, + "step": 2419 + }, + { + "epoch": 0.25, + "grad_norm": 1.949060659018235, + "learning_rate": 8.767350291768827e-06, + "loss": 0.7126, + "step": 2420 + }, + { + "epoch": 0.25, + "grad_norm": 1.8428955759377683, + "learning_rate": 8.76624326156958e-06, + "loss": 0.6419, + "step": 2421 + }, + { + "epoch": 0.25, + "grad_norm": 1.9139856823393193, + "learning_rate": 8.765135804445192e-06, + "loss": 0.6822, + "step": 2422 + }, + { + "epoch": 0.25, + "grad_norm": 2.12635949355531, + "learning_rate": 8.7640279205212e-06, + "loss": 0.7338, + "step": 2423 + }, + { + "epoch": 0.25, + "grad_norm": 1.871367384050979, + "learning_rate": 8.76291960992319e-06, + "loss": 0.6935, + "step": 2424 + }, + { + "epoch": 0.25, + "grad_norm": 1.9112013632683538, + "learning_rate": 8.761810872776793e-06, + "loss": 0.6928, + "step": 2425 + }, + { + "epoch": 0.25, + "grad_norm": 1.986560427331108, + "learning_rate": 8.760701709207693e-06, + "loss": 0.6297, + "step": 2426 + }, + { + "epoch": 0.25, + "grad_norm": 1.8219563126246536, + "learning_rate": 8.759592119341618e-06, + "loss": 0.6592, + "step": 2427 + }, + { + "epoch": 0.25, + "grad_norm": 2.063821372703502, + "learning_rate": 8.758482103304348e-06, + "loss": 0.6741, + "step": 2428 + }, + { + "epoch": 0.25, + "grad_norm": 1.834418853775193, + "learning_rate": 8.757371661221709e-06, + "loss": 0.5968, + "step": 2429 + }, + { + "epoch": 0.25, + "grad_norm": 1.8513650814208955, + "learning_rate": 8.756260793219575e-06, + "loss": 0.6706, + "step": 2430 + }, + { + "epoch": 0.25, + "grad_norm": 2.205372225245357, + "learning_rate": 8.755149499423871e-06, + "loss": 0.7245, + "step": 2431 + }, + { + "epoch": 0.25, + "grad_norm": 1.8523502602927382, + "learning_rate": 8.754037779960566e-06, + "loss": 0.6928, + "step": 2432 + }, + { + "epoch": 0.25, + "grad_norm": 2.164253711937142, + "learning_rate": 8.752925634955685e-06, + "loss": 0.6091, + "step": 2433 + }, + { + "epoch": 0.25, + "grad_norm": 1.7726757578138437, + "learning_rate": 8.751813064535288e-06, + "loss": 0.5928, + "step": 2434 + }, + { + "epoch": 0.25, + "grad_norm": 1.9559306596077315, + "learning_rate": 8.750700068825499e-06, + "loss": 0.6752, + "step": 2435 + }, + { + "epoch": 0.25, + "grad_norm": 1.7822366987620128, + "learning_rate": 8.749586647952478e-06, + "loss": 0.6989, + "step": 2436 + }, + { + "epoch": 0.25, + "grad_norm": 2.2000460835818703, + "learning_rate": 8.748472802042438e-06, + "loss": 0.6844, + "step": 2437 + }, + { + "epoch": 0.25, + "grad_norm": 2.0813206807667015, + "learning_rate": 8.74735853122164e-06, + "loss": 0.7764, + "step": 2438 + }, + { + "epoch": 0.25, + "grad_norm": 1.810316239654606, + "learning_rate": 8.746243835616392e-06, + "loss": 0.6687, + "step": 2439 + }, + { + "epoch": 0.25, + "grad_norm": 2.2509528243743127, + "learning_rate": 8.745128715353055e-06, + "loss": 0.576, + "step": 2440 + }, + { + "epoch": 0.25, + "grad_norm": 2.073047694712885, + "learning_rate": 8.74401317055803e-06, + "loss": 0.7311, + "step": 2441 + }, + { + "epoch": 0.25, + "grad_norm": 1.9258060000172623, + "learning_rate": 8.742897201357772e-06, + "loss": 0.6671, + "step": 2442 + }, + { + "epoch": 0.25, + "grad_norm": 1.8935603943634967, + "learning_rate": 8.741780807878783e-06, + "loss": 0.7318, + "step": 2443 + }, + { + "epoch": 0.25, + "grad_norm": 1.803740494965822, + "learning_rate": 8.740663990247612e-06, + "loss": 0.6143, + "step": 2444 + }, + { + "epoch": 0.25, + "grad_norm": 1.8225712920487915, + "learning_rate": 8.739546748590857e-06, + "loss": 0.6634, + "step": 2445 + }, + { + "epoch": 0.25, + "grad_norm": 1.79407696453161, + "learning_rate": 8.738429083035162e-06, + "loss": 0.6252, + "step": 2446 + }, + { + "epoch": 0.25, + "grad_norm": 1.7820233409510757, + "learning_rate": 8.737310993707225e-06, + "loss": 0.6748, + "step": 2447 + }, + { + "epoch": 0.25, + "grad_norm": 1.8736454414922534, + "learning_rate": 8.736192480733782e-06, + "loss": 0.6461, + "step": 2448 + }, + { + "epoch": 0.25, + "grad_norm": 1.8092214496718138, + "learning_rate": 8.735073544241627e-06, + "loss": 0.5736, + "step": 2449 + }, + { + "epoch": 0.25, + "grad_norm": 1.8901610250011938, + "learning_rate": 8.733954184357596e-06, + "loss": 0.5879, + "step": 2450 + }, + { + "epoch": 0.25, + "grad_norm": 1.9058356933274532, + "learning_rate": 8.732834401208575e-06, + "loss": 0.7635, + "step": 2451 + }, + { + "epoch": 0.25, + "grad_norm": 2.145236956293225, + "learning_rate": 8.731714194921498e-06, + "loss": 0.7627, + "step": 2452 + }, + { + "epoch": 0.25, + "grad_norm": 1.9661727069908106, + "learning_rate": 8.730593565623349e-06, + "loss": 0.6266, + "step": 2453 + }, + { + "epoch": 0.26, + "grad_norm": 1.9383169325757479, + "learning_rate": 8.729472513441152e-06, + "loss": 0.6724, + "step": 2454 + }, + { + "epoch": 0.26, + "grad_norm": 2.0297056180550634, + "learning_rate": 8.728351038501991e-06, + "loss": 0.7136, + "step": 2455 + }, + { + "epoch": 0.26, + "grad_norm": 2.2711745650680903, + "learning_rate": 8.727229140932988e-06, + "loss": 0.6877, + "step": 2456 + }, + { + "epoch": 0.26, + "grad_norm": 1.8425469528169693, + "learning_rate": 8.726106820861319e-06, + "loss": 0.645, + "step": 2457 + }, + { + "epoch": 0.26, + "grad_norm": 1.9820413892321256, + "learning_rate": 8.724984078414202e-06, + "loss": 0.6962, + "step": 2458 + }, + { + "epoch": 0.26, + "grad_norm": 1.943380318626068, + "learning_rate": 8.72386091371891e-06, + "loss": 0.6888, + "step": 2459 + }, + { + "epoch": 0.26, + "grad_norm": 1.8212199805904132, + "learning_rate": 8.722737326902757e-06, + "loss": 0.6929, + "step": 2460 + }, + { + "epoch": 0.26, + "grad_norm": 1.8554451720860514, + "learning_rate": 8.72161331809311e-06, + "loss": 0.7531, + "step": 2461 + }, + { + "epoch": 0.26, + "grad_norm": 1.9310112538498863, + "learning_rate": 8.720488887417379e-06, + "loss": 0.6156, + "step": 2462 + }, + { + "epoch": 0.26, + "grad_norm": 1.8574398933023148, + "learning_rate": 8.719364035003028e-06, + "loss": 0.6742, + "step": 2463 + }, + { + "epoch": 0.26, + "grad_norm": 1.9700711516872065, + "learning_rate": 8.718238760977562e-06, + "loss": 0.7087, + "step": 2464 + }, + { + "epoch": 0.26, + "grad_norm": 1.896331167359135, + "learning_rate": 8.71711306546854e-06, + "loss": 0.6402, + "step": 2465 + }, + { + "epoch": 0.26, + "grad_norm": 1.5871072247915825, + "learning_rate": 8.715986948603566e-06, + "loss": 0.5402, + "step": 2466 + }, + { + "epoch": 0.26, + "grad_norm": 1.9043062384767355, + "learning_rate": 8.71486041051029e-06, + "loss": 0.7504, + "step": 2467 + }, + { + "epoch": 0.26, + "grad_norm": 1.9304941660777277, + "learning_rate": 8.713733451316415e-06, + "loss": 0.7583, + "step": 2468 + }, + { + "epoch": 0.26, + "grad_norm": 1.9562908438044804, + "learning_rate": 8.712606071149683e-06, + "loss": 0.6654, + "step": 2469 + }, + { + "epoch": 0.26, + "grad_norm": 1.955293052258211, + "learning_rate": 8.711478270137892e-06, + "loss": 0.7673, + "step": 2470 + }, + { + "epoch": 0.26, + "grad_norm": 1.7622527529578897, + "learning_rate": 8.710350048408885e-06, + "loss": 0.6916, + "step": 2471 + }, + { + "epoch": 0.26, + "grad_norm": 1.9881603469858304, + "learning_rate": 8.709221406090552e-06, + "loss": 0.6934, + "step": 2472 + }, + { + "epoch": 0.26, + "grad_norm": 2.0033670173359495, + "learning_rate": 8.70809234331083e-06, + "loss": 0.7154, + "step": 2473 + }, + { + "epoch": 0.26, + "grad_norm": 1.777575663522896, + "learning_rate": 8.706962860197707e-06, + "loss": 0.6564, + "step": 2474 + }, + { + "epoch": 0.26, + "grad_norm": 1.967548242810914, + "learning_rate": 8.705832956879214e-06, + "loss": 0.6439, + "step": 2475 + }, + { + "epoch": 0.26, + "grad_norm": 1.94761621671732, + "learning_rate": 8.704702633483431e-06, + "loss": 0.6275, + "step": 2476 + }, + { + "epoch": 0.26, + "grad_norm": 2.017817753341322, + "learning_rate": 8.703571890138491e-06, + "loss": 0.737, + "step": 2477 + }, + { + "epoch": 0.26, + "grad_norm": 1.6563836459176866, + "learning_rate": 8.702440726972565e-06, + "loss": 0.5686, + "step": 2478 + }, + { + "epoch": 0.26, + "grad_norm": 2.0567605849561037, + "learning_rate": 8.701309144113881e-06, + "loss": 0.6831, + "step": 2479 + }, + { + "epoch": 0.26, + "grad_norm": 1.8114888761264734, + "learning_rate": 8.700177141690708e-06, + "loss": 0.6569, + "step": 2480 + }, + { + "epoch": 0.26, + "grad_norm": 2.1350886758417156, + "learning_rate": 8.699044719831368e-06, + "loss": 0.712, + "step": 2481 + }, + { + "epoch": 0.26, + "grad_norm": 1.8611495208476372, + "learning_rate": 8.697911878664222e-06, + "loss": 0.6709, + "step": 2482 + }, + { + "epoch": 0.26, + "grad_norm": 1.940942812297296, + "learning_rate": 8.69677861831769e-06, + "loss": 0.656, + "step": 2483 + }, + { + "epoch": 0.26, + "grad_norm": 1.8476105475176288, + "learning_rate": 8.695644938920229e-06, + "loss": 0.6289, + "step": 2484 + }, + { + "epoch": 0.26, + "grad_norm": 2.0744387596970384, + "learning_rate": 8.69451084060035e-06, + "loss": 0.7183, + "step": 2485 + }, + { + "epoch": 0.26, + "grad_norm": 1.9263953191832632, + "learning_rate": 8.693376323486609e-06, + "loss": 0.5893, + "step": 2486 + }, + { + "epoch": 0.26, + "grad_norm": 1.6553134124256639, + "learning_rate": 8.69224138770761e-06, + "loss": 0.5539, + "step": 2487 + }, + { + "epoch": 0.26, + "grad_norm": 1.7912036322986, + "learning_rate": 8.691106033392004e-06, + "loss": 0.6439, + "step": 2488 + }, + { + "epoch": 0.26, + "grad_norm": 1.9551293930565374, + "learning_rate": 8.689970260668494e-06, + "loss": 0.6473, + "step": 2489 + }, + { + "epoch": 0.26, + "grad_norm": 2.4836561713045784, + "learning_rate": 8.688834069665819e-06, + "loss": 0.7666, + "step": 2490 + }, + { + "epoch": 0.26, + "grad_norm": 1.9702181315374108, + "learning_rate": 8.687697460512779e-06, + "loss": 0.5778, + "step": 2491 + }, + { + "epoch": 0.26, + "grad_norm": 1.8888601359095911, + "learning_rate": 8.686560433338212e-06, + "loss": 0.6791, + "step": 2492 + }, + { + "epoch": 0.26, + "grad_norm": 1.8130047281565391, + "learning_rate": 8.685422988271005e-06, + "loss": 0.7071, + "step": 2493 + }, + { + "epoch": 0.26, + "grad_norm": 1.8530436031260942, + "learning_rate": 8.684285125440099e-06, + "loss": 0.6628, + "step": 2494 + }, + { + "epoch": 0.26, + "grad_norm": 1.8370355003345429, + "learning_rate": 8.683146844974473e-06, + "loss": 0.6431, + "step": 2495 + }, + { + "epoch": 0.26, + "grad_norm": 1.872017642208934, + "learning_rate": 8.682008147003159e-06, + "loss": 0.6496, + "step": 2496 + }, + { + "epoch": 0.26, + "grad_norm": 2.194564775682107, + "learning_rate": 8.680869031655234e-06, + "loss": 0.7402, + "step": 2497 + }, + { + "epoch": 0.26, + "grad_norm": 1.7391627982124838, + "learning_rate": 8.679729499059826e-06, + "loss": 0.6092, + "step": 2498 + }, + { + "epoch": 0.26, + "grad_norm": 1.8041114226784951, + "learning_rate": 8.678589549346103e-06, + "loss": 0.6696, + "step": 2499 + }, + { + "epoch": 0.26, + "grad_norm": 1.9286629979286871, + "learning_rate": 8.67744918264329e-06, + "loss": 0.6261, + "step": 2500 + }, + { + "epoch": 0.26, + "grad_norm": 2.034026955174544, + "learning_rate": 8.67630839908065e-06, + "loss": 0.6834, + "step": 2501 + }, + { + "epoch": 0.26, + "grad_norm": 1.7611055542338694, + "learning_rate": 8.675167198787497e-06, + "loss": 0.6267, + "step": 2502 + }, + { + "epoch": 0.26, + "grad_norm": 2.0451242410055697, + "learning_rate": 8.674025581893197e-06, + "loss": 0.6848, + "step": 2503 + }, + { + "epoch": 0.26, + "grad_norm": 1.939191563492884, + "learning_rate": 8.672883548527156e-06, + "loss": 0.6619, + "step": 2504 + }, + { + "epoch": 0.26, + "grad_norm": 1.5780790969065914, + "learning_rate": 8.671741098818829e-06, + "loss": 0.7253, + "step": 2505 + }, + { + "epoch": 0.26, + "grad_norm": 1.9719588150251077, + "learning_rate": 8.67059823289772e-06, + "loss": 0.667, + "step": 2506 + }, + { + "epoch": 0.26, + "grad_norm": 1.9648668702584733, + "learning_rate": 8.669454950893381e-06, + "loss": 0.7402, + "step": 2507 + }, + { + "epoch": 0.26, + "grad_norm": 1.9258196939936445, + "learning_rate": 8.668311252935407e-06, + "loss": 0.6772, + "step": 2508 + }, + { + "epoch": 0.26, + "grad_norm": 2.0081060724360444, + "learning_rate": 8.667167139153443e-06, + "loss": 0.6405, + "step": 2509 + }, + { + "epoch": 0.26, + "grad_norm": 1.6742198835544373, + "learning_rate": 8.666022609677183e-06, + "loss": 0.5199, + "step": 2510 + }, + { + "epoch": 0.26, + "grad_norm": 1.8564714159417233, + "learning_rate": 8.664877664636365e-06, + "loss": 0.7075, + "step": 2511 + }, + { + "epoch": 0.26, + "grad_norm": 1.7875909865802426, + "learning_rate": 8.663732304160772e-06, + "loss": 0.6749, + "step": 2512 + }, + { + "epoch": 0.26, + "grad_norm": 1.8538473774971693, + "learning_rate": 8.66258652838024e-06, + "loss": 0.6399, + "step": 2513 + }, + { + "epoch": 0.26, + "grad_norm": 1.8806935770428046, + "learning_rate": 8.661440337424652e-06, + "loss": 0.6695, + "step": 2514 + }, + { + "epoch": 0.26, + "grad_norm": 1.8938221894953242, + "learning_rate": 8.660293731423929e-06, + "loss": 0.5308, + "step": 2515 + }, + { + "epoch": 0.26, + "grad_norm": 1.7300456189211295, + "learning_rate": 8.659146710508052e-06, + "loss": 0.6649, + "step": 2516 + }, + { + "epoch": 0.26, + "grad_norm": 1.9670066452324917, + "learning_rate": 8.657999274807036e-06, + "loss": 0.6295, + "step": 2517 + }, + { + "epoch": 0.26, + "grad_norm": 2.421249790517567, + "learning_rate": 8.656851424450954e-06, + "loss": 0.5711, + "step": 2518 + }, + { + "epoch": 0.26, + "grad_norm": 1.7941611225137957, + "learning_rate": 8.655703159569919e-06, + "loss": 0.6625, + "step": 2519 + }, + { + "epoch": 0.26, + "grad_norm": 1.8645532699032437, + "learning_rate": 8.654554480294094e-06, + "loss": 0.6557, + "step": 2520 + }, + { + "epoch": 0.26, + "grad_norm": 1.7914760505530758, + "learning_rate": 8.653405386753688e-06, + "loss": 0.6373, + "step": 2521 + }, + { + "epoch": 0.26, + "grad_norm": 1.9904369150091747, + "learning_rate": 8.652255879078959e-06, + "loss": 0.6712, + "step": 2522 + }, + { + "epoch": 0.26, + "grad_norm": 2.0797350090765514, + "learning_rate": 8.651105957400208e-06, + "loss": 0.75, + "step": 2523 + }, + { + "epoch": 0.26, + "grad_norm": 2.0592593153415586, + "learning_rate": 8.649955621847787e-06, + "loss": 0.6536, + "step": 2524 + }, + { + "epoch": 0.26, + "grad_norm": 1.9125424066671115, + "learning_rate": 8.648804872552092e-06, + "loss": 0.7074, + "step": 2525 + }, + { + "epoch": 0.26, + "grad_norm": 1.9638190738119656, + "learning_rate": 8.647653709643566e-06, + "loss": 0.7199, + "step": 2526 + }, + { + "epoch": 0.26, + "grad_norm": 1.7916417184920295, + "learning_rate": 8.646502133252702e-06, + "loss": 0.6874, + "step": 2527 + }, + { + "epoch": 0.26, + "grad_norm": 1.8071976978774118, + "learning_rate": 8.645350143510036e-06, + "loss": 0.8068, + "step": 2528 + }, + { + "epoch": 0.26, + "grad_norm": 1.9139163770847392, + "learning_rate": 8.644197740546153e-06, + "loss": 0.7273, + "step": 2529 + }, + { + "epoch": 0.26, + "grad_norm": 1.902494211289062, + "learning_rate": 8.643044924491688e-06, + "loss": 0.7555, + "step": 2530 + }, + { + "epoch": 0.26, + "grad_norm": 1.7887237449586197, + "learning_rate": 8.641891695477314e-06, + "loss": 0.7074, + "step": 2531 + }, + { + "epoch": 0.26, + "grad_norm": 1.6889392543519213, + "learning_rate": 8.640738053633758e-06, + "loss": 0.6728, + "step": 2532 + }, + { + "epoch": 0.26, + "grad_norm": 2.045472851926104, + "learning_rate": 8.639583999091792e-06, + "loss": 0.7177, + "step": 2533 + }, + { + "epoch": 0.26, + "grad_norm": 1.8990351279183315, + "learning_rate": 8.638429531982235e-06, + "loss": 0.7403, + "step": 2534 + }, + { + "epoch": 0.26, + "grad_norm": 1.7419245372014982, + "learning_rate": 8.637274652435954e-06, + "loss": 0.5934, + "step": 2535 + }, + { + "epoch": 0.26, + "grad_norm": 2.0825259365129316, + "learning_rate": 8.636119360583857e-06, + "loss": 0.6051, + "step": 2536 + }, + { + "epoch": 0.26, + "grad_norm": 1.8906745088867434, + "learning_rate": 8.634963656556904e-06, + "loss": 0.6227, + "step": 2537 + }, + { + "epoch": 0.26, + "grad_norm": 1.8357251628237146, + "learning_rate": 8.633807540486105e-06, + "loss": 0.6398, + "step": 2538 + }, + { + "epoch": 0.26, + "grad_norm": 1.8989039062807476, + "learning_rate": 8.632651012502508e-06, + "loss": 0.6443, + "step": 2539 + }, + { + "epoch": 0.26, + "grad_norm": 1.7261862083314314, + "learning_rate": 8.631494072737215e-06, + "loss": 0.6061, + "step": 2540 + }, + { + "epoch": 0.26, + "grad_norm": 1.76712139420288, + "learning_rate": 8.630336721321368e-06, + "loss": 0.564, + "step": 2541 + }, + { + "epoch": 0.26, + "grad_norm": 1.8432886950858667, + "learning_rate": 8.629178958386162e-06, + "loss": 0.6403, + "step": 2542 + }, + { + "epoch": 0.26, + "grad_norm": 1.9592036465215612, + "learning_rate": 8.628020784062837e-06, + "loss": 0.5919, + "step": 2543 + }, + { + "epoch": 0.26, + "grad_norm": 1.8828642264460216, + "learning_rate": 8.626862198482676e-06, + "loss": 0.6855, + "step": 2544 + }, + { + "epoch": 0.26, + "grad_norm": 1.8946806088170294, + "learning_rate": 8.62570320177701e-06, + "loss": 0.527, + "step": 2545 + }, + { + "epoch": 0.26, + "grad_norm": 1.9271775419029487, + "learning_rate": 8.624543794077223e-06, + "loss": 0.7033, + "step": 2546 + }, + { + "epoch": 0.26, + "grad_norm": 2.079607663238331, + "learning_rate": 8.623383975514736e-06, + "loss": 0.6912, + "step": 2547 + }, + { + "epoch": 0.26, + "grad_norm": 1.923055418753926, + "learning_rate": 8.622223746221024e-06, + "loss": 0.7492, + "step": 2548 + }, + { + "epoch": 0.26, + "grad_norm": 1.8699947533876318, + "learning_rate": 8.621063106327604e-06, + "loss": 0.6444, + "step": 2549 + }, + { + "epoch": 0.27, + "grad_norm": 2.0911040969561068, + "learning_rate": 8.619902055966043e-06, + "loss": 0.7458, + "step": 2550 + }, + { + "epoch": 0.27, + "grad_norm": 2.0972275911219986, + "learning_rate": 8.618740595267949e-06, + "loss": 0.7584, + "step": 2551 + }, + { + "epoch": 0.27, + "grad_norm": 1.8754911401151375, + "learning_rate": 8.617578724364984e-06, + "loss": 0.636, + "step": 2552 + }, + { + "epoch": 0.27, + "grad_norm": 2.0750752889395683, + "learning_rate": 8.616416443388849e-06, + "loss": 0.643, + "step": 2553 + }, + { + "epoch": 0.27, + "grad_norm": 1.8972081219498664, + "learning_rate": 8.615253752471297e-06, + "loss": 0.6895, + "step": 2554 + }, + { + "epoch": 0.27, + "grad_norm": 1.8417165581548547, + "learning_rate": 8.61409065174413e-06, + "loss": 0.6493, + "step": 2555 + }, + { + "epoch": 0.27, + "grad_norm": 1.7181649813698556, + "learning_rate": 8.612927141339184e-06, + "loss": 0.5672, + "step": 2556 + }, + { + "epoch": 0.27, + "grad_norm": 1.8312017035141213, + "learning_rate": 8.611763221388356e-06, + "loss": 0.6054, + "step": 2557 + }, + { + "epoch": 0.27, + "grad_norm": 2.618425682813372, + "learning_rate": 8.610598892023578e-06, + "loss": 0.7074, + "step": 2558 + }, + { + "epoch": 0.27, + "grad_norm": 1.9623646243149804, + "learning_rate": 8.609434153376839e-06, + "loss": 0.6801, + "step": 2559 + }, + { + "epoch": 0.27, + "grad_norm": 1.7973271647036762, + "learning_rate": 8.608269005580164e-06, + "loss": 0.5592, + "step": 2560 + }, + { + "epoch": 0.27, + "grad_norm": 1.8497187638997796, + "learning_rate": 8.60710344876563e-06, + "loss": 0.6174, + "step": 2561 + }, + { + "epoch": 0.27, + "grad_norm": 1.6527277765494606, + "learning_rate": 8.605937483065361e-06, + "loss": 0.6983, + "step": 2562 + }, + { + "epoch": 0.27, + "grad_norm": 1.8421684748336877, + "learning_rate": 8.604771108611525e-06, + "loss": 0.6781, + "step": 2563 + }, + { + "epoch": 0.27, + "grad_norm": 1.9856957053064686, + "learning_rate": 8.603604325536338e-06, + "loss": 0.6826, + "step": 2564 + }, + { + "epoch": 0.27, + "grad_norm": 1.9390978240487184, + "learning_rate": 8.60243713397206e-06, + "loss": 0.6792, + "step": 2565 + }, + { + "epoch": 0.27, + "grad_norm": 1.8237849953951095, + "learning_rate": 8.601269534051e-06, + "loss": 0.7281, + "step": 2566 + }, + { + "epoch": 0.27, + "grad_norm": 2.0146077881578974, + "learning_rate": 8.600101525905512e-06, + "loss": 0.6844, + "step": 2567 + }, + { + "epoch": 0.27, + "grad_norm": 1.8280236844474076, + "learning_rate": 8.598933109667995e-06, + "loss": 0.6021, + "step": 2568 + }, + { + "epoch": 0.27, + "grad_norm": 1.7861961197050547, + "learning_rate": 8.597764285470897e-06, + "loss": 0.5682, + "step": 2569 + }, + { + "epoch": 0.27, + "grad_norm": 1.9280191246157654, + "learning_rate": 8.596595053446713e-06, + "loss": 0.5181, + "step": 2570 + }, + { + "epoch": 0.27, + "grad_norm": 1.822416375671748, + "learning_rate": 8.595425413727979e-06, + "loss": 0.6518, + "step": 2571 + }, + { + "epoch": 0.27, + "grad_norm": 2.2655542490958416, + "learning_rate": 8.59425536644728e-06, + "loss": 0.6768, + "step": 2572 + }, + { + "epoch": 0.27, + "grad_norm": 1.8253287565260945, + "learning_rate": 8.593084911737249e-06, + "loss": 0.5769, + "step": 2573 + }, + { + "epoch": 0.27, + "grad_norm": 2.1113111922024586, + "learning_rate": 8.591914049730561e-06, + "loss": 0.6449, + "step": 2574 + }, + { + "epoch": 0.27, + "grad_norm": 2.0885463074387056, + "learning_rate": 8.590742780559945e-06, + "loss": 0.6036, + "step": 2575 + }, + { + "epoch": 0.27, + "grad_norm": 2.0324217465623797, + "learning_rate": 8.589571104358168e-06, + "loss": 0.6981, + "step": 2576 + }, + { + "epoch": 0.27, + "grad_norm": 2.1051060069157144, + "learning_rate": 8.588399021258046e-06, + "loss": 0.6511, + "step": 2577 + }, + { + "epoch": 0.27, + "grad_norm": 1.849243572140936, + "learning_rate": 8.587226531392443e-06, + "loss": 0.5977, + "step": 2578 + }, + { + "epoch": 0.27, + "grad_norm": 1.9913608116550334, + "learning_rate": 8.586053634894264e-06, + "loss": 0.6965, + "step": 2579 + }, + { + "epoch": 0.27, + "grad_norm": 2.015092471598591, + "learning_rate": 8.584880331896467e-06, + "loss": 0.7351, + "step": 2580 + }, + { + "epoch": 0.27, + "grad_norm": 1.9372041452614617, + "learning_rate": 8.58370662253205e-06, + "loss": 0.7634, + "step": 2581 + }, + { + "epoch": 0.27, + "grad_norm": 1.832886288665305, + "learning_rate": 8.582532506934063e-06, + "loss": 0.6648, + "step": 2582 + }, + { + "epoch": 0.27, + "grad_norm": 1.9608951852109446, + "learning_rate": 8.581357985235595e-06, + "loss": 0.6949, + "step": 2583 + }, + { + "epoch": 0.27, + "grad_norm": 1.957191923854249, + "learning_rate": 8.580183057569788e-06, + "loss": 0.665, + "step": 2584 + }, + { + "epoch": 0.27, + "grad_norm": 2.0127234717259053, + "learning_rate": 8.579007724069823e-06, + "loss": 0.6466, + "step": 2585 + }, + { + "epoch": 0.27, + "grad_norm": 1.8341028012931, + "learning_rate": 8.577831984868934e-06, + "loss": 0.6804, + "step": 2586 + }, + { + "epoch": 0.27, + "grad_norm": 1.8705519139581988, + "learning_rate": 8.576655840100397e-06, + "loss": 0.6218, + "step": 2587 + }, + { + "epoch": 0.27, + "grad_norm": 1.776464815901934, + "learning_rate": 8.575479289897533e-06, + "loss": 0.6891, + "step": 2588 + }, + { + "epoch": 0.27, + "grad_norm": 1.7028339642895287, + "learning_rate": 8.574302334393712e-06, + "loss": 0.5559, + "step": 2589 + }, + { + "epoch": 0.27, + "grad_norm": 1.9135910153508136, + "learning_rate": 8.573124973722349e-06, + "loss": 0.6412, + "step": 2590 + }, + { + "epoch": 0.27, + "grad_norm": 1.7931655177631687, + "learning_rate": 8.571947208016904e-06, + "loss": 0.6621, + "step": 2591 + }, + { + "epoch": 0.27, + "grad_norm": 1.8046680636422858, + "learning_rate": 8.570769037410885e-06, + "loss": 0.6493, + "step": 2592 + }, + { + "epoch": 0.27, + "grad_norm": 1.7590315838924429, + "learning_rate": 8.56959046203784e-06, + "loss": 0.6755, + "step": 2593 + }, + { + "epoch": 0.27, + "grad_norm": 1.8703325066678136, + "learning_rate": 8.568411482031372e-06, + "loss": 0.644, + "step": 2594 + }, + { + "epoch": 0.27, + "grad_norm": 1.9303932644741164, + "learning_rate": 8.567232097525123e-06, + "loss": 0.6309, + "step": 2595 + }, + { + "epoch": 0.27, + "grad_norm": 1.8518576249903604, + "learning_rate": 8.566052308652783e-06, + "loss": 0.6986, + "step": 2596 + }, + { + "epoch": 0.27, + "grad_norm": 1.8176810029507842, + "learning_rate": 8.56487211554809e-06, + "loss": 0.6124, + "step": 2597 + }, + { + "epoch": 0.27, + "grad_norm": 1.806871269882299, + "learning_rate": 8.563691518344822e-06, + "loss": 0.5968, + "step": 2598 + }, + { + "epoch": 0.27, + "grad_norm": 1.9774455484279878, + "learning_rate": 8.562510517176807e-06, + "loss": 0.6801, + "step": 2599 + }, + { + "epoch": 0.27, + "grad_norm": 1.8888905845005723, + "learning_rate": 8.561329112177918e-06, + "loss": 0.6651, + "step": 2600 + }, + { + "epoch": 0.27, + "grad_norm": 1.9363855520254039, + "learning_rate": 8.560147303482078e-06, + "loss": 0.7647, + "step": 2601 + }, + { + "epoch": 0.27, + "grad_norm": 2.1291448409247398, + "learning_rate": 8.558965091223248e-06, + "loss": 0.7111, + "step": 2602 + }, + { + "epoch": 0.27, + "grad_norm": 2.0391902138624625, + "learning_rate": 8.55778247553544e-06, + "loss": 0.6569, + "step": 2603 + }, + { + "epoch": 0.27, + "grad_norm": 2.0624650689714334, + "learning_rate": 8.55659945655271e-06, + "loss": 0.6995, + "step": 2604 + }, + { + "epoch": 0.27, + "grad_norm": 1.6672557776522174, + "learning_rate": 8.555416034409158e-06, + "loss": 0.5768, + "step": 2605 + }, + { + "epoch": 0.27, + "grad_norm": 1.8330406340117817, + "learning_rate": 8.554232209238935e-06, + "loss": 0.7001, + "step": 2606 + }, + { + "epoch": 0.27, + "grad_norm": 1.9879512597907347, + "learning_rate": 8.553047981176232e-06, + "loss": 0.6188, + "step": 2607 + }, + { + "epoch": 0.27, + "grad_norm": 1.945608350625391, + "learning_rate": 8.55186335035529e-06, + "loss": 0.6776, + "step": 2608 + }, + { + "epoch": 0.27, + "grad_norm": 2.0410863507175847, + "learning_rate": 8.55067831691039e-06, + "loss": 0.8017, + "step": 2609 + }, + { + "epoch": 0.27, + "grad_norm": 1.9040127392524986, + "learning_rate": 8.549492880975866e-06, + "loss": 0.7686, + "step": 2610 + }, + { + "epoch": 0.27, + "grad_norm": 1.7321283064625452, + "learning_rate": 8.548307042686093e-06, + "loss": 0.5713, + "step": 2611 + }, + { + "epoch": 0.27, + "grad_norm": 2.016949276627831, + "learning_rate": 8.54712080217549e-06, + "loss": 0.6542, + "step": 2612 + }, + { + "epoch": 0.27, + "grad_norm": 1.6897248893935806, + "learning_rate": 8.545934159578527e-06, + "loss": 0.6149, + "step": 2613 + }, + { + "epoch": 0.27, + "grad_norm": 1.7965717499797786, + "learning_rate": 8.544747115029717e-06, + "loss": 0.6785, + "step": 2614 + }, + { + "epoch": 0.27, + "grad_norm": 1.869943587447054, + "learning_rate": 8.543559668663616e-06, + "loss": 0.6481, + "step": 2615 + }, + { + "epoch": 0.27, + "grad_norm": 1.9399110552269427, + "learning_rate": 8.54237182061483e-06, + "loss": 0.5535, + "step": 2616 + }, + { + "epoch": 0.27, + "grad_norm": 1.8931184533667633, + "learning_rate": 8.541183571018006e-06, + "loss": 0.6471, + "step": 2617 + }, + { + "epoch": 0.27, + "grad_norm": 1.9115419127837032, + "learning_rate": 8.53999492000784e-06, + "loss": 0.6745, + "step": 2618 + }, + { + "epoch": 0.27, + "grad_norm": 1.8487042882352491, + "learning_rate": 8.538805867719073e-06, + "loss": 0.7526, + "step": 2619 + }, + { + "epoch": 0.27, + "grad_norm": 1.9130883130049305, + "learning_rate": 8.537616414286491e-06, + "loss": 0.6541, + "step": 2620 + }, + { + "epoch": 0.27, + "grad_norm": 1.804592790152373, + "learning_rate": 8.536426559844923e-06, + "loss": 0.6664, + "step": 2621 + }, + { + "epoch": 0.27, + "grad_norm": 1.9861938106078298, + "learning_rate": 8.53523630452925e-06, + "loss": 0.7571, + "step": 2622 + }, + { + "epoch": 0.27, + "grad_norm": 1.6363374746611283, + "learning_rate": 8.53404564847439e-06, + "loss": 0.5683, + "step": 2623 + }, + { + "epoch": 0.27, + "grad_norm": 1.77836846394158, + "learning_rate": 8.532854591815313e-06, + "loss": 0.6264, + "step": 2624 + }, + { + "epoch": 0.27, + "grad_norm": 1.8115689056920217, + "learning_rate": 8.531663134687031e-06, + "loss": 0.649, + "step": 2625 + }, + { + "epoch": 0.27, + "grad_norm": 1.734131206157076, + "learning_rate": 8.530471277224603e-06, + "loss": 0.5828, + "step": 2626 + }, + { + "epoch": 0.27, + "grad_norm": 2.1028348732686246, + "learning_rate": 8.529279019563133e-06, + "loss": 0.6539, + "step": 2627 + }, + { + "epoch": 0.27, + "grad_norm": 2.1826963739866594, + "learning_rate": 8.528086361837771e-06, + "loss": 0.7534, + "step": 2628 + }, + { + "epoch": 0.27, + "grad_norm": 1.852410404097433, + "learning_rate": 8.526893304183708e-06, + "loss": 0.591, + "step": 2629 + }, + { + "epoch": 0.27, + "grad_norm": 1.6674908894896996, + "learning_rate": 8.525699846736189e-06, + "loss": 0.5761, + "step": 2630 + }, + { + "epoch": 0.27, + "grad_norm": 2.1536689365949493, + "learning_rate": 8.524505989630493e-06, + "loss": 0.6215, + "step": 2631 + }, + { + "epoch": 0.27, + "grad_norm": 1.7369192872406605, + "learning_rate": 8.523311733001957e-06, + "loss": 0.6372, + "step": 2632 + }, + { + "epoch": 0.27, + "grad_norm": 2.491539215247505, + "learning_rate": 8.522117076985955e-06, + "loss": 0.6566, + "step": 2633 + }, + { + "epoch": 0.27, + "grad_norm": 1.8660934135316272, + "learning_rate": 8.520922021717903e-06, + "loss": 0.6222, + "step": 2634 + }, + { + "epoch": 0.27, + "grad_norm": 2.058563343023167, + "learning_rate": 8.519726567333273e-06, + "loss": 0.6289, + "step": 2635 + }, + { + "epoch": 0.27, + "grad_norm": 2.0092068256350415, + "learning_rate": 8.518530713967575e-06, + "loss": 0.6672, + "step": 2636 + }, + { + "epoch": 0.27, + "grad_norm": 1.8080677008085388, + "learning_rate": 8.517334461756366e-06, + "loss": 0.6223, + "step": 2637 + }, + { + "epoch": 0.27, + "grad_norm": 1.9543683749556946, + "learning_rate": 8.516137810835248e-06, + "loss": 0.5789, + "step": 2638 + }, + { + "epoch": 0.27, + "grad_norm": 2.2093630526599357, + "learning_rate": 8.514940761339867e-06, + "loss": 0.7268, + "step": 2639 + }, + { + "epoch": 0.27, + "grad_norm": 2.2424846594153105, + "learning_rate": 8.513743313405916e-06, + "loss": 0.6877, + "step": 2640 + }, + { + "epoch": 0.27, + "grad_norm": 4.710291182584347, + "learning_rate": 8.512545467169133e-06, + "loss": 0.6036, + "step": 2641 + }, + { + "epoch": 0.27, + "grad_norm": 1.6276412226705352, + "learning_rate": 8.5113472227653e-06, + "loss": 0.517, + "step": 2642 + }, + { + "epoch": 0.27, + "grad_norm": 1.802571808921564, + "learning_rate": 8.510148580330246e-06, + "loss": 0.5983, + "step": 2643 + }, + { + "epoch": 0.27, + "grad_norm": 2.067739256881607, + "learning_rate": 8.508949539999845e-06, + "loss": 0.6387, + "step": 2644 + }, + { + "epoch": 0.27, + "grad_norm": 1.7614511548342808, + "learning_rate": 8.50775010191001e-06, + "loss": 0.6034, + "step": 2645 + }, + { + "epoch": 0.28, + "grad_norm": 1.852977153736663, + "learning_rate": 8.50655026619671e-06, + "loss": 0.6811, + "step": 2646 + }, + { + "epoch": 0.28, + "grad_norm": 1.7666746273554579, + "learning_rate": 8.505350032995946e-06, + "loss": 0.662, + "step": 2647 + }, + { + "epoch": 0.28, + "grad_norm": 1.9788840184781433, + "learning_rate": 8.504149402443782e-06, + "loss": 0.6714, + "step": 2648 + }, + { + "epoch": 0.28, + "grad_norm": 1.763046798035941, + "learning_rate": 8.502948374676307e-06, + "loss": 0.6534, + "step": 2649 + }, + { + "epoch": 0.28, + "grad_norm": 1.6872062703419626, + "learning_rate": 8.501746949829668e-06, + "loss": 0.6629, + "step": 2650 + }, + { + "epoch": 0.28, + "grad_norm": 1.9880445367487545, + "learning_rate": 8.500545128040052e-06, + "loss": 0.6643, + "step": 2651 + }, + { + "epoch": 0.28, + "grad_norm": 1.65755205744229, + "learning_rate": 8.499342909443697e-06, + "loss": 0.6829, + "step": 2652 + }, + { + "epoch": 0.28, + "grad_norm": 1.9105964610284794, + "learning_rate": 8.498140294176874e-06, + "loss": 0.5986, + "step": 2653 + }, + { + "epoch": 0.28, + "grad_norm": 1.7850157986482749, + "learning_rate": 8.496937282375912e-06, + "loss": 0.598, + "step": 2654 + }, + { + "epoch": 0.28, + "grad_norm": 1.830316459639485, + "learning_rate": 8.495733874177176e-06, + "loss": 0.6988, + "step": 2655 + }, + { + "epoch": 0.28, + "grad_norm": 1.8863090070314195, + "learning_rate": 8.49453006971708e-06, + "loss": 0.6376, + "step": 2656 + }, + { + "epoch": 0.28, + "grad_norm": 1.9381452812775855, + "learning_rate": 8.493325869132083e-06, + "loss": 0.5406, + "step": 2657 + }, + { + "epoch": 0.28, + "grad_norm": 1.9249092805322674, + "learning_rate": 8.492121272558687e-06, + "loss": 0.6696, + "step": 2658 + }, + { + "epoch": 0.28, + "grad_norm": 2.110242213687224, + "learning_rate": 8.49091628013344e-06, + "loss": 0.7029, + "step": 2659 + }, + { + "epoch": 0.28, + "grad_norm": 1.797032481255268, + "learning_rate": 8.489710891992938e-06, + "loss": 0.8027, + "step": 2660 + }, + { + "epoch": 0.28, + "grad_norm": 2.070668589376598, + "learning_rate": 8.488505108273813e-06, + "loss": 0.7835, + "step": 2661 + }, + { + "epoch": 0.28, + "grad_norm": 1.8527254181639246, + "learning_rate": 8.487298929112751e-06, + "loss": 0.5424, + "step": 2662 + }, + { + "epoch": 0.28, + "grad_norm": 1.9530291879277955, + "learning_rate": 8.486092354646478e-06, + "loss": 0.6221, + "step": 2663 + }, + { + "epoch": 0.28, + "grad_norm": 2.0275784280967564, + "learning_rate": 8.484885385011765e-06, + "loss": 0.6157, + "step": 2664 + }, + { + "epoch": 0.28, + "grad_norm": 1.7405853747938145, + "learning_rate": 8.483678020345433e-06, + "loss": 0.6301, + "step": 2665 + }, + { + "epoch": 0.28, + "grad_norm": 2.249414015385447, + "learning_rate": 8.48247026078434e-06, + "loss": 0.7239, + "step": 2666 + }, + { + "epoch": 0.28, + "grad_norm": 1.8456342897721276, + "learning_rate": 8.481262106465395e-06, + "loss": 0.6632, + "step": 2667 + }, + { + "epoch": 0.28, + "grad_norm": 1.9602153469722, + "learning_rate": 8.480053557525544e-06, + "loss": 0.795, + "step": 2668 + }, + { + "epoch": 0.28, + "grad_norm": 1.969295867379851, + "learning_rate": 8.478844614101792e-06, + "loss": 0.6968, + "step": 2669 + }, + { + "epoch": 0.28, + "grad_norm": 1.966399979210272, + "learning_rate": 8.47763527633117e-06, + "loss": 0.7567, + "step": 2670 + }, + { + "epoch": 0.28, + "grad_norm": 1.9079604001646893, + "learning_rate": 8.476425544350768e-06, + "loss": 0.6965, + "step": 2671 + }, + { + "epoch": 0.28, + "grad_norm": 1.8146309399032956, + "learning_rate": 8.475215418297718e-06, + "loss": 0.6707, + "step": 2672 + }, + { + "epoch": 0.28, + "grad_norm": 1.6768376922133157, + "learning_rate": 8.47400489830919e-06, + "loss": 0.6126, + "step": 2673 + }, + { + "epoch": 0.28, + "grad_norm": 1.9499300869990017, + "learning_rate": 8.472793984522406e-06, + "loss": 0.5496, + "step": 2674 + }, + { + "epoch": 0.28, + "grad_norm": 2.1684600034276653, + "learning_rate": 8.47158267707463e-06, + "loss": 0.6461, + "step": 2675 + }, + { + "epoch": 0.28, + "grad_norm": 1.817694567431662, + "learning_rate": 8.470370976103171e-06, + "loss": 0.5364, + "step": 2676 + }, + { + "epoch": 0.28, + "grad_norm": 2.217641057731939, + "learning_rate": 8.46915888174538e-06, + "loss": 0.689, + "step": 2677 + }, + { + "epoch": 0.28, + "grad_norm": 1.8021492176632457, + "learning_rate": 8.467946394138657e-06, + "loss": 0.6299, + "step": 2678 + }, + { + "epoch": 0.28, + "grad_norm": 1.9107035431154975, + "learning_rate": 8.466733513420442e-06, + "loss": 0.5893, + "step": 2679 + }, + { + "epoch": 0.28, + "grad_norm": 1.8742357813979735, + "learning_rate": 8.465520239728225e-06, + "loss": 0.6131, + "step": 2680 + }, + { + "epoch": 0.28, + "grad_norm": 1.9727390371432398, + "learning_rate": 8.464306573199536e-06, + "loss": 0.6842, + "step": 2681 + }, + { + "epoch": 0.28, + "grad_norm": 1.7240250249963895, + "learning_rate": 8.46309251397195e-06, + "loss": 0.6465, + "step": 2682 + }, + { + "epoch": 0.28, + "grad_norm": 1.9189298287066967, + "learning_rate": 8.461878062183092e-06, + "loss": 0.6417, + "step": 2683 + }, + { + "epoch": 0.28, + "grad_norm": 1.800900644472255, + "learning_rate": 8.46066321797062e-06, + "loss": 0.7715, + "step": 2684 + }, + { + "epoch": 0.28, + "grad_norm": 1.8212217697731836, + "learning_rate": 8.459447981472249e-06, + "loss": 0.7056, + "step": 2685 + }, + { + "epoch": 0.28, + "grad_norm": 1.9752112975069571, + "learning_rate": 8.45823235282573e-06, + "loss": 0.6689, + "step": 2686 + }, + { + "epoch": 0.28, + "grad_norm": 1.9746070435343586, + "learning_rate": 8.457016332168862e-06, + "loss": 0.7456, + "step": 2687 + }, + { + "epoch": 0.28, + "grad_norm": 3.432989926354537, + "learning_rate": 8.455799919639489e-06, + "loss": 0.593, + "step": 2688 + }, + { + "epoch": 0.28, + "grad_norm": 1.982266556212586, + "learning_rate": 8.454583115375498e-06, + "loss": 0.6421, + "step": 2689 + }, + { + "epoch": 0.28, + "grad_norm": 1.7985726638165644, + "learning_rate": 8.45336591951482e-06, + "loss": 0.5681, + "step": 2690 + }, + { + "epoch": 0.28, + "grad_norm": 2.1677613588933418, + "learning_rate": 8.452148332195434e-06, + "loss": 0.7061, + "step": 2691 + }, + { + "epoch": 0.28, + "grad_norm": 1.8779569941032115, + "learning_rate": 8.450930353555355e-06, + "loss": 0.6452, + "step": 2692 + }, + { + "epoch": 0.28, + "grad_norm": 1.8980254003942776, + "learning_rate": 8.449711983732652e-06, + "loss": 0.7254, + "step": 2693 + }, + { + "epoch": 0.28, + "grad_norm": 1.7453475241560552, + "learning_rate": 8.448493222865432e-06, + "loss": 0.6741, + "step": 2694 + }, + { + "epoch": 0.28, + "grad_norm": 1.8262450441941662, + "learning_rate": 8.447274071091848e-06, + "loss": 0.7417, + "step": 2695 + }, + { + "epoch": 0.28, + "grad_norm": 1.8776699685839715, + "learning_rate": 8.446054528550104e-06, + "loss": 0.6622, + "step": 2696 + }, + { + "epoch": 0.28, + "grad_norm": 2.0960061725496963, + "learning_rate": 8.444834595378434e-06, + "loss": 0.6906, + "step": 2697 + }, + { + "epoch": 0.28, + "grad_norm": 1.9118518648617624, + "learning_rate": 8.443614271715128e-06, + "loss": 0.6912, + "step": 2698 + }, + { + "epoch": 0.28, + "grad_norm": 1.9194363297045363, + "learning_rate": 8.442393557698517e-06, + "loss": 0.5579, + "step": 2699 + }, + { + "epoch": 0.28, + "grad_norm": 1.9081413101253448, + "learning_rate": 8.441172453466974e-06, + "loss": 0.6608, + "step": 2700 + }, + { + "epoch": 0.28, + "grad_norm": 1.9249403721722926, + "learning_rate": 8.43995095915892e-06, + "loss": 0.7314, + "step": 2701 + }, + { + "epoch": 0.28, + "grad_norm": 1.9096647611073485, + "learning_rate": 8.438729074912819e-06, + "loss": 0.5881, + "step": 2702 + }, + { + "epoch": 0.28, + "grad_norm": 2.1034642501676415, + "learning_rate": 8.437506800867176e-06, + "loss": 0.7437, + "step": 2703 + }, + { + "epoch": 0.28, + "grad_norm": 2.0906568978132567, + "learning_rate": 8.436284137160544e-06, + "loss": 0.685, + "step": 2704 + }, + { + "epoch": 0.28, + "grad_norm": 1.7856893381417736, + "learning_rate": 8.435061083931519e-06, + "loss": 0.6323, + "step": 2705 + }, + { + "epoch": 0.28, + "grad_norm": 1.9666249344130147, + "learning_rate": 8.433837641318741e-06, + "loss": 0.6732, + "step": 2706 + }, + { + "epoch": 0.28, + "grad_norm": 2.122621535668428, + "learning_rate": 8.432613809460895e-06, + "loss": 0.6447, + "step": 2707 + }, + { + "epoch": 0.28, + "grad_norm": 2.0054957981285253, + "learning_rate": 8.431389588496708e-06, + "loss": 0.6584, + "step": 2708 + }, + { + "epoch": 0.28, + "grad_norm": 1.9393041610845445, + "learning_rate": 8.430164978564952e-06, + "loss": 0.6929, + "step": 2709 + }, + { + "epoch": 0.28, + "grad_norm": 1.8108812484917312, + "learning_rate": 8.428939979804445e-06, + "loss": 0.6626, + "step": 2710 + }, + { + "epoch": 0.28, + "grad_norm": 2.021521385977441, + "learning_rate": 8.427714592354046e-06, + "loss": 0.7671, + "step": 2711 + }, + { + "epoch": 0.28, + "grad_norm": 2.0810704085129377, + "learning_rate": 8.426488816352662e-06, + "loss": 0.7076, + "step": 2712 + }, + { + "epoch": 0.28, + "grad_norm": 1.9862619828862162, + "learning_rate": 8.425262651939238e-06, + "loss": 0.669, + "step": 2713 + }, + { + "epoch": 0.28, + "grad_norm": 1.951010662132229, + "learning_rate": 8.424036099252772e-06, + "loss": 0.6356, + "step": 2714 + }, + { + "epoch": 0.28, + "grad_norm": 1.9863665255721001, + "learning_rate": 8.422809158432296e-06, + "loss": 0.6026, + "step": 2715 + }, + { + "epoch": 0.28, + "grad_norm": 1.6639586057276734, + "learning_rate": 8.421581829616893e-06, + "loss": 0.6527, + "step": 2716 + }, + { + "epoch": 0.28, + "grad_norm": 2.103415737649651, + "learning_rate": 8.42035411294569e-06, + "loss": 0.5964, + "step": 2717 + }, + { + "epoch": 0.28, + "grad_norm": 1.5707603548039, + "learning_rate": 8.41912600855785e-06, + "loss": 0.6068, + "step": 2718 + }, + { + "epoch": 0.28, + "grad_norm": 1.8106080608902415, + "learning_rate": 8.417897516592589e-06, + "loss": 0.6864, + "step": 2719 + }, + { + "epoch": 0.28, + "grad_norm": 2.1259873253643327, + "learning_rate": 8.416668637189162e-06, + "loss": 0.7358, + "step": 2720 + }, + { + "epoch": 0.28, + "grad_norm": 2.083747722616653, + "learning_rate": 8.415439370486872e-06, + "loss": 0.6944, + "step": 2721 + }, + { + "epoch": 0.28, + "grad_norm": 2.063196434811401, + "learning_rate": 8.414209716625062e-06, + "loss": 0.741, + "step": 2722 + }, + { + "epoch": 0.28, + "grad_norm": 1.7907869386100392, + "learning_rate": 8.41297967574312e-06, + "loss": 0.7097, + "step": 2723 + }, + { + "epoch": 0.28, + "grad_norm": 1.74621161890205, + "learning_rate": 8.411749247980478e-06, + "loss": 0.6878, + "step": 2724 + }, + { + "epoch": 0.28, + "grad_norm": 1.7515260020420864, + "learning_rate": 8.410518433476613e-06, + "loss": 0.7515, + "step": 2725 + }, + { + "epoch": 0.28, + "grad_norm": 1.9016574392369556, + "learning_rate": 8.409287232371043e-06, + "loss": 0.5963, + "step": 2726 + }, + { + "epoch": 0.28, + "grad_norm": 1.8126661652045508, + "learning_rate": 8.408055644803335e-06, + "loss": 0.6511, + "step": 2727 + }, + { + "epoch": 0.28, + "grad_norm": 1.8555841277858098, + "learning_rate": 8.406823670913093e-06, + "loss": 0.5926, + "step": 2728 + }, + { + "epoch": 0.28, + "grad_norm": 2.0801913638478475, + "learning_rate": 8.405591310839972e-06, + "loss": 0.7424, + "step": 2729 + }, + { + "epoch": 0.28, + "grad_norm": 1.8777432597286765, + "learning_rate": 8.404358564723663e-06, + "loss": 0.7011, + "step": 2730 + }, + { + "epoch": 0.28, + "grad_norm": 1.8320426461813335, + "learning_rate": 8.403125432703904e-06, + "loss": 0.6469, + "step": 2731 + }, + { + "epoch": 0.28, + "grad_norm": 1.8676234472685271, + "learning_rate": 8.401891914920483e-06, + "loss": 0.7347, + "step": 2732 + }, + { + "epoch": 0.28, + "grad_norm": 2.1091699541707456, + "learning_rate": 8.400658011513223e-06, + "loss": 0.6774, + "step": 2733 + }, + { + "epoch": 0.28, + "grad_norm": 1.778219564489563, + "learning_rate": 8.399423722621994e-06, + "loss": 0.708, + "step": 2734 + }, + { + "epoch": 0.28, + "grad_norm": 1.791116263128344, + "learning_rate": 8.398189048386708e-06, + "loss": 0.5954, + "step": 2735 + }, + { + "epoch": 0.28, + "grad_norm": 1.9654234959498464, + "learning_rate": 8.396953988947327e-06, + "loss": 0.7262, + "step": 2736 + }, + { + "epoch": 0.28, + "grad_norm": 1.923570947146556, + "learning_rate": 8.39571854444385e-06, + "loss": 0.6263, + "step": 2737 + }, + { + "epoch": 0.28, + "grad_norm": 1.9762691203393106, + "learning_rate": 8.394482715016318e-06, + "loss": 0.7634, + "step": 2738 + }, + { + "epoch": 0.28, + "grad_norm": 1.785056907642776, + "learning_rate": 8.393246500804825e-06, + "loss": 0.6711, + "step": 2739 + }, + { + "epoch": 0.28, + "grad_norm": 1.92259639664194, + "learning_rate": 8.3920099019495e-06, + "loss": 0.6447, + "step": 2740 + }, + { + "epoch": 0.28, + "grad_norm": 1.9325806794278027, + "learning_rate": 8.390772918590517e-06, + "loss": 0.6193, + "step": 2741 + }, + { + "epoch": 0.29, + "grad_norm": 1.796897789122905, + "learning_rate": 8.389535550868098e-06, + "loss": 0.5864, + "step": 2742 + }, + { + "epoch": 0.29, + "grad_norm": 1.5789258620706366, + "learning_rate": 8.388297798922505e-06, + "loss": 0.6275, + "step": 2743 + }, + { + "epoch": 0.29, + "grad_norm": 1.9466379670977967, + "learning_rate": 8.387059662894043e-06, + "loss": 0.6843, + "step": 2744 + }, + { + "epoch": 0.29, + "grad_norm": 1.7072700706186492, + "learning_rate": 8.385821142923064e-06, + "loss": 0.669, + "step": 2745 + }, + { + "epoch": 0.29, + "grad_norm": 1.848227815607721, + "learning_rate": 8.38458223914996e-06, + "loss": 0.6242, + "step": 2746 + }, + { + "epoch": 0.29, + "grad_norm": 1.8308049578888184, + "learning_rate": 8.383342951715165e-06, + "loss": 0.6314, + "step": 2747 + }, + { + "epoch": 0.29, + "grad_norm": 2.1493743941726224, + "learning_rate": 8.382103280759164e-06, + "loss": 0.6465, + "step": 2748 + }, + { + "epoch": 0.29, + "grad_norm": 1.9101945267736256, + "learning_rate": 8.380863226422478e-06, + "loss": 0.6816, + "step": 2749 + }, + { + "epoch": 0.29, + "grad_norm": 1.9718539473467542, + "learning_rate": 8.379622788845675e-06, + "loss": 0.7033, + "step": 2750 + }, + { + "epoch": 0.29, + "grad_norm": 2.038835831098079, + "learning_rate": 8.378381968169368e-06, + "loss": 0.6988, + "step": 2751 + }, + { + "epoch": 0.29, + "grad_norm": 1.7900118328139933, + "learning_rate": 8.377140764534206e-06, + "loss": 0.5064, + "step": 2752 + }, + { + "epoch": 0.29, + "grad_norm": 1.7774886330392579, + "learning_rate": 8.37589917808089e-06, + "loss": 0.6096, + "step": 2753 + }, + { + "epoch": 0.29, + "grad_norm": 1.809054839530392, + "learning_rate": 8.37465720895016e-06, + "loss": 0.6773, + "step": 2754 + }, + { + "epoch": 0.29, + "grad_norm": 1.5992093185602265, + "learning_rate": 8.373414857282802e-06, + "loss": 0.5207, + "step": 2755 + }, + { + "epoch": 0.29, + "grad_norm": 2.010983378957128, + "learning_rate": 8.372172123219639e-06, + "loss": 0.7494, + "step": 2756 + }, + { + "epoch": 0.29, + "grad_norm": 1.9938803891005408, + "learning_rate": 8.370929006901547e-06, + "loss": 0.5927, + "step": 2757 + }, + { + "epoch": 0.29, + "grad_norm": 1.8945446825004, + "learning_rate": 8.36968550846944e-06, + "loss": 0.6086, + "step": 2758 + }, + { + "epoch": 0.29, + "grad_norm": 1.9169728105853157, + "learning_rate": 8.368441628064273e-06, + "loss": 0.6921, + "step": 2759 + }, + { + "epoch": 0.29, + "grad_norm": 1.9901330548989136, + "learning_rate": 8.367197365827047e-06, + "loss": 0.6753, + "step": 2760 + }, + { + "epoch": 0.29, + "grad_norm": 2.0824344592889297, + "learning_rate": 8.365952721898806e-06, + "loss": 0.6964, + "step": 2761 + }, + { + "epoch": 0.29, + "grad_norm": 2.1360108572284773, + "learning_rate": 8.364707696420642e-06, + "loss": 0.7696, + "step": 2762 + }, + { + "epoch": 0.29, + "grad_norm": 1.6831167248161611, + "learning_rate": 8.363462289533681e-06, + "loss": 0.5605, + "step": 2763 + }, + { + "epoch": 0.29, + "grad_norm": 1.989356336067944, + "learning_rate": 8.362216501379096e-06, + "loss": 0.7506, + "step": 2764 + }, + { + "epoch": 0.29, + "grad_norm": 1.5598840637317084, + "learning_rate": 8.360970332098111e-06, + "loss": 0.6175, + "step": 2765 + }, + { + "epoch": 0.29, + "grad_norm": 2.1096417150952536, + "learning_rate": 8.359723781831978e-06, + "loss": 0.7308, + "step": 2766 + }, + { + "epoch": 0.29, + "grad_norm": 1.6011126729548892, + "learning_rate": 8.358476850722007e-06, + "loss": 0.6811, + "step": 2767 + }, + { + "epoch": 0.29, + "grad_norm": 1.9464596799550342, + "learning_rate": 8.357229538909542e-06, + "loss": 0.663, + "step": 2768 + }, + { + "epoch": 0.29, + "grad_norm": 1.9137397893266275, + "learning_rate": 8.355981846535972e-06, + "loss": 0.6147, + "step": 2769 + }, + { + "epoch": 0.29, + "grad_norm": 1.764067948839555, + "learning_rate": 8.354733773742734e-06, + "loss": 0.6074, + "step": 2770 + }, + { + "epoch": 0.29, + "grad_norm": 1.9148936860052028, + "learning_rate": 8.353485320671298e-06, + "loss": 0.7877, + "step": 2771 + }, + { + "epoch": 0.29, + "grad_norm": 1.7112798128547464, + "learning_rate": 8.352236487463188e-06, + "loss": 0.5686, + "step": 2772 + }, + { + "epoch": 0.29, + "grad_norm": 1.992101185683906, + "learning_rate": 8.350987274259966e-06, + "loss": 0.7142, + "step": 2773 + }, + { + "epoch": 0.29, + "grad_norm": 1.8753999534127264, + "learning_rate": 8.349737681203234e-06, + "loss": 0.6127, + "step": 2774 + }, + { + "epoch": 0.29, + "grad_norm": 2.142936199132452, + "learning_rate": 8.348487708434644e-06, + "loss": 0.6528, + "step": 2775 + }, + { + "epoch": 0.29, + "grad_norm": 2.1341310375948317, + "learning_rate": 8.347237356095888e-06, + "loss": 0.6646, + "step": 2776 + }, + { + "epoch": 0.29, + "grad_norm": 1.9282855617561891, + "learning_rate": 8.3459866243287e-06, + "loss": 0.7142, + "step": 2777 + }, + { + "epoch": 0.29, + "grad_norm": 1.8331210754757201, + "learning_rate": 8.344735513274853e-06, + "loss": 0.7037, + "step": 2778 + }, + { + "epoch": 0.29, + "grad_norm": 1.9595823974640136, + "learning_rate": 8.343484023076175e-06, + "loss": 0.6657, + "step": 2779 + }, + { + "epoch": 0.29, + "grad_norm": 1.8681476948952542, + "learning_rate": 8.342232153874521e-06, + "loss": 0.6335, + "step": 2780 + }, + { + "epoch": 0.29, + "grad_norm": 1.7368048649716803, + "learning_rate": 8.340979905811805e-06, + "loss": 0.6599, + "step": 2781 + }, + { + "epoch": 0.29, + "grad_norm": 2.688572412587195, + "learning_rate": 8.339727279029974e-06, + "loss": 0.6573, + "step": 2782 + }, + { + "epoch": 0.29, + "grad_norm": 1.7254670078388807, + "learning_rate": 8.33847427367102e-06, + "loss": 0.7033, + "step": 2783 + }, + { + "epoch": 0.29, + "grad_norm": 1.7445016985559958, + "learning_rate": 8.337220889876978e-06, + "loss": 0.6888, + "step": 2784 + }, + { + "epoch": 0.29, + "grad_norm": 1.8956447209405711, + "learning_rate": 8.335967127789929e-06, + "loss": 0.6261, + "step": 2785 + }, + { + "epoch": 0.29, + "grad_norm": 1.8284382496111118, + "learning_rate": 8.334712987551989e-06, + "loss": 0.6161, + "step": 2786 + }, + { + "epoch": 0.29, + "grad_norm": 1.828969267565181, + "learning_rate": 8.333458469305324e-06, + "loss": 0.7489, + "step": 2787 + }, + { + "epoch": 0.29, + "grad_norm": 1.7603821492238025, + "learning_rate": 8.332203573192143e-06, + "loss": 0.5797, + "step": 2788 + }, + { + "epoch": 0.29, + "grad_norm": 2.0787481786361366, + "learning_rate": 8.330948299354694e-06, + "loss": 0.6936, + "step": 2789 + }, + { + "epoch": 0.29, + "grad_norm": 2.0677329033718657, + "learning_rate": 8.329692647935269e-06, + "loss": 0.6972, + "step": 2790 + }, + { + "epoch": 0.29, + "grad_norm": 1.775462102298857, + "learning_rate": 8.328436619076203e-06, + "loss": 0.6864, + "step": 2791 + }, + { + "epoch": 0.29, + "grad_norm": 2.0303699301163443, + "learning_rate": 8.327180212919877e-06, + "loss": 0.6823, + "step": 2792 + }, + { + "epoch": 0.29, + "grad_norm": 2.210558036257578, + "learning_rate": 8.32592342960871e-06, + "loss": 0.7242, + "step": 2793 + }, + { + "epoch": 0.29, + "grad_norm": 2.1828645450156468, + "learning_rate": 8.324666269285161e-06, + "loss": 0.6827, + "step": 2794 + }, + { + "epoch": 0.29, + "grad_norm": 1.6089239490667053, + "learning_rate": 8.323408732091743e-06, + "loss": 0.5493, + "step": 2795 + }, + { + "epoch": 0.29, + "grad_norm": 1.7779698846657657, + "learning_rate": 8.322150818171002e-06, + "loss": 0.6296, + "step": 2796 + }, + { + "epoch": 0.29, + "grad_norm": 2.0643568991443635, + "learning_rate": 8.32089252766553e-06, + "loss": 0.5536, + "step": 2797 + }, + { + "epoch": 0.29, + "grad_norm": 1.86510472363077, + "learning_rate": 8.319633860717963e-06, + "loss": 0.6896, + "step": 2798 + }, + { + "epoch": 0.29, + "grad_norm": 1.9510095493605686, + "learning_rate": 8.318374817470976e-06, + "loss": 0.6448, + "step": 2799 + }, + { + "epoch": 0.29, + "grad_norm": 1.885516169850448, + "learning_rate": 8.317115398067289e-06, + "loss": 0.669, + "step": 2800 + }, + { + "epoch": 0.29, + "grad_norm": 2.2572097683864305, + "learning_rate": 8.315855602649662e-06, + "loss": 0.5818, + "step": 2801 + }, + { + "epoch": 0.29, + "grad_norm": 1.759056075445115, + "learning_rate": 8.314595431360906e-06, + "loss": 0.6457, + "step": 2802 + }, + { + "epoch": 0.29, + "grad_norm": 1.6425939825571452, + "learning_rate": 8.313334884343866e-06, + "loss": 0.5231, + "step": 2803 + }, + { + "epoch": 0.29, + "grad_norm": 1.7130113252654398, + "learning_rate": 8.31207396174143e-06, + "loss": 0.7554, + "step": 2804 + }, + { + "epoch": 0.29, + "grad_norm": 1.955914565838696, + "learning_rate": 8.310812663696531e-06, + "loss": 0.627, + "step": 2805 + }, + { + "epoch": 0.29, + "grad_norm": 1.6720147103849268, + "learning_rate": 8.309550990352146e-06, + "loss": 0.6079, + "step": 2806 + }, + { + "epoch": 0.29, + "grad_norm": 2.237069935203191, + "learning_rate": 8.308288941851295e-06, + "loss": 0.7222, + "step": 2807 + }, + { + "epoch": 0.29, + "grad_norm": 1.8290522376346103, + "learning_rate": 8.307026518337033e-06, + "loss": 0.5718, + "step": 2808 + }, + { + "epoch": 0.29, + "grad_norm": 1.8328742008169372, + "learning_rate": 8.305763719952467e-06, + "loss": 0.6836, + "step": 2809 + }, + { + "epoch": 0.29, + "grad_norm": 2.0006170517993533, + "learning_rate": 8.304500546840742e-06, + "loss": 0.6471, + "step": 2810 + }, + { + "epoch": 0.29, + "grad_norm": 1.863878297149142, + "learning_rate": 8.303236999145044e-06, + "loss": 0.7006, + "step": 2811 + }, + { + "epoch": 0.29, + "grad_norm": 1.9753606075612662, + "learning_rate": 8.301973077008604e-06, + "loss": 0.6209, + "step": 2812 + }, + { + "epoch": 0.29, + "grad_norm": 1.85924478525618, + "learning_rate": 8.300708780574695e-06, + "loss": 0.58, + "step": 2813 + }, + { + "epoch": 0.29, + "grad_norm": 1.9617682288103584, + "learning_rate": 8.299444109986631e-06, + "loss": 0.6333, + "step": 2814 + }, + { + "epoch": 0.29, + "grad_norm": 1.9932942836246554, + "learning_rate": 8.298179065387774e-06, + "loss": 0.6311, + "step": 2815 + }, + { + "epoch": 0.29, + "grad_norm": 1.9582877206850164, + "learning_rate": 8.29691364692152e-06, + "loss": 0.6961, + "step": 2816 + }, + { + "epoch": 0.29, + "grad_norm": 2.011029682430342, + "learning_rate": 8.295647854731312e-06, + "loss": 0.6541, + "step": 2817 + }, + { + "epoch": 0.29, + "grad_norm": 1.9245441043938138, + "learning_rate": 8.294381688960634e-06, + "loss": 0.6018, + "step": 2818 + }, + { + "epoch": 0.29, + "grad_norm": 2.1310980060323508, + "learning_rate": 8.293115149753016e-06, + "loss": 0.7579, + "step": 2819 + }, + { + "epoch": 0.29, + "grad_norm": 1.775629952527209, + "learning_rate": 8.291848237252025e-06, + "loss": 0.6678, + "step": 2820 + }, + { + "epoch": 0.29, + "grad_norm": 1.8880663986965478, + "learning_rate": 8.290580951601272e-06, + "loss": 0.5957, + "step": 2821 + }, + { + "epoch": 0.29, + "grad_norm": 2.269045732221966, + "learning_rate": 8.289313292944415e-06, + "loss": 0.6567, + "step": 2822 + }, + { + "epoch": 0.29, + "grad_norm": 2.157095157644829, + "learning_rate": 8.288045261425146e-06, + "loss": 0.7711, + "step": 2823 + }, + { + "epoch": 0.29, + "grad_norm": 1.7717605727416954, + "learning_rate": 8.286776857187205e-06, + "loss": 0.5993, + "step": 2824 + }, + { + "epoch": 0.29, + "grad_norm": 2.2377882664385167, + "learning_rate": 8.285508080374376e-06, + "loss": 0.6189, + "step": 2825 + }, + { + "epoch": 0.29, + "grad_norm": 1.9421413495128026, + "learning_rate": 8.284238931130476e-06, + "loss": 0.6647, + "step": 2826 + }, + { + "epoch": 0.29, + "grad_norm": 1.9791339729087056, + "learning_rate": 8.282969409599375e-06, + "loss": 0.7541, + "step": 2827 + }, + { + "epoch": 0.29, + "grad_norm": 2.2188976160973986, + "learning_rate": 8.28169951592498e-06, + "loss": 0.6117, + "step": 2828 + }, + { + "epoch": 0.29, + "grad_norm": 2.018124174549173, + "learning_rate": 8.280429250251238e-06, + "loss": 0.6828, + "step": 2829 + }, + { + "epoch": 0.29, + "grad_norm": 1.9341218682789678, + "learning_rate": 8.279158612722145e-06, + "loss": 0.628, + "step": 2830 + }, + { + "epoch": 0.29, + "grad_norm": 1.861973629599436, + "learning_rate": 8.27788760348173e-06, + "loss": 0.6198, + "step": 2831 + }, + { + "epoch": 0.29, + "grad_norm": 1.7662295415002316, + "learning_rate": 8.276616222674072e-06, + "loss": 0.6158, + "step": 2832 + }, + { + "epoch": 0.29, + "grad_norm": 1.7746613606468469, + "learning_rate": 8.275344470443292e-06, + "loss": 0.6234, + "step": 2833 + }, + { + "epoch": 0.29, + "grad_norm": 1.77084820465646, + "learning_rate": 8.274072346933544e-06, + "loss": 0.6325, + "step": 2834 + }, + { + "epoch": 0.29, + "grad_norm": 1.9053142262270337, + "learning_rate": 8.272799852289036e-06, + "loss": 0.5998, + "step": 2835 + }, + { + "epoch": 0.29, + "grad_norm": 1.9878612490236216, + "learning_rate": 8.27152698665401e-06, + "loss": 0.8256, + "step": 2836 + }, + { + "epoch": 0.29, + "grad_norm": 1.7506593303708355, + "learning_rate": 8.270253750172754e-06, + "loss": 0.6899, + "step": 2837 + }, + { + "epoch": 0.3, + "grad_norm": 1.9372791674384715, + "learning_rate": 8.268980142989594e-06, + "loss": 0.6556, + "step": 2838 + }, + { + "epoch": 0.3, + "grad_norm": 1.9580485272446944, + "learning_rate": 8.267706165248901e-06, + "loss": 0.6014, + "step": 2839 + }, + { + "epoch": 0.3, + "grad_norm": 1.94607284887436, + "learning_rate": 8.266431817095094e-06, + "loss": 0.7343, + "step": 2840 + }, + { + "epoch": 0.3, + "grad_norm": 2.0233162109195444, + "learning_rate": 8.265157098672617e-06, + "loss": 0.6424, + "step": 2841 + }, + { + "epoch": 0.3, + "grad_norm": 1.7612601997749828, + "learning_rate": 8.263882010125974e-06, + "loss": 0.6468, + "step": 2842 + }, + { + "epoch": 0.3, + "grad_norm": 1.8166003406613074, + "learning_rate": 8.262606551599701e-06, + "loss": 0.7198, + "step": 2843 + }, + { + "epoch": 0.3, + "grad_norm": 2.1777514169067005, + "learning_rate": 8.261330723238381e-06, + "loss": 0.6666, + "step": 2844 + }, + { + "epoch": 0.3, + "grad_norm": 1.8431539782484165, + "learning_rate": 8.260054525186634e-06, + "loss": 0.6959, + "step": 2845 + }, + { + "epoch": 0.3, + "grad_norm": 1.9351049160946372, + "learning_rate": 8.258777957589124e-06, + "loss": 0.6634, + "step": 2846 + }, + { + "epoch": 0.3, + "grad_norm": 1.785975155097984, + "learning_rate": 8.257501020590557e-06, + "loss": 0.7822, + "step": 2847 + }, + { + "epoch": 0.3, + "grad_norm": 1.7172233034670437, + "learning_rate": 8.256223714335685e-06, + "loss": 0.593, + "step": 2848 + }, + { + "epoch": 0.3, + "grad_norm": 1.8331082125147111, + "learning_rate": 8.254946038969294e-06, + "loss": 0.5986, + "step": 2849 + }, + { + "epoch": 0.3, + "grad_norm": 1.5989269880200359, + "learning_rate": 8.253667994636216e-06, + "loss": 0.6494, + "step": 2850 + }, + { + "epoch": 0.3, + "grad_norm": 1.8126460703093885, + "learning_rate": 8.252389581481328e-06, + "loss": 0.6088, + "step": 2851 + }, + { + "epoch": 0.3, + "grad_norm": 1.7710013320024187, + "learning_rate": 8.25111079964954e-06, + "loss": 0.692, + "step": 2852 + }, + { + "epoch": 0.3, + "grad_norm": 1.7261220654564688, + "learning_rate": 8.249831649285813e-06, + "loss": 0.6587, + "step": 2853 + }, + { + "epoch": 0.3, + "grad_norm": 1.8275573012523363, + "learning_rate": 8.248552130535146e-06, + "loss": 0.6926, + "step": 2854 + }, + { + "epoch": 0.3, + "grad_norm": 1.8146408797397333, + "learning_rate": 8.247272243542579e-06, + "loss": 0.603, + "step": 2855 + }, + { + "epoch": 0.3, + "grad_norm": 1.9636780696037397, + "learning_rate": 8.245991988453193e-06, + "loss": 0.7721, + "step": 2856 + }, + { + "epoch": 0.3, + "grad_norm": 1.7874387137159384, + "learning_rate": 8.244711365412113e-06, + "loss": 0.5659, + "step": 2857 + }, + { + "epoch": 0.3, + "grad_norm": 1.836389067622902, + "learning_rate": 8.243430374564507e-06, + "loss": 0.6837, + "step": 2858 + }, + { + "epoch": 0.3, + "grad_norm": 1.7888108908675027, + "learning_rate": 8.242149016055582e-06, + "loss": 0.5791, + "step": 2859 + }, + { + "epoch": 0.3, + "grad_norm": 1.8335701181284372, + "learning_rate": 8.240867290030585e-06, + "loss": 0.6948, + "step": 2860 + }, + { + "epoch": 0.3, + "grad_norm": 2.0901484654745426, + "learning_rate": 8.239585196634808e-06, + "loss": 0.748, + "step": 2861 + }, + { + "epoch": 0.3, + "grad_norm": 1.7432176517498563, + "learning_rate": 8.238302736013587e-06, + "loss": 0.6321, + "step": 2862 + }, + { + "epoch": 0.3, + "grad_norm": 1.8447676911357693, + "learning_rate": 8.237019908312289e-06, + "loss": 0.6296, + "step": 2863 + }, + { + "epoch": 0.3, + "grad_norm": 1.6818923204848246, + "learning_rate": 8.235736713676336e-06, + "loss": 0.5814, + "step": 2864 + }, + { + "epoch": 0.3, + "grad_norm": 1.8930984108194275, + "learning_rate": 8.234453152251183e-06, + "loss": 0.5862, + "step": 2865 + }, + { + "epoch": 0.3, + "grad_norm": 1.7856506372753727, + "learning_rate": 8.23316922418233e-06, + "loss": 0.6485, + "step": 2866 + }, + { + "epoch": 0.3, + "grad_norm": 1.7134171960426727, + "learning_rate": 8.231884929615315e-06, + "loss": 0.7412, + "step": 2867 + }, + { + "epoch": 0.3, + "grad_norm": 2.034932618216676, + "learning_rate": 8.230600268695724e-06, + "loss": 0.8004, + "step": 2868 + }, + { + "epoch": 0.3, + "grad_norm": 1.8280220472034374, + "learning_rate": 8.229315241569177e-06, + "loss": 0.6732, + "step": 2869 + }, + { + "epoch": 0.3, + "grad_norm": 2.2761201014870887, + "learning_rate": 8.228029848381343e-06, + "loss": 0.6222, + "step": 2870 + }, + { + "epoch": 0.3, + "grad_norm": 1.9435641548208933, + "learning_rate": 8.226744089277927e-06, + "loss": 0.706, + "step": 2871 + }, + { + "epoch": 0.3, + "grad_norm": 1.7705164240793172, + "learning_rate": 8.225457964404675e-06, + "loss": 0.6072, + "step": 2872 + }, + { + "epoch": 0.3, + "grad_norm": 1.7796138793949614, + "learning_rate": 8.224171473907379e-06, + "loss": 0.7511, + "step": 2873 + }, + { + "epoch": 0.3, + "grad_norm": 2.294358735763917, + "learning_rate": 8.222884617931868e-06, + "loss": 0.7495, + "step": 2874 + }, + { + "epoch": 0.3, + "grad_norm": 1.9779657166432165, + "learning_rate": 8.221597396624017e-06, + "loss": 0.6661, + "step": 2875 + }, + { + "epoch": 0.3, + "grad_norm": 1.821430815356854, + "learning_rate": 8.220309810129739e-06, + "loss": 0.6018, + "step": 2876 + }, + { + "epoch": 0.3, + "grad_norm": 1.9649085361169925, + "learning_rate": 8.219021858594989e-06, + "loss": 0.6216, + "step": 2877 + }, + { + "epoch": 0.3, + "grad_norm": 1.9075400329966081, + "learning_rate": 8.217733542165762e-06, + "loss": 0.5262, + "step": 2878 + }, + { + "epoch": 0.3, + "grad_norm": 1.787468247986847, + "learning_rate": 8.216444860988098e-06, + "loss": 0.7078, + "step": 2879 + }, + { + "epoch": 0.3, + "grad_norm": 1.9202828017719724, + "learning_rate": 8.215155815208075e-06, + "loss": 0.6898, + "step": 2880 + }, + { + "epoch": 0.3, + "grad_norm": 2.074321753545804, + "learning_rate": 8.213866404971817e-06, + "loss": 0.6434, + "step": 2881 + }, + { + "epoch": 0.3, + "grad_norm": 1.9239815198105046, + "learning_rate": 8.212576630425482e-06, + "loss": 0.6708, + "step": 2882 + }, + { + "epoch": 0.3, + "grad_norm": 2.0109810934562273, + "learning_rate": 8.211286491715274e-06, + "loss": 0.6282, + "step": 2883 + }, + { + "epoch": 0.3, + "grad_norm": 1.877253484772073, + "learning_rate": 8.20999598898744e-06, + "loss": 0.5615, + "step": 2884 + }, + { + "epoch": 0.3, + "grad_norm": 1.9652383152117894, + "learning_rate": 8.208705122388263e-06, + "loss": 0.6393, + "step": 2885 + }, + { + "epoch": 0.3, + "grad_norm": 1.9491407973172676, + "learning_rate": 8.207413892064073e-06, + "loss": 0.6992, + "step": 2886 + }, + { + "epoch": 0.3, + "grad_norm": 1.7003145780895765, + "learning_rate": 8.206122298161236e-06, + "loss": 0.5622, + "step": 2887 + }, + { + "epoch": 0.3, + "grad_norm": 1.8707928859913363, + "learning_rate": 8.204830340826161e-06, + "loss": 0.751, + "step": 2888 + }, + { + "epoch": 0.3, + "grad_norm": 1.9977017222580762, + "learning_rate": 8.203538020205301e-06, + "loss": 0.6936, + "step": 2889 + }, + { + "epoch": 0.3, + "grad_norm": 1.7954445118532534, + "learning_rate": 8.202245336445146e-06, + "loss": 0.6332, + "step": 2890 + }, + { + "epoch": 0.3, + "grad_norm": 2.1212523482878796, + "learning_rate": 8.200952289692233e-06, + "loss": 0.7295, + "step": 2891 + }, + { + "epoch": 0.3, + "grad_norm": 1.9203803961045498, + "learning_rate": 8.199658880093132e-06, + "loss": 0.6838, + "step": 2892 + }, + { + "epoch": 0.3, + "grad_norm": 1.9804119787254033, + "learning_rate": 8.198365107794457e-06, + "loss": 0.6464, + "step": 2893 + }, + { + "epoch": 0.3, + "grad_norm": 1.7118907526474592, + "learning_rate": 8.19707097294287e-06, + "loss": 0.6199, + "step": 2894 + }, + { + "epoch": 0.3, + "grad_norm": 1.8173439975730064, + "learning_rate": 8.195776475685061e-06, + "loss": 0.6636, + "step": 2895 + }, + { + "epoch": 0.3, + "grad_norm": 1.9947503806681284, + "learning_rate": 8.194481616167777e-06, + "loss": 0.7348, + "step": 2896 + }, + { + "epoch": 0.3, + "grad_norm": 2.0059450385174444, + "learning_rate": 8.193186394537792e-06, + "loss": 0.671, + "step": 2897 + }, + { + "epoch": 0.3, + "grad_norm": 1.8995286372082676, + "learning_rate": 8.191890810941932e-06, + "loss": 0.646, + "step": 2898 + }, + { + "epoch": 0.3, + "grad_norm": 1.6680630849647606, + "learning_rate": 8.190594865527052e-06, + "loss": 0.6015, + "step": 2899 + }, + { + "epoch": 0.3, + "grad_norm": 2.0300903756694364, + "learning_rate": 8.18929855844006e-06, + "loss": 0.6532, + "step": 2900 + }, + { + "epoch": 0.3, + "grad_norm": 2.03213398770465, + "learning_rate": 8.188001889827897e-06, + "loss": 0.745, + "step": 2901 + }, + { + "epoch": 0.3, + "grad_norm": 1.9409976976712575, + "learning_rate": 8.18670485983755e-06, + "loss": 0.6487, + "step": 2902 + }, + { + "epoch": 0.3, + "grad_norm": 2.0263585046810744, + "learning_rate": 8.185407468616042e-06, + "loss": 0.5663, + "step": 2903 + }, + { + "epoch": 0.3, + "grad_norm": 2.003893682630391, + "learning_rate": 8.18410971631044e-06, + "loss": 0.7368, + "step": 2904 + }, + { + "epoch": 0.3, + "grad_norm": 1.8199743964798658, + "learning_rate": 8.182811603067855e-06, + "loss": 0.6108, + "step": 2905 + }, + { + "epoch": 0.3, + "grad_norm": 1.9114442676440553, + "learning_rate": 8.181513129035434e-06, + "loss": 0.7262, + "step": 2906 + }, + { + "epoch": 0.3, + "grad_norm": 2.138825809398571, + "learning_rate": 8.180214294360365e-06, + "loss": 0.6708, + "step": 2907 + }, + { + "epoch": 0.3, + "grad_norm": 1.7635191853249457, + "learning_rate": 8.178915099189877e-06, + "loss": 0.5041, + "step": 2908 + }, + { + "epoch": 0.3, + "grad_norm": 1.7070240123779603, + "learning_rate": 8.177615543671247e-06, + "loss": 0.5458, + "step": 2909 + }, + { + "epoch": 0.3, + "grad_norm": 1.9043026323384222, + "learning_rate": 8.176315627951781e-06, + "loss": 0.6708, + "step": 2910 + }, + { + "epoch": 0.3, + "grad_norm": 1.8512788197178098, + "learning_rate": 8.175015352178835e-06, + "loss": 0.6638, + "step": 2911 + }, + { + "epoch": 0.3, + "grad_norm": 1.69736575656168, + "learning_rate": 8.173714716499801e-06, + "loss": 0.6792, + "step": 2912 + }, + { + "epoch": 0.3, + "grad_norm": 1.7299590678396837, + "learning_rate": 8.172413721062115e-06, + "loss": 0.576, + "step": 2913 + }, + { + "epoch": 0.3, + "grad_norm": 1.8580558642183693, + "learning_rate": 8.171112366013252e-06, + "loss": 0.6543, + "step": 2914 + }, + { + "epoch": 0.3, + "grad_norm": 1.8468439712054643, + "learning_rate": 8.169810651500728e-06, + "loss": 0.6622, + "step": 2915 + }, + { + "epoch": 0.3, + "grad_norm": 1.8010109531550258, + "learning_rate": 8.168508577672096e-06, + "loss": 0.6466, + "step": 2916 + }, + { + "epoch": 0.3, + "grad_norm": 1.9057592922152613, + "learning_rate": 8.167206144674959e-06, + "loss": 0.5711, + "step": 2917 + }, + { + "epoch": 0.3, + "grad_norm": 2.050683545427632, + "learning_rate": 8.165903352656954e-06, + "loss": 0.6602, + "step": 2918 + }, + { + "epoch": 0.3, + "grad_norm": 1.8054973854791319, + "learning_rate": 8.164600201765758e-06, + "loss": 0.6585, + "step": 2919 + }, + { + "epoch": 0.3, + "grad_norm": 1.9761637090582729, + "learning_rate": 8.163296692149093e-06, + "loss": 0.6079, + "step": 2920 + }, + { + "epoch": 0.3, + "grad_norm": 1.88880213967494, + "learning_rate": 8.161992823954715e-06, + "loss": 0.6007, + "step": 2921 + }, + { + "epoch": 0.3, + "grad_norm": 1.8416518885541622, + "learning_rate": 8.160688597330428e-06, + "loss": 0.6713, + "step": 2922 + }, + { + "epoch": 0.3, + "grad_norm": 1.873829102606367, + "learning_rate": 8.159384012424074e-06, + "loss": 0.6165, + "step": 2923 + }, + { + "epoch": 0.3, + "grad_norm": 1.8160124127691164, + "learning_rate": 8.158079069383535e-06, + "loss": 0.7594, + "step": 2924 + }, + { + "epoch": 0.3, + "grad_norm": 1.863086365003482, + "learning_rate": 8.156773768356733e-06, + "loss": 0.6637, + "step": 2925 + }, + { + "epoch": 0.3, + "grad_norm": 1.6903193748478216, + "learning_rate": 8.155468109491632e-06, + "loss": 0.5883, + "step": 2926 + }, + { + "epoch": 0.3, + "grad_norm": 1.9542532517773603, + "learning_rate": 8.154162092936233e-06, + "loss": 0.7481, + "step": 2927 + }, + { + "epoch": 0.3, + "grad_norm": 1.772285860147228, + "learning_rate": 8.152855718838583e-06, + "loss": 0.5753, + "step": 2928 + }, + { + "epoch": 0.3, + "grad_norm": 1.9175001165132863, + "learning_rate": 8.151548987346768e-06, + "loss": 0.6404, + "step": 2929 + }, + { + "epoch": 0.3, + "grad_norm": 2.1361197261230918, + "learning_rate": 8.150241898608911e-06, + "loss": 0.7385, + "step": 2930 + }, + { + "epoch": 0.3, + "grad_norm": 1.6543334650805261, + "learning_rate": 8.14893445277318e-06, + "loss": 0.5488, + "step": 2931 + }, + { + "epoch": 0.3, + "grad_norm": 2.2819717034684923, + "learning_rate": 8.147626649987779e-06, + "loss": 0.5632, + "step": 2932 + }, + { + "epoch": 0.3, + "grad_norm": 1.9251150848421499, + "learning_rate": 8.146318490400958e-06, + "loss": 0.73, + "step": 2933 + }, + { + "epoch": 0.3, + "grad_norm": 1.9096484516775554, + "learning_rate": 8.145009974161002e-06, + "loss": 0.6355, + "step": 2934 + }, + { + "epoch": 0.31, + "grad_norm": 2.014743746968432, + "learning_rate": 8.14370110141624e-06, + "loss": 0.604, + "step": 2935 + }, + { + "epoch": 0.31, + "grad_norm": 1.7574927081153062, + "learning_rate": 8.142391872315038e-06, + "loss": 0.5701, + "step": 2936 + }, + { + "epoch": 0.31, + "grad_norm": 1.7754012395866758, + "learning_rate": 8.141082287005808e-06, + "loss": 0.7174, + "step": 2937 + }, + { + "epoch": 0.31, + "grad_norm": 1.7225274879562285, + "learning_rate": 8.139772345636996e-06, + "loss": 0.5452, + "step": 2938 + }, + { + "epoch": 0.31, + "grad_norm": 1.88076198186382, + "learning_rate": 8.138462048357093e-06, + "loss": 0.6864, + "step": 2939 + }, + { + "epoch": 0.31, + "grad_norm": 2.099299123705203, + "learning_rate": 8.137151395314628e-06, + "loss": 0.6782, + "step": 2940 + }, + { + "epoch": 0.31, + "grad_norm": 1.8633877364101679, + "learning_rate": 8.13584038665817e-06, + "loss": 0.6702, + "step": 2941 + }, + { + "epoch": 0.31, + "grad_norm": 1.7629071756590513, + "learning_rate": 8.134529022536332e-06, + "loss": 0.6396, + "step": 2942 + }, + { + "epoch": 0.31, + "grad_norm": 1.8434231548356848, + "learning_rate": 8.133217303097764e-06, + "loss": 0.7665, + "step": 2943 + }, + { + "epoch": 0.31, + "grad_norm": 2.0000437586736637, + "learning_rate": 8.131905228491155e-06, + "loss": 0.655, + "step": 2944 + }, + { + "epoch": 0.31, + "grad_norm": 2.0562680118666377, + "learning_rate": 8.130592798865237e-06, + "loss": 0.623, + "step": 2945 + }, + { + "epoch": 0.31, + "grad_norm": 1.8727560280886, + "learning_rate": 8.129280014368781e-06, + "loss": 0.5804, + "step": 2946 + }, + { + "epoch": 0.31, + "grad_norm": 1.7842157138360704, + "learning_rate": 8.1279668751506e-06, + "loss": 0.7247, + "step": 2947 + }, + { + "epoch": 0.31, + "grad_norm": 1.8652448850439833, + "learning_rate": 8.126653381359543e-06, + "loss": 0.6968, + "step": 2948 + }, + { + "epoch": 0.31, + "grad_norm": 2.2497973252323638, + "learning_rate": 8.125339533144507e-06, + "loss": 0.6329, + "step": 2949 + }, + { + "epoch": 0.31, + "grad_norm": 1.7867873852205007, + "learning_rate": 8.12402533065442e-06, + "loss": 0.5789, + "step": 2950 + }, + { + "epoch": 0.31, + "grad_norm": 2.077399777170234, + "learning_rate": 8.122710774038253e-06, + "loss": 0.6199, + "step": 2951 + }, + { + "epoch": 0.31, + "grad_norm": 1.9012648354483244, + "learning_rate": 8.121395863445023e-06, + "loss": 0.7289, + "step": 2952 + }, + { + "epoch": 0.31, + "grad_norm": 1.824018411625984, + "learning_rate": 8.120080599023781e-06, + "loss": 0.6642, + "step": 2953 + }, + { + "epoch": 0.31, + "grad_norm": 1.9300181390182036, + "learning_rate": 8.118764980923619e-06, + "loss": 0.6938, + "step": 2954 + }, + { + "epoch": 0.31, + "grad_norm": 1.8854008336004575, + "learning_rate": 8.117449009293668e-06, + "loss": 0.6819, + "step": 2955 + }, + { + "epoch": 0.31, + "grad_norm": 1.966959364350495, + "learning_rate": 8.116132684283104e-06, + "loss": 0.6794, + "step": 2956 + }, + { + "epoch": 0.31, + "grad_norm": 1.905465999690345, + "learning_rate": 8.11481600604114e-06, + "loss": 0.6242, + "step": 2957 + }, + { + "epoch": 0.31, + "grad_norm": 1.945562288125105, + "learning_rate": 8.113498974717027e-06, + "loss": 0.5482, + "step": 2958 + }, + { + "epoch": 0.31, + "grad_norm": 2.086970708973281, + "learning_rate": 8.11218159046006e-06, + "loss": 0.7742, + "step": 2959 + }, + { + "epoch": 0.31, + "grad_norm": 1.8443539974736436, + "learning_rate": 8.110863853419568e-06, + "loss": 0.7127, + "step": 2960 + }, + { + "epoch": 0.31, + "grad_norm": 2.890393352231358, + "learning_rate": 8.10954576374493e-06, + "loss": 0.6969, + "step": 2961 + }, + { + "epoch": 0.31, + "grad_norm": 2.033669898074698, + "learning_rate": 8.108227321585554e-06, + "loss": 0.6983, + "step": 2962 + }, + { + "epoch": 0.31, + "grad_norm": 1.9019326562773995, + "learning_rate": 8.106908527090895e-06, + "loss": 0.6637, + "step": 2963 + }, + { + "epoch": 0.31, + "grad_norm": 1.9436938800459622, + "learning_rate": 8.105589380410448e-06, + "loss": 0.6138, + "step": 2964 + }, + { + "epoch": 0.31, + "grad_norm": 1.931517898910024, + "learning_rate": 8.10426988169374e-06, + "loss": 0.6288, + "step": 2965 + }, + { + "epoch": 0.31, + "grad_norm": 2.01533596468131, + "learning_rate": 8.10295003109035e-06, + "loss": 0.6901, + "step": 2966 + }, + { + "epoch": 0.31, + "grad_norm": 1.9307107562996824, + "learning_rate": 8.101629828749887e-06, + "loss": 0.6474, + "step": 2967 + }, + { + "epoch": 0.31, + "grad_norm": 1.7937486278251964, + "learning_rate": 8.100309274822002e-06, + "loss": 0.6022, + "step": 2968 + }, + { + "epoch": 0.31, + "grad_norm": 1.8036146364537236, + "learning_rate": 8.098988369456392e-06, + "loss": 0.6685, + "step": 2969 + }, + { + "epoch": 0.31, + "grad_norm": 1.8502562122591037, + "learning_rate": 8.097667112802784e-06, + "loss": 0.7181, + "step": 2970 + }, + { + "epoch": 0.31, + "grad_norm": 1.8619088903053767, + "learning_rate": 8.096345505010956e-06, + "loss": 0.6382, + "step": 2971 + }, + { + "epoch": 0.31, + "grad_norm": 2.0072848292508727, + "learning_rate": 8.095023546230715e-06, + "loss": 0.6269, + "step": 2972 + }, + { + "epoch": 0.31, + "grad_norm": 2.0408515277535035, + "learning_rate": 8.093701236611914e-06, + "loss": 0.7009, + "step": 2973 + }, + { + "epoch": 0.31, + "grad_norm": 1.8022765607025482, + "learning_rate": 8.092378576304443e-06, + "loss": 0.6222, + "step": 2974 + }, + { + "epoch": 0.31, + "grad_norm": 2.1356198828409583, + "learning_rate": 8.091055565458236e-06, + "loss": 0.8086, + "step": 2975 + }, + { + "epoch": 0.31, + "grad_norm": 1.7900732105124129, + "learning_rate": 8.089732204223263e-06, + "loss": 0.6219, + "step": 2976 + }, + { + "epoch": 0.31, + "grad_norm": 1.9052543682026897, + "learning_rate": 8.088408492749534e-06, + "loss": 0.6403, + "step": 2977 + }, + { + "epoch": 0.31, + "grad_norm": 1.831209876851521, + "learning_rate": 8.087084431187096e-06, + "loss": 0.6012, + "step": 2978 + }, + { + "epoch": 0.31, + "grad_norm": 2.015018382951914, + "learning_rate": 8.085760019686044e-06, + "loss": 0.7143, + "step": 2979 + }, + { + "epoch": 0.31, + "grad_norm": 1.990275012744501, + "learning_rate": 8.084435258396504e-06, + "loss": 0.6116, + "step": 2980 + }, + { + "epoch": 0.31, + "grad_norm": 1.7622362259074167, + "learning_rate": 8.08311014746865e-06, + "loss": 0.62, + "step": 2981 + }, + { + "epoch": 0.31, + "grad_norm": 2.102214569940529, + "learning_rate": 8.081784687052683e-06, + "loss": 0.5395, + "step": 2982 + }, + { + "epoch": 0.31, + "grad_norm": 1.8920824650503145, + "learning_rate": 8.080458877298861e-06, + "loss": 0.6173, + "step": 2983 + }, + { + "epoch": 0.31, + "grad_norm": 1.825716574244372, + "learning_rate": 8.079132718357465e-06, + "loss": 0.715, + "step": 2984 + }, + { + "epoch": 0.31, + "grad_norm": 1.8137038198264506, + "learning_rate": 8.077806210378824e-06, + "loss": 0.6763, + "step": 2985 + }, + { + "epoch": 0.31, + "grad_norm": 1.9100852008920197, + "learning_rate": 8.076479353513308e-06, + "loss": 0.7405, + "step": 2986 + }, + { + "epoch": 0.31, + "grad_norm": 2.008643615507193, + "learning_rate": 8.07515214791132e-06, + "loss": 0.6724, + "step": 2987 + }, + { + "epoch": 0.31, + "grad_norm": 1.9322056861892911, + "learning_rate": 8.073824593723309e-06, + "loss": 0.7223, + "step": 2988 + }, + { + "epoch": 0.31, + "grad_norm": 1.9330171400421932, + "learning_rate": 8.07249669109976e-06, + "loss": 0.6259, + "step": 2989 + }, + { + "epoch": 0.31, + "grad_norm": 2.132240605171131, + "learning_rate": 8.071168440191199e-06, + "loss": 0.6683, + "step": 2990 + }, + { + "epoch": 0.31, + "grad_norm": 1.71224736007814, + "learning_rate": 8.06983984114819e-06, + "loss": 0.6645, + "step": 2991 + }, + { + "epoch": 0.31, + "grad_norm": 2.1475200177604536, + "learning_rate": 8.068510894121338e-06, + "loss": 0.7044, + "step": 2992 + }, + { + "epoch": 0.31, + "grad_norm": 1.8431375374949153, + "learning_rate": 8.067181599261285e-06, + "loss": 0.5972, + "step": 2993 + }, + { + "epoch": 0.31, + "grad_norm": 1.823307176827159, + "learning_rate": 8.065851956718716e-06, + "loss": 0.6707, + "step": 2994 + }, + { + "epoch": 0.31, + "grad_norm": 1.9937691220905867, + "learning_rate": 8.064521966644351e-06, + "loss": 0.6577, + "step": 2995 + }, + { + "epoch": 0.31, + "grad_norm": 1.7340263444241693, + "learning_rate": 8.063191629188958e-06, + "loss": 0.5489, + "step": 2996 + }, + { + "epoch": 0.31, + "grad_norm": 1.9549885348959344, + "learning_rate": 8.06186094450333e-06, + "loss": 0.7293, + "step": 2997 + }, + { + "epoch": 0.31, + "grad_norm": 2.759727563771705, + "learning_rate": 8.060529912738316e-06, + "loss": 0.7183, + "step": 2998 + }, + { + "epoch": 0.31, + "grad_norm": 1.8394811728272813, + "learning_rate": 8.05919853404479e-06, + "loss": 0.6787, + "step": 2999 + }, + { + "epoch": 0.31, + "grad_norm": 1.8405697026850263, + "learning_rate": 8.057866808573672e-06, + "loss": 0.6668, + "step": 3000 + }, + { + "epoch": 0.31, + "grad_norm": 1.8378753932405179, + "learning_rate": 8.056534736475923e-06, + "loss": 0.6546, + "step": 3001 + }, + { + "epoch": 0.31, + "grad_norm": 1.7713096886810131, + "learning_rate": 8.05520231790254e-06, + "loss": 0.7496, + "step": 3002 + }, + { + "epoch": 0.31, + "grad_norm": 1.8134475570918778, + "learning_rate": 8.053869553004561e-06, + "loss": 0.6435, + "step": 3003 + }, + { + "epoch": 0.31, + "grad_norm": 1.847138795743933, + "learning_rate": 8.052536441933062e-06, + "loss": 0.626, + "step": 3004 + }, + { + "epoch": 0.31, + "grad_norm": 1.8868783461313385, + "learning_rate": 8.051202984839157e-06, + "loss": 0.6443, + "step": 3005 + }, + { + "epoch": 0.31, + "grad_norm": 1.6274506637183248, + "learning_rate": 8.049869181874002e-06, + "loss": 0.6456, + "step": 3006 + }, + { + "epoch": 0.31, + "grad_norm": 1.7934457191553317, + "learning_rate": 8.048535033188794e-06, + "loss": 0.5856, + "step": 3007 + }, + { + "epoch": 0.31, + "grad_norm": 1.925470936240325, + "learning_rate": 8.04720053893476e-06, + "loss": 0.7103, + "step": 3008 + }, + { + "epoch": 0.31, + "grad_norm": 2.0746079297930913, + "learning_rate": 8.04586569926318e-06, + "loss": 0.8232, + "step": 3009 + }, + { + "epoch": 0.31, + "grad_norm": 1.8229354357781145, + "learning_rate": 8.04453051432536e-06, + "loss": 0.5973, + "step": 3010 + }, + { + "epoch": 0.31, + "grad_norm": 2.17337743919196, + "learning_rate": 8.043194984272656e-06, + "loss": 0.7325, + "step": 3011 + }, + { + "epoch": 0.31, + "grad_norm": 1.8513812475733578, + "learning_rate": 8.041859109256452e-06, + "loss": 0.6306, + "step": 3012 + }, + { + "epoch": 0.31, + "grad_norm": 2.117685022304302, + "learning_rate": 8.04052288942818e-06, + "loss": 0.7018, + "step": 3013 + }, + { + "epoch": 0.31, + "grad_norm": 1.8467774888427202, + "learning_rate": 8.03918632493931e-06, + "loss": 0.6535, + "step": 3014 + }, + { + "epoch": 0.31, + "grad_norm": 1.841308511882857, + "learning_rate": 8.037849415941346e-06, + "loss": 0.5757, + "step": 3015 + }, + { + "epoch": 0.31, + "grad_norm": 1.971472468230098, + "learning_rate": 8.036512162585834e-06, + "loss": 0.641, + "step": 3016 + }, + { + "epoch": 0.31, + "grad_norm": 2.0178646419829653, + "learning_rate": 8.035174565024362e-06, + "loss": 0.671, + "step": 3017 + }, + { + "epoch": 0.31, + "grad_norm": 1.6146581959501902, + "learning_rate": 8.033836623408556e-06, + "loss": 0.6094, + "step": 3018 + }, + { + "epoch": 0.31, + "grad_norm": 1.873134384365632, + "learning_rate": 8.032498337890073e-06, + "loss": 0.7449, + "step": 3019 + }, + { + "epoch": 0.31, + "grad_norm": 2.0365935522496765, + "learning_rate": 8.03115970862062e-06, + "loss": 0.6673, + "step": 3020 + }, + { + "epoch": 0.31, + "grad_norm": 1.8359556335408065, + "learning_rate": 8.029820735751936e-06, + "loss": 0.5654, + "step": 3021 + }, + { + "epoch": 0.31, + "grad_norm": 1.7270952911249733, + "learning_rate": 8.028481419435803e-06, + "loss": 0.6331, + "step": 3022 + }, + { + "epoch": 0.31, + "grad_norm": 2.2156910867047936, + "learning_rate": 8.02714175982404e-06, + "loss": 0.7084, + "step": 3023 + }, + { + "epoch": 0.31, + "grad_norm": 1.995649963769197, + "learning_rate": 8.025801757068504e-06, + "loss": 0.6732, + "step": 3024 + }, + { + "epoch": 0.31, + "grad_norm": 1.9708209577525355, + "learning_rate": 8.024461411321092e-06, + "loss": 0.6283, + "step": 3025 + }, + { + "epoch": 0.31, + "grad_norm": 1.7503337006314423, + "learning_rate": 8.02312072273374e-06, + "loss": 0.6433, + "step": 3026 + }, + { + "epoch": 0.31, + "grad_norm": 1.9351669006196723, + "learning_rate": 8.021779691458422e-06, + "loss": 0.6875, + "step": 3027 + }, + { + "epoch": 0.31, + "grad_norm": 1.654807381732136, + "learning_rate": 8.020438317647155e-06, + "loss": 0.6628, + "step": 3028 + }, + { + "epoch": 0.31, + "grad_norm": 1.8618456773903507, + "learning_rate": 8.019096601451987e-06, + "loss": 0.6953, + "step": 3029 + }, + { + "epoch": 0.31, + "grad_norm": 1.9551349205252637, + "learning_rate": 8.017754543025012e-06, + "loss": 0.6496, + "step": 3030 + }, + { + "epoch": 0.32, + "grad_norm": 1.780040566722473, + "learning_rate": 8.016412142518356e-06, + "loss": 0.5778, + "step": 3031 + }, + { + "epoch": 0.32, + "grad_norm": 1.8404946597473568, + "learning_rate": 8.015069400084194e-06, + "loss": 0.7011, + "step": 3032 + }, + { + "epoch": 0.32, + "grad_norm": 2.0243422271373612, + "learning_rate": 8.013726315874729e-06, + "loss": 0.579, + "step": 3033 + }, + { + "epoch": 0.32, + "grad_norm": 1.7886309414150088, + "learning_rate": 8.012382890042208e-06, + "loss": 0.6459, + "step": 3034 + }, + { + "epoch": 0.32, + "grad_norm": 1.8805496619276878, + "learning_rate": 8.011039122738918e-06, + "loss": 0.6622, + "step": 3035 + }, + { + "epoch": 0.32, + "grad_norm": 1.839648694046673, + "learning_rate": 8.00969501411718e-06, + "loss": 0.6199, + "step": 3036 + }, + { + "epoch": 0.32, + "grad_norm": 2.397442942159792, + "learning_rate": 8.008350564329356e-06, + "loss": 0.6051, + "step": 3037 + }, + { + "epoch": 0.32, + "grad_norm": 2.031673857387813, + "learning_rate": 8.00700577352785e-06, + "loss": 0.6209, + "step": 3038 + }, + { + "epoch": 0.32, + "grad_norm": 1.6669416983842271, + "learning_rate": 8.0056606418651e-06, + "loss": 0.6872, + "step": 3039 + }, + { + "epoch": 0.32, + "grad_norm": 1.8867515977043507, + "learning_rate": 8.004315169493586e-06, + "loss": 0.5904, + "step": 3040 + }, + { + "epoch": 0.32, + "grad_norm": 1.755108581811021, + "learning_rate": 8.002969356565822e-06, + "loss": 0.7178, + "step": 3041 + }, + { + "epoch": 0.32, + "grad_norm": 1.955755740139808, + "learning_rate": 8.001623203234366e-06, + "loss": 0.6451, + "step": 3042 + }, + { + "epoch": 0.32, + "grad_norm": 1.7756454101534862, + "learning_rate": 8.00027670965181e-06, + "loss": 0.6337, + "step": 3043 + }, + { + "epoch": 0.32, + "grad_norm": 1.8908982993281924, + "learning_rate": 7.998929875970788e-06, + "loss": 0.7074, + "step": 3044 + }, + { + "epoch": 0.32, + "grad_norm": 1.8663129056512406, + "learning_rate": 7.99758270234397e-06, + "loss": 0.5291, + "step": 3045 + }, + { + "epoch": 0.32, + "grad_norm": 1.701969315380072, + "learning_rate": 7.99623518892407e-06, + "loss": 0.6038, + "step": 3046 + }, + { + "epoch": 0.32, + "grad_norm": 2.104064850307174, + "learning_rate": 7.994887335863832e-06, + "loss": 0.7409, + "step": 3047 + }, + { + "epoch": 0.32, + "grad_norm": 1.8828855956551462, + "learning_rate": 7.993539143316044e-06, + "loss": 0.6762, + "step": 3048 + }, + { + "epoch": 0.32, + "grad_norm": 1.7104446736903443, + "learning_rate": 7.992190611433532e-06, + "loss": 0.5661, + "step": 3049 + }, + { + "epoch": 0.32, + "grad_norm": 2.07219151701992, + "learning_rate": 7.99084174036916e-06, + "loss": 0.69, + "step": 3050 + }, + { + "epoch": 0.32, + "grad_norm": 1.9078394195572277, + "learning_rate": 7.989492530275829e-06, + "loss": 0.767, + "step": 3051 + }, + { + "epoch": 0.32, + "grad_norm": 1.8675014184687138, + "learning_rate": 7.988142981306479e-06, + "loss": 0.615, + "step": 3052 + }, + { + "epoch": 0.32, + "grad_norm": 1.713066568608695, + "learning_rate": 7.98679309361409e-06, + "loss": 0.6929, + "step": 3053 + }, + { + "epoch": 0.32, + "grad_norm": 1.9366046584274337, + "learning_rate": 7.985442867351682e-06, + "loss": 0.7056, + "step": 3054 + }, + { + "epoch": 0.32, + "grad_norm": 1.7526108241072664, + "learning_rate": 7.984092302672306e-06, + "loss": 0.618, + "step": 3055 + }, + { + "epoch": 0.32, + "grad_norm": 1.8741774360912886, + "learning_rate": 7.98274139972906e-06, + "loss": 0.7573, + "step": 3056 + }, + { + "epoch": 0.32, + "grad_norm": 1.8106219976633589, + "learning_rate": 7.981390158675076e-06, + "loss": 0.6292, + "step": 3057 + }, + { + "epoch": 0.32, + "grad_norm": 1.8965265207010533, + "learning_rate": 7.980038579663523e-06, + "loss": 0.7095, + "step": 3058 + }, + { + "epoch": 0.32, + "grad_norm": 1.7954077915009716, + "learning_rate": 7.978686662847612e-06, + "loss": 0.6326, + "step": 3059 + }, + { + "epoch": 0.32, + "grad_norm": 2.0120709348450947, + "learning_rate": 7.977334408380588e-06, + "loss": 0.6684, + "step": 3060 + }, + { + "epoch": 0.32, + "grad_norm": 1.7127762611593402, + "learning_rate": 7.975981816415741e-06, + "loss": 0.5592, + "step": 3061 + }, + { + "epoch": 0.32, + "grad_norm": 1.9838766263136216, + "learning_rate": 7.974628887106391e-06, + "loss": 0.5739, + "step": 3062 + }, + { + "epoch": 0.32, + "grad_norm": 1.8024132426453374, + "learning_rate": 7.973275620605903e-06, + "loss": 0.6057, + "step": 3063 + }, + { + "epoch": 0.32, + "grad_norm": 1.662452177175643, + "learning_rate": 7.971922017067674e-06, + "loss": 0.5933, + "step": 3064 + }, + { + "epoch": 0.32, + "grad_norm": 1.9625977664980205, + "learning_rate": 7.970568076645149e-06, + "loss": 0.5994, + "step": 3065 + }, + { + "epoch": 0.32, + "grad_norm": 2.006758324902627, + "learning_rate": 7.969213799491799e-06, + "loss": 0.6422, + "step": 3066 + }, + { + "epoch": 0.32, + "grad_norm": 1.883116643768088, + "learning_rate": 7.96785918576114e-06, + "loss": 0.6657, + "step": 3067 + }, + { + "epoch": 0.32, + "grad_norm": 2.162848986767221, + "learning_rate": 7.966504235606726e-06, + "loss": 0.7306, + "step": 3068 + }, + { + "epoch": 0.32, + "grad_norm": 2.1148968110067687, + "learning_rate": 7.965148949182148e-06, + "loss": 0.6631, + "step": 3069 + }, + { + "epoch": 0.32, + "grad_norm": 1.7761884516910715, + "learning_rate": 7.963793326641038e-06, + "loss": 0.6629, + "step": 3070 + }, + { + "epoch": 0.32, + "grad_norm": 1.9040220164550217, + "learning_rate": 7.96243736813706e-06, + "loss": 0.6321, + "step": 3071 + }, + { + "epoch": 0.32, + "grad_norm": 1.993692247613143, + "learning_rate": 7.961081073823921e-06, + "loss": 0.6548, + "step": 3072 + }, + { + "epoch": 0.32, + "grad_norm": 1.7461663526967959, + "learning_rate": 7.959724443855366e-06, + "loss": 0.6201, + "step": 3073 + }, + { + "epoch": 0.32, + "grad_norm": 1.9928755049849687, + "learning_rate": 7.958367478385172e-06, + "loss": 0.6746, + "step": 3074 + }, + { + "epoch": 0.32, + "grad_norm": 1.7754560830370942, + "learning_rate": 7.957010177567167e-06, + "loss": 0.6824, + "step": 3075 + }, + { + "epoch": 0.32, + "grad_norm": 2.019669666508399, + "learning_rate": 7.955652541555198e-06, + "loss": 0.6759, + "step": 3076 + }, + { + "epoch": 0.32, + "grad_norm": 1.868951468579273, + "learning_rate": 7.954294570503171e-06, + "loss": 0.7548, + "step": 3077 + }, + { + "epoch": 0.32, + "grad_norm": 1.845669333759357, + "learning_rate": 7.952936264565015e-06, + "loss": 0.6105, + "step": 3078 + }, + { + "epoch": 0.32, + "grad_norm": 1.821381002259593, + "learning_rate": 7.951577623894701e-06, + "loss": 0.6323, + "step": 3079 + }, + { + "epoch": 0.32, + "grad_norm": 1.9570419308484384, + "learning_rate": 7.950218648646242e-06, + "loss": 0.708, + "step": 3080 + }, + { + "epoch": 0.32, + "grad_norm": 1.8858894304328615, + "learning_rate": 7.948859338973682e-06, + "loss": 0.6006, + "step": 3081 + }, + { + "epoch": 0.32, + "grad_norm": 1.7831579814605896, + "learning_rate": 7.947499695031108e-06, + "loss": 0.7099, + "step": 3082 + }, + { + "epoch": 0.32, + "grad_norm": 1.8783313094793848, + "learning_rate": 7.946139716972644e-06, + "loss": 0.732, + "step": 3083 + }, + { + "epoch": 0.32, + "grad_norm": 2.0736000767971667, + "learning_rate": 7.94477940495245e-06, + "loss": 0.7732, + "step": 3084 + }, + { + "epoch": 0.32, + "grad_norm": 2.077886034109067, + "learning_rate": 7.943418759124727e-06, + "loss": 0.6822, + "step": 3085 + }, + { + "epoch": 0.32, + "grad_norm": 1.7930850277354924, + "learning_rate": 7.94205777964371e-06, + "loss": 0.6518, + "step": 3086 + }, + { + "epoch": 0.32, + "grad_norm": 1.8907545370407564, + "learning_rate": 7.940696466663674e-06, + "loss": 0.6399, + "step": 3087 + }, + { + "epoch": 0.32, + "grad_norm": 1.838018871743233, + "learning_rate": 7.939334820338933e-06, + "loss": 0.5909, + "step": 3088 + }, + { + "epoch": 0.32, + "grad_norm": 1.9243946073025104, + "learning_rate": 7.937972840823836e-06, + "loss": 0.6856, + "step": 3089 + }, + { + "epoch": 0.32, + "grad_norm": 2.0112204626320285, + "learning_rate": 7.93661052827277e-06, + "loss": 0.7325, + "step": 3090 + }, + { + "epoch": 0.32, + "grad_norm": 1.9773351694370116, + "learning_rate": 7.935247882840164e-06, + "loss": 0.6951, + "step": 3091 + }, + { + "epoch": 0.32, + "grad_norm": 1.8575930787334212, + "learning_rate": 7.933884904680482e-06, + "loss": 0.6292, + "step": 3092 + }, + { + "epoch": 0.32, + "grad_norm": 1.8549941118391462, + "learning_rate": 7.93252159394822e-06, + "loss": 0.7249, + "step": 3093 + }, + { + "epoch": 0.32, + "grad_norm": 1.8390883570912002, + "learning_rate": 7.931157950797923e-06, + "loss": 0.5553, + "step": 3094 + }, + { + "epoch": 0.32, + "grad_norm": 1.979121650127033, + "learning_rate": 7.929793975384164e-06, + "loss": 0.6871, + "step": 3095 + }, + { + "epoch": 0.32, + "grad_norm": 1.8303793323181003, + "learning_rate": 7.92842966786156e-06, + "loss": 0.6484, + "step": 3096 + }, + { + "epoch": 0.32, + "grad_norm": 1.8775457173794086, + "learning_rate": 7.92706502838476e-06, + "loss": 0.7069, + "step": 3097 + }, + { + "epoch": 0.32, + "grad_norm": 1.7512552354294855, + "learning_rate": 7.925700057108455e-06, + "loss": 0.6866, + "step": 3098 + }, + { + "epoch": 0.32, + "grad_norm": 2.036055977573231, + "learning_rate": 7.924334754187373e-06, + "loss": 0.6938, + "step": 3099 + }, + { + "epoch": 0.32, + "grad_norm": 1.9444583395675479, + "learning_rate": 7.92296911977628e-06, + "loss": 0.6017, + "step": 3100 + }, + { + "epoch": 0.32, + "grad_norm": 1.7671077634825694, + "learning_rate": 7.921603154029976e-06, + "loss": 0.6438, + "step": 3101 + }, + { + "epoch": 0.32, + "grad_norm": 1.7985327525448345, + "learning_rate": 7.920236857103301e-06, + "loss": 0.5896, + "step": 3102 + }, + { + "epoch": 0.32, + "grad_norm": 1.9693985426170422, + "learning_rate": 7.918870229151134e-06, + "loss": 0.6559, + "step": 3103 + }, + { + "epoch": 0.32, + "grad_norm": 1.756742744849359, + "learning_rate": 7.91750327032839e-06, + "loss": 0.6919, + "step": 3104 + }, + { + "epoch": 0.32, + "grad_norm": 1.7404813813771645, + "learning_rate": 7.91613598079002e-06, + "loss": 0.6627, + "step": 3105 + }, + { + "epoch": 0.32, + "grad_norm": 1.7682475752042812, + "learning_rate": 7.914768360691017e-06, + "loss": 0.6542, + "step": 3106 + }, + { + "epoch": 0.32, + "grad_norm": 1.9117815109314706, + "learning_rate": 7.913400410186406e-06, + "loss": 0.6916, + "step": 3107 + }, + { + "epoch": 0.32, + "grad_norm": 2.0622229043342157, + "learning_rate": 7.912032129431251e-06, + "loss": 0.7196, + "step": 3108 + }, + { + "epoch": 0.32, + "grad_norm": 1.767446056151579, + "learning_rate": 7.91066351858066e-06, + "loss": 0.6045, + "step": 3109 + }, + { + "epoch": 0.32, + "grad_norm": 1.8688982836066614, + "learning_rate": 7.909294577789765e-06, + "loss": 0.6622, + "step": 3110 + }, + { + "epoch": 0.32, + "grad_norm": 2.005588406771868, + "learning_rate": 7.907925307213748e-06, + "loss": 0.7336, + "step": 3111 + }, + { + "epoch": 0.32, + "grad_norm": 1.8008940358030987, + "learning_rate": 7.906555707007823e-06, + "loss": 0.6675, + "step": 3112 + }, + { + "epoch": 0.32, + "grad_norm": 1.912571767612056, + "learning_rate": 7.905185777327242e-06, + "loss": 0.6646, + "step": 3113 + }, + { + "epoch": 0.32, + "grad_norm": 1.6446609630621263, + "learning_rate": 7.903815518327295e-06, + "loss": 0.6515, + "step": 3114 + }, + { + "epoch": 0.32, + "grad_norm": 1.982274144676952, + "learning_rate": 7.902444930163308e-06, + "loss": 0.6789, + "step": 3115 + }, + { + "epoch": 0.32, + "grad_norm": 1.8551387879880945, + "learning_rate": 7.901074012990645e-06, + "loss": 0.5955, + "step": 3116 + }, + { + "epoch": 0.32, + "grad_norm": 1.943758584895994, + "learning_rate": 7.899702766964705e-06, + "loss": 0.7111, + "step": 3117 + }, + { + "epoch": 0.32, + "grad_norm": 1.8648274627942667, + "learning_rate": 7.898331192240929e-06, + "loss": 0.7082, + "step": 3118 + }, + { + "epoch": 0.32, + "grad_norm": 1.9477847308322247, + "learning_rate": 7.896959288974792e-06, + "loss": 0.7108, + "step": 3119 + }, + { + "epoch": 0.32, + "grad_norm": 1.885086662138018, + "learning_rate": 7.89558705732181e-06, + "loss": 0.6246, + "step": 3120 + }, + { + "epoch": 0.32, + "grad_norm": 1.9888061668612529, + "learning_rate": 7.894214497437528e-06, + "loss": 0.7098, + "step": 3121 + }, + { + "epoch": 0.32, + "grad_norm": 1.9341899975077337, + "learning_rate": 7.892841609477538e-06, + "loss": 0.5994, + "step": 3122 + }, + { + "epoch": 0.32, + "grad_norm": 1.9079692239038495, + "learning_rate": 7.891468393597464e-06, + "loss": 0.6772, + "step": 3123 + }, + { + "epoch": 0.32, + "grad_norm": 2.038691877984083, + "learning_rate": 7.890094849952964e-06, + "loss": 0.6783, + "step": 3124 + }, + { + "epoch": 0.32, + "grad_norm": 1.7539539211322792, + "learning_rate": 7.888720978699742e-06, + "loss": 0.6765, + "step": 3125 + }, + { + "epoch": 0.32, + "grad_norm": 1.8011122274916558, + "learning_rate": 7.88734677999353e-06, + "loss": 0.6645, + "step": 3126 + }, + { + "epoch": 0.33, + "grad_norm": 1.9309782177573798, + "learning_rate": 7.885972253990104e-06, + "loss": 0.711, + "step": 3127 + }, + { + "epoch": 0.33, + "grad_norm": 1.8744956494093434, + "learning_rate": 7.884597400845273e-06, + "loss": 0.6176, + "step": 3128 + }, + { + "epoch": 0.33, + "grad_norm": 1.8638622827816949, + "learning_rate": 7.883222220714886e-06, + "loss": 0.7032, + "step": 3129 + }, + { + "epoch": 0.33, + "grad_norm": 1.776577837690079, + "learning_rate": 7.881846713754826e-06, + "loss": 0.6416, + "step": 3130 + }, + { + "epoch": 0.33, + "grad_norm": 1.9796058522752333, + "learning_rate": 7.880470880121015e-06, + "loss": 0.628, + "step": 3131 + }, + { + "epoch": 0.33, + "grad_norm": 1.9181012593383686, + "learning_rate": 7.879094719969412e-06, + "loss": 0.6968, + "step": 3132 + }, + { + "epoch": 0.33, + "grad_norm": 1.7820935673678577, + "learning_rate": 7.87771823345601e-06, + "loss": 0.6633, + "step": 3133 + }, + { + "epoch": 0.33, + "grad_norm": 1.7651158173417194, + "learning_rate": 7.876341420736847e-06, + "loss": 0.6512, + "step": 3134 + }, + { + "epoch": 0.33, + "grad_norm": 1.8206169688036722, + "learning_rate": 7.874964281967988e-06, + "loss": 0.6045, + "step": 3135 + }, + { + "epoch": 0.33, + "grad_norm": 2.0786032378811115, + "learning_rate": 7.87358681730554e-06, + "loss": 0.7924, + "step": 3136 + }, + { + "epoch": 0.33, + "grad_norm": 2.1478641904169677, + "learning_rate": 7.872209026905648e-06, + "loss": 0.8035, + "step": 3137 + }, + { + "epoch": 0.33, + "grad_norm": 1.8482328317441383, + "learning_rate": 7.870830910924491e-06, + "loss": 0.6109, + "step": 3138 + }, + { + "epoch": 0.33, + "grad_norm": 1.9746459109210937, + "learning_rate": 7.869452469518291e-06, + "loss": 0.6593, + "step": 3139 + }, + { + "epoch": 0.33, + "grad_norm": 2.3263447551499503, + "learning_rate": 7.868073702843294e-06, + "loss": 0.7088, + "step": 3140 + }, + { + "epoch": 0.33, + "grad_norm": 1.790599477651934, + "learning_rate": 7.866694611055796e-06, + "loss": 0.6343, + "step": 3141 + }, + { + "epoch": 0.33, + "grad_norm": 2.031239757144938, + "learning_rate": 7.865315194312125e-06, + "loss": 0.6643, + "step": 3142 + }, + { + "epoch": 0.33, + "grad_norm": 2.0555372887136754, + "learning_rate": 7.863935452768645e-06, + "loss": 0.6539, + "step": 3143 + }, + { + "epoch": 0.33, + "grad_norm": 1.9539749091015681, + "learning_rate": 7.862555386581758e-06, + "loss": 0.6238, + "step": 3144 + }, + { + "epoch": 0.33, + "grad_norm": 1.7643599459994184, + "learning_rate": 7.861174995907901e-06, + "loss": 0.6501, + "step": 3145 + }, + { + "epoch": 0.33, + "grad_norm": 1.7358705962081895, + "learning_rate": 7.85979428090355e-06, + "loss": 0.6352, + "step": 3146 + }, + { + "epoch": 0.33, + "grad_norm": 1.8716834585310764, + "learning_rate": 7.858413241725219e-06, + "loss": 0.6743, + "step": 3147 + }, + { + "epoch": 0.33, + "grad_norm": 1.9704127723150717, + "learning_rate": 7.857031878529452e-06, + "loss": 0.7033, + "step": 3148 + }, + { + "epoch": 0.33, + "grad_norm": 1.9876812359612106, + "learning_rate": 7.855650191472836e-06, + "loss": 0.6894, + "step": 3149 + }, + { + "epoch": 0.33, + "grad_norm": 1.8676202472340744, + "learning_rate": 7.854268180711997e-06, + "loss": 0.6776, + "step": 3150 + }, + { + "epoch": 0.33, + "grad_norm": 2.016838800702593, + "learning_rate": 7.852885846403591e-06, + "loss": 0.5783, + "step": 3151 + }, + { + "epoch": 0.33, + "grad_norm": 2.059826072634291, + "learning_rate": 7.851503188704312e-06, + "loss": 0.6996, + "step": 3152 + }, + { + "epoch": 0.33, + "grad_norm": 1.9390691952182482, + "learning_rate": 7.850120207770893e-06, + "loss": 0.7219, + "step": 3153 + }, + { + "epoch": 0.33, + "grad_norm": 1.8477942788865895, + "learning_rate": 7.848736903760106e-06, + "loss": 0.6796, + "step": 3154 + }, + { + "epoch": 0.33, + "grad_norm": 1.7632667358112966, + "learning_rate": 7.847353276828751e-06, + "loss": 0.719, + "step": 3155 + }, + { + "epoch": 0.33, + "grad_norm": 1.7943354805743594, + "learning_rate": 7.845969327133673e-06, + "loss": 0.6816, + "step": 3156 + }, + { + "epoch": 0.33, + "grad_norm": 2.114565393931848, + "learning_rate": 7.84458505483175e-06, + "loss": 0.7039, + "step": 3157 + }, + { + "epoch": 0.33, + "grad_norm": 2.0487194410229215, + "learning_rate": 7.8432004600799e-06, + "loss": 0.6646, + "step": 3158 + }, + { + "epoch": 0.33, + "grad_norm": 2.196991395817911, + "learning_rate": 7.84181554303507e-06, + "loss": 0.7162, + "step": 3159 + }, + { + "epoch": 0.33, + "grad_norm": 1.805330609982835, + "learning_rate": 7.840430303854251e-06, + "loss": 0.6998, + "step": 3160 + }, + { + "epoch": 0.33, + "grad_norm": 1.945875095821067, + "learning_rate": 7.839044742694466e-06, + "loss": 0.6201, + "step": 3161 + }, + { + "epoch": 0.33, + "grad_norm": 1.8994297100188413, + "learning_rate": 7.83765885971278e-06, + "loss": 0.6215, + "step": 3162 + }, + { + "epoch": 0.33, + "grad_norm": 1.7735390094819343, + "learning_rate": 7.836272655066286e-06, + "loss": 0.6738, + "step": 3163 + }, + { + "epoch": 0.33, + "grad_norm": 2.0506083726297994, + "learning_rate": 7.83488612891212e-06, + "loss": 0.6146, + "step": 3164 + }, + { + "epoch": 0.33, + "grad_norm": 2.0616518744706984, + "learning_rate": 7.833499281407455e-06, + "loss": 0.6962, + "step": 3165 + }, + { + "epoch": 0.33, + "grad_norm": 1.9640484511919805, + "learning_rate": 7.832112112709496e-06, + "loss": 0.6537, + "step": 3166 + }, + { + "epoch": 0.33, + "grad_norm": 1.8916743371522864, + "learning_rate": 7.830724622975485e-06, + "loss": 0.6021, + "step": 3167 + }, + { + "epoch": 0.33, + "grad_norm": 2.068753121157626, + "learning_rate": 7.829336812362703e-06, + "loss": 0.6653, + "step": 3168 + }, + { + "epoch": 0.33, + "grad_norm": 1.8414392119014225, + "learning_rate": 7.827948681028467e-06, + "loss": 0.6749, + "step": 3169 + }, + { + "epoch": 0.33, + "grad_norm": 1.8041772814742751, + "learning_rate": 7.826560229130132e-06, + "loss": 0.6561, + "step": 3170 + }, + { + "epoch": 0.33, + "grad_norm": 2.277001270558421, + "learning_rate": 7.82517145682508e-06, + "loss": 0.7026, + "step": 3171 + }, + { + "epoch": 0.33, + "grad_norm": 1.9932702774934739, + "learning_rate": 7.823782364270743e-06, + "loss": 0.6764, + "step": 3172 + }, + { + "epoch": 0.33, + "grad_norm": 1.9152540765115291, + "learning_rate": 7.82239295162458e-06, + "loss": 0.5991, + "step": 3173 + }, + { + "epoch": 0.33, + "grad_norm": 2.0171036692649142, + "learning_rate": 7.821003219044087e-06, + "loss": 0.6495, + "step": 3174 + }, + { + "epoch": 0.33, + "grad_norm": 1.9017373728063542, + "learning_rate": 7.819613166686802e-06, + "loss": 0.6444, + "step": 3175 + }, + { + "epoch": 0.33, + "grad_norm": 1.9178976145922144, + "learning_rate": 7.818222794710293e-06, + "loss": 0.61, + "step": 3176 + }, + { + "epoch": 0.33, + "grad_norm": 2.3536736709136723, + "learning_rate": 7.816832103272165e-06, + "loss": 0.5589, + "step": 3177 + }, + { + "epoch": 0.33, + "grad_norm": 2.5473526112248663, + "learning_rate": 7.815441092530064e-06, + "loss": 0.7009, + "step": 3178 + }, + { + "epoch": 0.33, + "grad_norm": 1.9025338748284415, + "learning_rate": 7.814049762641668e-06, + "loss": 0.665, + "step": 3179 + }, + { + "epoch": 0.33, + "grad_norm": 1.9662648376077074, + "learning_rate": 7.812658113764691e-06, + "loss": 0.8229, + "step": 3180 + }, + { + "epoch": 0.33, + "grad_norm": 1.8632716806392016, + "learning_rate": 7.811266146056886e-06, + "loss": 0.7367, + "step": 3181 + }, + { + "epoch": 0.33, + "grad_norm": 2.082465852054203, + "learning_rate": 7.80987385967604e-06, + "loss": 0.696, + "step": 3182 + }, + { + "epoch": 0.33, + "grad_norm": 1.9024013485348124, + "learning_rate": 7.808481254779975e-06, + "loss": 0.7804, + "step": 3183 + }, + { + "epoch": 0.33, + "grad_norm": 1.785436285512577, + "learning_rate": 7.807088331526553e-06, + "loss": 0.4849, + "step": 3184 + }, + { + "epoch": 0.33, + "grad_norm": 1.933460760859134, + "learning_rate": 7.805695090073668e-06, + "loss": 0.6479, + "step": 3185 + }, + { + "epoch": 0.33, + "grad_norm": 1.8728996996369847, + "learning_rate": 7.804301530579253e-06, + "loss": 0.6551, + "step": 3186 + }, + { + "epoch": 0.33, + "grad_norm": 1.9368897835831256, + "learning_rate": 7.802907653201275e-06, + "loss": 0.7015, + "step": 3187 + }, + { + "epoch": 0.33, + "grad_norm": 1.8425480322102283, + "learning_rate": 7.801513458097741e-06, + "loss": 0.6353, + "step": 3188 + }, + { + "epoch": 0.33, + "grad_norm": 1.9522627071910101, + "learning_rate": 7.800118945426684e-06, + "loss": 0.6558, + "step": 3189 + }, + { + "epoch": 0.33, + "grad_norm": 1.9459414208532624, + "learning_rate": 7.798724115346188e-06, + "loss": 0.6414, + "step": 3190 + }, + { + "epoch": 0.33, + "grad_norm": 1.8198890780658505, + "learning_rate": 7.797328968014359e-06, + "loss": 0.6503, + "step": 3191 + }, + { + "epoch": 0.33, + "grad_norm": 1.989239342204068, + "learning_rate": 7.795933503589349e-06, + "loss": 0.6533, + "step": 3192 + }, + { + "epoch": 0.33, + "grad_norm": 1.9017341494606674, + "learning_rate": 7.79453772222934e-06, + "loss": 0.6688, + "step": 3193 + }, + { + "epoch": 0.33, + "grad_norm": 1.790821468643048, + "learning_rate": 7.793141624092551e-06, + "loss": 0.6868, + "step": 3194 + }, + { + "epoch": 0.33, + "grad_norm": 2.1081525065527806, + "learning_rate": 7.791745209337239e-06, + "loss": 0.7204, + "step": 3195 + }, + { + "epoch": 0.33, + "grad_norm": 1.9121572482641518, + "learning_rate": 7.790348478121695e-06, + "loss": 0.7305, + "step": 3196 + }, + { + "epoch": 0.33, + "grad_norm": 2.197598958597094, + "learning_rate": 7.788951430604246e-06, + "loss": 0.7218, + "step": 3197 + }, + { + "epoch": 0.33, + "grad_norm": 1.7643539813858016, + "learning_rate": 7.787554066943256e-06, + "loss": 0.5945, + "step": 3198 + }, + { + "epoch": 0.33, + "grad_norm": 1.9199918036679502, + "learning_rate": 7.786156387297126e-06, + "loss": 0.6765, + "step": 3199 + }, + { + "epoch": 0.33, + "grad_norm": 1.9184094585478977, + "learning_rate": 7.784758391824286e-06, + "loss": 0.649, + "step": 3200 + }, + { + "epoch": 0.33, + "grad_norm": 1.9359112716415736, + "learning_rate": 7.783360080683212e-06, + "loss": 0.6215, + "step": 3201 + }, + { + "epoch": 0.33, + "grad_norm": 1.6650658879147164, + "learning_rate": 7.781961454032407e-06, + "loss": 0.6007, + "step": 3202 + }, + { + "epoch": 0.33, + "grad_norm": 1.8712142665583873, + "learning_rate": 7.780562512030414e-06, + "loss": 0.6509, + "step": 3203 + }, + { + "epoch": 0.33, + "grad_norm": 1.9979865163071713, + "learning_rate": 7.77916325483581e-06, + "loss": 0.7924, + "step": 3204 + }, + { + "epoch": 0.33, + "grad_norm": 1.9307099915203076, + "learning_rate": 7.777763682607214e-06, + "loss": 0.6831, + "step": 3205 + }, + { + "epoch": 0.33, + "grad_norm": 1.9719246908216845, + "learning_rate": 7.77636379550327e-06, + "loss": 0.6781, + "step": 3206 + }, + { + "epoch": 0.33, + "grad_norm": 2.281909905417469, + "learning_rate": 7.774963593682667e-06, + "loss": 0.7001, + "step": 3207 + }, + { + "epoch": 0.33, + "grad_norm": 1.9134686519705317, + "learning_rate": 7.773563077304123e-06, + "loss": 0.6533, + "step": 3208 + }, + { + "epoch": 0.33, + "grad_norm": 1.7375846606061787, + "learning_rate": 7.772162246526394e-06, + "loss": 0.6, + "step": 3209 + }, + { + "epoch": 0.33, + "grad_norm": 1.9210680843386594, + "learning_rate": 7.770761101508274e-06, + "loss": 0.6591, + "step": 3210 + }, + { + "epoch": 0.33, + "grad_norm": 1.858930054263607, + "learning_rate": 7.769359642408591e-06, + "loss": 0.7036, + "step": 3211 + }, + { + "epoch": 0.33, + "grad_norm": 2.024890925973368, + "learning_rate": 7.767957869386208e-06, + "loss": 0.6565, + "step": 3212 + }, + { + "epoch": 0.33, + "grad_norm": 1.8619952533612665, + "learning_rate": 7.766555782600023e-06, + "loss": 0.5979, + "step": 3213 + }, + { + "epoch": 0.33, + "grad_norm": 1.6727164069719362, + "learning_rate": 7.765153382208972e-06, + "loss": 0.6987, + "step": 3214 + }, + { + "epoch": 0.33, + "grad_norm": 1.8507203779819241, + "learning_rate": 7.763750668372023e-06, + "loss": 0.628, + "step": 3215 + }, + { + "epoch": 0.33, + "grad_norm": 1.9341511347273956, + "learning_rate": 7.762347641248182e-06, + "loss": 0.607, + "step": 3216 + }, + { + "epoch": 0.33, + "grad_norm": 1.8954280950807452, + "learning_rate": 7.760944300996494e-06, + "loss": 0.6493, + "step": 3217 + }, + { + "epoch": 0.33, + "grad_norm": 2.103132213288938, + "learning_rate": 7.759540647776031e-06, + "loss": 0.656, + "step": 3218 + }, + { + "epoch": 0.33, + "grad_norm": 1.964201049747272, + "learning_rate": 7.758136681745907e-06, + "loss": 0.6294, + "step": 3219 + }, + { + "epoch": 0.33, + "grad_norm": 1.731784592727429, + "learning_rate": 7.756732403065269e-06, + "loss": 0.633, + "step": 3220 + }, + { + "epoch": 0.33, + "grad_norm": 1.8723486158466771, + "learning_rate": 7.7553278118933e-06, + "loss": 0.5642, + "step": 3221 + }, + { + "epoch": 0.33, + "grad_norm": 1.912846816349759, + "learning_rate": 7.75392290838922e-06, + "loss": 0.6209, + "step": 3222 + }, + { + "epoch": 0.34, + "grad_norm": 2.0355518598418763, + "learning_rate": 7.75251769271228e-06, + "loss": 0.6866, + "step": 3223 + }, + { + "epoch": 0.34, + "grad_norm": 1.8024911163155974, + "learning_rate": 7.751112165021771e-06, + "loss": 0.6965, + "step": 3224 + }, + { + "epoch": 0.34, + "grad_norm": 1.992035602097195, + "learning_rate": 7.749706325477017e-06, + "loss": 0.727, + "step": 3225 + }, + { + "epoch": 0.34, + "grad_norm": 2.012344638600577, + "learning_rate": 7.74830017423738e-06, + "loss": 0.6522, + "step": 3226 + }, + { + "epoch": 0.34, + "grad_norm": 1.7334097631987118, + "learning_rate": 7.74689371146225e-06, + "loss": 0.7054, + "step": 3227 + }, + { + "epoch": 0.34, + "grad_norm": 1.910822938912061, + "learning_rate": 7.745486937311065e-06, + "loss": 0.699, + "step": 3228 + }, + { + "epoch": 0.34, + "grad_norm": 2.189272683241941, + "learning_rate": 7.744079851943286e-06, + "loss": 0.6648, + "step": 3229 + }, + { + "epoch": 0.34, + "grad_norm": 1.777075786244854, + "learning_rate": 7.742672455518413e-06, + "loss": 0.7164, + "step": 3230 + }, + { + "epoch": 0.34, + "grad_norm": 1.7858757362545894, + "learning_rate": 7.741264748195984e-06, + "loss": 0.6449, + "step": 3231 + }, + { + "epoch": 0.34, + "grad_norm": 1.9753861500283088, + "learning_rate": 7.739856730135575e-06, + "loss": 0.6902, + "step": 3232 + }, + { + "epoch": 0.34, + "grad_norm": 1.6698498743407861, + "learning_rate": 7.738448401496785e-06, + "loss": 0.6012, + "step": 3233 + }, + { + "epoch": 0.34, + "grad_norm": 2.0095261492551115, + "learning_rate": 7.737039762439263e-06, + "loss": 0.7093, + "step": 3234 + }, + { + "epoch": 0.34, + "grad_norm": 1.7863240773519469, + "learning_rate": 7.735630813122683e-06, + "loss": 0.6411, + "step": 3235 + }, + { + "epoch": 0.34, + "grad_norm": 1.7699717846306777, + "learning_rate": 7.734221553706756e-06, + "loss": 0.6196, + "step": 3236 + }, + { + "epoch": 0.34, + "grad_norm": 1.8298275662451435, + "learning_rate": 7.732811984351232e-06, + "loss": 0.6401, + "step": 3237 + }, + { + "epoch": 0.34, + "grad_norm": 1.8757903882230937, + "learning_rate": 7.731402105215892e-06, + "loss": 0.6505, + "step": 3238 + }, + { + "epoch": 0.34, + "grad_norm": 1.8771166804066328, + "learning_rate": 7.729991916460558e-06, + "loss": 0.6448, + "step": 3239 + }, + { + "epoch": 0.34, + "grad_norm": 2.0422800495620725, + "learning_rate": 7.728581418245078e-06, + "loss": 0.7242, + "step": 3240 + }, + { + "epoch": 0.34, + "grad_norm": 1.9500585202348317, + "learning_rate": 7.727170610729341e-06, + "loss": 0.6406, + "step": 3241 + }, + { + "epoch": 0.34, + "grad_norm": 1.9179487116731881, + "learning_rate": 7.725759494073272e-06, + "loss": 0.709, + "step": 3242 + }, + { + "epoch": 0.34, + "grad_norm": 1.8442830499995486, + "learning_rate": 7.72434806843683e-06, + "loss": 0.5964, + "step": 3243 + }, + { + "epoch": 0.34, + "grad_norm": 1.9596900219378959, + "learning_rate": 7.722936333980002e-06, + "loss": 0.6349, + "step": 3244 + }, + { + "epoch": 0.34, + "grad_norm": 1.8473944971850758, + "learning_rate": 7.721524290862821e-06, + "loss": 0.6101, + "step": 3245 + }, + { + "epoch": 0.34, + "grad_norm": 1.903216419149971, + "learning_rate": 7.720111939245351e-06, + "loss": 0.6553, + "step": 3246 + }, + { + "epoch": 0.34, + "grad_norm": 2.1706479120378255, + "learning_rate": 7.71869927928769e-06, + "loss": 0.5984, + "step": 3247 + }, + { + "epoch": 0.34, + "grad_norm": 1.9082270614604948, + "learning_rate": 7.717286311149967e-06, + "loss": 0.5559, + "step": 3248 + }, + { + "epoch": 0.34, + "grad_norm": 1.8074082902284876, + "learning_rate": 7.715873034992354e-06, + "loss": 0.6667, + "step": 3249 + }, + { + "epoch": 0.34, + "grad_norm": 2.4590764946541355, + "learning_rate": 7.714459450975052e-06, + "loss": 0.6183, + "step": 3250 + }, + { + "epoch": 0.34, + "grad_norm": 1.7625406627291564, + "learning_rate": 7.7130455592583e-06, + "loss": 0.7122, + "step": 3251 + }, + { + "epoch": 0.34, + "grad_norm": 2.104638475133327, + "learning_rate": 7.71163136000237e-06, + "loss": 0.6527, + "step": 3252 + }, + { + "epoch": 0.34, + "grad_norm": 1.8891265825485541, + "learning_rate": 7.710216853367568e-06, + "loss": 0.6901, + "step": 3253 + }, + { + "epoch": 0.34, + "grad_norm": 2.2146816583793743, + "learning_rate": 7.708802039514238e-06, + "loss": 0.7759, + "step": 3254 + }, + { + "epoch": 0.34, + "grad_norm": 1.9382427214432836, + "learning_rate": 7.707386918602759e-06, + "loss": 0.6797, + "step": 3255 + }, + { + "epoch": 0.34, + "grad_norm": 2.070382342712708, + "learning_rate": 7.70597149079354e-06, + "loss": 0.6483, + "step": 3256 + }, + { + "epoch": 0.34, + "grad_norm": 1.767128407067787, + "learning_rate": 7.70455575624703e-06, + "loss": 0.5626, + "step": 3257 + }, + { + "epoch": 0.34, + "grad_norm": 1.7058603097337257, + "learning_rate": 7.70313971512371e-06, + "loss": 0.4997, + "step": 3258 + }, + { + "epoch": 0.34, + "grad_norm": 1.7513131467756609, + "learning_rate": 7.701723367584094e-06, + "loss": 0.6709, + "step": 3259 + }, + { + "epoch": 0.34, + "grad_norm": 1.8236846567523217, + "learning_rate": 7.700306713788735e-06, + "loss": 0.6769, + "step": 3260 + }, + { + "epoch": 0.34, + "grad_norm": 2.1815094050601966, + "learning_rate": 7.69888975389822e-06, + "loss": 0.7312, + "step": 3261 + }, + { + "epoch": 0.34, + "grad_norm": 1.8049118711336998, + "learning_rate": 7.697472488073168e-06, + "loss": 0.6029, + "step": 3262 + }, + { + "epoch": 0.34, + "grad_norm": 1.6981106891148934, + "learning_rate": 7.696054916474235e-06, + "loss": 0.6433, + "step": 3263 + }, + { + "epoch": 0.34, + "grad_norm": 1.891118335430464, + "learning_rate": 7.694637039262109e-06, + "loss": 0.6485, + "step": 3264 + }, + { + "epoch": 0.34, + "grad_norm": 1.7889227423611733, + "learning_rate": 7.693218856597515e-06, + "loss": 0.6024, + "step": 3265 + }, + { + "epoch": 0.34, + "grad_norm": 1.9373484978474884, + "learning_rate": 7.691800368641214e-06, + "loss": 0.6609, + "step": 3266 + }, + { + "epoch": 0.34, + "grad_norm": 1.7546189984312635, + "learning_rate": 7.690381575553998e-06, + "loss": 0.6001, + "step": 3267 + }, + { + "epoch": 0.34, + "grad_norm": 1.742031273653571, + "learning_rate": 7.688962477496696e-06, + "loss": 0.6134, + "step": 3268 + }, + { + "epoch": 0.34, + "grad_norm": 1.8461145661001945, + "learning_rate": 7.68754307463017e-06, + "loss": 0.6021, + "step": 3269 + }, + { + "epoch": 0.34, + "grad_norm": 1.852835261703502, + "learning_rate": 7.68612336711532e-06, + "loss": 0.6454, + "step": 3270 + }, + { + "epoch": 0.34, + "grad_norm": 2.2319112287773493, + "learning_rate": 7.684703355113074e-06, + "loss": 0.799, + "step": 3271 + }, + { + "epoch": 0.34, + "grad_norm": 1.8107744027642172, + "learning_rate": 7.683283038784402e-06, + "loss": 0.6029, + "step": 3272 + }, + { + "epoch": 0.34, + "grad_norm": 2.0367129090031293, + "learning_rate": 7.681862418290302e-06, + "loss": 0.7111, + "step": 3273 + }, + { + "epoch": 0.34, + "grad_norm": 1.8995914346640856, + "learning_rate": 7.68044149379181e-06, + "loss": 0.6072, + "step": 3274 + }, + { + "epoch": 0.34, + "grad_norm": 1.8071434072693784, + "learning_rate": 7.679020265449999e-06, + "loss": 0.6942, + "step": 3275 + }, + { + "epoch": 0.34, + "grad_norm": 1.8946083630450248, + "learning_rate": 7.67759873342597e-06, + "loss": 0.666, + "step": 3276 + }, + { + "epoch": 0.34, + "grad_norm": 1.8185369692074362, + "learning_rate": 7.676176897880862e-06, + "loss": 0.6498, + "step": 3277 + }, + { + "epoch": 0.34, + "grad_norm": 1.8510420885532568, + "learning_rate": 7.67475475897585e-06, + "loss": 0.5591, + "step": 3278 + }, + { + "epoch": 0.34, + "grad_norm": 1.8125241947077075, + "learning_rate": 7.67333231687214e-06, + "loss": 0.6652, + "step": 3279 + }, + { + "epoch": 0.34, + "grad_norm": 1.6649027457440944, + "learning_rate": 7.671909571730974e-06, + "loss": 0.6202, + "step": 3280 + }, + { + "epoch": 0.34, + "grad_norm": 1.6610788823267293, + "learning_rate": 7.67048652371363e-06, + "loss": 0.5144, + "step": 3281 + }, + { + "epoch": 0.34, + "grad_norm": 2.051859824910609, + "learning_rate": 7.669063172981415e-06, + "loss": 0.8101, + "step": 3282 + }, + { + "epoch": 0.34, + "grad_norm": 1.8135258363900928, + "learning_rate": 7.667639519695678e-06, + "loss": 0.6548, + "step": 3283 + }, + { + "epoch": 0.34, + "grad_norm": 2.0789065749765734, + "learning_rate": 7.666215564017797e-06, + "loss": 0.7535, + "step": 3284 + }, + { + "epoch": 0.34, + "grad_norm": 1.9976097148077065, + "learning_rate": 7.664791306109183e-06, + "loss": 0.6333, + "step": 3285 + }, + { + "epoch": 0.34, + "grad_norm": 1.792724714904917, + "learning_rate": 7.663366746131286e-06, + "loss": 0.6677, + "step": 3286 + }, + { + "epoch": 0.34, + "grad_norm": 2.01046056944132, + "learning_rate": 7.661941884245589e-06, + "loss": 0.5846, + "step": 3287 + }, + { + "epoch": 0.34, + "grad_norm": 1.8378314305059873, + "learning_rate": 7.660516720613606e-06, + "loss": 0.7189, + "step": 3288 + }, + { + "epoch": 0.34, + "grad_norm": 2.116097571217621, + "learning_rate": 7.659091255396888e-06, + "loss": 0.6876, + "step": 3289 + }, + { + "epoch": 0.34, + "grad_norm": 2.452763925603077, + "learning_rate": 7.65766548875702e-06, + "loss": 0.6209, + "step": 3290 + }, + { + "epoch": 0.34, + "grad_norm": 1.7086733403610908, + "learning_rate": 7.656239420855621e-06, + "loss": 0.6561, + "step": 3291 + }, + { + "epoch": 0.34, + "grad_norm": 2.1055982116655105, + "learning_rate": 7.654813051854345e-06, + "loss": 0.6497, + "step": 3292 + }, + { + "epoch": 0.34, + "grad_norm": 1.945103058631345, + "learning_rate": 7.653386381914874e-06, + "loss": 0.7458, + "step": 3293 + }, + { + "epoch": 0.34, + "grad_norm": 1.8639467359374053, + "learning_rate": 7.651959411198934e-06, + "loss": 0.6667, + "step": 3294 + }, + { + "epoch": 0.34, + "grad_norm": 1.9314142195588189, + "learning_rate": 7.65053213986828e-06, + "loss": 0.6434, + "step": 3295 + }, + { + "epoch": 0.34, + "grad_norm": 1.968743977461504, + "learning_rate": 7.649104568084701e-06, + "loss": 0.6727, + "step": 3296 + }, + { + "epoch": 0.34, + "grad_norm": 1.6173702518058752, + "learning_rate": 7.64767669601002e-06, + "loss": 0.6738, + "step": 3297 + }, + { + "epoch": 0.34, + "grad_norm": 1.819260191517557, + "learning_rate": 7.646248523806092e-06, + "loss": 0.7187, + "step": 3298 + }, + { + "epoch": 0.34, + "grad_norm": 1.711390241856095, + "learning_rate": 7.644820051634813e-06, + "loss": 0.5777, + "step": 3299 + }, + { + "epoch": 0.34, + "grad_norm": 2.371999947619344, + "learning_rate": 7.643391279658106e-06, + "loss": 0.5607, + "step": 3300 + }, + { + "epoch": 0.34, + "grad_norm": 2.080268194932253, + "learning_rate": 7.64196220803793e-06, + "loss": 0.6454, + "step": 3301 + }, + { + "epoch": 0.34, + "grad_norm": 1.8161890897658788, + "learning_rate": 7.640532836936279e-06, + "loss": 0.6145, + "step": 3302 + }, + { + "epoch": 0.34, + "grad_norm": 1.8867414994265745, + "learning_rate": 7.639103166515179e-06, + "loss": 0.7367, + "step": 3303 + }, + { + "epoch": 0.34, + "grad_norm": 2.0829266047249013, + "learning_rate": 7.637673196936694e-06, + "loss": 0.7419, + "step": 3304 + }, + { + "epoch": 0.34, + "grad_norm": 2.021524026140808, + "learning_rate": 7.636242928362918e-06, + "loss": 0.7149, + "step": 3305 + }, + { + "epoch": 0.34, + "grad_norm": 1.9487343164474362, + "learning_rate": 7.634812360955982e-06, + "loss": 0.7065, + "step": 3306 + }, + { + "epoch": 0.34, + "grad_norm": 2.0147112197597092, + "learning_rate": 7.633381494878042e-06, + "loss": 0.6809, + "step": 3307 + }, + { + "epoch": 0.34, + "grad_norm": 2.033897246264916, + "learning_rate": 7.631950330291305e-06, + "loss": 0.6792, + "step": 3308 + }, + { + "epoch": 0.34, + "grad_norm": 1.7911815368426338, + "learning_rate": 7.630518867357994e-06, + "loss": 0.6244, + "step": 3309 + }, + { + "epoch": 0.34, + "grad_norm": 1.8465211407886768, + "learning_rate": 7.629087106240376e-06, + "loss": 0.6385, + "step": 3310 + }, + { + "epoch": 0.34, + "grad_norm": 1.8768778769605858, + "learning_rate": 7.6276550471007486e-06, + "loss": 0.6325, + "step": 3311 + }, + { + "epoch": 0.34, + "grad_norm": 1.8761662832688346, + "learning_rate": 7.626222690101445e-06, + "loss": 0.6617, + "step": 3312 + }, + { + "epoch": 0.34, + "grad_norm": 1.8842109296938618, + "learning_rate": 7.624790035404831e-06, + "loss": 0.6187, + "step": 3313 + }, + { + "epoch": 0.34, + "grad_norm": 1.998556004167101, + "learning_rate": 7.623357083173306e-06, + "loss": 0.6287, + "step": 3314 + }, + { + "epoch": 0.34, + "grad_norm": 1.9369889183368985, + "learning_rate": 7.621923833569301e-06, + "loss": 0.6092, + "step": 3315 + }, + { + "epoch": 0.34, + "grad_norm": 1.8729477963392878, + "learning_rate": 7.620490286755286e-06, + "loss": 0.6911, + "step": 3316 + }, + { + "epoch": 0.34, + "grad_norm": 1.9822928384022405, + "learning_rate": 7.619056442893762e-06, + "loss": 0.662, + "step": 3317 + }, + { + "epoch": 0.34, + "grad_norm": 1.7248626081500673, + "learning_rate": 7.61762230214726e-06, + "loss": 0.6479, + "step": 3318 + }, + { + "epoch": 0.35, + "grad_norm": 2.0789146795929487, + "learning_rate": 7.616187864678352e-06, + "loss": 0.6182, + "step": 3319 + }, + { + "epoch": 0.35, + "grad_norm": 1.838839059416301, + "learning_rate": 7.614753130649638e-06, + "loss": 0.656, + "step": 3320 + }, + { + "epoch": 0.35, + "grad_norm": 1.7187828108507008, + "learning_rate": 7.613318100223752e-06, + "loss": 0.6863, + "step": 3321 + }, + { + "epoch": 0.35, + "grad_norm": 1.695488574423953, + "learning_rate": 7.611882773563364e-06, + "loss": 0.6258, + "step": 3322 + }, + { + "epoch": 0.35, + "grad_norm": 1.758042093943283, + "learning_rate": 7.610447150831175e-06, + "loss": 0.6084, + "step": 3323 + }, + { + "epoch": 0.35, + "grad_norm": 1.9600377009168812, + "learning_rate": 7.609011232189925e-06, + "loss": 0.6382, + "step": 3324 + }, + { + "epoch": 0.35, + "grad_norm": 2.004029691758048, + "learning_rate": 7.60757501780238e-06, + "loss": 0.6046, + "step": 3325 + }, + { + "epoch": 0.35, + "grad_norm": 1.6927785653074294, + "learning_rate": 7.6061385078313424e-06, + "loss": 0.6248, + "step": 3326 + }, + { + "epoch": 0.35, + "grad_norm": 1.7976204301469354, + "learning_rate": 7.604701702439652e-06, + "loss": 0.6413, + "step": 3327 + }, + { + "epoch": 0.35, + "grad_norm": 1.8544041865004253, + "learning_rate": 7.603264601790178e-06, + "loss": 0.6129, + "step": 3328 + }, + { + "epoch": 0.35, + "grad_norm": 1.708847319417589, + "learning_rate": 7.601827206045822e-06, + "loss": 0.5682, + "step": 3329 + }, + { + "epoch": 0.35, + "grad_norm": 1.8900069868675693, + "learning_rate": 7.600389515369522e-06, + "loss": 0.6713, + "step": 3330 + }, + { + "epoch": 0.35, + "grad_norm": 1.667577105224544, + "learning_rate": 7.59895152992425e-06, + "loss": 0.5176, + "step": 3331 + }, + { + "epoch": 0.35, + "grad_norm": 1.823563725373987, + "learning_rate": 7.597513249873008e-06, + "loss": 0.6509, + "step": 3332 + }, + { + "epoch": 0.35, + "grad_norm": 2.1014119569769187, + "learning_rate": 7.5960746753788335e-06, + "loss": 0.7477, + "step": 3333 + }, + { + "epoch": 0.35, + "grad_norm": 1.9398729490943276, + "learning_rate": 7.594635806604797e-06, + "loss": 0.6432, + "step": 3334 + }, + { + "epoch": 0.35, + "grad_norm": 1.689141864675774, + "learning_rate": 7.593196643714005e-06, + "loss": 0.6297, + "step": 3335 + }, + { + "epoch": 0.35, + "grad_norm": 1.8997762411676566, + "learning_rate": 7.5917571868695905e-06, + "loss": 0.6331, + "step": 3336 + }, + { + "epoch": 0.35, + "grad_norm": 2.0613676536332335, + "learning_rate": 7.5903174362347265e-06, + "loss": 0.6842, + "step": 3337 + }, + { + "epoch": 0.35, + "grad_norm": 1.9957895885750523, + "learning_rate": 7.5888773919726176e-06, + "loss": 0.7165, + "step": 3338 + }, + { + "epoch": 0.35, + "grad_norm": 2.018970903898421, + "learning_rate": 7.5874370542465005e-06, + "loss": 0.6857, + "step": 3339 + }, + { + "epoch": 0.35, + "grad_norm": 1.961825625490136, + "learning_rate": 7.585996423219643e-06, + "loss": 0.6977, + "step": 3340 + }, + { + "epoch": 0.35, + "grad_norm": 2.1271983391631766, + "learning_rate": 7.584555499055355e-06, + "loss": 0.7688, + "step": 3341 + }, + { + "epoch": 0.35, + "grad_norm": 1.7660688433101426, + "learning_rate": 7.5831142819169664e-06, + "loss": 0.5719, + "step": 3342 + }, + { + "epoch": 0.35, + "grad_norm": 1.9487869971090308, + "learning_rate": 7.581672771967854e-06, + "loss": 0.7091, + "step": 3343 + }, + { + "epoch": 0.35, + "grad_norm": 1.9584035184171076, + "learning_rate": 7.5802309693714145e-06, + "loss": 0.6265, + "step": 3344 + }, + { + "epoch": 0.35, + "grad_norm": 1.8973868826432616, + "learning_rate": 7.57878887429109e-06, + "loss": 0.5412, + "step": 3345 + }, + { + "epoch": 0.35, + "grad_norm": 1.8292437136670205, + "learning_rate": 7.5773464868903465e-06, + "loss": 0.5505, + "step": 3346 + }, + { + "epoch": 0.35, + "grad_norm": 2.110059781420011, + "learning_rate": 7.57590380733269e-06, + "loss": 0.6323, + "step": 3347 + }, + { + "epoch": 0.35, + "grad_norm": 1.8430200609730625, + "learning_rate": 7.574460835781654e-06, + "loss": 0.5947, + "step": 3348 + }, + { + "epoch": 0.35, + "grad_norm": 1.8841432114527707, + "learning_rate": 7.573017572400807e-06, + "loss": 0.6783, + "step": 3349 + }, + { + "epoch": 0.35, + "grad_norm": 2.013283679598883, + "learning_rate": 7.571574017353755e-06, + "loss": 0.6922, + "step": 3350 + }, + { + "epoch": 0.35, + "grad_norm": 1.773668795659689, + "learning_rate": 7.570130170804129e-06, + "loss": 0.6641, + "step": 3351 + }, + { + "epoch": 0.35, + "grad_norm": 1.746076214972598, + "learning_rate": 7.5686860329156e-06, + "loss": 0.5992, + "step": 3352 + }, + { + "epoch": 0.35, + "grad_norm": 1.8380004264405159, + "learning_rate": 7.567241603851866e-06, + "loss": 0.6498, + "step": 3353 + }, + { + "epoch": 0.35, + "grad_norm": 1.908526988253742, + "learning_rate": 7.565796883776666e-06, + "loss": 0.679, + "step": 3354 + }, + { + "epoch": 0.35, + "grad_norm": 1.7663503430585459, + "learning_rate": 7.564351872853763e-06, + "loss": 0.6488, + "step": 3355 + }, + { + "epoch": 0.35, + "grad_norm": 1.8698403441208586, + "learning_rate": 7.5629065712469595e-06, + "loss": 0.6297, + "step": 3356 + }, + { + "epoch": 0.35, + "grad_norm": 1.7693710236384135, + "learning_rate": 7.561460979120088e-06, + "loss": 0.6881, + "step": 3357 + }, + { + "epoch": 0.35, + "grad_norm": 1.8736827010560169, + "learning_rate": 7.560015096637015e-06, + "loss": 0.643, + "step": 3358 + }, + { + "epoch": 0.35, + "grad_norm": 1.9932305255398057, + "learning_rate": 7.558568923961638e-06, + "loss": 0.6615, + "step": 3359 + }, + { + "epoch": 0.35, + "grad_norm": 1.7329221135497108, + "learning_rate": 7.557122461257891e-06, + "loss": 0.5559, + "step": 3360 + }, + { + "epoch": 0.35, + "grad_norm": 1.7334989136140522, + "learning_rate": 7.555675708689738e-06, + "loss": 0.7119, + "step": 3361 + }, + { + "epoch": 0.35, + "grad_norm": 1.941030820066443, + "learning_rate": 7.554228666421176e-06, + "loss": 0.7062, + "step": 3362 + }, + { + "epoch": 0.35, + "grad_norm": 1.6938476484173985, + "learning_rate": 7.552781334616237e-06, + "loss": 0.7022, + "step": 3363 + }, + { + "epoch": 0.35, + "grad_norm": 2.086372882874223, + "learning_rate": 7.551333713438982e-06, + "loss": 0.5912, + "step": 3364 + }, + { + "epoch": 0.35, + "grad_norm": 2.0681430639202976, + "learning_rate": 7.549885803053509e-06, + "loss": 0.6965, + "step": 3365 + }, + { + "epoch": 0.35, + "grad_norm": 1.8155626765570805, + "learning_rate": 7.548437603623947e-06, + "loss": 0.6455, + "step": 3366 + }, + { + "epoch": 0.35, + "grad_norm": 2.0742329496020178, + "learning_rate": 7.546989115314456e-06, + "loss": 0.6947, + "step": 3367 + }, + { + "epoch": 0.35, + "grad_norm": 1.7556510523856663, + "learning_rate": 7.5455403382892325e-06, + "loss": 0.6135, + "step": 3368 + }, + { + "epoch": 0.35, + "grad_norm": 1.9078576489775798, + "learning_rate": 7.544091272712501e-06, + "loss": 0.6017, + "step": 3369 + }, + { + "epoch": 0.35, + "grad_norm": 1.6812110139609036, + "learning_rate": 7.542641918748526e-06, + "loss": 0.584, + "step": 3370 + }, + { + "epoch": 0.35, + "grad_norm": 2.1753283760243174, + "learning_rate": 7.541192276561594e-06, + "loss": 0.6506, + "step": 3371 + }, + { + "epoch": 0.35, + "grad_norm": 1.9246927540441021, + "learning_rate": 7.539742346316035e-06, + "loss": 0.6418, + "step": 3372 + }, + { + "epoch": 0.35, + "grad_norm": 2.0764921092291715, + "learning_rate": 7.538292128176204e-06, + "loss": 0.7311, + "step": 3373 + }, + { + "epoch": 0.35, + "grad_norm": 2.0016266904008444, + "learning_rate": 7.536841622306491e-06, + "loss": 0.6698, + "step": 3374 + }, + { + "epoch": 0.35, + "grad_norm": 1.7476993025192442, + "learning_rate": 7.535390828871322e-06, + "loss": 0.5843, + "step": 3375 + }, + { + "epoch": 0.35, + "grad_norm": 2.1443050656742795, + "learning_rate": 7.5339397480351525e-06, + "loss": 0.6661, + "step": 3376 + }, + { + "epoch": 0.35, + "grad_norm": 1.9640498566143108, + "learning_rate": 7.532488379962468e-06, + "loss": 0.6737, + "step": 3377 + }, + { + "epoch": 0.35, + "grad_norm": 1.894660279182222, + "learning_rate": 7.531036724817791e-06, + "loss": 0.6363, + "step": 3378 + }, + { + "epoch": 0.35, + "grad_norm": 1.865129793714187, + "learning_rate": 7.529584782765675e-06, + "loss": 0.6154, + "step": 3379 + }, + { + "epoch": 0.35, + "grad_norm": 2.0631351014194634, + "learning_rate": 7.528132553970706e-06, + "loss": 0.6748, + "step": 3380 + }, + { + "epoch": 0.35, + "grad_norm": 1.6704861005463274, + "learning_rate": 7.526680038597502e-06, + "loss": 0.6883, + "step": 3381 + }, + { + "epoch": 0.35, + "grad_norm": 2.093149404086043, + "learning_rate": 7.525227236810715e-06, + "loss": 0.5749, + "step": 3382 + }, + { + "epoch": 0.35, + "grad_norm": 2.0045663651778387, + "learning_rate": 7.523774148775027e-06, + "loss": 0.7112, + "step": 3383 + }, + { + "epoch": 0.35, + "grad_norm": 1.879588638216167, + "learning_rate": 7.522320774655154e-06, + "loss": 0.632, + "step": 3384 + }, + { + "epoch": 0.35, + "grad_norm": 1.796450452129187, + "learning_rate": 7.520867114615844e-06, + "loss": 0.6143, + "step": 3385 + }, + { + "epoch": 0.35, + "grad_norm": 1.9221269954863394, + "learning_rate": 7.519413168821878e-06, + "loss": 0.6371, + "step": 3386 + }, + { + "epoch": 0.35, + "grad_norm": 2.245278008970985, + "learning_rate": 7.5179589374380705e-06, + "loss": 0.6296, + "step": 3387 + }, + { + "epoch": 0.35, + "grad_norm": 1.928866713539612, + "learning_rate": 7.516504420629264e-06, + "loss": 0.6113, + "step": 3388 + }, + { + "epoch": 0.35, + "grad_norm": 2.295181510678045, + "learning_rate": 7.515049618560337e-06, + "loss": 0.6031, + "step": 3389 + }, + { + "epoch": 0.35, + "grad_norm": 2.195352983128163, + "learning_rate": 7.513594531396202e-06, + "loss": 0.6993, + "step": 3390 + }, + { + "epoch": 0.35, + "grad_norm": 1.7327908331334299, + "learning_rate": 7.512139159301801e-06, + "loss": 0.5751, + "step": 3391 + }, + { + "epoch": 0.35, + "grad_norm": 1.7569641738070485, + "learning_rate": 7.510683502442105e-06, + "loss": 0.6413, + "step": 3392 + }, + { + "epoch": 0.35, + "grad_norm": 1.6720116832024288, + "learning_rate": 7.5092275609821254e-06, + "loss": 0.6304, + "step": 3393 + }, + { + "epoch": 0.35, + "grad_norm": 2.164668470616303, + "learning_rate": 7.5077713350869e-06, + "loss": 0.7014, + "step": 3394 + }, + { + "epoch": 0.35, + "grad_norm": 2.0418743773840204, + "learning_rate": 7.506314824921498e-06, + "loss": 0.7441, + "step": 3395 + }, + { + "epoch": 0.35, + "grad_norm": 1.9352876217904793, + "learning_rate": 7.504858030651026e-06, + "loss": 0.7133, + "step": 3396 + }, + { + "epoch": 0.35, + "grad_norm": 1.9893676571328642, + "learning_rate": 7.503400952440618e-06, + "loss": 0.679, + "step": 3397 + }, + { + "epoch": 0.35, + "grad_norm": 2.0088136972631725, + "learning_rate": 7.501943590455445e-06, + "loss": 0.622, + "step": 3398 + }, + { + "epoch": 0.35, + "grad_norm": 1.635984512656272, + "learning_rate": 7.500485944860705e-06, + "loss": 0.5652, + "step": 3399 + }, + { + "epoch": 0.35, + "grad_norm": 1.870219344342775, + "learning_rate": 7.49902801582163e-06, + "loss": 0.676, + "step": 3400 + }, + { + "epoch": 0.35, + "grad_norm": 2.19177716835321, + "learning_rate": 7.497569803503486e-06, + "loss": 0.7368, + "step": 3401 + }, + { + "epoch": 0.35, + "grad_norm": 1.9242849370350925, + "learning_rate": 7.49611130807157e-06, + "loss": 0.7252, + "step": 3402 + }, + { + "epoch": 0.35, + "grad_norm": 1.9182235720120104, + "learning_rate": 7.494652529691209e-06, + "loss": 0.713, + "step": 3403 + }, + { + "epoch": 0.35, + "grad_norm": 1.7468572198863737, + "learning_rate": 7.493193468527764e-06, + "loss": 0.643, + "step": 3404 + }, + { + "epoch": 0.35, + "grad_norm": 1.7308546976807355, + "learning_rate": 7.491734124746628e-06, + "loss": 0.6003, + "step": 3405 + }, + { + "epoch": 0.35, + "grad_norm": 1.8679022278259034, + "learning_rate": 7.490274498513228e-06, + "loss": 0.7414, + "step": 3406 + }, + { + "epoch": 0.35, + "grad_norm": 2.0451720769063906, + "learning_rate": 7.488814589993019e-06, + "loss": 0.6454, + "step": 3407 + }, + { + "epoch": 0.35, + "grad_norm": 1.954452155359003, + "learning_rate": 7.487354399351491e-06, + "loss": 0.7069, + "step": 3408 + }, + { + "epoch": 0.35, + "grad_norm": 2.0412457459452837, + "learning_rate": 7.485893926754164e-06, + "loss": 0.706, + "step": 3409 + }, + { + "epoch": 0.35, + "grad_norm": 1.756504654826153, + "learning_rate": 7.484433172366592e-06, + "loss": 0.5949, + "step": 3410 + }, + { + "epoch": 0.35, + "grad_norm": 2.0975397217258207, + "learning_rate": 7.482972136354359e-06, + "loss": 0.827, + "step": 3411 + }, + { + "epoch": 0.35, + "grad_norm": 1.9019894290221375, + "learning_rate": 7.48151081888308e-06, + "loss": 0.6112, + "step": 3412 + }, + { + "epoch": 0.35, + "grad_norm": 1.961151827754993, + "learning_rate": 7.480049220118407e-06, + "loss": 0.6559, + "step": 3413 + }, + { + "epoch": 0.35, + "grad_norm": 2.1285287007697966, + "learning_rate": 7.478587340226019e-06, + "loss": 0.6444, + "step": 3414 + }, + { + "epoch": 0.35, + "grad_norm": 1.986160997932849, + "learning_rate": 7.477125179371628e-06, + "loss": 0.6846, + "step": 3415 + }, + { + "epoch": 0.36, + "grad_norm": 1.8237130387040967, + "learning_rate": 7.475662737720981e-06, + "loss": 0.6, + "step": 3416 + }, + { + "epoch": 0.36, + "grad_norm": 1.851416521972734, + "learning_rate": 7.47420001543985e-06, + "loss": 0.7632, + "step": 3417 + }, + { + "epoch": 0.36, + "grad_norm": 1.9104748163467893, + "learning_rate": 7.472737012694045e-06, + "loss": 0.5842, + "step": 3418 + }, + { + "epoch": 0.36, + "grad_norm": 1.9035555954382448, + "learning_rate": 7.471273729649404e-06, + "loss": 0.7967, + "step": 3419 + }, + { + "epoch": 0.36, + "grad_norm": 2.0514769058465228, + "learning_rate": 7.469810166471802e-06, + "loss": 0.6309, + "step": 3420 + }, + { + "epoch": 0.36, + "grad_norm": 1.831229869638447, + "learning_rate": 7.46834632332714e-06, + "loss": 0.6609, + "step": 3421 + }, + { + "epoch": 0.36, + "grad_norm": 2.0840899836154985, + "learning_rate": 7.466882200381352e-06, + "loss": 0.6543, + "step": 3422 + }, + { + "epoch": 0.36, + "grad_norm": 1.7908059902561366, + "learning_rate": 7.465417797800406e-06, + "loss": 0.6881, + "step": 3423 + }, + { + "epoch": 0.36, + "grad_norm": 1.9837969625415905, + "learning_rate": 7.463953115750302e-06, + "loss": 0.633, + "step": 3424 + }, + { + "epoch": 0.36, + "grad_norm": 1.6172551301392455, + "learning_rate": 7.462488154397067e-06, + "loss": 0.6234, + "step": 3425 + }, + { + "epoch": 0.36, + "grad_norm": 1.927272898423346, + "learning_rate": 7.461022913906764e-06, + "loss": 0.6417, + "step": 3426 + }, + { + "epoch": 0.36, + "grad_norm": 2.0551613282894583, + "learning_rate": 7.459557394445486e-06, + "loss": 0.6011, + "step": 3427 + }, + { + "epoch": 0.36, + "grad_norm": 2.0067846400605243, + "learning_rate": 7.458091596179359e-06, + "loss": 0.6854, + "step": 3428 + }, + { + "epoch": 0.36, + "grad_norm": 2.242882214853698, + "learning_rate": 7.4566255192745384e-06, + "loss": 0.6827, + "step": 3429 + }, + { + "epoch": 0.36, + "grad_norm": 1.9060789364117876, + "learning_rate": 7.455159163897213e-06, + "loss": 0.6727, + "step": 3430 + }, + { + "epoch": 0.36, + "grad_norm": 1.9612708236391978, + "learning_rate": 7.453692530213603e-06, + "loss": 0.6663, + "step": 3431 + }, + { + "epoch": 0.36, + "grad_norm": 2.3743911219968425, + "learning_rate": 7.452225618389959e-06, + "loss": 0.6794, + "step": 3432 + }, + { + "epoch": 0.36, + "grad_norm": 1.8840472801485948, + "learning_rate": 7.4507584285925625e-06, + "loss": 0.6563, + "step": 3433 + }, + { + "epoch": 0.36, + "grad_norm": 1.7866444653145974, + "learning_rate": 7.4492909609877304e-06, + "loss": 0.7027, + "step": 3434 + }, + { + "epoch": 0.36, + "grad_norm": 1.8941736336086783, + "learning_rate": 7.447823215741807e-06, + "loss": 0.6318, + "step": 3435 + }, + { + "epoch": 0.36, + "grad_norm": 1.782248833519918, + "learning_rate": 7.446355193021171e-06, + "loss": 0.7005, + "step": 3436 + }, + { + "epoch": 0.36, + "grad_norm": 1.9330931951718446, + "learning_rate": 7.444886892992229e-06, + "loss": 0.7213, + "step": 3437 + }, + { + "epoch": 0.36, + "grad_norm": 1.8453021176715279, + "learning_rate": 7.443418315821422e-06, + "loss": 0.6963, + "step": 3438 + }, + { + "epoch": 0.36, + "grad_norm": 1.7615865281146448, + "learning_rate": 7.441949461675223e-06, + "loss": 0.6233, + "step": 3439 + }, + { + "epoch": 0.36, + "grad_norm": 1.9187017752193514, + "learning_rate": 7.4404803307201345e-06, + "loss": 0.6402, + "step": 3440 + }, + { + "epoch": 0.36, + "grad_norm": 1.8966505444836648, + "learning_rate": 7.4390109231226895e-06, + "loss": 0.6392, + "step": 3441 + }, + { + "epoch": 0.36, + "grad_norm": 1.6365254365980388, + "learning_rate": 7.437541239049453e-06, + "loss": 0.6298, + "step": 3442 + }, + { + "epoch": 0.36, + "grad_norm": 1.7596906938070285, + "learning_rate": 7.436071278667024e-06, + "loss": 0.599, + "step": 3443 + }, + { + "epoch": 0.36, + "grad_norm": 1.787527162481923, + "learning_rate": 7.4346010421420314e-06, + "loss": 0.6659, + "step": 3444 + }, + { + "epoch": 0.36, + "grad_norm": 2.1953328947218864, + "learning_rate": 7.433130529641133e-06, + "loss": 0.7025, + "step": 3445 + }, + { + "epoch": 0.36, + "grad_norm": 1.8839951723071167, + "learning_rate": 7.431659741331022e-06, + "loss": 0.6574, + "step": 3446 + }, + { + "epoch": 0.36, + "grad_norm": 1.7025871231466374, + "learning_rate": 7.430188677378418e-06, + "loss": 0.6093, + "step": 3447 + }, + { + "epoch": 0.36, + "grad_norm": 1.8718629399629925, + "learning_rate": 7.4287173379500764e-06, + "loss": 0.6797, + "step": 3448 + }, + { + "epoch": 0.36, + "grad_norm": 1.9082997423204202, + "learning_rate": 7.427245723212781e-06, + "loss": 0.5676, + "step": 3449 + }, + { + "epoch": 0.36, + "grad_norm": 2.4745611002431454, + "learning_rate": 7.425773833333349e-06, + "loss": 0.6926, + "step": 3450 + }, + { + "epoch": 0.36, + "grad_norm": 1.7970418489642042, + "learning_rate": 7.424301668478626e-06, + "loss": 0.7225, + "step": 3451 + }, + { + "epoch": 0.36, + "grad_norm": 2.0818049027970567, + "learning_rate": 7.422829228815491e-06, + "loss": 0.6821, + "step": 3452 + }, + { + "epoch": 0.36, + "grad_norm": 1.9152217689013808, + "learning_rate": 7.421356514510853e-06, + "loss": 0.674, + "step": 3453 + }, + { + "epoch": 0.36, + "grad_norm": 1.948628354944815, + "learning_rate": 7.419883525731653e-06, + "loss": 0.6712, + "step": 3454 + }, + { + "epoch": 0.36, + "grad_norm": 1.7921525560109597, + "learning_rate": 7.418410262644862e-06, + "loss": 0.7164, + "step": 3455 + }, + { + "epoch": 0.36, + "grad_norm": 1.8241838649494904, + "learning_rate": 7.416936725417483e-06, + "loss": 0.6467, + "step": 3456 + }, + { + "epoch": 0.36, + "grad_norm": 1.9392896030044404, + "learning_rate": 7.415462914216551e-06, + "loss": 0.647, + "step": 3457 + }, + { + "epoch": 0.36, + "grad_norm": 1.9242618966552065, + "learning_rate": 7.41398882920913e-06, + "loss": 0.7629, + "step": 3458 + }, + { + "epoch": 0.36, + "grad_norm": 1.8239619089818286, + "learning_rate": 7.4125144705623155e-06, + "loss": 0.5769, + "step": 3459 + }, + { + "epoch": 0.36, + "grad_norm": 1.9414170395727977, + "learning_rate": 7.411039838443234e-06, + "loss": 0.7273, + "step": 3460 + }, + { + "epoch": 0.36, + "grad_norm": 1.796824341854242, + "learning_rate": 7.409564933019046e-06, + "loss": 0.6165, + "step": 3461 + }, + { + "epoch": 0.36, + "grad_norm": 1.834416215786427, + "learning_rate": 7.408089754456939e-06, + "loss": 0.5974, + "step": 3462 + }, + { + "epoch": 0.36, + "grad_norm": 1.8172898388225946, + "learning_rate": 7.406614302924131e-06, + "loss": 0.5895, + "step": 3463 + }, + { + "epoch": 0.36, + "grad_norm": 1.9176599567813366, + "learning_rate": 7.405138578587876e-06, + "loss": 0.6056, + "step": 3464 + }, + { + "epoch": 0.36, + "grad_norm": 1.892048274582568, + "learning_rate": 7.403662581615454e-06, + "loss": 0.7066, + "step": 3465 + }, + { + "epoch": 0.36, + "grad_norm": 1.8893025379528445, + "learning_rate": 7.402186312174177e-06, + "loss": 0.6479, + "step": 3466 + }, + { + "epoch": 0.36, + "grad_norm": 1.8654581009570594, + "learning_rate": 7.4007097704313894e-06, + "loss": 0.5893, + "step": 3467 + }, + { + "epoch": 0.36, + "grad_norm": 1.8272464271313358, + "learning_rate": 7.399232956554468e-06, + "loss": 0.7083, + "step": 3468 + }, + { + "epoch": 0.36, + "grad_norm": 1.810292265366264, + "learning_rate": 7.397755870710813e-06, + "loss": 0.6151, + "step": 3469 + }, + { + "epoch": 0.36, + "grad_norm": 1.8509332314834515, + "learning_rate": 7.396278513067865e-06, + "loss": 0.616, + "step": 3470 + }, + { + "epoch": 0.36, + "grad_norm": 1.6915564968768657, + "learning_rate": 7.394800883793087e-06, + "loss": 0.6099, + "step": 3471 + }, + { + "epoch": 0.36, + "grad_norm": 1.735592555743763, + "learning_rate": 7.393322983053982e-06, + "loss": 0.5545, + "step": 3472 + }, + { + "epoch": 0.36, + "grad_norm": 2.2703853679664783, + "learning_rate": 7.391844811018074e-06, + "loss": 0.6984, + "step": 3473 + }, + { + "epoch": 0.36, + "grad_norm": 2.0112032870742116, + "learning_rate": 7.390366367852923e-06, + "loss": 0.6332, + "step": 3474 + }, + { + "epoch": 0.36, + "grad_norm": 1.8616175779663644, + "learning_rate": 7.38888765372612e-06, + "loss": 0.5883, + "step": 3475 + }, + { + "epoch": 0.36, + "grad_norm": 1.712242784645238, + "learning_rate": 7.387408668805285e-06, + "loss": 0.7109, + "step": 3476 + }, + { + "epoch": 0.36, + "grad_norm": 1.6442545413702605, + "learning_rate": 7.38592941325807e-06, + "loss": 0.6177, + "step": 3477 + }, + { + "epoch": 0.36, + "grad_norm": 1.9352886207798174, + "learning_rate": 7.384449887252156e-06, + "loss": 0.7331, + "step": 3478 + }, + { + "epoch": 0.36, + "grad_norm": 1.918089359140376, + "learning_rate": 7.382970090955258e-06, + "loss": 0.607, + "step": 3479 + }, + { + "epoch": 0.36, + "grad_norm": 1.653803914611329, + "learning_rate": 7.381490024535117e-06, + "loss": 0.5688, + "step": 3480 + }, + { + "epoch": 0.36, + "grad_norm": 1.8646016700584298, + "learning_rate": 7.380009688159507e-06, + "loss": 0.7111, + "step": 3481 + }, + { + "epoch": 0.36, + "grad_norm": 1.9421559203462602, + "learning_rate": 7.378529081996233e-06, + "loss": 0.5779, + "step": 3482 + }, + { + "epoch": 0.36, + "grad_norm": 1.7791816060422943, + "learning_rate": 7.377048206213132e-06, + "loss": 0.5849, + "step": 3483 + }, + { + "epoch": 0.36, + "grad_norm": 1.9856777896158313, + "learning_rate": 7.375567060978067e-06, + "loss": 0.6638, + "step": 3484 + }, + { + "epoch": 0.36, + "grad_norm": 1.6344717716116572, + "learning_rate": 7.374085646458935e-06, + "loss": 0.6223, + "step": 3485 + }, + { + "epoch": 0.36, + "grad_norm": 1.7495251385174526, + "learning_rate": 7.372603962823664e-06, + "loss": 0.7742, + "step": 3486 + }, + { + "epoch": 0.36, + "grad_norm": 2.020767726437643, + "learning_rate": 7.3711220102402105e-06, + "loss": 0.5837, + "step": 3487 + }, + { + "epoch": 0.36, + "grad_norm": 2.0961729815094214, + "learning_rate": 7.369639788876561e-06, + "loss": 0.6203, + "step": 3488 + }, + { + "epoch": 0.36, + "grad_norm": 1.9022781368389061, + "learning_rate": 7.3681572989007365e-06, + "loss": 0.7023, + "step": 3489 + }, + { + "epoch": 0.36, + "grad_norm": 2.0866961157120456, + "learning_rate": 7.366674540480784e-06, + "loss": 0.7635, + "step": 3490 + }, + { + "epoch": 0.36, + "grad_norm": 2.507173647122414, + "learning_rate": 7.365191513784782e-06, + "loss": 0.5723, + "step": 3491 + }, + { + "epoch": 0.36, + "grad_norm": 2.0748742922266414, + "learning_rate": 7.363708218980841e-06, + "loss": 0.7909, + "step": 3492 + }, + { + "epoch": 0.36, + "grad_norm": 1.8333987770622944, + "learning_rate": 7.3622246562371e-06, + "loss": 0.6826, + "step": 3493 + }, + { + "epoch": 0.36, + "grad_norm": 1.70355206649613, + "learning_rate": 7.360740825721732e-06, + "loss": 0.5902, + "step": 3494 + }, + { + "epoch": 0.36, + "grad_norm": 1.9576190398806632, + "learning_rate": 7.3592567276029336e-06, + "loss": 0.6535, + "step": 3495 + }, + { + "epoch": 0.36, + "grad_norm": 1.8723793599589569, + "learning_rate": 7.357772362048939e-06, + "loss": 0.6424, + "step": 3496 + }, + { + "epoch": 0.36, + "grad_norm": 1.8059004571656276, + "learning_rate": 7.356287729228007e-06, + "loss": 0.6448, + "step": 3497 + }, + { + "epoch": 0.36, + "grad_norm": 1.7440998752846177, + "learning_rate": 7.354802829308432e-06, + "loss": 0.68, + "step": 3498 + }, + { + "epoch": 0.36, + "grad_norm": 1.9870844742740057, + "learning_rate": 7.353317662458532e-06, + "loss": 0.7523, + "step": 3499 + }, + { + "epoch": 0.36, + "grad_norm": 1.6846891263344534, + "learning_rate": 7.351832228846664e-06, + "loss": 0.5949, + "step": 3500 + }, + { + "epoch": 0.36, + "grad_norm": 2.146763984783295, + "learning_rate": 7.3503465286412064e-06, + "loss": 0.732, + "step": 3501 + }, + { + "epoch": 0.36, + "grad_norm": 1.9106542801719677, + "learning_rate": 7.348860562010574e-06, + "loss": 0.571, + "step": 3502 + }, + { + "epoch": 0.36, + "grad_norm": 1.7867908684838822, + "learning_rate": 7.347374329123209e-06, + "loss": 0.6187, + "step": 3503 + }, + { + "epoch": 0.36, + "grad_norm": 1.8933431553615583, + "learning_rate": 7.345887830147583e-06, + "loss": 0.5439, + "step": 3504 + }, + { + "epoch": 0.36, + "grad_norm": 2.1452897179479513, + "learning_rate": 7.344401065252203e-06, + "loss": 0.6209, + "step": 3505 + }, + { + "epoch": 0.36, + "grad_norm": 1.8240488714231755, + "learning_rate": 7.3429140346055975e-06, + "loss": 0.6676, + "step": 3506 + }, + { + "epoch": 0.36, + "grad_norm": 2.1529547922669914, + "learning_rate": 7.341426738376332e-06, + "loss": 0.5676, + "step": 3507 + }, + { + "epoch": 0.36, + "grad_norm": 2.0283041565520574, + "learning_rate": 7.339939176733e-06, + "loss": 0.7162, + "step": 3508 + }, + { + "epoch": 0.36, + "grad_norm": 2.1571072349133633, + "learning_rate": 7.338451349844225e-06, + "loss": 0.8388, + "step": 3509 + }, + { + "epoch": 0.36, + "grad_norm": 1.8719672067212767, + "learning_rate": 7.336963257878662e-06, + "loss": 0.7124, + "step": 3510 + }, + { + "epoch": 0.36, + "grad_norm": 1.877417040669118, + "learning_rate": 7.335474901004992e-06, + "loss": 0.7178, + "step": 3511 + }, + { + "epoch": 0.37, + "grad_norm": 1.8934927176612808, + "learning_rate": 7.33398627939193e-06, + "loss": 0.5567, + "step": 3512 + }, + { + "epoch": 0.37, + "grad_norm": 1.8796050148787842, + "learning_rate": 7.332497393208221e-06, + "loss": 0.6233, + "step": 3513 + }, + { + "epoch": 0.37, + "grad_norm": 1.83490148528102, + "learning_rate": 7.331008242622637e-06, + "loss": 0.6343, + "step": 3514 + }, + { + "epoch": 0.37, + "grad_norm": 1.8143441207248123, + "learning_rate": 7.329518827803983e-06, + "loss": 0.6808, + "step": 3515 + }, + { + "epoch": 0.37, + "grad_norm": 2.1087748223352754, + "learning_rate": 7.328029148921093e-06, + "loss": 0.5901, + "step": 3516 + }, + { + "epoch": 0.37, + "grad_norm": 1.7491095171764681, + "learning_rate": 7.326539206142829e-06, + "loss": 0.566, + "step": 3517 + }, + { + "epoch": 0.37, + "grad_norm": 1.8990983602988085, + "learning_rate": 7.325048999638083e-06, + "loss": 0.647, + "step": 3518 + }, + { + "epoch": 0.37, + "grad_norm": 1.7817325075047465, + "learning_rate": 7.323558529575783e-06, + "loss": 0.6294, + "step": 3519 + }, + { + "epoch": 0.37, + "grad_norm": 1.822167484873732, + "learning_rate": 7.32206779612488e-06, + "loss": 0.599, + "step": 3520 + }, + { + "epoch": 0.37, + "grad_norm": 1.910179885511139, + "learning_rate": 7.320576799454355e-06, + "loss": 0.6137, + "step": 3521 + }, + { + "epoch": 0.37, + "grad_norm": 1.9373630796040067, + "learning_rate": 7.319085539733225e-06, + "loss": 0.5921, + "step": 3522 + }, + { + "epoch": 0.37, + "grad_norm": 1.8558805706601034, + "learning_rate": 7.317594017130529e-06, + "loss": 0.6571, + "step": 3523 + }, + { + "epoch": 0.37, + "grad_norm": 1.7509386776872393, + "learning_rate": 7.316102231815343e-06, + "loss": 0.65, + "step": 3524 + }, + { + "epoch": 0.37, + "grad_norm": 1.87626594632814, + "learning_rate": 7.3146101839567665e-06, + "loss": 0.6148, + "step": 3525 + }, + { + "epoch": 0.37, + "grad_norm": 1.9780604464642324, + "learning_rate": 7.313117873723933e-06, + "loss": 0.5868, + "step": 3526 + }, + { + "epoch": 0.37, + "grad_norm": 1.7614477042809984, + "learning_rate": 7.311625301286005e-06, + "loss": 0.6357, + "step": 3527 + }, + { + "epoch": 0.37, + "grad_norm": 1.8327860152596689, + "learning_rate": 7.310132466812172e-06, + "loss": 0.5737, + "step": 3528 + }, + { + "epoch": 0.37, + "grad_norm": 1.8844138267432626, + "learning_rate": 7.308639370471658e-06, + "loss": 0.6322, + "step": 3529 + }, + { + "epoch": 0.37, + "grad_norm": 1.8674359027203937, + "learning_rate": 7.30714601243371e-06, + "loss": 0.5701, + "step": 3530 + }, + { + "epoch": 0.37, + "grad_norm": 1.8123042050551081, + "learning_rate": 7.3056523928676145e-06, + "loss": 0.6558, + "step": 3531 + }, + { + "epoch": 0.37, + "grad_norm": 1.7883251931061648, + "learning_rate": 7.304158511942676e-06, + "loss": 0.6277, + "step": 3532 + }, + { + "epoch": 0.37, + "grad_norm": 2.0045012224678205, + "learning_rate": 7.302664369828238e-06, + "loss": 0.7663, + "step": 3533 + }, + { + "epoch": 0.37, + "grad_norm": 1.9524543531778042, + "learning_rate": 7.3011699666936685e-06, + "loss": 0.7106, + "step": 3534 + }, + { + "epoch": 0.37, + "grad_norm": 1.9413034212258677, + "learning_rate": 7.299675302708368e-06, + "loss": 0.6501, + "step": 3535 + }, + { + "epoch": 0.37, + "grad_norm": 1.6892597011874646, + "learning_rate": 7.298180378041763e-06, + "loss": 0.5715, + "step": 3536 + }, + { + "epoch": 0.37, + "grad_norm": 1.9208949618622175, + "learning_rate": 7.296685192863313e-06, + "loss": 0.6828, + "step": 3537 + }, + { + "epoch": 0.37, + "grad_norm": 2.1550436825626966, + "learning_rate": 7.295189747342507e-06, + "loss": 0.6368, + "step": 3538 + }, + { + "epoch": 0.37, + "grad_norm": 1.6240302138516085, + "learning_rate": 7.29369404164886e-06, + "loss": 0.5729, + "step": 3539 + }, + { + "epoch": 0.37, + "grad_norm": 1.9753475779628318, + "learning_rate": 7.2921980759519195e-06, + "loss": 0.6921, + "step": 3540 + }, + { + "epoch": 0.37, + "grad_norm": 1.871059765624354, + "learning_rate": 7.290701850421263e-06, + "loss": 0.7101, + "step": 3541 + }, + { + "epoch": 0.37, + "grad_norm": 1.8488824199834615, + "learning_rate": 7.289205365226495e-06, + "loss": 0.7121, + "step": 3542 + }, + { + "epoch": 0.37, + "grad_norm": 1.979776721083964, + "learning_rate": 7.28770862053725e-06, + "loss": 0.6629, + "step": 3543 + }, + { + "epoch": 0.37, + "grad_norm": 1.7690159529643779, + "learning_rate": 7.286211616523193e-06, + "loss": 0.5671, + "step": 3544 + }, + { + "epoch": 0.37, + "grad_norm": 2.0002953479570516, + "learning_rate": 7.28471435335402e-06, + "loss": 0.5838, + "step": 3545 + }, + { + "epoch": 0.37, + "grad_norm": 1.8849684513390108, + "learning_rate": 7.2832168311994514e-06, + "loss": 0.6947, + "step": 3546 + }, + { + "epoch": 0.37, + "grad_norm": 1.8271816423627594, + "learning_rate": 7.281719050229241e-06, + "loss": 0.6798, + "step": 3547 + }, + { + "epoch": 0.37, + "grad_norm": 1.9154738481542926, + "learning_rate": 7.280221010613171e-06, + "loss": 0.5558, + "step": 3548 + }, + { + "epoch": 0.37, + "grad_norm": 1.7236896270573931, + "learning_rate": 7.278722712521054e-06, + "loss": 0.6129, + "step": 3549 + }, + { + "epoch": 0.37, + "grad_norm": 1.858416081191063, + "learning_rate": 7.277224156122728e-06, + "loss": 0.7925, + "step": 3550 + }, + { + "epoch": 0.37, + "grad_norm": 1.9349250790205446, + "learning_rate": 7.275725341588064e-06, + "loss": 0.7287, + "step": 3551 + }, + { + "epoch": 0.37, + "grad_norm": 2.0965449424230016, + "learning_rate": 7.2742262690869615e-06, + "loss": 0.7357, + "step": 3552 + }, + { + "epoch": 0.37, + "grad_norm": 1.9572073961053111, + "learning_rate": 7.272726938789348e-06, + "loss": 0.6857, + "step": 3553 + }, + { + "epoch": 0.37, + "grad_norm": 1.6355516242540316, + "learning_rate": 7.2712273508651834e-06, + "loss": 0.578, + "step": 3554 + }, + { + "epoch": 0.37, + "grad_norm": 1.8240038253760955, + "learning_rate": 7.269727505484452e-06, + "loss": 0.7063, + "step": 3555 + }, + { + "epoch": 0.37, + "grad_norm": 1.9522137436656415, + "learning_rate": 7.268227402817171e-06, + "loss": 0.6384, + "step": 3556 + }, + { + "epoch": 0.37, + "grad_norm": 1.8803868241469999, + "learning_rate": 7.266727043033386e-06, + "loss": 0.6292, + "step": 3557 + }, + { + "epoch": 0.37, + "grad_norm": 1.7416415007506427, + "learning_rate": 7.26522642630317e-06, + "loss": 0.5993, + "step": 3558 + }, + { + "epoch": 0.37, + "grad_norm": 1.7285779931370737, + "learning_rate": 7.263725552796628e-06, + "loss": 0.6292, + "step": 3559 + }, + { + "epoch": 0.37, + "grad_norm": 1.8865673304095096, + "learning_rate": 7.262224422683891e-06, + "loss": 0.6817, + "step": 3560 + }, + { + "epoch": 0.37, + "grad_norm": 1.8541330421002968, + "learning_rate": 7.260723036135122e-06, + "loss": 0.5544, + "step": 3561 + }, + { + "epoch": 0.37, + "grad_norm": 1.5749650639128956, + "learning_rate": 7.259221393320511e-06, + "loss": 0.637, + "step": 3562 + }, + { + "epoch": 0.37, + "grad_norm": 1.821273194460373, + "learning_rate": 7.257719494410278e-06, + "loss": 0.5897, + "step": 3563 + }, + { + "epoch": 0.37, + "grad_norm": 1.8149563401389326, + "learning_rate": 7.2562173395746725e-06, + "loss": 0.6168, + "step": 3564 + }, + { + "epoch": 0.37, + "grad_norm": 2.0211465063160863, + "learning_rate": 7.25471492898397e-06, + "loss": 0.6834, + "step": 3565 + }, + { + "epoch": 0.37, + "grad_norm": 1.6758100886152418, + "learning_rate": 7.25321226280848e-06, + "loss": 0.5116, + "step": 3566 + }, + { + "epoch": 0.37, + "grad_norm": 1.9332635703531122, + "learning_rate": 7.251709341218536e-06, + "loss": 0.7101, + "step": 3567 + }, + { + "epoch": 0.37, + "grad_norm": 1.9021202447821917, + "learning_rate": 7.250206164384506e-06, + "loss": 0.7104, + "step": 3568 + }, + { + "epoch": 0.37, + "grad_norm": 2.1677945215060594, + "learning_rate": 7.24870273247678e-06, + "loss": 0.7328, + "step": 3569 + }, + { + "epoch": 0.37, + "grad_norm": 1.8317538366672943, + "learning_rate": 7.247199045665781e-06, + "loss": 0.6999, + "step": 3570 + }, + { + "epoch": 0.37, + "grad_norm": 1.9648854126394588, + "learning_rate": 7.245695104121963e-06, + "loss": 0.6013, + "step": 3571 + }, + { + "epoch": 0.37, + "grad_norm": 1.7893874702530328, + "learning_rate": 7.244190908015805e-06, + "loss": 0.6356, + "step": 3572 + }, + { + "epoch": 0.37, + "grad_norm": 2.4006419796596727, + "learning_rate": 7.242686457517815e-06, + "loss": 0.7903, + "step": 3573 + }, + { + "epoch": 0.37, + "grad_norm": 1.8682533212867887, + "learning_rate": 7.241181752798534e-06, + "loss": 0.7252, + "step": 3574 + }, + { + "epoch": 0.37, + "grad_norm": 1.968275479562997, + "learning_rate": 7.239676794028526e-06, + "loss": 0.667, + "step": 3575 + }, + { + "epoch": 0.37, + "grad_norm": 1.5473586071098482, + "learning_rate": 7.238171581378388e-06, + "loss": 0.5816, + "step": 3576 + }, + { + "epoch": 0.37, + "grad_norm": 1.773785035800144, + "learning_rate": 7.236666115018744e-06, + "loss": 0.6212, + "step": 3577 + }, + { + "epoch": 0.37, + "grad_norm": 1.8332500658652024, + "learning_rate": 7.235160395120247e-06, + "loss": 0.6962, + "step": 3578 + }, + { + "epoch": 0.37, + "grad_norm": 1.5568737353280147, + "learning_rate": 7.2336544218535776e-06, + "loss": 0.559, + "step": 3579 + }, + { + "epoch": 0.37, + "grad_norm": 1.8913861420333955, + "learning_rate": 7.23214819538945e-06, + "loss": 0.553, + "step": 3580 + }, + { + "epoch": 0.37, + "grad_norm": 1.9256262438864884, + "learning_rate": 7.230641715898602e-06, + "loss": 0.7225, + "step": 3581 + }, + { + "epoch": 0.37, + "grad_norm": 1.9428757885250234, + "learning_rate": 7.2291349835518e-06, + "loss": 0.6232, + "step": 3582 + }, + { + "epoch": 0.37, + "grad_norm": 1.847661038967175, + "learning_rate": 7.227627998519843e-06, + "loss": 0.6888, + "step": 3583 + }, + { + "epoch": 0.37, + "grad_norm": 1.7968971064453072, + "learning_rate": 7.226120760973554e-06, + "loss": 0.6808, + "step": 3584 + }, + { + "epoch": 0.37, + "grad_norm": 1.8316340872317762, + "learning_rate": 7.224613271083789e-06, + "loss": 0.6496, + "step": 3585 + }, + { + "epoch": 0.37, + "grad_norm": 1.7751193674739747, + "learning_rate": 7.22310552902143e-06, + "loss": 0.6898, + "step": 3586 + }, + { + "epoch": 0.37, + "grad_norm": 2.0056699940699625, + "learning_rate": 7.221597534957389e-06, + "loss": 0.6359, + "step": 3587 + }, + { + "epoch": 0.37, + "grad_norm": 2.2114079963540183, + "learning_rate": 7.220089289062603e-06, + "loss": 0.7437, + "step": 3588 + }, + { + "epoch": 0.37, + "grad_norm": 1.8708371960322898, + "learning_rate": 7.218580791508043e-06, + "loss": 0.6704, + "step": 3589 + }, + { + "epoch": 0.37, + "grad_norm": 1.8610230872176512, + "learning_rate": 7.217072042464706e-06, + "loss": 0.5883, + "step": 3590 + }, + { + "epoch": 0.37, + "grad_norm": 1.7328612845093743, + "learning_rate": 7.215563042103614e-06, + "loss": 0.6694, + "step": 3591 + }, + { + "epoch": 0.37, + "grad_norm": 1.7591041086632069, + "learning_rate": 7.214053790595823e-06, + "loss": 0.7389, + "step": 3592 + }, + { + "epoch": 0.37, + "grad_norm": 1.9763016947888363, + "learning_rate": 7.212544288112415e-06, + "loss": 0.7703, + "step": 3593 + }, + { + "epoch": 0.37, + "grad_norm": 1.739904384320081, + "learning_rate": 7.211034534824503e-06, + "loss": 0.6461, + "step": 3594 + }, + { + "epoch": 0.37, + "grad_norm": 1.725500951731821, + "learning_rate": 7.209524530903223e-06, + "loss": 0.6184, + "step": 3595 + }, + { + "epoch": 0.37, + "grad_norm": 1.9718658533362245, + "learning_rate": 7.208014276519741e-06, + "loss": 0.6249, + "step": 3596 + }, + { + "epoch": 0.37, + "grad_norm": 1.7782536173137125, + "learning_rate": 7.206503771845259e-06, + "loss": 0.7426, + "step": 3597 + }, + { + "epoch": 0.37, + "grad_norm": 2.0669465358505135, + "learning_rate": 7.2049930170509965e-06, + "loss": 0.6984, + "step": 3598 + }, + { + "epoch": 0.37, + "grad_norm": 1.9149032831444073, + "learning_rate": 7.2034820123082075e-06, + "loss": 0.5994, + "step": 3599 + }, + { + "epoch": 0.37, + "grad_norm": 1.8824750319453147, + "learning_rate": 7.201970757788172e-06, + "loss": 0.5991, + "step": 3600 + }, + { + "epoch": 0.37, + "grad_norm": 1.897325064986783, + "learning_rate": 7.200459253662202e-06, + "loss": 0.6076, + "step": 3601 + }, + { + "epoch": 0.37, + "grad_norm": 1.9589204398556992, + "learning_rate": 7.198947500101632e-06, + "loss": 0.6531, + "step": 3602 + }, + { + "epoch": 0.37, + "grad_norm": 1.6587850379948488, + "learning_rate": 7.19743549727783e-06, + "loss": 0.6824, + "step": 3603 + }, + { + "epoch": 0.37, + "grad_norm": 1.9014993866573917, + "learning_rate": 7.195923245362188e-06, + "loss": 0.6682, + "step": 3604 + }, + { + "epoch": 0.37, + "grad_norm": 1.872178215470094, + "learning_rate": 7.194410744526132e-06, + "loss": 0.5496, + "step": 3605 + }, + { + "epoch": 0.37, + "grad_norm": 1.971682951834879, + "learning_rate": 7.192897994941111e-06, + "loss": 0.6729, + "step": 3606 + }, + { + "epoch": 0.37, + "grad_norm": 2.097197443870445, + "learning_rate": 7.191384996778601e-06, + "loss": 0.7267, + "step": 3607 + }, + { + "epoch": 0.38, + "grad_norm": 2.104883778259818, + "learning_rate": 7.189871750210111e-06, + "loss": 0.5755, + "step": 3608 + }, + { + "epoch": 0.38, + "grad_norm": 2.038992765805888, + "learning_rate": 7.1883582554071776e-06, + "loss": 0.596, + "step": 3609 + }, + { + "epoch": 0.38, + "grad_norm": 1.8255786200273927, + "learning_rate": 7.1868445125413625e-06, + "loss": 0.5536, + "step": 3610 + }, + { + "epoch": 0.38, + "grad_norm": 1.649347481136232, + "learning_rate": 7.1853305217842565e-06, + "loss": 0.6403, + "step": 3611 + }, + { + "epoch": 0.38, + "grad_norm": 1.7709514913740092, + "learning_rate": 7.183816283307481e-06, + "loss": 0.6644, + "step": 3612 + }, + { + "epoch": 0.38, + "grad_norm": 1.8054415691274552, + "learning_rate": 7.1823017972826815e-06, + "loss": 0.5759, + "step": 3613 + }, + { + "epoch": 0.38, + "grad_norm": 1.908794948479207, + "learning_rate": 7.180787063881534e-06, + "loss": 0.7358, + "step": 3614 + }, + { + "epoch": 0.38, + "grad_norm": 1.8067734916298452, + "learning_rate": 7.179272083275744e-06, + "loss": 0.6164, + "step": 3615 + }, + { + "epoch": 0.38, + "grad_norm": 2.1927195824585604, + "learning_rate": 7.177756855637042e-06, + "loss": 0.7013, + "step": 3616 + }, + { + "epoch": 0.38, + "grad_norm": 1.853731488800181, + "learning_rate": 7.1762413811371855e-06, + "loss": 0.6514, + "step": 3617 + }, + { + "epoch": 0.38, + "grad_norm": 1.6905411710563716, + "learning_rate": 7.174725659947966e-06, + "loss": 0.5478, + "step": 3618 + }, + { + "epoch": 0.38, + "grad_norm": 1.8778815442471106, + "learning_rate": 7.173209692241199e-06, + "loss": 0.6467, + "step": 3619 + }, + { + "epoch": 0.38, + "grad_norm": 2.066809489639149, + "learning_rate": 7.171693478188724e-06, + "loss": 0.7792, + "step": 3620 + }, + { + "epoch": 0.38, + "grad_norm": 2.182844071235007, + "learning_rate": 7.170177017962415e-06, + "loss": 0.6966, + "step": 3621 + }, + { + "epoch": 0.38, + "grad_norm": 1.9173244135871332, + "learning_rate": 7.168660311734173e-06, + "loss": 0.6599, + "step": 3622 + }, + { + "epoch": 0.38, + "grad_norm": 1.8071165425130356, + "learning_rate": 7.167143359675924e-06, + "loss": 0.5384, + "step": 3623 + }, + { + "epoch": 0.38, + "grad_norm": 1.818643370303639, + "learning_rate": 7.1656261619596205e-06, + "loss": 0.5815, + "step": 3624 + }, + { + "epoch": 0.38, + "grad_norm": 1.9981940575117898, + "learning_rate": 7.1641087187572485e-06, + "loss": 0.6087, + "step": 3625 + }, + { + "epoch": 0.38, + "grad_norm": 1.7155423758343529, + "learning_rate": 7.16259103024082e-06, + "loss": 0.5151, + "step": 3626 + }, + { + "epoch": 0.38, + "grad_norm": 1.8508271654042858, + "learning_rate": 7.161073096582371e-06, + "loss": 0.6903, + "step": 3627 + }, + { + "epoch": 0.38, + "grad_norm": 1.9064304886131862, + "learning_rate": 7.159554917953968e-06, + "loss": 0.6077, + "step": 3628 + }, + { + "epoch": 0.38, + "grad_norm": 1.9992795993108614, + "learning_rate": 7.158036494527707e-06, + "loss": 0.8044, + "step": 3629 + }, + { + "epoch": 0.38, + "grad_norm": 1.8289307895653182, + "learning_rate": 7.156517826475708e-06, + "loss": 0.6744, + "step": 3630 + }, + { + "epoch": 0.38, + "grad_norm": 1.9640255147580412, + "learning_rate": 7.154998913970124e-06, + "loss": 0.6093, + "step": 3631 + }, + { + "epoch": 0.38, + "grad_norm": 1.8704100255026406, + "learning_rate": 7.153479757183127e-06, + "loss": 0.5818, + "step": 3632 + }, + { + "epoch": 0.38, + "grad_norm": 1.8530421196673053, + "learning_rate": 7.1519603562869265e-06, + "loss": 0.5494, + "step": 3633 + }, + { + "epoch": 0.38, + "grad_norm": 1.859849961294324, + "learning_rate": 7.150440711453754e-06, + "loss": 0.6875, + "step": 3634 + }, + { + "epoch": 0.38, + "grad_norm": 1.9767165128139512, + "learning_rate": 7.148920822855869e-06, + "loss": 0.6612, + "step": 3635 + }, + { + "epoch": 0.38, + "grad_norm": 2.0058716132606684, + "learning_rate": 7.1474006906655605e-06, + "loss": 0.7287, + "step": 3636 + }, + { + "epoch": 0.38, + "grad_norm": 1.7254558229061596, + "learning_rate": 7.145880315055145e-06, + "loss": 0.6355, + "step": 3637 + }, + { + "epoch": 0.38, + "grad_norm": 1.5572686981014636, + "learning_rate": 7.144359696196964e-06, + "loss": 0.6584, + "step": 3638 + }, + { + "epoch": 0.38, + "grad_norm": 1.9052815448096254, + "learning_rate": 7.142838834263388e-06, + "loss": 0.6688, + "step": 3639 + }, + { + "epoch": 0.38, + "grad_norm": 1.6860169875157103, + "learning_rate": 7.141317729426817e-06, + "loss": 0.6196, + "step": 3640 + }, + { + "epoch": 0.38, + "grad_norm": 1.7721664305732985, + "learning_rate": 7.139796381859676e-06, + "loss": 0.6633, + "step": 3641 + }, + { + "epoch": 0.38, + "grad_norm": 1.7826626559988565, + "learning_rate": 7.138274791734421e-06, + "loss": 0.6367, + "step": 3642 + }, + { + "epoch": 0.38, + "grad_norm": 1.990311955470244, + "learning_rate": 7.136752959223527e-06, + "loss": 0.6497, + "step": 3643 + }, + { + "epoch": 0.38, + "grad_norm": 2.074898158189017, + "learning_rate": 7.1352308844995086e-06, + "loss": 0.6088, + "step": 3644 + }, + { + "epoch": 0.38, + "grad_norm": 1.9277929482740352, + "learning_rate": 7.133708567734898e-06, + "loss": 0.5938, + "step": 3645 + }, + { + "epoch": 0.38, + "grad_norm": 1.7516961666712483, + "learning_rate": 7.13218600910226e-06, + "loss": 0.7127, + "step": 3646 + }, + { + "epoch": 0.38, + "grad_norm": 1.889977062266395, + "learning_rate": 7.1306632087741844e-06, + "loss": 0.6515, + "step": 3647 + }, + { + "epoch": 0.38, + "grad_norm": 2.0117401818077862, + "learning_rate": 7.12914016692329e-06, + "loss": 0.708, + "step": 3648 + }, + { + "epoch": 0.38, + "grad_norm": 1.8988679344144204, + "learning_rate": 7.1276168837222215e-06, + "loss": 0.707, + "step": 3649 + }, + { + "epoch": 0.38, + "grad_norm": 1.992469048910512, + "learning_rate": 7.1260933593436535e-06, + "loss": 0.6132, + "step": 3650 + }, + { + "epoch": 0.38, + "grad_norm": 1.8079967436056992, + "learning_rate": 7.1245695939602834e-06, + "loss": 0.6561, + "step": 3651 + }, + { + "epoch": 0.38, + "grad_norm": 1.6363093178305284, + "learning_rate": 7.12304558774484e-06, + "loss": 0.5939, + "step": 3652 + }, + { + "epoch": 0.38, + "grad_norm": 1.7910381053908828, + "learning_rate": 7.121521340870079e-06, + "loss": 0.673, + "step": 3653 + }, + { + "epoch": 0.38, + "grad_norm": 1.9706296825523928, + "learning_rate": 7.119996853508781e-06, + "loss": 0.6262, + "step": 3654 + }, + { + "epoch": 0.38, + "grad_norm": 1.869840494562361, + "learning_rate": 7.1184721258337575e-06, + "loss": 0.6058, + "step": 3655 + }, + { + "epoch": 0.38, + "grad_norm": 1.926379615057285, + "learning_rate": 7.116947158017842e-06, + "loss": 0.7076, + "step": 3656 + }, + { + "epoch": 0.38, + "grad_norm": 1.7795167202195803, + "learning_rate": 7.115421950233902e-06, + "loss": 0.6051, + "step": 3657 + }, + { + "epoch": 0.38, + "grad_norm": 1.9578043275804096, + "learning_rate": 7.113896502654824e-06, + "loss": 0.605, + "step": 3658 + }, + { + "epoch": 0.38, + "grad_norm": 1.7369262214424697, + "learning_rate": 7.112370815453531e-06, + "loss": 0.5664, + "step": 3659 + }, + { + "epoch": 0.38, + "grad_norm": 2.0109937094073906, + "learning_rate": 7.110844888802966e-06, + "loss": 0.6973, + "step": 3660 + }, + { + "epoch": 0.38, + "grad_norm": 1.9212899018541152, + "learning_rate": 7.1093187228760995e-06, + "loss": 0.7831, + "step": 3661 + }, + { + "epoch": 0.38, + "grad_norm": 1.9675443765140932, + "learning_rate": 7.107792317845934e-06, + "loss": 0.7564, + "step": 3662 + }, + { + "epoch": 0.38, + "grad_norm": 2.5186964557046534, + "learning_rate": 7.106265673885494e-06, + "loss": 0.8091, + "step": 3663 + }, + { + "epoch": 0.38, + "grad_norm": 2.172007104728858, + "learning_rate": 7.104738791167837e-06, + "loss": 0.6913, + "step": 3664 + }, + { + "epoch": 0.38, + "grad_norm": 1.767126716740684, + "learning_rate": 7.103211669866039e-06, + "loss": 0.6973, + "step": 3665 + }, + { + "epoch": 0.38, + "grad_norm": 1.7754637022542625, + "learning_rate": 7.1016843101532115e-06, + "loss": 0.6218, + "step": 3666 + }, + { + "epoch": 0.38, + "grad_norm": 1.7831012329808487, + "learning_rate": 7.100156712202488e-06, + "loss": 0.685, + "step": 3667 + }, + { + "epoch": 0.38, + "grad_norm": 1.8985315383249024, + "learning_rate": 7.098628876187031e-06, + "loss": 0.6087, + "step": 3668 + }, + { + "epoch": 0.38, + "grad_norm": 1.9257692672338116, + "learning_rate": 7.0971008022800295e-06, + "loss": 0.6588, + "step": 3669 + }, + { + "epoch": 0.38, + "grad_norm": 2.048096408415564, + "learning_rate": 7.095572490654698e-06, + "loss": 0.7406, + "step": 3670 + }, + { + "epoch": 0.38, + "grad_norm": 1.80679778795583, + "learning_rate": 7.094043941484282e-06, + "loss": 0.6449, + "step": 3671 + }, + { + "epoch": 0.38, + "grad_norm": 1.924580745209938, + "learning_rate": 7.092515154942048e-06, + "loss": 0.7301, + "step": 3672 + }, + { + "epoch": 0.38, + "grad_norm": 1.7817683854261057, + "learning_rate": 7.090986131201294e-06, + "loss": 0.6471, + "step": 3673 + }, + { + "epoch": 0.38, + "grad_norm": 1.7928879478359547, + "learning_rate": 7.089456870435344e-06, + "loss": 0.6514, + "step": 3674 + }, + { + "epoch": 0.38, + "grad_norm": 2.0696609601855616, + "learning_rate": 7.087927372817549e-06, + "loss": 0.6171, + "step": 3675 + }, + { + "epoch": 0.38, + "grad_norm": 1.6884810491961613, + "learning_rate": 7.086397638521285e-06, + "loss": 0.6483, + "step": 3676 + }, + { + "epoch": 0.38, + "grad_norm": 1.7331759057428129, + "learning_rate": 7.084867667719957e-06, + "loss": 0.5589, + "step": 3677 + }, + { + "epoch": 0.38, + "grad_norm": 1.9600799820017218, + "learning_rate": 7.083337460586995e-06, + "loss": 0.7712, + "step": 3678 + }, + { + "epoch": 0.38, + "grad_norm": 1.9698872325167316, + "learning_rate": 7.0818070172958585e-06, + "loss": 0.6482, + "step": 3679 + }, + { + "epoch": 0.38, + "grad_norm": 1.772990633413805, + "learning_rate": 7.080276338020029e-06, + "loss": 0.6664, + "step": 3680 + }, + { + "epoch": 0.38, + "grad_norm": 1.9925905028374946, + "learning_rate": 7.07874542293302e-06, + "loss": 0.7052, + "step": 3681 + }, + { + "epoch": 0.38, + "grad_norm": 1.9775724860452595, + "learning_rate": 7.077214272208369e-06, + "loss": 0.6453, + "step": 3682 + }, + { + "epoch": 0.38, + "grad_norm": 2.101909580042845, + "learning_rate": 7.07568288601964e-06, + "loss": 0.7348, + "step": 3683 + }, + { + "epoch": 0.38, + "grad_norm": 1.7865633356521249, + "learning_rate": 7.074151264540425e-06, + "loss": 0.5937, + "step": 3684 + }, + { + "epoch": 0.38, + "grad_norm": 1.912028189475162, + "learning_rate": 7.072619407944343e-06, + "loss": 0.6418, + "step": 3685 + }, + { + "epoch": 0.38, + "grad_norm": 1.7555446608686078, + "learning_rate": 7.071087316405037e-06, + "loss": 0.7161, + "step": 3686 + }, + { + "epoch": 0.38, + "grad_norm": 1.7423318666131733, + "learning_rate": 7.069554990096178e-06, + "loss": 0.6073, + "step": 3687 + }, + { + "epoch": 0.38, + "grad_norm": 1.727743961687725, + "learning_rate": 7.068022429191465e-06, + "loss": 0.6346, + "step": 3688 + }, + { + "epoch": 0.38, + "grad_norm": 2.020998879626979, + "learning_rate": 7.066489633864624e-06, + "loss": 0.6432, + "step": 3689 + }, + { + "epoch": 0.38, + "grad_norm": 1.7904324410819448, + "learning_rate": 7.064956604289402e-06, + "loss": 0.5901, + "step": 3690 + }, + { + "epoch": 0.38, + "grad_norm": 1.6681103595815747, + "learning_rate": 7.0634233406395806e-06, + "loss": 0.6236, + "step": 3691 + }, + { + "epoch": 0.38, + "grad_norm": 2.019485959225672, + "learning_rate": 7.061889843088961e-06, + "loss": 0.6031, + "step": 3692 + }, + { + "epoch": 0.38, + "grad_norm": 1.6974985157799625, + "learning_rate": 7.060356111811376e-06, + "loss": 0.5825, + "step": 3693 + }, + { + "epoch": 0.38, + "grad_norm": 2.007966294692052, + "learning_rate": 7.058822146980684e-06, + "loss": 0.6565, + "step": 3694 + }, + { + "epoch": 0.38, + "grad_norm": 1.8677225337817733, + "learning_rate": 7.0572879487707645e-06, + "loss": 0.6306, + "step": 3695 + }, + { + "epoch": 0.38, + "grad_norm": 1.9268857672131279, + "learning_rate": 7.05575351735553e-06, + "loss": 0.6039, + "step": 3696 + }, + { + "epoch": 0.38, + "grad_norm": 1.8004826415621484, + "learning_rate": 7.054218852908918e-06, + "loss": 0.7229, + "step": 3697 + }, + { + "epoch": 0.38, + "grad_norm": 1.86814066122037, + "learning_rate": 7.05268395560489e-06, + "loss": 0.6712, + "step": 3698 + }, + { + "epoch": 0.38, + "grad_norm": 1.8861582510455044, + "learning_rate": 7.051148825617435e-06, + "loss": 0.6199, + "step": 3699 + }, + { + "epoch": 0.38, + "grad_norm": 2.0271375461938907, + "learning_rate": 7.0496134631205705e-06, + "loss": 0.6532, + "step": 3700 + }, + { + "epoch": 0.38, + "grad_norm": 1.7322689461414547, + "learning_rate": 7.048077868288338e-06, + "loss": 0.6175, + "step": 3701 + }, + { + "epoch": 0.38, + "grad_norm": 1.7427810559006416, + "learning_rate": 7.046542041294804e-06, + "loss": 0.624, + "step": 3702 + }, + { + "epoch": 0.38, + "grad_norm": 1.814216437196377, + "learning_rate": 7.045005982314065e-06, + "loss": 0.5836, + "step": 3703 + }, + { + "epoch": 0.39, + "grad_norm": 2.008142687023798, + "learning_rate": 7.0434696915202415e-06, + "loss": 0.7704, + "step": 3704 + }, + { + "epoch": 0.39, + "grad_norm": 1.7855230650520368, + "learning_rate": 7.041933169087482e-06, + "loss": 0.6452, + "step": 3705 + }, + { + "epoch": 0.39, + "grad_norm": 1.985668972035989, + "learning_rate": 7.040396415189959e-06, + "loss": 0.7503, + "step": 3706 + }, + { + "epoch": 0.39, + "grad_norm": 1.8844199217844504, + "learning_rate": 7.038859430001872e-06, + "loss": 0.6682, + "step": 3707 + }, + { + "epoch": 0.39, + "grad_norm": 1.6502541814904381, + "learning_rate": 7.037322213697448e-06, + "loss": 0.5502, + "step": 3708 + }, + { + "epoch": 0.39, + "grad_norm": 2.323083971904915, + "learning_rate": 7.035784766450938e-06, + "loss": 0.5438, + "step": 3709 + }, + { + "epoch": 0.39, + "grad_norm": 1.9612136420018857, + "learning_rate": 7.034247088436621e-06, + "loss": 0.7139, + "step": 3710 + }, + { + "epoch": 0.39, + "grad_norm": 1.8172381656336853, + "learning_rate": 7.032709179828803e-06, + "loss": 0.6152, + "step": 3711 + }, + { + "epoch": 0.39, + "grad_norm": 1.8903375116755927, + "learning_rate": 7.031171040801813e-06, + "loss": 0.6615, + "step": 3712 + }, + { + "epoch": 0.39, + "grad_norm": 2.0513450644693183, + "learning_rate": 7.029632671530008e-06, + "loss": 0.7287, + "step": 3713 + }, + { + "epoch": 0.39, + "grad_norm": 2.364069746083148, + "learning_rate": 7.02809407218777e-06, + "loss": 0.7201, + "step": 3714 + }, + { + "epoch": 0.39, + "grad_norm": 1.789762597807389, + "learning_rate": 7.026555242949511e-06, + "loss": 0.6513, + "step": 3715 + }, + { + "epoch": 0.39, + "grad_norm": 1.7415445101122051, + "learning_rate": 7.0250161839896636e-06, + "loss": 0.5658, + "step": 3716 + }, + { + "epoch": 0.39, + "grad_norm": 1.9838077381226866, + "learning_rate": 7.02347689548269e-06, + "loss": 0.643, + "step": 3717 + }, + { + "epoch": 0.39, + "grad_norm": 2.0244134936944933, + "learning_rate": 7.021937377603076e-06, + "loss": 0.7017, + "step": 3718 + }, + { + "epoch": 0.39, + "grad_norm": 1.9745168936105955, + "learning_rate": 7.020397630525336e-06, + "loss": 0.6745, + "step": 3719 + }, + { + "epoch": 0.39, + "grad_norm": 1.7289255604430116, + "learning_rate": 7.018857654424008e-06, + "loss": 0.6488, + "step": 3720 + }, + { + "epoch": 0.39, + "grad_norm": 1.9155400352019034, + "learning_rate": 7.017317449473658e-06, + "loss": 0.6324, + "step": 3721 + }, + { + "epoch": 0.39, + "grad_norm": 1.5674009208190292, + "learning_rate": 7.015777015848877e-06, + "loss": 0.6442, + "step": 3722 + }, + { + "epoch": 0.39, + "grad_norm": 1.876596679522037, + "learning_rate": 7.0142363537242815e-06, + "loss": 0.6725, + "step": 3723 + }, + { + "epoch": 0.39, + "grad_norm": 1.8965716533361012, + "learning_rate": 7.012695463274515e-06, + "loss": 0.6449, + "step": 3724 + }, + { + "epoch": 0.39, + "grad_norm": 1.8872516694937251, + "learning_rate": 7.0111543446742444e-06, + "loss": 0.6431, + "step": 3725 + }, + { + "epoch": 0.39, + "grad_norm": 2.1515953153188776, + "learning_rate": 7.0096129980981674e-06, + "loss": 0.6978, + "step": 3726 + }, + { + "epoch": 0.39, + "grad_norm": 2.0270766691639523, + "learning_rate": 7.008071423721004e-06, + "loss": 0.6044, + "step": 3727 + }, + { + "epoch": 0.39, + "grad_norm": 2.0184644859313225, + "learning_rate": 7.006529621717496e-06, + "loss": 0.696, + "step": 3728 + }, + { + "epoch": 0.39, + "grad_norm": 2.0626010069202447, + "learning_rate": 7.00498759226242e-06, + "loss": 0.6551, + "step": 3729 + }, + { + "epoch": 0.39, + "grad_norm": 2.016147799155721, + "learning_rate": 7.003445335530572e-06, + "loss": 0.641, + "step": 3730 + }, + { + "epoch": 0.39, + "grad_norm": 1.8731480687364606, + "learning_rate": 7.001902851696775e-06, + "loss": 0.5508, + "step": 3731 + }, + { + "epoch": 0.39, + "grad_norm": 1.7525210729280494, + "learning_rate": 7.000360140935881e-06, + "loss": 0.5715, + "step": 3732 + }, + { + "epoch": 0.39, + "grad_norm": 1.7791094074416964, + "learning_rate": 6.998817203422763e-06, + "loss": 0.6188, + "step": 3733 + }, + { + "epoch": 0.39, + "grad_norm": 1.7107178217967776, + "learning_rate": 6.997274039332323e-06, + "loss": 0.6057, + "step": 3734 + }, + { + "epoch": 0.39, + "grad_norm": 2.1689931360984835, + "learning_rate": 6.995730648839485e-06, + "loss": 0.6081, + "step": 3735 + }, + { + "epoch": 0.39, + "grad_norm": 1.9944082514802253, + "learning_rate": 6.9941870321192015e-06, + "loss": 0.6827, + "step": 3736 + }, + { + "epoch": 0.39, + "grad_norm": 1.9902720881288807, + "learning_rate": 6.992643189346453e-06, + "loss": 0.6835, + "step": 3737 + }, + { + "epoch": 0.39, + "grad_norm": 1.9676878976709407, + "learning_rate": 6.991099120696243e-06, + "loss": 0.6268, + "step": 3738 + }, + { + "epoch": 0.39, + "grad_norm": 1.9265199475254238, + "learning_rate": 6.989554826343597e-06, + "loss": 0.6599, + "step": 3739 + }, + { + "epoch": 0.39, + "grad_norm": 1.6498327098254237, + "learning_rate": 6.988010306463571e-06, + "loss": 0.5858, + "step": 3740 + }, + { + "epoch": 0.39, + "grad_norm": 1.8263087431164569, + "learning_rate": 6.986465561231246e-06, + "loss": 0.5895, + "step": 3741 + }, + { + "epoch": 0.39, + "grad_norm": 1.9555452689076078, + "learning_rate": 6.984920590821726e-06, + "loss": 0.6573, + "step": 3742 + }, + { + "epoch": 0.39, + "grad_norm": 2.0437770842120933, + "learning_rate": 6.983375395410146e-06, + "loss": 0.6526, + "step": 3743 + }, + { + "epoch": 0.39, + "grad_norm": 1.869433885945104, + "learning_rate": 6.981829975171658e-06, + "loss": 0.6379, + "step": 3744 + }, + { + "epoch": 0.39, + "grad_norm": 1.8994878040677412, + "learning_rate": 6.9802843302814475e-06, + "loss": 0.673, + "step": 3745 + }, + { + "epoch": 0.39, + "grad_norm": 1.8642847780082168, + "learning_rate": 6.97873846091472e-06, + "loss": 0.6918, + "step": 3746 + }, + { + "epoch": 0.39, + "grad_norm": 1.940150418676658, + "learning_rate": 6.977192367246709e-06, + "loss": 0.6778, + "step": 3747 + }, + { + "epoch": 0.39, + "grad_norm": 1.952659328164777, + "learning_rate": 6.975646049452673e-06, + "loss": 0.6567, + "step": 3748 + }, + { + "epoch": 0.39, + "grad_norm": 1.897858212581815, + "learning_rate": 6.9740995077079e-06, + "loss": 0.6589, + "step": 3749 + }, + { + "epoch": 0.39, + "grad_norm": 1.9798414395768777, + "learning_rate": 6.972552742187693e-06, + "loss": 0.6554, + "step": 3750 + }, + { + "epoch": 0.39, + "grad_norm": 1.947474102381345, + "learning_rate": 6.971005753067391e-06, + "loss": 0.6939, + "step": 3751 + }, + { + "epoch": 0.39, + "grad_norm": 1.8593020322548108, + "learning_rate": 6.96945854052235e-06, + "loss": 0.6908, + "step": 3752 + }, + { + "epoch": 0.39, + "grad_norm": 1.8790691411742364, + "learning_rate": 6.96791110472796e-06, + "loss": 0.6918, + "step": 3753 + }, + { + "epoch": 0.39, + "grad_norm": 2.009560441823809, + "learning_rate": 6.966363445859629e-06, + "loss": 0.701, + "step": 3754 + }, + { + "epoch": 0.39, + "grad_norm": 1.7247879670505806, + "learning_rate": 6.964815564092792e-06, + "loss": 0.5531, + "step": 3755 + }, + { + "epoch": 0.39, + "grad_norm": 1.9237024626907298, + "learning_rate": 6.9632674596029135e-06, + "loss": 0.7353, + "step": 3756 + }, + { + "epoch": 0.39, + "grad_norm": 1.8423072923305135, + "learning_rate": 6.9617191325654785e-06, + "loss": 0.6292, + "step": 3757 + }, + { + "epoch": 0.39, + "grad_norm": 2.095217792173662, + "learning_rate": 6.9601705831559985e-06, + "loss": 0.7027, + "step": 3758 + }, + { + "epoch": 0.39, + "grad_norm": 1.9172392464043688, + "learning_rate": 6.95862181155001e-06, + "loss": 0.5968, + "step": 3759 + }, + { + "epoch": 0.39, + "grad_norm": 1.7885688923040335, + "learning_rate": 6.957072817923074e-06, + "loss": 0.6571, + "step": 3760 + }, + { + "epoch": 0.39, + "grad_norm": 2.0189100338935813, + "learning_rate": 6.95552360245078e-06, + "loss": 0.6102, + "step": 3761 + }, + { + "epoch": 0.39, + "grad_norm": 2.176085342804834, + "learning_rate": 6.95397416530874e-06, + "loss": 0.7103, + "step": 3762 + }, + { + "epoch": 0.39, + "grad_norm": 1.8281330590058102, + "learning_rate": 6.95242450667259e-06, + "loss": 0.5678, + "step": 3763 + }, + { + "epoch": 0.39, + "grad_norm": 1.780282107248268, + "learning_rate": 6.950874626717996e-06, + "loss": 0.5708, + "step": 3764 + }, + { + "epoch": 0.39, + "grad_norm": 1.696269074894569, + "learning_rate": 6.949324525620642e-06, + "loss": 0.627, + "step": 3765 + }, + { + "epoch": 0.39, + "grad_norm": 2.028234243684498, + "learning_rate": 6.947774203556241e-06, + "loss": 0.6769, + "step": 3766 + }, + { + "epoch": 0.39, + "grad_norm": 1.8100157337051184, + "learning_rate": 6.946223660700535e-06, + "loss": 0.6096, + "step": 3767 + }, + { + "epoch": 0.39, + "grad_norm": 2.0304897261390935, + "learning_rate": 6.944672897229282e-06, + "loss": 0.7408, + "step": 3768 + }, + { + "epoch": 0.39, + "grad_norm": 1.8186687992645447, + "learning_rate": 6.943121913318272e-06, + "loss": 0.6727, + "step": 3769 + }, + { + "epoch": 0.39, + "grad_norm": 2.014696798801752, + "learning_rate": 6.941570709143317e-06, + "loss": 0.6679, + "step": 3770 + }, + { + "epoch": 0.39, + "grad_norm": 1.9573005847526657, + "learning_rate": 6.9400192848802575e-06, + "loss": 0.7203, + "step": 3771 + }, + { + "epoch": 0.39, + "grad_norm": 1.870931516660422, + "learning_rate": 6.938467640704953e-06, + "loss": 0.6407, + "step": 3772 + }, + { + "epoch": 0.39, + "grad_norm": 1.924348926934089, + "learning_rate": 6.936915776793293e-06, + "loss": 0.7739, + "step": 3773 + }, + { + "epoch": 0.39, + "grad_norm": 1.8219644545452585, + "learning_rate": 6.935363693321189e-06, + "loss": 0.7424, + "step": 3774 + }, + { + "epoch": 0.39, + "grad_norm": 1.8909820062964133, + "learning_rate": 6.93381139046458e-06, + "loss": 0.5874, + "step": 3775 + }, + { + "epoch": 0.39, + "grad_norm": 1.851095708702634, + "learning_rate": 6.932258868399426e-06, + "loss": 0.5811, + "step": 3776 + }, + { + "epoch": 0.39, + "grad_norm": 1.9626956710437433, + "learning_rate": 6.930706127301718e-06, + "loss": 0.6269, + "step": 3777 + }, + { + "epoch": 0.39, + "grad_norm": 1.8588515617089818, + "learning_rate": 6.9291531673474645e-06, + "loss": 0.6689, + "step": 3778 + }, + { + "epoch": 0.39, + "grad_norm": 2.0129296892502437, + "learning_rate": 6.9275999887127045e-06, + "loss": 0.692, + "step": 3779 + }, + { + "epoch": 0.39, + "grad_norm": 1.967573552691157, + "learning_rate": 6.926046591573498e-06, + "loss": 0.6498, + "step": 3780 + }, + { + "epoch": 0.39, + "grad_norm": 1.9177145711520847, + "learning_rate": 6.924492976105932e-06, + "loss": 0.6621, + "step": 3781 + }, + { + "epoch": 0.39, + "grad_norm": 1.8662266389846618, + "learning_rate": 6.922939142486118e-06, + "loss": 0.7561, + "step": 3782 + }, + { + "epoch": 0.39, + "grad_norm": 2.0519301872983853, + "learning_rate": 6.921385090890193e-06, + "loss": 0.613, + "step": 3783 + }, + { + "epoch": 0.39, + "grad_norm": 1.9506463097256144, + "learning_rate": 6.919830821494314e-06, + "loss": 0.771, + "step": 3784 + }, + { + "epoch": 0.39, + "grad_norm": 2.035175531222123, + "learning_rate": 6.918276334474671e-06, + "loss": 0.7823, + "step": 3785 + }, + { + "epoch": 0.39, + "grad_norm": 1.7749634341037155, + "learning_rate": 6.916721630007471e-06, + "loss": 0.6171, + "step": 3786 + }, + { + "epoch": 0.39, + "grad_norm": 2.135332931852134, + "learning_rate": 6.91516670826895e-06, + "loss": 0.5635, + "step": 3787 + }, + { + "epoch": 0.39, + "grad_norm": 1.7881778397885084, + "learning_rate": 6.913611569435366e-06, + "loss": 0.5461, + "step": 3788 + }, + { + "epoch": 0.39, + "grad_norm": 1.7450638856633924, + "learning_rate": 6.912056213683001e-06, + "loss": 0.6384, + "step": 3789 + }, + { + "epoch": 0.39, + "grad_norm": 1.9663931312471843, + "learning_rate": 6.9105006411881695e-06, + "loss": 0.6423, + "step": 3790 + }, + { + "epoch": 0.39, + "grad_norm": 2.012129630221285, + "learning_rate": 6.9089448521271995e-06, + "loss": 0.6986, + "step": 3791 + }, + { + "epoch": 0.39, + "grad_norm": 1.933878118917115, + "learning_rate": 6.9073888466764495e-06, + "loss": 0.6312, + "step": 3792 + }, + { + "epoch": 0.39, + "grad_norm": 1.8464628698639192, + "learning_rate": 6.905832625012301e-06, + "loss": 0.6321, + "step": 3793 + }, + { + "epoch": 0.39, + "grad_norm": 2.062510373103962, + "learning_rate": 6.904276187311163e-06, + "loss": 0.6971, + "step": 3794 + }, + { + "epoch": 0.39, + "grad_norm": 1.93374853425991, + "learning_rate": 6.9027195337494645e-06, + "loss": 0.5843, + "step": 3795 + }, + { + "epoch": 0.39, + "grad_norm": 2.1647078071719457, + "learning_rate": 6.901162664503662e-06, + "loss": 0.5992, + "step": 3796 + }, + { + "epoch": 0.39, + "grad_norm": 1.9599458350519412, + "learning_rate": 6.899605579750236e-06, + "loss": 0.6586, + "step": 3797 + }, + { + "epoch": 0.39, + "grad_norm": 2.2163360975899313, + "learning_rate": 6.898048279665689e-06, + "loss": 0.6925, + "step": 3798 + }, + { + "epoch": 0.39, + "grad_norm": 1.7824221315080275, + "learning_rate": 6.896490764426551e-06, + "loss": 0.6436, + "step": 3799 + }, + { + "epoch": 0.4, + "grad_norm": 1.9842651710271497, + "learning_rate": 6.8949330342093756e-06, + "loss": 0.7, + "step": 3800 + }, + { + "epoch": 0.4, + "grad_norm": 1.8505223212020494, + "learning_rate": 6.893375089190741e-06, + "loss": 0.6553, + "step": 3801 + }, + { + "epoch": 0.4, + "grad_norm": 1.8419752520577877, + "learning_rate": 6.891816929547247e-06, + "loss": 0.5906, + "step": 3802 + }, + { + "epoch": 0.4, + "grad_norm": 1.6868313861269602, + "learning_rate": 6.890258555455521e-06, + "loss": 0.6291, + "step": 3803 + }, + { + "epoch": 0.4, + "grad_norm": 1.906131986603101, + "learning_rate": 6.888699967092215e-06, + "loss": 0.6008, + "step": 3804 + }, + { + "epoch": 0.4, + "grad_norm": 1.756561257424716, + "learning_rate": 6.887141164634001e-06, + "loss": 0.5851, + "step": 3805 + }, + { + "epoch": 0.4, + "grad_norm": 1.8379054874503786, + "learning_rate": 6.885582148257579e-06, + "loss": 0.6615, + "step": 3806 + }, + { + "epoch": 0.4, + "grad_norm": 1.8589992016721577, + "learning_rate": 6.884022918139675e-06, + "loss": 0.5656, + "step": 3807 + }, + { + "epoch": 0.4, + "grad_norm": 1.7750855346733305, + "learning_rate": 6.882463474457034e-06, + "loss": 0.684, + "step": 3808 + }, + { + "epoch": 0.4, + "grad_norm": 1.7331464972792372, + "learning_rate": 6.8809038173864285e-06, + "loss": 0.5875, + "step": 3809 + }, + { + "epoch": 0.4, + "grad_norm": 1.8145592971627584, + "learning_rate": 6.879343947104653e-06, + "loss": 0.6518, + "step": 3810 + }, + { + "epoch": 0.4, + "grad_norm": 1.7186609313479941, + "learning_rate": 6.877783863788531e-06, + "loss": 0.6398, + "step": 3811 + }, + { + "epoch": 0.4, + "grad_norm": 2.0593901439717217, + "learning_rate": 6.876223567614904e-06, + "loss": 0.6199, + "step": 3812 + }, + { + "epoch": 0.4, + "grad_norm": 2.051921874821278, + "learning_rate": 6.874663058760642e-06, + "loss": 0.6652, + "step": 3813 + }, + { + "epoch": 0.4, + "grad_norm": 1.983268125178121, + "learning_rate": 6.873102337402637e-06, + "loss": 0.6742, + "step": 3814 + }, + { + "epoch": 0.4, + "grad_norm": 1.9250957085532272, + "learning_rate": 6.871541403717808e-06, + "loss": 0.5963, + "step": 3815 + }, + { + "epoch": 0.4, + "grad_norm": 1.8923565653388212, + "learning_rate": 6.86998025788309e-06, + "loss": 0.6559, + "step": 3816 + }, + { + "epoch": 0.4, + "grad_norm": 1.7717289543577657, + "learning_rate": 6.868418900075452e-06, + "loss": 0.6317, + "step": 3817 + }, + { + "epoch": 0.4, + "grad_norm": 1.8393903443302437, + "learning_rate": 6.866857330471882e-06, + "loss": 0.6645, + "step": 3818 + }, + { + "epoch": 0.4, + "grad_norm": 1.907056188925765, + "learning_rate": 6.8652955492493944e-06, + "loss": 0.6674, + "step": 3819 + }, + { + "epoch": 0.4, + "grad_norm": 1.7792952155871635, + "learning_rate": 6.863733556585023e-06, + "loss": 0.6211, + "step": 3820 + }, + { + "epoch": 0.4, + "grad_norm": 2.0070176175353485, + "learning_rate": 6.862171352655831e-06, + "loss": 0.5849, + "step": 3821 + }, + { + "epoch": 0.4, + "grad_norm": 1.927108955804964, + "learning_rate": 6.8606089376389006e-06, + "loss": 0.6479, + "step": 3822 + }, + { + "epoch": 0.4, + "grad_norm": 1.8876654284510865, + "learning_rate": 6.859046311711344e-06, + "loss": 0.6571, + "step": 3823 + }, + { + "epoch": 0.4, + "grad_norm": 1.9645805494724002, + "learning_rate": 6.85748347505029e-06, + "loss": 0.6443, + "step": 3824 + }, + { + "epoch": 0.4, + "grad_norm": 2.033078807453262, + "learning_rate": 6.855920427832898e-06, + "loss": 0.7693, + "step": 3825 + }, + { + "epoch": 0.4, + "grad_norm": 1.741023709894303, + "learning_rate": 6.854357170236346e-06, + "loss": 0.5633, + "step": 3826 + }, + { + "epoch": 0.4, + "grad_norm": 1.8327831597190622, + "learning_rate": 6.85279370243784e-06, + "loss": 0.6629, + "step": 3827 + }, + { + "epoch": 0.4, + "grad_norm": 2.138936128449846, + "learning_rate": 6.851230024614608e-06, + "loss": 0.6216, + "step": 3828 + }, + { + "epoch": 0.4, + "grad_norm": 1.9998323937030877, + "learning_rate": 6.8496661369439e-06, + "loss": 0.6371, + "step": 3829 + }, + { + "epoch": 0.4, + "grad_norm": 1.956674866863184, + "learning_rate": 6.848102039602993e-06, + "loss": 0.6175, + "step": 3830 + }, + { + "epoch": 0.4, + "grad_norm": 2.1315466856909713, + "learning_rate": 6.846537732769185e-06, + "loss": 0.7057, + "step": 3831 + }, + { + "epoch": 0.4, + "grad_norm": 1.8023054657266382, + "learning_rate": 6.844973216619801e-06, + "loss": 0.7285, + "step": 3832 + }, + { + "epoch": 0.4, + "grad_norm": 2.331237965000702, + "learning_rate": 6.843408491332186e-06, + "loss": 0.6631, + "step": 3833 + }, + { + "epoch": 0.4, + "grad_norm": 1.8747243292576776, + "learning_rate": 6.841843557083714e-06, + "loss": 0.623, + "step": 3834 + }, + { + "epoch": 0.4, + "grad_norm": 2.081104098621447, + "learning_rate": 6.840278414051774e-06, + "loss": 0.6736, + "step": 3835 + }, + { + "epoch": 0.4, + "grad_norm": 2.027824141424956, + "learning_rate": 6.838713062413788e-06, + "loss": 0.7356, + "step": 3836 + }, + { + "epoch": 0.4, + "grad_norm": 1.8448960739370412, + "learning_rate": 6.8371475023471945e-06, + "loss": 0.689, + "step": 3837 + }, + { + "epoch": 0.4, + "grad_norm": 1.9094274178849335, + "learning_rate": 6.835581734029462e-06, + "loss": 0.64, + "step": 3838 + }, + { + "epoch": 0.4, + "grad_norm": 1.8412279763358383, + "learning_rate": 6.834015757638076e-06, + "loss": 0.6305, + "step": 3839 + }, + { + "epoch": 0.4, + "grad_norm": 1.8963150898529104, + "learning_rate": 6.8324495733505515e-06, + "loss": 0.5916, + "step": 3840 + }, + { + "epoch": 0.4, + "grad_norm": 2.006814141934046, + "learning_rate": 6.830883181344423e-06, + "loss": 0.6003, + "step": 3841 + }, + { + "epoch": 0.4, + "grad_norm": 1.761995258515656, + "learning_rate": 6.829316581797249e-06, + "loss": 0.5449, + "step": 3842 + }, + { + "epoch": 0.4, + "grad_norm": 1.8751428173287201, + "learning_rate": 6.827749774886616e-06, + "loss": 0.6368, + "step": 3843 + }, + { + "epoch": 0.4, + "grad_norm": 1.8238649095624873, + "learning_rate": 6.826182760790127e-06, + "loss": 0.5485, + "step": 3844 + }, + { + "epoch": 0.4, + "grad_norm": 1.8050371465050767, + "learning_rate": 6.824615539685413e-06, + "loss": 0.6776, + "step": 3845 + }, + { + "epoch": 0.4, + "grad_norm": 1.8185148092018413, + "learning_rate": 6.823048111750128e-06, + "loss": 0.5611, + "step": 3846 + }, + { + "epoch": 0.4, + "grad_norm": 1.9137674036077919, + "learning_rate": 6.821480477161948e-06, + "loss": 0.7279, + "step": 3847 + }, + { + "epoch": 0.4, + "grad_norm": 2.278371530895591, + "learning_rate": 6.819912636098574e-06, + "loss": 0.6727, + "step": 3848 + }, + { + "epoch": 0.4, + "grad_norm": 1.9336374035262587, + "learning_rate": 6.81834458873773e-06, + "loss": 0.5828, + "step": 3849 + }, + { + "epoch": 0.4, + "grad_norm": 1.608424844887298, + "learning_rate": 6.816776335257162e-06, + "loss": 0.6486, + "step": 3850 + }, + { + "epoch": 0.4, + "grad_norm": 1.9733951460532413, + "learning_rate": 6.815207875834641e-06, + "loss": 0.7597, + "step": 3851 + }, + { + "epoch": 0.4, + "grad_norm": 1.8337771137234302, + "learning_rate": 6.8136392106479624e-06, + "loss": 0.6421, + "step": 3852 + }, + { + "epoch": 0.4, + "grad_norm": 1.88536398864315, + "learning_rate": 6.81207033987494e-06, + "loss": 0.5933, + "step": 3853 + }, + { + "epoch": 0.4, + "grad_norm": 1.8852332420912083, + "learning_rate": 6.810501263693416e-06, + "loss": 0.6265, + "step": 3854 + }, + { + "epoch": 0.4, + "grad_norm": 1.6270870662218326, + "learning_rate": 6.808931982281255e-06, + "loss": 0.5371, + "step": 3855 + }, + { + "epoch": 0.4, + "grad_norm": 1.9198351870779027, + "learning_rate": 6.807362495816344e-06, + "loss": 0.6244, + "step": 3856 + }, + { + "epoch": 0.4, + "grad_norm": 1.9626894008745388, + "learning_rate": 6.805792804476592e-06, + "loss": 0.6162, + "step": 3857 + }, + { + "epoch": 0.4, + "grad_norm": 1.847790506596514, + "learning_rate": 6.8042229084399325e-06, + "loss": 0.656, + "step": 3858 + }, + { + "epoch": 0.4, + "grad_norm": 1.9491524882255473, + "learning_rate": 6.802652807884322e-06, + "loss": 0.6687, + "step": 3859 + }, + { + "epoch": 0.4, + "grad_norm": 1.8982335002271724, + "learning_rate": 6.801082502987742e-06, + "loss": 0.6663, + "step": 3860 + }, + { + "epoch": 0.4, + "grad_norm": 2.0630552583208925, + "learning_rate": 6.799511993928195e-06, + "loss": 0.5751, + "step": 3861 + }, + { + "epoch": 0.4, + "grad_norm": 1.9407115444354701, + "learning_rate": 6.797941280883706e-06, + "loss": 0.6156, + "step": 3862 + }, + { + "epoch": 0.4, + "grad_norm": 1.855317202450553, + "learning_rate": 6.796370364032324e-06, + "loss": 0.6032, + "step": 3863 + }, + { + "epoch": 0.4, + "grad_norm": 1.8158488358917717, + "learning_rate": 6.794799243552123e-06, + "loss": 0.5607, + "step": 3864 + }, + { + "epoch": 0.4, + "grad_norm": 1.7489231484293017, + "learning_rate": 6.793227919621197e-06, + "loss": 0.7611, + "step": 3865 + }, + { + "epoch": 0.4, + "grad_norm": 2.0671408277409413, + "learning_rate": 6.791656392417666e-06, + "loss": 0.7821, + "step": 3866 + }, + { + "epoch": 0.4, + "grad_norm": 1.6697050142330605, + "learning_rate": 6.790084662119671e-06, + "loss": 0.6472, + "step": 3867 + }, + { + "epoch": 0.4, + "grad_norm": 1.9443126027690214, + "learning_rate": 6.7885127289053765e-06, + "loss": 0.6885, + "step": 3868 + }, + { + "epoch": 0.4, + "grad_norm": 1.7689567655002967, + "learning_rate": 6.78694059295297e-06, + "loss": 0.6111, + "step": 3869 + }, + { + "epoch": 0.4, + "grad_norm": 1.9659917977999166, + "learning_rate": 6.785368254440661e-06, + "loss": 0.5973, + "step": 3870 + }, + { + "epoch": 0.4, + "grad_norm": 1.9024907067927386, + "learning_rate": 6.783795713546686e-06, + "loss": 0.6616, + "step": 3871 + }, + { + "epoch": 0.4, + "grad_norm": 1.8879653406046633, + "learning_rate": 6.782222970449298e-06, + "loss": 0.6182, + "step": 3872 + }, + { + "epoch": 0.4, + "grad_norm": 1.8947196759807639, + "learning_rate": 6.780650025326778e-06, + "loss": 0.6899, + "step": 3873 + }, + { + "epoch": 0.4, + "grad_norm": 1.8450879931056883, + "learning_rate": 6.779076878357429e-06, + "loss": 0.5877, + "step": 3874 + }, + { + "epoch": 0.4, + "grad_norm": 1.8933404415566144, + "learning_rate": 6.777503529719576e-06, + "loss": 0.7101, + "step": 3875 + }, + { + "epoch": 0.4, + "grad_norm": 1.7558373826056832, + "learning_rate": 6.775929979591565e-06, + "loss": 0.6396, + "step": 3876 + }, + { + "epoch": 0.4, + "grad_norm": 2.0003562552284433, + "learning_rate": 6.774356228151768e-06, + "loss": 0.6069, + "step": 3877 + }, + { + "epoch": 0.4, + "grad_norm": 1.9255259994750547, + "learning_rate": 6.772782275578582e-06, + "loss": 0.5854, + "step": 3878 + }, + { + "epoch": 0.4, + "grad_norm": 1.8642977093526387, + "learning_rate": 6.771208122050418e-06, + "loss": 0.6715, + "step": 3879 + }, + { + "epoch": 0.4, + "grad_norm": 1.8022100159125873, + "learning_rate": 6.769633767745718e-06, + "loss": 0.6321, + "step": 3880 + }, + { + "epoch": 0.4, + "grad_norm": 1.8278453367976983, + "learning_rate": 6.768059212842944e-06, + "loss": 0.592, + "step": 3881 + }, + { + "epoch": 0.4, + "grad_norm": 1.8746751891331823, + "learning_rate": 6.7664844575205816e-06, + "loss": 0.6962, + "step": 3882 + }, + { + "epoch": 0.4, + "grad_norm": 1.999998865342274, + "learning_rate": 6.764909501957136e-06, + "loss": 0.665, + "step": 3883 + }, + { + "epoch": 0.4, + "grad_norm": 1.8866066346814458, + "learning_rate": 6.76333434633114e-06, + "loss": 0.7158, + "step": 3884 + }, + { + "epoch": 0.4, + "grad_norm": 1.854320769553393, + "learning_rate": 6.761758990821143e-06, + "loss": 0.6665, + "step": 3885 + }, + { + "epoch": 0.4, + "grad_norm": 2.0265802747703274, + "learning_rate": 6.760183435605725e-06, + "loss": 0.6098, + "step": 3886 + }, + { + "epoch": 0.4, + "grad_norm": 1.8563571894286828, + "learning_rate": 6.758607680863481e-06, + "loss": 0.6027, + "step": 3887 + }, + { + "epoch": 0.4, + "grad_norm": 1.8388299982466285, + "learning_rate": 6.757031726773033e-06, + "loss": 0.6205, + "step": 3888 + }, + { + "epoch": 0.4, + "grad_norm": 1.9306457408739781, + "learning_rate": 6.755455573513025e-06, + "loss": 0.6471, + "step": 3889 + }, + { + "epoch": 0.4, + "grad_norm": 1.8282397443953233, + "learning_rate": 6.75387922126212e-06, + "loss": 0.6413, + "step": 3890 + }, + { + "epoch": 0.4, + "grad_norm": 1.8013519821627202, + "learning_rate": 6.752302670199009e-06, + "loss": 0.5619, + "step": 3891 + }, + { + "epoch": 0.4, + "grad_norm": 2.279002945506568, + "learning_rate": 6.750725920502402e-06, + "loss": 0.7127, + "step": 3892 + }, + { + "epoch": 0.4, + "grad_norm": 2.0640115448453646, + "learning_rate": 6.749148972351034e-06, + "loss": 0.6407, + "step": 3893 + }, + { + "epoch": 0.4, + "grad_norm": 2.0687523081357693, + "learning_rate": 6.74757182592366e-06, + "loss": 0.6546, + "step": 3894 + }, + { + "epoch": 0.4, + "grad_norm": 2.116509052754373, + "learning_rate": 6.7459944813990585e-06, + "loss": 0.737, + "step": 3895 + }, + { + "epoch": 0.4, + "grad_norm": 2.0452835675623184, + "learning_rate": 6.744416938956031e-06, + "loss": 0.7485, + "step": 3896 + }, + { + "epoch": 0.41, + "grad_norm": 1.8635310962131926, + "learning_rate": 6.7428391987734e-06, + "loss": 0.639, + "step": 3897 + }, + { + "epoch": 0.41, + "grad_norm": 2.00514277899685, + "learning_rate": 6.741261261030013e-06, + "loss": 0.7099, + "step": 3898 + }, + { + "epoch": 0.41, + "grad_norm": 1.7909492085460657, + "learning_rate": 6.739683125904737e-06, + "loss": 0.7444, + "step": 3899 + }, + { + "epoch": 0.41, + "grad_norm": 1.9311294006580535, + "learning_rate": 6.7381047935764625e-06, + "loss": 0.6704, + "step": 3900 + }, + { + "epoch": 0.41, + "grad_norm": 2.066959770065368, + "learning_rate": 6.736526264224101e-06, + "loss": 0.6768, + "step": 3901 + }, + { + "epoch": 0.41, + "grad_norm": 1.7730917255734966, + "learning_rate": 6.7349475380265926e-06, + "loss": 0.6414, + "step": 3902 + }, + { + "epoch": 0.41, + "grad_norm": 1.7538215933569536, + "learning_rate": 6.7333686151628895e-06, + "loss": 0.7574, + "step": 3903 + }, + { + "epoch": 0.41, + "grad_norm": 1.8843523002908729, + "learning_rate": 6.731789495811975e-06, + "loss": 0.6446, + "step": 3904 + }, + { + "epoch": 0.41, + "grad_norm": 1.9711212147816433, + "learning_rate": 6.730210180152852e-06, + "loss": 0.6436, + "step": 3905 + }, + { + "epoch": 0.41, + "grad_norm": 1.7667975906547377, + "learning_rate": 6.728630668364541e-06, + "loss": 0.5743, + "step": 3906 + }, + { + "epoch": 0.41, + "grad_norm": 1.747514957971742, + "learning_rate": 6.7270509606260915e-06, + "loss": 0.5884, + "step": 3907 + }, + { + "epoch": 0.41, + "grad_norm": 1.8617280757448222, + "learning_rate": 6.725471057116573e-06, + "loss": 0.7538, + "step": 3908 + }, + { + "epoch": 0.41, + "grad_norm": 2.103545907282549, + "learning_rate": 6.7238909580150735e-06, + "loss": 0.6881, + "step": 3909 + }, + { + "epoch": 0.41, + "grad_norm": 1.8513840301905578, + "learning_rate": 6.7223106635007085e-06, + "loss": 0.671, + "step": 3910 + }, + { + "epoch": 0.41, + "grad_norm": 2.049540873747753, + "learning_rate": 6.720730173752613e-06, + "loss": 0.5919, + "step": 3911 + }, + { + "epoch": 0.41, + "grad_norm": 1.8190429314370908, + "learning_rate": 6.719149488949945e-06, + "loss": 0.5832, + "step": 3912 + }, + { + "epoch": 0.41, + "grad_norm": 1.8679025089120667, + "learning_rate": 6.717568609271883e-06, + "loss": 0.7133, + "step": 3913 + }, + { + "epoch": 0.41, + "grad_norm": 2.0178764743414006, + "learning_rate": 6.715987534897629e-06, + "loss": 0.7242, + "step": 3914 + }, + { + "epoch": 0.41, + "grad_norm": 1.9092744073317511, + "learning_rate": 6.714406266006408e-06, + "loss": 0.6669, + "step": 3915 + }, + { + "epoch": 0.41, + "grad_norm": 1.7023005026783422, + "learning_rate": 6.712824802777465e-06, + "loss": 0.6614, + "step": 3916 + }, + { + "epoch": 0.41, + "grad_norm": 2.077414840829166, + "learning_rate": 6.711243145390066e-06, + "loss": 0.7272, + "step": 3917 + }, + { + "epoch": 0.41, + "grad_norm": 1.5880707527072129, + "learning_rate": 6.709661294023504e-06, + "loss": 0.5592, + "step": 3918 + }, + { + "epoch": 0.41, + "grad_norm": 2.0116835193591376, + "learning_rate": 6.708079248857091e-06, + "loss": 0.7295, + "step": 3919 + }, + { + "epoch": 0.41, + "grad_norm": 1.9509358664824894, + "learning_rate": 6.706497010070157e-06, + "loss": 0.6443, + "step": 3920 + }, + { + "epoch": 0.41, + "grad_norm": 1.9452163843629293, + "learning_rate": 6.704914577842062e-06, + "loss": 0.6771, + "step": 3921 + }, + { + "epoch": 0.41, + "grad_norm": 1.7279799757953567, + "learning_rate": 6.703331952352181e-06, + "loss": 0.6676, + "step": 3922 + }, + { + "epoch": 0.41, + "grad_norm": 1.9451199834666655, + "learning_rate": 6.701749133779916e-06, + "loss": 0.5865, + "step": 3923 + }, + { + "epoch": 0.41, + "grad_norm": 1.813795054616006, + "learning_rate": 6.700166122304686e-06, + "loss": 0.6689, + "step": 3924 + }, + { + "epoch": 0.41, + "grad_norm": 1.8965308229743518, + "learning_rate": 6.698582918105934e-06, + "loss": 0.6256, + "step": 3925 + }, + { + "epoch": 0.41, + "grad_norm": 1.7884201565417062, + "learning_rate": 6.696999521363128e-06, + "loss": 0.6363, + "step": 3926 + }, + { + "epoch": 0.41, + "grad_norm": 1.9635557773440546, + "learning_rate": 6.695415932255753e-06, + "loss": 0.653, + "step": 3927 + }, + { + "epoch": 0.41, + "grad_norm": 2.0890326348091453, + "learning_rate": 6.69383215096332e-06, + "loss": 0.7069, + "step": 3928 + }, + { + "epoch": 0.41, + "grad_norm": 1.6538146640381228, + "learning_rate": 6.692248177665357e-06, + "loss": 0.5993, + "step": 3929 + }, + { + "epoch": 0.41, + "grad_norm": 1.9705437684955363, + "learning_rate": 6.690664012541418e-06, + "loss": 0.7516, + "step": 3930 + }, + { + "epoch": 0.41, + "grad_norm": 1.7444055802195977, + "learning_rate": 6.689079655771076e-06, + "loss": 0.6767, + "step": 3931 + }, + { + "epoch": 0.41, + "grad_norm": 1.8819144065805602, + "learning_rate": 6.687495107533928e-06, + "loss": 0.7314, + "step": 3932 + }, + { + "epoch": 0.41, + "grad_norm": 1.8479615838505195, + "learning_rate": 6.685910368009592e-06, + "loss": 0.6272, + "step": 3933 + }, + { + "epoch": 0.41, + "grad_norm": 2.0909081795401017, + "learning_rate": 6.684325437377704e-06, + "loss": 0.681, + "step": 3934 + }, + { + "epoch": 0.41, + "grad_norm": 1.9637392727671243, + "learning_rate": 6.682740315817929e-06, + "loss": 0.6803, + "step": 3935 + }, + { + "epoch": 0.41, + "grad_norm": 1.7789446449482587, + "learning_rate": 6.681155003509949e-06, + "loss": 0.6188, + "step": 3936 + }, + { + "epoch": 0.41, + "grad_norm": 1.8522898997130828, + "learning_rate": 6.679569500633466e-06, + "loss": 0.6477, + "step": 3937 + }, + { + "epoch": 0.41, + "grad_norm": 1.5571151829309526, + "learning_rate": 6.6779838073682066e-06, + "loss": 0.5387, + "step": 3938 + }, + { + "epoch": 0.41, + "grad_norm": 1.7670313948733032, + "learning_rate": 6.676397923893918e-06, + "loss": 0.6602, + "step": 3939 + }, + { + "epoch": 0.41, + "grad_norm": 2.027370857469276, + "learning_rate": 6.67481185039037e-06, + "loss": 0.6638, + "step": 3940 + }, + { + "epoch": 0.41, + "grad_norm": 1.9916936077066447, + "learning_rate": 6.673225587037354e-06, + "loss": 0.6213, + "step": 3941 + }, + { + "epoch": 0.41, + "grad_norm": 1.9133600244144617, + "learning_rate": 6.671639134014679e-06, + "loss": 0.5868, + "step": 3942 + }, + { + "epoch": 0.41, + "grad_norm": 1.8755522182462092, + "learning_rate": 6.670052491502182e-06, + "loss": 0.6124, + "step": 3943 + }, + { + "epoch": 0.41, + "grad_norm": 2.3078342997813386, + "learning_rate": 6.668465659679714e-06, + "loss": 0.7289, + "step": 3944 + }, + { + "epoch": 0.41, + "grad_norm": 1.9514380326839884, + "learning_rate": 6.666878638727154e-06, + "loss": 0.6594, + "step": 3945 + }, + { + "epoch": 0.41, + "grad_norm": 2.133201334464456, + "learning_rate": 6.6652914288243996e-06, + "loss": 0.5717, + "step": 3946 + }, + { + "epoch": 0.41, + "grad_norm": 1.9012313927156967, + "learning_rate": 6.66370403015137e-06, + "loss": 0.5791, + "step": 3947 + }, + { + "epoch": 0.41, + "grad_norm": 1.9722937856370126, + "learning_rate": 6.662116442888007e-06, + "loss": 0.6285, + "step": 3948 + }, + { + "epoch": 0.41, + "grad_norm": 1.903817678915892, + "learning_rate": 6.66052866721427e-06, + "loss": 0.6406, + "step": 3949 + }, + { + "epoch": 0.41, + "grad_norm": 1.9979888015586342, + "learning_rate": 6.6589407033101435e-06, + "loss": 0.7432, + "step": 3950 + }, + { + "epoch": 0.41, + "grad_norm": 1.6259230128595419, + "learning_rate": 6.657352551355634e-06, + "loss": 0.6007, + "step": 3951 + }, + { + "epoch": 0.41, + "grad_norm": 1.9194025746637722, + "learning_rate": 6.655764211530767e-06, + "loss": 0.7022, + "step": 3952 + }, + { + "epoch": 0.41, + "grad_norm": 1.735014946320013, + "learning_rate": 6.654175684015587e-06, + "loss": 0.6331, + "step": 3953 + }, + { + "epoch": 0.41, + "grad_norm": 1.9823415478074962, + "learning_rate": 6.652586968990164e-06, + "loss": 0.7171, + "step": 3954 + }, + { + "epoch": 0.41, + "grad_norm": 1.7583769540998349, + "learning_rate": 6.650998066634589e-06, + "loss": 0.6572, + "step": 3955 + }, + { + "epoch": 0.41, + "grad_norm": 1.8600867587835666, + "learning_rate": 6.649408977128975e-06, + "loss": 0.6734, + "step": 3956 + }, + { + "epoch": 0.41, + "grad_norm": 1.862698659637831, + "learning_rate": 6.64781970065345e-06, + "loss": 0.615, + "step": 3957 + }, + { + "epoch": 0.41, + "grad_norm": 1.9564006592363403, + "learning_rate": 6.646230237388172e-06, + "loss": 0.6748, + "step": 3958 + }, + { + "epoch": 0.41, + "grad_norm": 1.8968835571667682, + "learning_rate": 6.644640587513313e-06, + "loss": 0.6716, + "step": 3959 + }, + { + "epoch": 0.41, + "grad_norm": 1.869462572063707, + "learning_rate": 6.643050751209067e-06, + "loss": 0.7545, + "step": 3960 + }, + { + "epoch": 0.41, + "grad_norm": 1.8778880002152252, + "learning_rate": 6.641460728655654e-06, + "loss": 0.6778, + "step": 3961 + }, + { + "epoch": 0.41, + "grad_norm": 1.7399480890625565, + "learning_rate": 6.6398705200333125e-06, + "loss": 0.5642, + "step": 3962 + }, + { + "epoch": 0.41, + "grad_norm": 1.5578811285592717, + "learning_rate": 6.6382801255223e-06, + "loss": 0.4749, + "step": 3963 + }, + { + "epoch": 0.41, + "grad_norm": 1.6948492903335945, + "learning_rate": 6.636689545302898e-06, + "loss": 0.6076, + "step": 3964 + }, + { + "epoch": 0.41, + "grad_norm": 1.5766728927525278, + "learning_rate": 6.6350987795554056e-06, + "loss": 0.5974, + "step": 3965 + }, + { + "epoch": 0.41, + "grad_norm": 1.9685332103178712, + "learning_rate": 6.633507828460148e-06, + "loss": 0.712, + "step": 3966 + }, + { + "epoch": 0.41, + "grad_norm": 1.748252872817549, + "learning_rate": 6.631916692197466e-06, + "loss": 0.5779, + "step": 3967 + }, + { + "epoch": 0.41, + "grad_norm": 2.2011682557023478, + "learning_rate": 6.6303253709477276e-06, + "loss": 0.756, + "step": 3968 + }, + { + "epoch": 0.41, + "grad_norm": 1.870775828998848, + "learning_rate": 6.628733864891315e-06, + "loss": 0.6796, + "step": 3969 + }, + { + "epoch": 0.41, + "grad_norm": 1.9934558724921358, + "learning_rate": 6.627142174208634e-06, + "loss": 0.6594, + "step": 3970 + }, + { + "epoch": 0.41, + "grad_norm": 1.762238827723651, + "learning_rate": 6.625550299080115e-06, + "loss": 0.673, + "step": 3971 + }, + { + "epoch": 0.41, + "grad_norm": 1.7996931164327568, + "learning_rate": 6.623958239686204e-06, + "loss": 0.599, + "step": 3972 + }, + { + "epoch": 0.41, + "grad_norm": 1.7062603543348756, + "learning_rate": 6.622365996207368e-06, + "loss": 0.6358, + "step": 3973 + }, + { + "epoch": 0.41, + "grad_norm": 1.8352990023889209, + "learning_rate": 6.620773568824101e-06, + "loss": 0.5925, + "step": 3974 + }, + { + "epoch": 0.41, + "grad_norm": 1.777374114911824, + "learning_rate": 6.619180957716913e-06, + "loss": 0.5977, + "step": 3975 + }, + { + "epoch": 0.41, + "grad_norm": 1.8645416702797122, + "learning_rate": 6.617588163066333e-06, + "loss": 0.6726, + "step": 3976 + }, + { + "epoch": 0.41, + "grad_norm": 1.986094087122366, + "learning_rate": 6.615995185052915e-06, + "loss": 0.6859, + "step": 3977 + }, + { + "epoch": 0.41, + "grad_norm": 1.9671637070485966, + "learning_rate": 6.614402023857231e-06, + "loss": 0.6191, + "step": 3978 + }, + { + "epoch": 0.41, + "grad_norm": 1.992735241698695, + "learning_rate": 6.612808679659878e-06, + "loss": 0.7177, + "step": 3979 + }, + { + "epoch": 0.41, + "grad_norm": 1.8594009010630315, + "learning_rate": 6.611215152641466e-06, + "loss": 0.6817, + "step": 3980 + }, + { + "epoch": 0.41, + "grad_norm": 1.8366393363968638, + "learning_rate": 6.609621442982634e-06, + "loss": 0.7283, + "step": 3981 + }, + { + "epoch": 0.41, + "grad_norm": 1.8880200232029714, + "learning_rate": 6.608027550864038e-06, + "loss": 0.6786, + "step": 3982 + }, + { + "epoch": 0.41, + "grad_norm": 1.7713852962235712, + "learning_rate": 6.606433476466352e-06, + "loss": 0.6204, + "step": 3983 + }, + { + "epoch": 0.41, + "grad_norm": 1.951459700873205, + "learning_rate": 6.604839219970276e-06, + "loss": 0.5822, + "step": 3984 + }, + { + "epoch": 0.41, + "grad_norm": 1.941984778501092, + "learning_rate": 6.603244781556527e-06, + "loss": 0.816, + "step": 3985 + }, + { + "epoch": 0.41, + "grad_norm": 1.6869770005515161, + "learning_rate": 6.601650161405844e-06, + "loss": 0.6415, + "step": 3986 + }, + { + "epoch": 0.41, + "grad_norm": 1.7618809871558578, + "learning_rate": 6.600055359698984e-06, + "loss": 0.718, + "step": 3987 + }, + { + "epoch": 0.41, + "grad_norm": 1.8940876608861175, + "learning_rate": 6.598460376616731e-06, + "loss": 0.695, + "step": 3988 + }, + { + "epoch": 0.41, + "grad_norm": 1.582975947302647, + "learning_rate": 6.596865212339885e-06, + "loss": 0.5889, + "step": 3989 + }, + { + "epoch": 0.41, + "grad_norm": 2.0959689544385873, + "learning_rate": 6.595269867049262e-06, + "loss": 0.7246, + "step": 3990 + }, + { + "epoch": 0.41, + "grad_norm": 1.875274315991869, + "learning_rate": 6.5936743409257085e-06, + "loss": 0.6586, + "step": 3991 + }, + { + "epoch": 0.41, + "grad_norm": 1.952777633657918, + "learning_rate": 6.592078634150084e-06, + "loss": 0.6749, + "step": 3992 + }, + { + "epoch": 0.42, + "grad_norm": 2.0330590435053755, + "learning_rate": 6.590482746903273e-06, + "loss": 0.694, + "step": 3993 + }, + { + "epoch": 0.42, + "grad_norm": 2.040171030726033, + "learning_rate": 6.588886679366177e-06, + "loss": 0.8347, + "step": 3994 + }, + { + "epoch": 0.42, + "grad_norm": 1.9779967255959818, + "learning_rate": 6.587290431719718e-06, + "loss": 0.5998, + "step": 3995 + }, + { + "epoch": 0.42, + "grad_norm": 1.9489545122956449, + "learning_rate": 6.585694004144844e-06, + "loss": 0.622, + "step": 3996 + }, + { + "epoch": 0.42, + "grad_norm": 1.7578029357926648, + "learning_rate": 6.584097396822514e-06, + "loss": 0.6302, + "step": 3997 + }, + { + "epoch": 0.42, + "grad_norm": 1.851928626062996, + "learning_rate": 6.582500609933715e-06, + "loss": 0.6829, + "step": 3998 + }, + { + "epoch": 0.42, + "grad_norm": 1.7174238910726383, + "learning_rate": 6.580903643659453e-06, + "loss": 0.6014, + "step": 3999 + }, + { + "epoch": 0.42, + "grad_norm": 1.8121816554070762, + "learning_rate": 6.579306498180753e-06, + "loss": 0.6708, + "step": 4000 + }, + { + "epoch": 0.42, + "grad_norm": 1.8179688321936576, + "learning_rate": 6.577709173678658e-06, + "loss": 0.6709, + "step": 4001 + }, + { + "epoch": 0.42, + "grad_norm": 2.0852265758278596, + "learning_rate": 6.5761116703342365e-06, + "loss": 0.7801, + "step": 4002 + }, + { + "epoch": 0.42, + "grad_norm": 1.725498003504001, + "learning_rate": 6.574513988328572e-06, + "loss": 0.6907, + "step": 4003 + }, + { + "epoch": 0.42, + "grad_norm": 1.9877426312997266, + "learning_rate": 6.572916127842775e-06, + "loss": 0.5943, + "step": 4004 + }, + { + "epoch": 0.42, + "grad_norm": 1.7800607172380565, + "learning_rate": 6.5713180890579675e-06, + "loss": 0.5852, + "step": 4005 + }, + { + "epoch": 0.42, + "grad_norm": 2.0937490532286516, + "learning_rate": 6.569719872155299e-06, + "loss": 0.7188, + "step": 4006 + }, + { + "epoch": 0.42, + "grad_norm": 1.7241389489460623, + "learning_rate": 6.568121477315936e-06, + "loss": 0.4979, + "step": 4007 + }, + { + "epoch": 0.42, + "grad_norm": 2.007324352220353, + "learning_rate": 6.566522904721066e-06, + "loss": 0.6385, + "step": 4008 + }, + { + "epoch": 0.42, + "grad_norm": 2.2119954176168126, + "learning_rate": 6.564924154551895e-06, + "loss": 0.6259, + "step": 4009 + }, + { + "epoch": 0.42, + "grad_norm": 2.1210351984877147, + "learning_rate": 6.563325226989652e-06, + "loss": 0.707, + "step": 4010 + }, + { + "epoch": 0.42, + "grad_norm": 1.8997902827936242, + "learning_rate": 6.561726122215585e-06, + "loss": 0.7243, + "step": 4011 + }, + { + "epoch": 0.42, + "grad_norm": 1.6957497867748266, + "learning_rate": 6.560126840410958e-06, + "loss": 0.6136, + "step": 4012 + }, + { + "epoch": 0.42, + "grad_norm": 1.7441208168504188, + "learning_rate": 6.558527381757063e-06, + "loss": 0.6152, + "step": 4013 + }, + { + "epoch": 0.42, + "grad_norm": 1.790485685409849, + "learning_rate": 6.556927746435204e-06, + "loss": 0.6593, + "step": 4014 + }, + { + "epoch": 0.42, + "grad_norm": 2.0974652382452303, + "learning_rate": 6.555327934626714e-06, + "loss": 0.6737, + "step": 4015 + }, + { + "epoch": 0.42, + "grad_norm": 1.8269551144468505, + "learning_rate": 6.553727946512935e-06, + "loss": 0.6143, + "step": 4016 + }, + { + "epoch": 0.42, + "grad_norm": 1.7153314087472655, + "learning_rate": 6.552127782275238e-06, + "loss": 0.6397, + "step": 4017 + }, + { + "epoch": 0.42, + "grad_norm": 2.090028098890312, + "learning_rate": 6.55052744209501e-06, + "loss": 0.6303, + "step": 4018 + }, + { + "epoch": 0.42, + "grad_norm": 2.009734384358688, + "learning_rate": 6.548926926153659e-06, + "loss": 0.6369, + "step": 4019 + }, + { + "epoch": 0.42, + "grad_norm": 1.770723520702836, + "learning_rate": 6.5473262346326125e-06, + "loss": 0.5863, + "step": 4020 + }, + { + "epoch": 0.42, + "grad_norm": 2.0075361710627084, + "learning_rate": 6.545725367713317e-06, + "loss": 0.5489, + "step": 4021 + }, + { + "epoch": 0.42, + "grad_norm": 1.7245698235105078, + "learning_rate": 6.5441243255772426e-06, + "loss": 0.6108, + "step": 4022 + }, + { + "epoch": 0.42, + "grad_norm": 1.8035379353837957, + "learning_rate": 6.542523108405873e-06, + "loss": 0.8067, + "step": 4023 + }, + { + "epoch": 0.42, + "grad_norm": 1.8715746504239854, + "learning_rate": 6.540921716380717e-06, + "loss": 0.6967, + "step": 4024 + }, + { + "epoch": 0.42, + "grad_norm": 2.096205344372492, + "learning_rate": 6.539320149683302e-06, + "loss": 0.645, + "step": 4025 + }, + { + "epoch": 0.42, + "grad_norm": 1.7646255765648868, + "learning_rate": 6.537718408495174e-06, + "loss": 0.7169, + "step": 4026 + }, + { + "epoch": 0.42, + "grad_norm": 1.8536865089842989, + "learning_rate": 6.536116492997899e-06, + "loss": 0.5697, + "step": 4027 + }, + { + "epoch": 0.42, + "grad_norm": 2.0152237179046915, + "learning_rate": 6.534514403373064e-06, + "loss": 0.6279, + "step": 4028 + }, + { + "epoch": 0.42, + "grad_norm": 1.8954231608810324, + "learning_rate": 6.5329121398022756e-06, + "loss": 0.665, + "step": 4029 + }, + { + "epoch": 0.42, + "grad_norm": 1.8970903222431375, + "learning_rate": 6.531309702467159e-06, + "loss": 0.7078, + "step": 4030 + }, + { + "epoch": 0.42, + "grad_norm": 2.135112232684426, + "learning_rate": 6.52970709154936e-06, + "loss": 0.7513, + "step": 4031 + }, + { + "epoch": 0.42, + "grad_norm": 1.9549109974607481, + "learning_rate": 6.528104307230542e-06, + "loss": 0.6653, + "step": 4032 + }, + { + "epoch": 0.42, + "grad_norm": 1.7139953691811927, + "learning_rate": 6.526501349692392e-06, + "loss": 0.591, + "step": 4033 + }, + { + "epoch": 0.42, + "grad_norm": 1.6627286877312124, + "learning_rate": 6.524898219116612e-06, + "loss": 0.6157, + "step": 4034 + }, + { + "epoch": 0.42, + "grad_norm": 2.0560769425457464, + "learning_rate": 6.523294915684928e-06, + "loss": 0.634, + "step": 4035 + }, + { + "epoch": 0.42, + "grad_norm": 1.8113394568340626, + "learning_rate": 6.5216914395790806e-06, + "loss": 0.6815, + "step": 4036 + }, + { + "epoch": 0.42, + "grad_norm": 1.9270712852849294, + "learning_rate": 6.520087790980838e-06, + "loss": 0.6843, + "step": 4037 + }, + { + "epoch": 0.42, + "grad_norm": 1.9527943178109233, + "learning_rate": 6.51848397007198e-06, + "loss": 0.709, + "step": 4038 + }, + { + "epoch": 0.42, + "grad_norm": 2.023544759218534, + "learning_rate": 6.516879977034307e-06, + "loss": 0.7007, + "step": 4039 + }, + { + "epoch": 0.42, + "grad_norm": 2.0956614909800018, + "learning_rate": 6.515275812049644e-06, + "loss": 0.6164, + "step": 4040 + }, + { + "epoch": 0.42, + "grad_norm": 2.0323171139807004, + "learning_rate": 6.51367147529983e-06, + "loss": 0.706, + "step": 4041 + }, + { + "epoch": 0.42, + "grad_norm": 1.8683385638918015, + "learning_rate": 6.512066966966728e-06, + "loss": 0.6089, + "step": 4042 + }, + { + "epoch": 0.42, + "grad_norm": 2.02288354013809, + "learning_rate": 6.510462287232216e-06, + "loss": 0.7069, + "step": 4043 + }, + { + "epoch": 0.42, + "grad_norm": 1.8013988532696286, + "learning_rate": 6.508857436278195e-06, + "loss": 0.6228, + "step": 4044 + }, + { + "epoch": 0.42, + "grad_norm": 2.0158224191382668, + "learning_rate": 6.5072524142865825e-06, + "loss": 0.7294, + "step": 4045 + }, + { + "epoch": 0.42, + "grad_norm": 1.6978751958811078, + "learning_rate": 6.505647221439317e-06, + "loss": 0.527, + "step": 4046 + }, + { + "epoch": 0.42, + "grad_norm": 1.7626559524080982, + "learning_rate": 6.504041857918359e-06, + "loss": 0.5494, + "step": 4047 + }, + { + "epoch": 0.42, + "grad_norm": 2.052676030295689, + "learning_rate": 6.502436323905683e-06, + "loss": 0.7799, + "step": 4048 + }, + { + "epoch": 0.42, + "grad_norm": 1.9461214027379186, + "learning_rate": 6.500830619583286e-06, + "loss": 0.6377, + "step": 4049 + }, + { + "epoch": 0.42, + "grad_norm": 1.8652319862322915, + "learning_rate": 6.499224745133184e-06, + "loss": 0.6572, + "step": 4050 + }, + { + "epoch": 0.42, + "grad_norm": 2.077328808264135, + "learning_rate": 6.4976187007374116e-06, + "loss": 0.7387, + "step": 4051 + }, + { + "epoch": 0.42, + "grad_norm": 1.8648108669301284, + "learning_rate": 6.496012486578024e-06, + "loss": 0.6635, + "step": 4052 + }, + { + "epoch": 0.42, + "grad_norm": 1.535449841127006, + "learning_rate": 6.494406102837093e-06, + "loss": 0.573, + "step": 4053 + }, + { + "epoch": 0.42, + "grad_norm": 1.7169141635665444, + "learning_rate": 6.492799549696712e-06, + "loss": 0.5881, + "step": 4054 + }, + { + "epoch": 0.42, + "grad_norm": 1.8801509275305481, + "learning_rate": 6.4911928273389946e-06, + "loss": 0.7089, + "step": 4055 + }, + { + "epoch": 0.42, + "grad_norm": 1.6912175530324325, + "learning_rate": 6.4895859359460714e-06, + "loss": 0.6018, + "step": 4056 + }, + { + "epoch": 0.42, + "grad_norm": 1.8383688047454536, + "learning_rate": 6.487978875700091e-06, + "loss": 0.6198, + "step": 4057 + }, + { + "epoch": 0.42, + "grad_norm": 1.8638435936380096, + "learning_rate": 6.486371646783223e-06, + "loss": 0.6198, + "step": 4058 + }, + { + "epoch": 0.42, + "grad_norm": 1.7715201966443364, + "learning_rate": 6.4847642493776585e-06, + "loss": 0.6964, + "step": 4059 + }, + { + "epoch": 0.42, + "grad_norm": 1.7922800486547616, + "learning_rate": 6.4831566836656024e-06, + "loss": 0.5816, + "step": 4060 + }, + { + "epoch": 0.42, + "grad_norm": 1.7727987030469636, + "learning_rate": 6.481548949829282e-06, + "loss": 0.6143, + "step": 4061 + }, + { + "epoch": 0.42, + "grad_norm": 2.024752836474955, + "learning_rate": 6.479941048050944e-06, + "loss": 0.6124, + "step": 4062 + }, + { + "epoch": 0.42, + "grad_norm": 1.8250783973226659, + "learning_rate": 6.478332978512853e-06, + "loss": 0.6522, + "step": 4063 + }, + { + "epoch": 0.42, + "grad_norm": 1.9493846686603185, + "learning_rate": 6.476724741397293e-06, + "loss": 0.6507, + "step": 4064 + }, + { + "epoch": 0.42, + "grad_norm": 1.755309105876831, + "learning_rate": 6.4751163368865665e-06, + "loss": 0.6361, + "step": 4065 + }, + { + "epoch": 0.42, + "grad_norm": 1.810159349170945, + "learning_rate": 6.473507765162994e-06, + "loss": 0.6304, + "step": 4066 + }, + { + "epoch": 0.42, + "grad_norm": 1.8676885311500284, + "learning_rate": 6.47189902640892e-06, + "loss": 0.7234, + "step": 4067 + }, + { + "epoch": 0.42, + "grad_norm": 1.6933054623850488, + "learning_rate": 6.4702901208067e-06, + "loss": 0.6732, + "step": 4068 + }, + { + "epoch": 0.42, + "grad_norm": 1.7455580715939196, + "learning_rate": 6.468681048538715e-06, + "loss": 0.6288, + "step": 4069 + }, + { + "epoch": 0.42, + "grad_norm": 1.8551247654424352, + "learning_rate": 6.467071809787363e-06, + "loss": 0.6904, + "step": 4070 + }, + { + "epoch": 0.42, + "grad_norm": 2.2777286315308367, + "learning_rate": 6.4654624047350575e-06, + "loss": 0.7492, + "step": 4071 + }, + { + "epoch": 0.42, + "grad_norm": 1.6700799417004726, + "learning_rate": 6.463852833564236e-06, + "loss": 0.5802, + "step": 4072 + }, + { + "epoch": 0.42, + "grad_norm": 1.829777392822749, + "learning_rate": 6.462243096457352e-06, + "loss": 0.6539, + "step": 4073 + }, + { + "epoch": 0.42, + "grad_norm": 1.8592653629194054, + "learning_rate": 6.460633193596879e-06, + "loss": 0.5977, + "step": 4074 + }, + { + "epoch": 0.42, + "grad_norm": 1.9504686835427847, + "learning_rate": 6.459023125165308e-06, + "loss": 0.7809, + "step": 4075 + }, + { + "epoch": 0.42, + "grad_norm": 2.0166498080138373, + "learning_rate": 6.4574128913451495e-06, + "loss": 0.6218, + "step": 4076 + }, + { + "epoch": 0.42, + "grad_norm": 1.9365597515134503, + "learning_rate": 6.4558024923189336e-06, + "loss": 0.678, + "step": 4077 + }, + { + "epoch": 0.42, + "grad_norm": 1.7631181530060356, + "learning_rate": 6.454191928269207e-06, + "loss": 0.6319, + "step": 4078 + }, + { + "epoch": 0.42, + "grad_norm": 1.861501700664707, + "learning_rate": 6.452581199378536e-06, + "loss": 0.6315, + "step": 4079 + }, + { + "epoch": 0.42, + "grad_norm": 2.1103702750302067, + "learning_rate": 6.450970305829507e-06, + "loss": 0.7092, + "step": 4080 + }, + { + "epoch": 0.42, + "grad_norm": 1.740618758516033, + "learning_rate": 6.449359247804724e-06, + "loss": 0.6223, + "step": 4081 + }, + { + "epoch": 0.42, + "grad_norm": 1.8901679485332747, + "learning_rate": 6.447748025486809e-06, + "loss": 0.5887, + "step": 4082 + }, + { + "epoch": 0.42, + "grad_norm": 1.9819488844550182, + "learning_rate": 6.4461366390584025e-06, + "loss": 0.7952, + "step": 4083 + }, + { + "epoch": 0.42, + "grad_norm": 1.9219125754878177, + "learning_rate": 6.444525088702166e-06, + "loss": 0.8131, + "step": 4084 + }, + { + "epoch": 0.42, + "grad_norm": 1.7999306822008587, + "learning_rate": 6.442913374600778e-06, + "loss": 0.6919, + "step": 4085 + }, + { + "epoch": 0.42, + "grad_norm": 1.9952306260295138, + "learning_rate": 6.441301496936934e-06, + "loss": 0.6228, + "step": 4086 + }, + { + "epoch": 0.42, + "grad_norm": 1.6397261652327657, + "learning_rate": 6.4396894558933495e-06, + "loss": 0.5822, + "step": 4087 + }, + { + "epoch": 0.42, + "grad_norm": 2.0119623346471935, + "learning_rate": 6.438077251652759e-06, + "loss": 0.6775, + "step": 4088 + }, + { + "epoch": 0.43, + "grad_norm": 1.883160572533884, + "learning_rate": 6.436464884397917e-06, + "loss": 0.5655, + "step": 4089 + }, + { + "epoch": 0.43, + "grad_norm": 1.9595553513484605, + "learning_rate": 6.434852354311592e-06, + "loss": 0.6677, + "step": 4090 + }, + { + "epoch": 0.43, + "grad_norm": 1.8563543420640487, + "learning_rate": 6.433239661576574e-06, + "loss": 0.6959, + "step": 4091 + }, + { + "epoch": 0.43, + "grad_norm": 1.979965895251359, + "learning_rate": 6.431626806375671e-06, + "loss": 0.6135, + "step": 4092 + }, + { + "epoch": 0.43, + "grad_norm": 1.895092434982727, + "learning_rate": 6.4300137888917104e-06, + "loss": 0.5857, + "step": 4093 + }, + { + "epoch": 0.43, + "grad_norm": 1.8786371189307325, + "learning_rate": 6.428400609307535e-06, + "loss": 0.6763, + "step": 4094 + }, + { + "epoch": 0.43, + "grad_norm": 1.9249867638405949, + "learning_rate": 6.426787267806009e-06, + "loss": 0.5852, + "step": 4095 + }, + { + "epoch": 0.43, + "grad_norm": 2.076984042439822, + "learning_rate": 6.425173764570014e-06, + "loss": 0.5923, + "step": 4096 + }, + { + "epoch": 0.43, + "grad_norm": 1.9017582414587852, + "learning_rate": 6.42356009978245e-06, + "loss": 0.6385, + "step": 4097 + }, + { + "epoch": 0.43, + "grad_norm": 1.98427043016857, + "learning_rate": 6.421946273626234e-06, + "loss": 0.6316, + "step": 4098 + }, + { + "epoch": 0.43, + "grad_norm": 2.085670532511142, + "learning_rate": 6.420332286284303e-06, + "loss": 0.6466, + "step": 4099 + }, + { + "epoch": 0.43, + "grad_norm": 1.9356842581183769, + "learning_rate": 6.418718137939614e-06, + "loss": 0.6055, + "step": 4100 + }, + { + "epoch": 0.43, + "grad_norm": 2.099063254616092, + "learning_rate": 6.417103828775135e-06, + "loss": 0.6641, + "step": 4101 + }, + { + "epoch": 0.43, + "grad_norm": 2.2228334066963855, + "learning_rate": 6.41548935897386e-06, + "loss": 0.784, + "step": 4102 + }, + { + "epoch": 0.43, + "grad_norm": 1.7704647109970402, + "learning_rate": 6.4138747287187984e-06, + "loss": 0.6703, + "step": 4103 + }, + { + "epoch": 0.43, + "grad_norm": 1.9802068841142038, + "learning_rate": 6.412259938192978e-06, + "loss": 0.6845, + "step": 4104 + }, + { + "epoch": 0.43, + "grad_norm": 2.4689051914936173, + "learning_rate": 6.410644987579444e-06, + "loss": 0.594, + "step": 4105 + }, + { + "epoch": 0.43, + "grad_norm": 2.212128219379263, + "learning_rate": 6.409029877061259e-06, + "loss": 0.7405, + "step": 4106 + }, + { + "epoch": 0.43, + "grad_norm": 1.875314160139368, + "learning_rate": 6.407414606821507e-06, + "loss": 0.6231, + "step": 4107 + }, + { + "epoch": 0.43, + "grad_norm": 1.8218343066015268, + "learning_rate": 6.405799177043289e-06, + "loss": 0.6286, + "step": 4108 + }, + { + "epoch": 0.43, + "grad_norm": 2.0688539285382137, + "learning_rate": 6.4041835879097205e-06, + "loss": 0.7444, + "step": 4109 + }, + { + "epoch": 0.43, + "grad_norm": 1.7472797608134105, + "learning_rate": 6.402567839603937e-06, + "loss": 0.6215, + "step": 4110 + }, + { + "epoch": 0.43, + "grad_norm": 1.9131271318603962, + "learning_rate": 6.400951932309097e-06, + "loss": 0.6368, + "step": 4111 + }, + { + "epoch": 0.43, + "grad_norm": 2.028012533221227, + "learning_rate": 6.399335866208367e-06, + "loss": 0.6568, + "step": 4112 + }, + { + "epoch": 0.43, + "grad_norm": 1.8051750494724248, + "learning_rate": 6.397719641484943e-06, + "loss": 0.6733, + "step": 4113 + }, + { + "epoch": 0.43, + "grad_norm": 1.9604187629140273, + "learning_rate": 6.39610325832203e-06, + "loss": 0.6956, + "step": 4114 + }, + { + "epoch": 0.43, + "grad_norm": 1.7146717528468935, + "learning_rate": 6.394486716902857e-06, + "loss": 0.567, + "step": 4115 + }, + { + "epoch": 0.43, + "grad_norm": 1.8587330203323895, + "learning_rate": 6.392870017410665e-06, + "loss": 0.7145, + "step": 4116 + }, + { + "epoch": 0.43, + "grad_norm": 1.886070724526401, + "learning_rate": 6.3912531600287166e-06, + "loss": 0.6478, + "step": 4117 + }, + { + "epoch": 0.43, + "grad_norm": 1.8676316071674293, + "learning_rate": 6.389636144940294e-06, + "loss": 0.6376, + "step": 4118 + }, + { + "epoch": 0.43, + "grad_norm": 2.152777218544474, + "learning_rate": 6.388018972328693e-06, + "loss": 0.6583, + "step": 4119 + }, + { + "epoch": 0.43, + "grad_norm": 1.7736923671339342, + "learning_rate": 6.386401642377231e-06, + "loss": 0.6683, + "step": 4120 + }, + { + "epoch": 0.43, + "grad_norm": 1.7318840943278404, + "learning_rate": 6.384784155269239e-06, + "loss": 0.6245, + "step": 4121 + }, + { + "epoch": 0.43, + "grad_norm": 2.1659556826960933, + "learning_rate": 6.383166511188072e-06, + "loss": 0.7848, + "step": 4122 + }, + { + "epoch": 0.43, + "grad_norm": 1.8961148222331055, + "learning_rate": 6.381548710317096e-06, + "loss": 0.7207, + "step": 4123 + }, + { + "epoch": 0.43, + "grad_norm": 1.8419789827418658, + "learning_rate": 6.3799307528397e-06, + "loss": 0.7183, + "step": 4124 + }, + { + "epoch": 0.43, + "grad_norm": 1.8897939224972424, + "learning_rate": 6.378312638939286e-06, + "loss": 0.5796, + "step": 4125 + }, + { + "epoch": 0.43, + "grad_norm": 1.9689256228329488, + "learning_rate": 6.37669436879928e-06, + "loss": 0.7358, + "step": 4126 + }, + { + "epoch": 0.43, + "grad_norm": 1.8892196667301127, + "learning_rate": 6.375075942603119e-06, + "loss": 0.7024, + "step": 4127 + }, + { + "epoch": 0.43, + "grad_norm": 1.9259521778924813, + "learning_rate": 6.373457360534263e-06, + "loss": 0.6811, + "step": 4128 + }, + { + "epoch": 0.43, + "grad_norm": 1.8004120338996794, + "learning_rate": 6.371838622776187e-06, + "loss": 0.6131, + "step": 4129 + }, + { + "epoch": 0.43, + "grad_norm": 1.935184387430914, + "learning_rate": 6.370219729512383e-06, + "loss": 0.6478, + "step": 4130 + }, + { + "epoch": 0.43, + "grad_norm": 1.909323332437752, + "learning_rate": 6.368600680926364e-06, + "loss": 0.7664, + "step": 4131 + }, + { + "epoch": 0.43, + "grad_norm": 2.1700474542918915, + "learning_rate": 6.3669814772016555e-06, + "loss": 0.7379, + "step": 4132 + }, + { + "epoch": 0.43, + "grad_norm": 1.8763611045110336, + "learning_rate": 6.365362118521807e-06, + "loss": 0.6204, + "step": 4133 + }, + { + "epoch": 0.43, + "grad_norm": 1.7139413076250838, + "learning_rate": 6.363742605070379e-06, + "loss": 0.7147, + "step": 4134 + }, + { + "epoch": 0.43, + "grad_norm": 1.9044300533734067, + "learning_rate": 6.362122937030952e-06, + "loss": 0.6914, + "step": 4135 + }, + { + "epoch": 0.43, + "grad_norm": 1.9154252110669694, + "learning_rate": 6.360503114587129e-06, + "loss": 0.615, + "step": 4136 + }, + { + "epoch": 0.43, + "grad_norm": 1.739297540790688, + "learning_rate": 6.3588831379225226e-06, + "loss": 0.6519, + "step": 4137 + }, + { + "epoch": 0.43, + "grad_norm": 1.7433831357411251, + "learning_rate": 6.357263007220767e-06, + "loss": 0.6645, + "step": 4138 + }, + { + "epoch": 0.43, + "grad_norm": 1.9514132458981708, + "learning_rate": 6.355642722665512e-06, + "loss": 0.7732, + "step": 4139 + }, + { + "epoch": 0.43, + "grad_norm": 1.8361076741980893, + "learning_rate": 6.354022284440429e-06, + "loss": 0.6559, + "step": 4140 + }, + { + "epoch": 0.43, + "grad_norm": 1.9369982517217328, + "learning_rate": 6.352401692729202e-06, + "loss": 0.6168, + "step": 4141 + }, + { + "epoch": 0.43, + "grad_norm": 1.5500501779631404, + "learning_rate": 6.3507809477155335e-06, + "loss": 0.6486, + "step": 4142 + }, + { + "epoch": 0.43, + "grad_norm": 1.762058079092894, + "learning_rate": 6.349160049583146e-06, + "loss": 0.6379, + "step": 4143 + }, + { + "epoch": 0.43, + "grad_norm": 1.6547077639256238, + "learning_rate": 6.347538998515778e-06, + "loss": 0.6427, + "step": 4144 + }, + { + "epoch": 0.43, + "grad_norm": 1.863644426003244, + "learning_rate": 6.345917794697183e-06, + "loss": 0.682, + "step": 4145 + }, + { + "epoch": 0.43, + "grad_norm": 2.0289147358075525, + "learning_rate": 6.344296438311134e-06, + "loss": 0.6736, + "step": 4146 + }, + { + "epoch": 0.43, + "grad_norm": 2.0453181169725765, + "learning_rate": 6.342674929541424e-06, + "loss": 0.6646, + "step": 4147 + }, + { + "epoch": 0.43, + "grad_norm": 1.814620528092348, + "learning_rate": 6.341053268571855e-06, + "loss": 0.6626, + "step": 4148 + }, + { + "epoch": 0.43, + "grad_norm": 1.9840600275863907, + "learning_rate": 6.3394314555862545e-06, + "loss": 0.6618, + "step": 4149 + }, + { + "epoch": 0.43, + "grad_norm": 1.8379449832426429, + "learning_rate": 6.337809490768465e-06, + "loss": 0.6177, + "step": 4150 + }, + { + "epoch": 0.43, + "grad_norm": 1.8309354258877755, + "learning_rate": 6.336187374302344e-06, + "loss": 0.7409, + "step": 4151 + }, + { + "epoch": 0.43, + "grad_norm": 1.7755918403707267, + "learning_rate": 6.334565106371768e-06, + "loss": 0.6348, + "step": 4152 + }, + { + "epoch": 0.43, + "grad_norm": 1.8224093319923627, + "learning_rate": 6.332942687160632e-06, + "loss": 0.6293, + "step": 4153 + }, + { + "epoch": 0.43, + "grad_norm": 2.0566967543569863, + "learning_rate": 6.331320116852842e-06, + "loss": 0.6957, + "step": 4154 + }, + { + "epoch": 0.43, + "grad_norm": 1.8244220543225507, + "learning_rate": 6.329697395632332e-06, + "loss": 0.6038, + "step": 4155 + }, + { + "epoch": 0.43, + "grad_norm": 1.623371055393367, + "learning_rate": 6.328074523683041e-06, + "loss": 0.6673, + "step": 4156 + }, + { + "epoch": 0.43, + "grad_norm": 2.2657238349497706, + "learning_rate": 6.326451501188933e-06, + "loss": 0.6658, + "step": 4157 + }, + { + "epoch": 0.43, + "grad_norm": 1.9014605197412302, + "learning_rate": 6.324828328333986e-06, + "loss": 0.6146, + "step": 4158 + }, + { + "epoch": 0.43, + "grad_norm": 1.7894161936132815, + "learning_rate": 6.323205005302199e-06, + "loss": 0.6886, + "step": 4159 + }, + { + "epoch": 0.43, + "grad_norm": 1.8682449812414834, + "learning_rate": 6.321581532277581e-06, + "loss": 0.6771, + "step": 4160 + }, + { + "epoch": 0.43, + "grad_norm": 1.7046598099665882, + "learning_rate": 6.319957909444163e-06, + "loss": 0.6186, + "step": 4161 + }, + { + "epoch": 0.43, + "grad_norm": 1.963434024206442, + "learning_rate": 6.318334136985993e-06, + "loss": 0.6581, + "step": 4162 + }, + { + "epoch": 0.43, + "grad_norm": 1.9731526290449264, + "learning_rate": 6.316710215087136e-06, + "loss": 0.6845, + "step": 4163 + }, + { + "epoch": 0.43, + "grad_norm": 1.8947817321763945, + "learning_rate": 6.31508614393167e-06, + "loss": 0.71, + "step": 4164 + }, + { + "epoch": 0.43, + "grad_norm": 1.6776835190313186, + "learning_rate": 6.313461923703693e-06, + "loss": 0.5618, + "step": 4165 + }, + { + "epoch": 0.43, + "grad_norm": 2.0755723255709397, + "learning_rate": 6.311837554587322e-06, + "loss": 0.6423, + "step": 4166 + }, + { + "epoch": 0.43, + "grad_norm": 1.8678892690364675, + "learning_rate": 6.3102130367666855e-06, + "loss": 0.6015, + "step": 4167 + }, + { + "epoch": 0.43, + "grad_norm": 1.9535946234388366, + "learning_rate": 6.308588370425934e-06, + "loss": 0.6249, + "step": 4168 + }, + { + "epoch": 0.43, + "grad_norm": 1.9754689426313052, + "learning_rate": 6.306963555749231e-06, + "loss": 0.6902, + "step": 4169 + }, + { + "epoch": 0.43, + "grad_norm": 1.7827061123763464, + "learning_rate": 6.305338592920762e-06, + "loss": 0.6728, + "step": 4170 + }, + { + "epoch": 0.43, + "grad_norm": 1.737874266961729, + "learning_rate": 6.303713482124721e-06, + "loss": 0.6458, + "step": 4171 + }, + { + "epoch": 0.43, + "grad_norm": 1.8841336594214666, + "learning_rate": 6.302088223545327e-06, + "loss": 0.6342, + "step": 4172 + }, + { + "epoch": 0.43, + "grad_norm": 1.8336447518739336, + "learning_rate": 6.30046281736681e-06, + "loss": 0.5722, + "step": 4173 + }, + { + "epoch": 0.43, + "grad_norm": 1.9792552077265053, + "learning_rate": 6.298837263773423e-06, + "loss": 0.619, + "step": 4174 + }, + { + "epoch": 0.43, + "grad_norm": 1.8990343746392675, + "learning_rate": 6.297211562949427e-06, + "loss": 0.6661, + "step": 4175 + }, + { + "epoch": 0.43, + "grad_norm": 2.012978566826468, + "learning_rate": 6.2955857150791055e-06, + "loss": 0.6569, + "step": 4176 + }, + { + "epoch": 0.43, + "grad_norm": 1.988969857473836, + "learning_rate": 6.29395972034676e-06, + "loss": 0.6604, + "step": 4177 + }, + { + "epoch": 0.43, + "grad_norm": 1.7627911436862862, + "learning_rate": 6.2923335789367044e-06, + "loss": 0.5787, + "step": 4178 + }, + { + "epoch": 0.43, + "grad_norm": 1.9374367211852295, + "learning_rate": 6.290707291033272e-06, + "loss": 0.6177, + "step": 4179 + }, + { + "epoch": 0.43, + "grad_norm": 2.0083371897370674, + "learning_rate": 6.289080856820811e-06, + "loss": 0.6268, + "step": 4180 + }, + { + "epoch": 0.43, + "grad_norm": 2.0569173930723967, + "learning_rate": 6.287454276483687e-06, + "loss": 0.6635, + "step": 4181 + }, + { + "epoch": 0.43, + "grad_norm": 1.7807436006960025, + "learning_rate": 6.285827550206282e-06, + "loss": 0.535, + "step": 4182 + }, + { + "epoch": 0.43, + "grad_norm": 2.101638627215469, + "learning_rate": 6.284200678172997e-06, + "loss": 0.7426, + "step": 4183 + }, + { + "epoch": 0.43, + "grad_norm": 1.7072537972165742, + "learning_rate": 6.282573660568245e-06, + "loss": 0.7568, + "step": 4184 + }, + { + "epoch": 0.44, + "grad_norm": 1.9695668809384006, + "learning_rate": 6.2809464975764575e-06, + "loss": 0.5676, + "step": 4185 + }, + { + "epoch": 0.44, + "grad_norm": 1.767265477943679, + "learning_rate": 6.279319189382084e-06, + "loss": 0.601, + "step": 4186 + }, + { + "epoch": 0.44, + "grad_norm": 1.7094500604340592, + "learning_rate": 6.2776917361695876e-06, + "loss": 0.6147, + "step": 4187 + }, + { + "epoch": 0.44, + "grad_norm": 1.8402827980509162, + "learning_rate": 6.276064138123453e-06, + "loss": 0.732, + "step": 4188 + }, + { + "epoch": 0.44, + "grad_norm": 1.9402004321921997, + "learning_rate": 6.274436395428171e-06, + "loss": 0.6128, + "step": 4189 + }, + { + "epoch": 0.44, + "grad_norm": 1.9031106270161509, + "learning_rate": 6.272808508268262e-06, + "loss": 0.6311, + "step": 4190 + }, + { + "epoch": 0.44, + "grad_norm": 2.033607806685722, + "learning_rate": 6.2711804768282535e-06, + "loss": 0.5969, + "step": 4191 + }, + { + "epoch": 0.44, + "grad_norm": 1.800733372504521, + "learning_rate": 6.269552301292693e-06, + "loss": 0.5638, + "step": 4192 + }, + { + "epoch": 0.44, + "grad_norm": 1.639474658639473, + "learning_rate": 6.267923981846141e-06, + "loss": 0.5257, + "step": 4193 + }, + { + "epoch": 0.44, + "grad_norm": 2.023559643318534, + "learning_rate": 6.26629551867318e-06, + "loss": 0.5674, + "step": 4194 + }, + { + "epoch": 0.44, + "grad_norm": 1.874292710750846, + "learning_rate": 6.264666911958404e-06, + "loss": 0.6333, + "step": 4195 + }, + { + "epoch": 0.44, + "grad_norm": 1.8563287999328968, + "learning_rate": 6.263038161886426e-06, + "loss": 0.7358, + "step": 4196 + }, + { + "epoch": 0.44, + "grad_norm": 1.7453380961988425, + "learning_rate": 6.261409268641872e-06, + "loss": 0.6227, + "step": 4197 + }, + { + "epoch": 0.44, + "grad_norm": 1.751372462067731, + "learning_rate": 6.259780232409389e-06, + "loss": 0.6074, + "step": 4198 + }, + { + "epoch": 0.44, + "grad_norm": 1.880312499572035, + "learning_rate": 6.2581510533736346e-06, + "loss": 0.6836, + "step": 4199 + }, + { + "epoch": 0.44, + "grad_norm": 1.866018226892596, + "learning_rate": 6.25652173171929e-06, + "loss": 0.6002, + "step": 4200 + }, + { + "epoch": 0.44, + "grad_norm": 1.8337619090346275, + "learning_rate": 6.254892267631042e-06, + "loss": 0.6665, + "step": 4201 + }, + { + "epoch": 0.44, + "grad_norm": 1.821660317307866, + "learning_rate": 6.2532626612936035e-06, + "loss": 0.6837, + "step": 4202 + }, + { + "epoch": 0.44, + "grad_norm": 1.852300675093971, + "learning_rate": 6.2516329128917e-06, + "loss": 0.6855, + "step": 4203 + }, + { + "epoch": 0.44, + "grad_norm": 2.067378628400363, + "learning_rate": 6.250003022610071e-06, + "loss": 0.684, + "step": 4204 + }, + { + "epoch": 0.44, + "grad_norm": 1.766568461268017, + "learning_rate": 6.248372990633475e-06, + "loss": 0.6828, + "step": 4205 + }, + { + "epoch": 0.44, + "grad_norm": 2.121014004372851, + "learning_rate": 6.246742817146684e-06, + "loss": 0.6887, + "step": 4206 + }, + { + "epoch": 0.44, + "grad_norm": 1.916239882668666, + "learning_rate": 6.2451125023344895e-06, + "loss": 0.7135, + "step": 4207 + }, + { + "epoch": 0.44, + "grad_norm": 2.0778830065986345, + "learning_rate": 6.243482046381696e-06, + "loss": 0.6779, + "step": 4208 + }, + { + "epoch": 0.44, + "grad_norm": 1.869012297808762, + "learning_rate": 6.2418514494731245e-06, + "loss": 0.6217, + "step": 4209 + }, + { + "epoch": 0.44, + "grad_norm": 1.650570523552766, + "learning_rate": 6.240220711793612e-06, + "loss": 0.6305, + "step": 4210 + }, + { + "epoch": 0.44, + "grad_norm": 1.775718554562567, + "learning_rate": 6.238589833528015e-06, + "loss": 0.6848, + "step": 4211 + }, + { + "epoch": 0.44, + "grad_norm": 2.010000193787045, + "learning_rate": 6.236958814861199e-06, + "loss": 0.6825, + "step": 4212 + }, + { + "epoch": 0.44, + "grad_norm": 1.7651378794347117, + "learning_rate": 6.2353276559780515e-06, + "loss": 0.5645, + "step": 4213 + }, + { + "epoch": 0.44, + "grad_norm": 1.722675513011501, + "learning_rate": 6.233696357063472e-06, + "loss": 0.668, + "step": 4214 + }, + { + "epoch": 0.44, + "grad_norm": 2.1181437150465214, + "learning_rate": 6.23206491830238e-06, + "loss": 0.7165, + "step": 4215 + }, + { + "epoch": 0.44, + "grad_norm": 1.9627449801841879, + "learning_rate": 6.230433339879706e-06, + "loss": 0.6373, + "step": 4216 + }, + { + "epoch": 0.44, + "grad_norm": 1.747089518779566, + "learning_rate": 6.2288016219804e-06, + "loss": 0.5995, + "step": 4217 + }, + { + "epoch": 0.44, + "grad_norm": 1.6960776760193497, + "learning_rate": 6.2271697647894265e-06, + "loss": 0.6389, + "step": 4218 + }, + { + "epoch": 0.44, + "grad_norm": 1.9381104103996185, + "learning_rate": 6.225537768491766e-06, + "loss": 0.6122, + "step": 4219 + }, + { + "epoch": 0.44, + "grad_norm": 1.9867789174214119, + "learning_rate": 6.223905633272414e-06, + "loss": 0.6503, + "step": 4220 + }, + { + "epoch": 0.44, + "grad_norm": 2.2355853167261124, + "learning_rate": 6.2222733593163805e-06, + "loss": 0.7641, + "step": 4221 + }, + { + "epoch": 0.44, + "grad_norm": 1.796152824199846, + "learning_rate": 6.220640946808697e-06, + "loss": 0.5726, + "step": 4222 + }, + { + "epoch": 0.44, + "grad_norm": 1.8490238088076443, + "learning_rate": 6.219008395934405e-06, + "loss": 0.5832, + "step": 4223 + }, + { + "epoch": 0.44, + "grad_norm": 1.734880478046965, + "learning_rate": 6.217375706878561e-06, + "loss": 0.6244, + "step": 4224 + }, + { + "epoch": 0.44, + "grad_norm": 1.8760334967705816, + "learning_rate": 6.215742879826244e-06, + "loss": 0.6718, + "step": 4225 + }, + { + "epoch": 0.44, + "grad_norm": 1.8641079257282516, + "learning_rate": 6.214109914962542e-06, + "loss": 0.6378, + "step": 4226 + }, + { + "epoch": 0.44, + "grad_norm": 1.9736071636844352, + "learning_rate": 6.21247681247256e-06, + "loss": 0.7905, + "step": 4227 + }, + { + "epoch": 0.44, + "grad_norm": 1.657253663363964, + "learning_rate": 6.210843572541421e-06, + "loss": 0.6086, + "step": 4228 + }, + { + "epoch": 0.44, + "grad_norm": 1.9013689343551632, + "learning_rate": 6.20921019535426e-06, + "loss": 0.6301, + "step": 4229 + }, + { + "epoch": 0.44, + "grad_norm": 1.9593538236517356, + "learning_rate": 6.207576681096233e-06, + "loss": 0.7359, + "step": 4230 + }, + { + "epoch": 0.44, + "grad_norm": 1.9607098259709497, + "learning_rate": 6.205943029952505e-06, + "loss": 0.6411, + "step": 4231 + }, + { + "epoch": 0.44, + "grad_norm": 1.917909444411163, + "learning_rate": 6.204309242108262e-06, + "loss": 0.6877, + "step": 4232 + }, + { + "epoch": 0.44, + "grad_norm": 1.9283331737073521, + "learning_rate": 6.202675317748702e-06, + "loss": 0.6473, + "step": 4233 + }, + { + "epoch": 0.44, + "grad_norm": 1.820140476133725, + "learning_rate": 6.201041257059039e-06, + "loss": 0.623, + "step": 4234 + }, + { + "epoch": 0.44, + "grad_norm": 1.8303703060759442, + "learning_rate": 6.199407060224503e-06, + "loss": 0.6681, + "step": 4235 + }, + { + "epoch": 0.44, + "grad_norm": 2.1784835352215315, + "learning_rate": 6.197772727430341e-06, + "loss": 0.6553, + "step": 4236 + }, + { + "epoch": 0.44, + "grad_norm": 2.057402259459153, + "learning_rate": 6.196138258861815e-06, + "loss": 0.6533, + "step": 4237 + }, + { + "epoch": 0.44, + "grad_norm": 1.9346080271003194, + "learning_rate": 6.194503654704198e-06, + "loss": 0.6091, + "step": 4238 + }, + { + "epoch": 0.44, + "grad_norm": 1.7055354507122449, + "learning_rate": 6.192868915142782e-06, + "loss": 0.6281, + "step": 4239 + }, + { + "epoch": 0.44, + "grad_norm": 1.9143105967875074, + "learning_rate": 6.191234040362879e-06, + "loss": 0.5833, + "step": 4240 + }, + { + "epoch": 0.44, + "grad_norm": 1.7670799000079251, + "learning_rate": 6.189599030549804e-06, + "loss": 0.6947, + "step": 4241 + }, + { + "epoch": 0.44, + "grad_norm": 1.8540914584065187, + "learning_rate": 6.1879638858889e-06, + "loss": 0.7129, + "step": 4242 + }, + { + "epoch": 0.44, + "grad_norm": 1.7533761314777891, + "learning_rate": 6.186328606565518e-06, + "loss": 0.5931, + "step": 4243 + }, + { + "epoch": 0.44, + "grad_norm": 1.8074134710317675, + "learning_rate": 6.184693192765028e-06, + "loss": 0.7204, + "step": 4244 + }, + { + "epoch": 0.44, + "grad_norm": 2.0892333656674364, + "learning_rate": 6.18305764467281e-06, + "loss": 0.7689, + "step": 4245 + }, + { + "epoch": 0.44, + "grad_norm": 1.903938283307967, + "learning_rate": 6.181421962474267e-06, + "loss": 0.6832, + "step": 4246 + }, + { + "epoch": 0.44, + "grad_norm": 1.9806689141914189, + "learning_rate": 6.179786146354808e-06, + "loss": 0.6931, + "step": 4247 + }, + { + "epoch": 0.44, + "grad_norm": 1.977935900264373, + "learning_rate": 6.178150196499868e-06, + "loss": 0.6945, + "step": 4248 + }, + { + "epoch": 0.44, + "grad_norm": 1.8313727726369613, + "learning_rate": 6.176514113094885e-06, + "loss": 0.5794, + "step": 4249 + }, + { + "epoch": 0.44, + "grad_norm": 2.081898696849267, + "learning_rate": 6.174877896325323e-06, + "loss": 0.6605, + "step": 4250 + }, + { + "epoch": 0.44, + "grad_norm": 1.9063195075574626, + "learning_rate": 6.173241546376654e-06, + "loss": 0.6784, + "step": 4251 + }, + { + "epoch": 0.44, + "grad_norm": 1.8626522818570608, + "learning_rate": 6.171605063434368e-06, + "loss": 0.6452, + "step": 4252 + }, + { + "epoch": 0.44, + "grad_norm": 1.8826580292136437, + "learning_rate": 6.169968447683971e-06, + "loss": 0.6847, + "step": 4253 + }, + { + "epoch": 0.44, + "grad_norm": 1.7157387012544394, + "learning_rate": 6.168331699310982e-06, + "loss": 0.5853, + "step": 4254 + }, + { + "epoch": 0.44, + "grad_norm": 1.9401064441075455, + "learning_rate": 6.1666948185009355e-06, + "loss": 0.6954, + "step": 4255 + }, + { + "epoch": 0.44, + "grad_norm": 2.0024822246974736, + "learning_rate": 6.165057805439382e-06, + "loss": 0.6232, + "step": 4256 + }, + { + "epoch": 0.44, + "grad_norm": 2.171084207492589, + "learning_rate": 6.1634206603118844e-06, + "loss": 0.672, + "step": 4257 + }, + { + "epoch": 0.44, + "grad_norm": 1.8833760760024472, + "learning_rate": 6.161783383304024e-06, + "loss": 0.6084, + "step": 4258 + }, + { + "epoch": 0.44, + "grad_norm": 1.8417561591740161, + "learning_rate": 6.160145974601397e-06, + "loss": 0.6039, + "step": 4259 + }, + { + "epoch": 0.44, + "grad_norm": 2.0709052955398577, + "learning_rate": 6.158508434389608e-06, + "loss": 0.6919, + "step": 4260 + }, + { + "epoch": 0.44, + "grad_norm": 1.8776488381311174, + "learning_rate": 6.156870762854287e-06, + "loss": 0.7309, + "step": 4261 + }, + { + "epoch": 0.44, + "grad_norm": 1.873816021513749, + "learning_rate": 6.155232960181071e-06, + "loss": 0.6691, + "step": 4262 + }, + { + "epoch": 0.44, + "grad_norm": 1.7901395019323316, + "learning_rate": 6.153595026555613e-06, + "loss": 0.6377, + "step": 4263 + }, + { + "epoch": 0.44, + "grad_norm": 1.75387062711662, + "learning_rate": 6.151956962163584e-06, + "loss": 0.6408, + "step": 4264 + }, + { + "epoch": 0.44, + "grad_norm": 1.770309730929508, + "learning_rate": 6.150318767190668e-06, + "loss": 0.6725, + "step": 4265 + }, + { + "epoch": 0.44, + "grad_norm": 1.919943910487086, + "learning_rate": 6.148680441822563e-06, + "loss": 0.5916, + "step": 4266 + }, + { + "epoch": 0.44, + "grad_norm": 1.8486164691741107, + "learning_rate": 6.1470419862449825e-06, + "loss": 0.5883, + "step": 4267 + }, + { + "epoch": 0.44, + "grad_norm": 1.9032761553068864, + "learning_rate": 6.1454034006436545e-06, + "loss": 0.5558, + "step": 4268 + }, + { + "epoch": 0.44, + "grad_norm": 1.9283442009335932, + "learning_rate": 6.143764685204323e-06, + "loss": 0.6445, + "step": 4269 + }, + { + "epoch": 0.44, + "grad_norm": 1.7842124977670573, + "learning_rate": 6.142125840112746e-06, + "loss": 0.6604, + "step": 4270 + }, + { + "epoch": 0.44, + "grad_norm": 1.6599561789243407, + "learning_rate": 6.140486865554693e-06, + "loss": 0.6057, + "step": 4271 + }, + { + "epoch": 0.44, + "grad_norm": 1.7994386291779092, + "learning_rate": 6.138847761715955e-06, + "loss": 0.5795, + "step": 4272 + }, + { + "epoch": 0.44, + "grad_norm": 2.1473723111235663, + "learning_rate": 6.137208528782331e-06, + "loss": 0.6959, + "step": 4273 + }, + { + "epoch": 0.44, + "grad_norm": 1.725204692644165, + "learning_rate": 6.1355691669396386e-06, + "loss": 0.5396, + "step": 4274 + }, + { + "epoch": 0.44, + "grad_norm": 2.0090637781380067, + "learning_rate": 6.133929676373709e-06, + "loss": 0.6082, + "step": 4275 + }, + { + "epoch": 0.44, + "grad_norm": 1.9772578144450519, + "learning_rate": 6.132290057270387e-06, + "loss": 0.6552, + "step": 4276 + }, + { + "epoch": 0.44, + "grad_norm": 2.0108173014531148, + "learning_rate": 6.130650309815535e-06, + "loss": 0.6897, + "step": 4277 + }, + { + "epoch": 0.44, + "grad_norm": 1.823894686140687, + "learning_rate": 6.129010434195023e-06, + "loss": 0.6536, + "step": 4278 + }, + { + "epoch": 0.44, + "grad_norm": 1.6203568412684168, + "learning_rate": 6.127370430594745e-06, + "loss": 0.5311, + "step": 4279 + }, + { + "epoch": 0.44, + "grad_norm": 2.014024329591531, + "learning_rate": 6.125730299200601e-06, + "loss": 0.6031, + "step": 4280 + }, + { + "epoch": 0.45, + "grad_norm": 1.944148231455446, + "learning_rate": 6.124090040198514e-06, + "loss": 0.667, + "step": 4281 + }, + { + "epoch": 0.45, + "grad_norm": 1.960939325536016, + "learning_rate": 6.122449653774411e-06, + "loss": 0.7211, + "step": 4282 + }, + { + "epoch": 0.45, + "grad_norm": 1.9195307639129888, + "learning_rate": 6.120809140114243e-06, + "loss": 0.6355, + "step": 4283 + }, + { + "epoch": 0.45, + "grad_norm": 1.7758071657350973, + "learning_rate": 6.119168499403971e-06, + "loss": 0.6558, + "step": 4284 + }, + { + "epoch": 0.45, + "grad_norm": 1.9774042812835415, + "learning_rate": 6.11752773182957e-06, + "loss": 0.6619, + "step": 4285 + }, + { + "epoch": 0.45, + "grad_norm": 1.8617259628540834, + "learning_rate": 6.115886837577031e-06, + "loss": 0.5504, + "step": 4286 + }, + { + "epoch": 0.45, + "grad_norm": 1.8015457930515442, + "learning_rate": 6.114245816832359e-06, + "loss": 0.5942, + "step": 4287 + }, + { + "epoch": 0.45, + "grad_norm": 1.8809551738481627, + "learning_rate": 6.112604669781572e-06, + "loss": 0.6016, + "step": 4288 + }, + { + "epoch": 0.45, + "grad_norm": 1.8090247840030873, + "learning_rate": 6.110963396610705e-06, + "loss": 0.583, + "step": 4289 + }, + { + "epoch": 0.45, + "grad_norm": 2.000768037728457, + "learning_rate": 6.109321997505804e-06, + "loss": 0.6731, + "step": 4290 + }, + { + "epoch": 0.45, + "grad_norm": 1.9743242871047149, + "learning_rate": 6.107680472652931e-06, + "loss": 0.715, + "step": 4291 + }, + { + "epoch": 0.45, + "grad_norm": 1.9282462151001922, + "learning_rate": 6.106038822238165e-06, + "loss": 0.6739, + "step": 4292 + }, + { + "epoch": 0.45, + "grad_norm": 2.1015185797276303, + "learning_rate": 6.104397046447593e-06, + "loss": 0.708, + "step": 4293 + }, + { + "epoch": 0.45, + "grad_norm": 1.748887075305813, + "learning_rate": 6.1027551454673204e-06, + "loss": 0.7053, + "step": 4294 + }, + { + "epoch": 0.45, + "grad_norm": 1.8395064781208474, + "learning_rate": 6.1011131194834675e-06, + "loss": 0.6444, + "step": 4295 + }, + { + "epoch": 0.45, + "grad_norm": 1.7803548664428446, + "learning_rate": 6.099470968682168e-06, + "loss": 0.5716, + "step": 4296 + }, + { + "epoch": 0.45, + "grad_norm": 1.6009153734353425, + "learning_rate": 6.097828693249565e-06, + "loss": 0.6457, + "step": 4297 + }, + { + "epoch": 0.45, + "grad_norm": 1.647971075805321, + "learning_rate": 6.0961862933718215e-06, + "loss": 0.5746, + "step": 4298 + }, + { + "epoch": 0.45, + "grad_norm": 1.8482386722821733, + "learning_rate": 6.0945437692351166e-06, + "loss": 0.6898, + "step": 4299 + }, + { + "epoch": 0.45, + "grad_norm": 1.8232321333867145, + "learning_rate": 6.092901121025634e-06, + "loss": 0.6632, + "step": 4300 + }, + { + "epoch": 0.45, + "grad_norm": 1.881039170540099, + "learning_rate": 6.091258348929581e-06, + "loss": 0.6537, + "step": 4301 + }, + { + "epoch": 0.45, + "grad_norm": 2.0396835214722606, + "learning_rate": 6.089615453133173e-06, + "loss": 0.6914, + "step": 4302 + }, + { + "epoch": 0.45, + "grad_norm": 1.8316939185491101, + "learning_rate": 6.0879724338226454e-06, + "loss": 0.5984, + "step": 4303 + }, + { + "epoch": 0.45, + "grad_norm": 1.7744519568370833, + "learning_rate": 6.086329291184238e-06, + "loss": 0.5976, + "step": 4304 + }, + { + "epoch": 0.45, + "grad_norm": 1.945342822305884, + "learning_rate": 6.084686025404216e-06, + "loss": 0.5689, + "step": 4305 + }, + { + "epoch": 0.45, + "grad_norm": 1.660347128085138, + "learning_rate": 6.08304263666885e-06, + "loss": 0.5238, + "step": 4306 + }, + { + "epoch": 0.45, + "grad_norm": 1.9805416996449299, + "learning_rate": 6.081399125164429e-06, + "loss": 0.7111, + "step": 4307 + }, + { + "epoch": 0.45, + "grad_norm": 1.8201579374736838, + "learning_rate": 6.079755491077251e-06, + "loss": 0.62, + "step": 4308 + }, + { + "epoch": 0.45, + "grad_norm": 1.671229966988917, + "learning_rate": 6.0781117345936345e-06, + "loss": 0.5775, + "step": 4309 + }, + { + "epoch": 0.45, + "grad_norm": 1.8676293114126867, + "learning_rate": 6.07646785589991e-06, + "loss": 0.7568, + "step": 4310 + }, + { + "epoch": 0.45, + "grad_norm": 1.8390499419653883, + "learning_rate": 6.074823855182416e-06, + "loss": 0.6798, + "step": 4311 + }, + { + "epoch": 0.45, + "grad_norm": 1.7786191474157553, + "learning_rate": 6.073179732627512e-06, + "loss": 0.5254, + "step": 4312 + }, + { + "epoch": 0.45, + "grad_norm": 1.7017606975225539, + "learning_rate": 6.0715354884215685e-06, + "loss": 0.5916, + "step": 4313 + }, + { + "epoch": 0.45, + "grad_norm": 1.8331971488804444, + "learning_rate": 6.069891122750971e-06, + "loss": 0.6283, + "step": 4314 + }, + { + "epoch": 0.45, + "grad_norm": 2.0707708625988377, + "learning_rate": 6.068246635802115e-06, + "loss": 0.6521, + "step": 4315 + }, + { + "epoch": 0.45, + "grad_norm": 2.1080846184243596, + "learning_rate": 6.066602027761414e-06, + "loss": 0.6613, + "step": 4316 + }, + { + "epoch": 0.45, + "grad_norm": 1.8541610045993289, + "learning_rate": 6.064957298815295e-06, + "loss": 0.7219, + "step": 4317 + }, + { + "epoch": 0.45, + "grad_norm": 1.7233308318810572, + "learning_rate": 6.063312449150196e-06, + "loss": 0.7256, + "step": 4318 + }, + { + "epoch": 0.45, + "grad_norm": 1.7786958252668963, + "learning_rate": 6.06166747895257e-06, + "loss": 0.6196, + "step": 4319 + }, + { + "epoch": 0.45, + "grad_norm": 1.8992649487983346, + "learning_rate": 6.060022388408883e-06, + "loss": 0.6449, + "step": 4320 + }, + { + "epoch": 0.45, + "grad_norm": 1.8783947566207382, + "learning_rate": 6.0583771777056166e-06, + "loss": 0.6849, + "step": 4321 + }, + { + "epoch": 0.45, + "grad_norm": 1.947749299526801, + "learning_rate": 6.056731847029265e-06, + "loss": 0.7143, + "step": 4322 + }, + { + "epoch": 0.45, + "grad_norm": 1.9019855284634992, + "learning_rate": 6.055086396566334e-06, + "loss": 0.5855, + "step": 4323 + }, + { + "epoch": 0.45, + "grad_norm": 1.831529691156291, + "learning_rate": 6.0534408265033485e-06, + "loss": 0.6443, + "step": 4324 + }, + { + "epoch": 0.45, + "grad_norm": 1.8942202831397477, + "learning_rate": 6.05179513702684e-06, + "loss": 0.5609, + "step": 4325 + }, + { + "epoch": 0.45, + "grad_norm": 1.9975511737435045, + "learning_rate": 6.050149328323358e-06, + "loss": 0.6611, + "step": 4326 + }, + { + "epoch": 0.45, + "grad_norm": 1.970690832774122, + "learning_rate": 6.048503400579463e-06, + "loss": 0.657, + "step": 4327 + }, + { + "epoch": 0.45, + "grad_norm": 1.903442299604656, + "learning_rate": 6.046857353981732e-06, + "loss": 0.6878, + "step": 4328 + }, + { + "epoch": 0.45, + "grad_norm": 1.9033342278105545, + "learning_rate": 6.045211188716753e-06, + "loss": 0.6339, + "step": 4329 + }, + { + "epoch": 0.45, + "grad_norm": 1.9358946001912258, + "learning_rate": 6.043564904971129e-06, + "loss": 0.7204, + "step": 4330 + }, + { + "epoch": 0.45, + "grad_norm": 1.562197508869273, + "learning_rate": 6.041918502931473e-06, + "loss": 0.5725, + "step": 4331 + }, + { + "epoch": 0.45, + "grad_norm": 1.7487153609771282, + "learning_rate": 6.040271982784417e-06, + "loss": 0.6848, + "step": 4332 + }, + { + "epoch": 0.45, + "grad_norm": 2.0537934311353845, + "learning_rate": 6.038625344716603e-06, + "loss": 0.7616, + "step": 4333 + }, + { + "epoch": 0.45, + "grad_norm": 1.880860377128153, + "learning_rate": 6.036978588914684e-06, + "loss": 0.6721, + "step": 4334 + }, + { + "epoch": 0.45, + "grad_norm": 1.7029662258523208, + "learning_rate": 6.035331715565333e-06, + "loss": 0.6612, + "step": 4335 + }, + { + "epoch": 0.45, + "grad_norm": 2.0649190844874443, + "learning_rate": 6.0336847248552335e-06, + "loss": 0.7124, + "step": 4336 + }, + { + "epoch": 0.45, + "grad_norm": 1.8863676505857916, + "learning_rate": 6.032037616971075e-06, + "loss": 0.6383, + "step": 4337 + }, + { + "epoch": 0.45, + "grad_norm": 1.6913686311949407, + "learning_rate": 6.030390392099571e-06, + "loss": 0.5815, + "step": 4338 + }, + { + "epoch": 0.45, + "grad_norm": 1.6854186885544427, + "learning_rate": 6.028743050427442e-06, + "loss": 0.5173, + "step": 4339 + }, + { + "epoch": 0.45, + "grad_norm": 2.0603592121426995, + "learning_rate": 6.027095592141428e-06, + "loss": 0.65, + "step": 4340 + }, + { + "epoch": 0.45, + "grad_norm": 1.8627823283033291, + "learning_rate": 6.025448017428272e-06, + "loss": 0.5187, + "step": 4341 + }, + { + "epoch": 0.45, + "grad_norm": 1.8550463706984868, + "learning_rate": 6.023800326474738e-06, + "loss": 0.6554, + "step": 4342 + }, + { + "epoch": 0.45, + "grad_norm": 2.01413399369854, + "learning_rate": 6.022152519467601e-06, + "loss": 0.5523, + "step": 4343 + }, + { + "epoch": 0.45, + "grad_norm": 1.871011023204209, + "learning_rate": 6.020504596593652e-06, + "loss": 0.5976, + "step": 4344 + }, + { + "epoch": 0.45, + "grad_norm": 1.9291226273099116, + "learning_rate": 6.018856558039689e-06, + "loss": 0.5815, + "step": 4345 + }, + { + "epoch": 0.45, + "grad_norm": 2.1033708436285283, + "learning_rate": 6.017208403992527e-06, + "loss": 0.6804, + "step": 4346 + }, + { + "epoch": 0.45, + "grad_norm": 1.7746131482935668, + "learning_rate": 6.015560134638997e-06, + "loss": 0.6312, + "step": 4347 + }, + { + "epoch": 0.45, + "grad_norm": 1.736794612224939, + "learning_rate": 6.013911750165935e-06, + "loss": 0.688, + "step": 4348 + }, + { + "epoch": 0.45, + "grad_norm": 1.8071088364104275, + "learning_rate": 6.012263250760199e-06, + "loss": 0.6993, + "step": 4349 + }, + { + "epoch": 0.45, + "grad_norm": 1.767310249059785, + "learning_rate": 6.0106146366086514e-06, + "loss": 0.5741, + "step": 4350 + }, + { + "epoch": 0.45, + "grad_norm": 2.0526654951756416, + "learning_rate": 6.0089659078981765e-06, + "loss": 0.6669, + "step": 4351 + }, + { + "epoch": 0.45, + "grad_norm": 1.6899837975267995, + "learning_rate": 6.007317064815664e-06, + "loss": 0.5892, + "step": 4352 + }, + { + "epoch": 0.45, + "grad_norm": 1.9256094251339038, + "learning_rate": 6.0056681075480206e-06, + "loss": 0.6298, + "step": 4353 + }, + { + "epoch": 0.45, + "grad_norm": 2.018125474079877, + "learning_rate": 6.004019036282165e-06, + "loss": 0.687, + "step": 4354 + }, + { + "epoch": 0.45, + "grad_norm": 1.8216632857004333, + "learning_rate": 6.002369851205029e-06, + "loss": 0.6037, + "step": 4355 + }, + { + "epoch": 0.45, + "grad_norm": 1.934654363397389, + "learning_rate": 6.000720552503557e-06, + "loss": 0.6146, + "step": 4356 + }, + { + "epoch": 0.45, + "grad_norm": 1.816258512462237, + "learning_rate": 5.999071140364708e-06, + "loss": 0.6523, + "step": 4357 + }, + { + "epoch": 0.45, + "grad_norm": 1.8239092670075012, + "learning_rate": 5.997421614975449e-06, + "loss": 0.6458, + "step": 4358 + }, + { + "epoch": 0.45, + "grad_norm": 1.9149484450077814, + "learning_rate": 5.995771976522765e-06, + "loss": 0.7412, + "step": 4359 + }, + { + "epoch": 0.45, + "grad_norm": 1.9121047542571723, + "learning_rate": 5.9941222251936525e-06, + "loss": 0.6096, + "step": 4360 + }, + { + "epoch": 0.45, + "grad_norm": 1.8481439735718255, + "learning_rate": 5.992472361175118e-06, + "loss": 0.6237, + "step": 4361 + }, + { + "epoch": 0.45, + "grad_norm": 2.0655990944889124, + "learning_rate": 5.990822384654187e-06, + "loss": 0.6851, + "step": 4362 + }, + { + "epoch": 0.45, + "grad_norm": 1.8831730913470377, + "learning_rate": 5.989172295817889e-06, + "loss": 0.618, + "step": 4363 + }, + { + "epoch": 0.45, + "grad_norm": 1.8724322214686773, + "learning_rate": 5.9875220948532745e-06, + "loss": 0.6682, + "step": 4364 + }, + { + "epoch": 0.45, + "grad_norm": 1.905634382989326, + "learning_rate": 5.9858717819474e-06, + "loss": 0.647, + "step": 4365 + }, + { + "epoch": 0.45, + "grad_norm": 2.0386048289605534, + "learning_rate": 5.984221357287342e-06, + "loss": 0.6118, + "step": 4366 + }, + { + "epoch": 0.45, + "grad_norm": 1.8630627415405536, + "learning_rate": 5.982570821060182e-06, + "loss": 0.6513, + "step": 4367 + }, + { + "epoch": 0.45, + "grad_norm": 2.146377572209457, + "learning_rate": 5.980920173453019e-06, + "loss": 0.6575, + "step": 4368 + }, + { + "epoch": 0.45, + "grad_norm": 1.8648330548708527, + "learning_rate": 5.979269414652964e-06, + "loss": 0.5981, + "step": 4369 + }, + { + "epoch": 0.45, + "grad_norm": 1.7479863045414308, + "learning_rate": 5.977618544847139e-06, + "loss": 0.6314, + "step": 4370 + }, + { + "epoch": 0.45, + "grad_norm": 1.9122892283559052, + "learning_rate": 5.975967564222679e-06, + "loss": 0.6404, + "step": 4371 + }, + { + "epoch": 0.45, + "grad_norm": 1.7702998645102086, + "learning_rate": 5.974316472966732e-06, + "loss": 0.6091, + "step": 4372 + }, + { + "epoch": 0.45, + "grad_norm": 1.9419264167413117, + "learning_rate": 5.9726652712664625e-06, + "loss": 0.6424, + "step": 4373 + }, + { + "epoch": 0.45, + "grad_norm": 1.924232841922094, + "learning_rate": 5.971013959309038e-06, + "loss": 0.6804, + "step": 4374 + }, + { + "epoch": 0.45, + "grad_norm": 2.0329707067511555, + "learning_rate": 5.969362537281647e-06, + "loss": 0.7667, + "step": 4375 + }, + { + "epoch": 0.45, + "grad_norm": 1.8286541545437696, + "learning_rate": 5.967711005371487e-06, + "loss": 0.6649, + "step": 4376 + }, + { + "epoch": 0.45, + "grad_norm": 2.036373560381382, + "learning_rate": 5.966059363765771e-06, + "loss": 0.6073, + "step": 4377 + }, + { + "epoch": 0.46, + "grad_norm": 1.8738880734455083, + "learning_rate": 5.9644076126517166e-06, + "loss": 0.6915, + "step": 4378 + }, + { + "epoch": 0.46, + "grad_norm": 2.049257494525163, + "learning_rate": 5.962755752216564e-06, + "loss": 0.6387, + "step": 4379 + }, + { + "epoch": 0.46, + "grad_norm": 1.693094678717967, + "learning_rate": 5.961103782647558e-06, + "loss": 0.4983, + "step": 4380 + }, + { + "epoch": 0.46, + "grad_norm": 1.9267764438897257, + "learning_rate": 5.959451704131962e-06, + "loss": 0.594, + "step": 4381 + }, + { + "epoch": 0.46, + "grad_norm": 1.966784263635863, + "learning_rate": 5.957799516857046e-06, + "loss": 0.6696, + "step": 4382 + }, + { + "epoch": 0.46, + "grad_norm": 1.917751357367465, + "learning_rate": 5.9561472210100955e-06, + "loss": 0.616, + "step": 4383 + }, + { + "epoch": 0.46, + "grad_norm": 1.9782860809483898, + "learning_rate": 5.954494816778408e-06, + "loss": 0.719, + "step": 4384 + }, + { + "epoch": 0.46, + "grad_norm": 1.8408410543098181, + "learning_rate": 5.952842304349291e-06, + "loss": 0.6837, + "step": 4385 + }, + { + "epoch": 0.46, + "grad_norm": 2.0510629750021216, + "learning_rate": 5.951189683910069e-06, + "loss": 0.6411, + "step": 4386 + }, + { + "epoch": 0.46, + "grad_norm": 1.7191965103067213, + "learning_rate": 5.949536955648074e-06, + "loss": 0.5532, + "step": 4387 + }, + { + "epoch": 0.46, + "grad_norm": 1.7432450197461355, + "learning_rate": 5.947884119750656e-06, + "loss": 0.652, + "step": 4388 + }, + { + "epoch": 0.46, + "grad_norm": 1.7602986319167306, + "learning_rate": 5.946231176405166e-06, + "loss": 0.5931, + "step": 4389 + }, + { + "epoch": 0.46, + "grad_norm": 1.9640168883565872, + "learning_rate": 5.944578125798981e-06, + "loss": 0.6641, + "step": 4390 + }, + { + "epoch": 0.46, + "grad_norm": 2.2045820879343583, + "learning_rate": 5.94292496811948e-06, + "loss": 0.6991, + "step": 4391 + }, + { + "epoch": 0.46, + "grad_norm": 1.6992537195980884, + "learning_rate": 5.94127170355406e-06, + "loss": 0.5314, + "step": 4392 + }, + { + "epoch": 0.46, + "grad_norm": 1.7760267943580061, + "learning_rate": 5.939618332290128e-06, + "loss": 0.6334, + "step": 4393 + }, + { + "epoch": 0.46, + "grad_norm": 2.0270949968085743, + "learning_rate": 5.937964854515101e-06, + "loss": 0.7455, + "step": 4394 + }, + { + "epoch": 0.46, + "grad_norm": 1.853300870630623, + "learning_rate": 5.936311270416415e-06, + "loss": 0.5298, + "step": 4395 + }, + { + "epoch": 0.46, + "grad_norm": 2.094047846726863, + "learning_rate": 5.9346575801815064e-06, + "loss": 0.6803, + "step": 4396 + }, + { + "epoch": 0.46, + "grad_norm": 1.5348679190305892, + "learning_rate": 5.933003783997835e-06, + "loss": 0.5962, + "step": 4397 + }, + { + "epoch": 0.46, + "grad_norm": 1.7849133821203453, + "learning_rate": 5.931349882052866e-06, + "loss": 0.6919, + "step": 4398 + }, + { + "epoch": 0.46, + "grad_norm": 1.7708118222363871, + "learning_rate": 5.929695874534081e-06, + "loss": 0.701, + "step": 4399 + }, + { + "epoch": 0.46, + "grad_norm": 1.9364917331436853, + "learning_rate": 5.928041761628968e-06, + "loss": 0.614, + "step": 4400 + }, + { + "epoch": 0.46, + "grad_norm": 1.81362935636992, + "learning_rate": 5.926387543525031e-06, + "loss": 0.7095, + "step": 4401 + }, + { + "epoch": 0.46, + "grad_norm": 1.9855548797846845, + "learning_rate": 5.924733220409786e-06, + "loss": 0.6378, + "step": 4402 + }, + { + "epoch": 0.46, + "grad_norm": 1.9860147094040368, + "learning_rate": 5.9230787924707625e-06, + "loss": 0.717, + "step": 4403 + }, + { + "epoch": 0.46, + "grad_norm": 1.799813809623052, + "learning_rate": 5.921424259895493e-06, + "loss": 0.6095, + "step": 4404 + }, + { + "epoch": 0.46, + "grad_norm": 1.854974605347049, + "learning_rate": 5.919769622871533e-06, + "loss": 0.614, + "step": 4405 + }, + { + "epoch": 0.46, + "grad_norm": 1.8929793362499432, + "learning_rate": 5.918114881586444e-06, + "loss": 0.6377, + "step": 4406 + }, + { + "epoch": 0.46, + "grad_norm": 1.8735159541726778, + "learning_rate": 5.9164600362278005e-06, + "loss": 0.5564, + "step": 4407 + }, + { + "epoch": 0.46, + "grad_norm": 1.9266281340327402, + "learning_rate": 5.914805086983187e-06, + "loss": 0.6363, + "step": 4408 + }, + { + "epoch": 0.46, + "grad_norm": 1.91652283894417, + "learning_rate": 5.913150034040203e-06, + "loss": 0.5966, + "step": 4409 + }, + { + "epoch": 0.46, + "grad_norm": 1.8967692186120793, + "learning_rate": 5.9114948775864585e-06, + "loss": 0.5763, + "step": 4410 + }, + { + "epoch": 0.46, + "grad_norm": 1.868068960782959, + "learning_rate": 5.909839617809574e-06, + "loss": 0.5677, + "step": 4411 + }, + { + "epoch": 0.46, + "grad_norm": 1.949627880291475, + "learning_rate": 5.908184254897183e-06, + "loss": 0.6456, + "step": 4412 + }, + { + "epoch": 0.46, + "grad_norm": 1.8952854730038935, + "learning_rate": 5.906528789036929e-06, + "loss": 0.7244, + "step": 4413 + }, + { + "epoch": 0.46, + "grad_norm": 1.88180194047267, + "learning_rate": 5.904873220416472e-06, + "loss": 0.6549, + "step": 4414 + }, + { + "epoch": 0.46, + "grad_norm": 1.9535874023387245, + "learning_rate": 5.903217549223477e-06, + "loss": 0.6613, + "step": 4415 + }, + { + "epoch": 0.46, + "grad_norm": 1.8438707598961703, + "learning_rate": 5.901561775645623e-06, + "loss": 0.5778, + "step": 4416 + }, + { + "epoch": 0.46, + "grad_norm": 2.0575987277313716, + "learning_rate": 5.8999058998706046e-06, + "loss": 0.656, + "step": 4417 + }, + { + "epoch": 0.46, + "grad_norm": 1.8539834505786594, + "learning_rate": 5.898249922086123e-06, + "loss": 0.7293, + "step": 4418 + }, + { + "epoch": 0.46, + "grad_norm": 1.8979032405655898, + "learning_rate": 5.896593842479893e-06, + "loss": 0.7049, + "step": 4419 + }, + { + "epoch": 0.46, + "grad_norm": 2.0946306162111976, + "learning_rate": 5.89493766123964e-06, + "loss": 0.6983, + "step": 4420 + }, + { + "epoch": 0.46, + "grad_norm": 2.1908352500873205, + "learning_rate": 5.893281378553104e-06, + "loss": 0.7129, + "step": 4421 + }, + { + "epoch": 0.46, + "grad_norm": 2.0150748552490465, + "learning_rate": 5.891624994608029e-06, + "loss": 0.6083, + "step": 4422 + }, + { + "epoch": 0.46, + "grad_norm": 1.7215755278309917, + "learning_rate": 5.8899685095921814e-06, + "loss": 0.656, + "step": 4423 + }, + { + "epoch": 0.46, + "grad_norm": 1.9343024267813007, + "learning_rate": 5.888311923693328e-06, + "loss": 0.5971, + "step": 4424 + }, + { + "epoch": 0.46, + "grad_norm": 1.8744245976889982, + "learning_rate": 5.886655237099257e-06, + "loss": 0.6771, + "step": 4425 + }, + { + "epoch": 0.46, + "grad_norm": 1.9031240731966292, + "learning_rate": 5.88499844999776e-06, + "loss": 0.6218, + "step": 4426 + }, + { + "epoch": 0.46, + "grad_norm": 1.9729786683428097, + "learning_rate": 5.8833415625766455e-06, + "loss": 0.5538, + "step": 4427 + }, + { + "epoch": 0.46, + "grad_norm": 1.834573070869529, + "learning_rate": 5.881684575023729e-06, + "loss": 0.6215, + "step": 4428 + }, + { + "epoch": 0.46, + "grad_norm": 1.760291997270579, + "learning_rate": 5.880027487526842e-06, + "loss": 0.5801, + "step": 4429 + }, + { + "epoch": 0.46, + "grad_norm": 1.9670809366698832, + "learning_rate": 5.878370300273821e-06, + "loss": 0.6537, + "step": 4430 + }, + { + "epoch": 0.46, + "grad_norm": 2.011201336258629, + "learning_rate": 5.876713013452521e-06, + "loss": 0.7965, + "step": 4431 + }, + { + "epoch": 0.46, + "grad_norm": 2.0312721214511242, + "learning_rate": 5.875055627250804e-06, + "loss": 0.6555, + "step": 4432 + }, + { + "epoch": 0.46, + "grad_norm": 1.7256318681000062, + "learning_rate": 5.873398141856545e-06, + "loss": 0.7285, + "step": 4433 + }, + { + "epoch": 0.46, + "grad_norm": 1.7330021830336968, + "learning_rate": 5.871740557457626e-06, + "loss": 0.5818, + "step": 4434 + }, + { + "epoch": 0.46, + "grad_norm": 1.7225502241433897, + "learning_rate": 5.870082874241947e-06, + "loss": 0.5089, + "step": 4435 + }, + { + "epoch": 0.46, + "grad_norm": 1.9286229634377934, + "learning_rate": 5.868425092397416e-06, + "loss": 0.7606, + "step": 4436 + }, + { + "epoch": 0.46, + "grad_norm": 1.9414863379591762, + "learning_rate": 5.86676721211195e-06, + "loss": 0.6719, + "step": 4437 + }, + { + "epoch": 0.46, + "grad_norm": 1.8080730220213328, + "learning_rate": 5.86510923357348e-06, + "loss": 0.6629, + "step": 4438 + }, + { + "epoch": 0.46, + "grad_norm": 1.7384087585471595, + "learning_rate": 5.8634511569699486e-06, + "loss": 0.728, + "step": 4439 + }, + { + "epoch": 0.46, + "grad_norm": 1.8723704862477661, + "learning_rate": 5.861792982489306e-06, + "loss": 0.6252, + "step": 4440 + }, + { + "epoch": 0.46, + "grad_norm": 1.7642246326221103, + "learning_rate": 5.860134710319517e-06, + "loss": 0.6327, + "step": 4441 + }, + { + "epoch": 0.46, + "grad_norm": 1.8926388005449055, + "learning_rate": 5.858476340648555e-06, + "loss": 0.6858, + "step": 4442 + }, + { + "epoch": 0.46, + "grad_norm": 1.8708903747298244, + "learning_rate": 5.856817873664409e-06, + "loss": 0.7199, + "step": 4443 + }, + { + "epoch": 0.46, + "grad_norm": 1.6894652296859378, + "learning_rate": 5.855159309555072e-06, + "loss": 0.5358, + "step": 4444 + }, + { + "epoch": 0.46, + "grad_norm": 1.8180878241005396, + "learning_rate": 5.853500648508552e-06, + "loss": 0.6665, + "step": 4445 + }, + { + "epoch": 0.46, + "grad_norm": 1.9056202180080377, + "learning_rate": 5.8518418907128694e-06, + "loss": 0.6349, + "step": 4446 + }, + { + "epoch": 0.46, + "grad_norm": 2.0279056733244483, + "learning_rate": 5.850183036356054e-06, + "loss": 0.6681, + "step": 4447 + }, + { + "epoch": 0.46, + "grad_norm": 1.7326930647206062, + "learning_rate": 5.8485240856261446e-06, + "loss": 0.5012, + "step": 4448 + }, + { + "epoch": 0.46, + "grad_norm": 1.7860636370758964, + "learning_rate": 5.846865038711194e-06, + "loss": 0.7176, + "step": 4449 + }, + { + "epoch": 0.46, + "grad_norm": 1.9843512953094944, + "learning_rate": 5.845205895799264e-06, + "loss": 0.7526, + "step": 4450 + }, + { + "epoch": 0.46, + "grad_norm": 2.010199635494966, + "learning_rate": 5.8435466570784295e-06, + "loss": 0.6164, + "step": 4451 + }, + { + "epoch": 0.46, + "grad_norm": 1.892725992106848, + "learning_rate": 5.8418873227367724e-06, + "loss": 0.6222, + "step": 4452 + }, + { + "epoch": 0.46, + "grad_norm": 2.0294869818788763, + "learning_rate": 5.840227892962388e-06, + "loss": 0.6154, + "step": 4453 + }, + { + "epoch": 0.46, + "grad_norm": 2.0869333452841192, + "learning_rate": 5.838568367943383e-06, + "loss": 0.6628, + "step": 4454 + }, + { + "epoch": 0.46, + "grad_norm": 1.6602804318639692, + "learning_rate": 5.8369087478678755e-06, + "loss": 0.6215, + "step": 4455 + }, + { + "epoch": 0.46, + "grad_norm": 1.8406790377489561, + "learning_rate": 5.835249032923989e-06, + "loss": 0.6173, + "step": 4456 + }, + { + "epoch": 0.46, + "grad_norm": 1.6276517300331053, + "learning_rate": 5.833589223299865e-06, + "loss": 0.6091, + "step": 4457 + }, + { + "epoch": 0.46, + "grad_norm": 1.820277411526944, + "learning_rate": 5.831929319183651e-06, + "loss": 0.6295, + "step": 4458 + }, + { + "epoch": 0.46, + "grad_norm": 1.9676714418497274, + "learning_rate": 5.830269320763507e-06, + "loss": 0.7288, + "step": 4459 + }, + { + "epoch": 0.46, + "grad_norm": 1.8329203395615346, + "learning_rate": 5.828609228227603e-06, + "loss": 0.7023, + "step": 4460 + }, + { + "epoch": 0.46, + "grad_norm": 1.8305837504641471, + "learning_rate": 5.82694904176412e-06, + "loss": 0.6172, + "step": 4461 + }, + { + "epoch": 0.46, + "grad_norm": 2.1586280747536115, + "learning_rate": 5.825288761561248e-06, + "loss": 0.7142, + "step": 4462 + }, + { + "epoch": 0.46, + "grad_norm": 1.8557818338452512, + "learning_rate": 5.823628387807193e-06, + "loss": 0.5918, + "step": 4463 + }, + { + "epoch": 0.46, + "grad_norm": 2.0083257509743015, + "learning_rate": 5.821967920690165e-06, + "loss": 0.6542, + "step": 4464 + }, + { + "epoch": 0.46, + "grad_norm": 1.7232403898211963, + "learning_rate": 5.82030736039839e-06, + "loss": 0.5959, + "step": 4465 + }, + { + "epoch": 0.46, + "grad_norm": 1.690743499181069, + "learning_rate": 5.818646707120098e-06, + "loss": 0.6244, + "step": 4466 + }, + { + "epoch": 0.46, + "grad_norm": 1.946693473563528, + "learning_rate": 5.8169859610435355e-06, + "loss": 0.5974, + "step": 4467 + }, + { + "epoch": 0.46, + "grad_norm": 1.8758207732410697, + "learning_rate": 5.815325122356959e-06, + "loss": 0.6168, + "step": 4468 + }, + { + "epoch": 0.46, + "grad_norm": 1.847333041662102, + "learning_rate": 5.813664191248631e-06, + "loss": 0.702, + "step": 4469 + }, + { + "epoch": 0.46, + "grad_norm": 1.8080621215350756, + "learning_rate": 5.8120031679068315e-06, + "loss": 0.5091, + "step": 4470 + }, + { + "epoch": 0.46, + "grad_norm": 1.7877506559716392, + "learning_rate": 5.810342052519842e-06, + "loss": 0.6393, + "step": 4471 + }, + { + "epoch": 0.46, + "grad_norm": 1.8901858594534553, + "learning_rate": 5.808680845275963e-06, + "loss": 0.5721, + "step": 4472 + }, + { + "epoch": 0.46, + "grad_norm": 1.873495548260194, + "learning_rate": 5.8070195463635025e-06, + "loss": 0.6208, + "step": 4473 + }, + { + "epoch": 0.47, + "grad_norm": 1.9721690840833237, + "learning_rate": 5.8053581559707754e-06, + "loss": 0.6252, + "step": 4474 + }, + { + "epoch": 0.47, + "grad_norm": 1.8923298290697836, + "learning_rate": 5.80369667428611e-06, + "loss": 0.6648, + "step": 4475 + }, + { + "epoch": 0.47, + "grad_norm": 2.0497196950258973, + "learning_rate": 5.802035101497846e-06, + "loss": 0.6719, + "step": 4476 + }, + { + "epoch": 0.47, + "grad_norm": 1.7156531826874144, + "learning_rate": 5.800373437794334e-06, + "loss": 0.6103, + "step": 4477 + }, + { + "epoch": 0.47, + "grad_norm": 1.8561943349164527, + "learning_rate": 5.798711683363929e-06, + "loss": 0.6015, + "step": 4478 + }, + { + "epoch": 0.47, + "grad_norm": 1.722452492222899, + "learning_rate": 5.797049838395001e-06, + "loss": 0.5597, + "step": 4479 + }, + { + "epoch": 0.47, + "grad_norm": 1.8749034479522824, + "learning_rate": 5.795387903075933e-06, + "loss": 0.6504, + "step": 4480 + }, + { + "epoch": 0.47, + "grad_norm": 1.7635470279845804, + "learning_rate": 5.7937258775951125e-06, + "loss": 0.5879, + "step": 4481 + }, + { + "epoch": 0.47, + "grad_norm": 1.8571087504016002, + "learning_rate": 5.792063762140938e-06, + "loss": 0.6682, + "step": 4482 + }, + { + "epoch": 0.47, + "grad_norm": 1.8506275346708279, + "learning_rate": 5.790401556901822e-06, + "loss": 0.7062, + "step": 4483 + }, + { + "epoch": 0.47, + "grad_norm": 1.9054042143788954, + "learning_rate": 5.788739262066185e-06, + "loss": 0.5929, + "step": 4484 + }, + { + "epoch": 0.47, + "grad_norm": 1.745527884049558, + "learning_rate": 5.787076877822457e-06, + "loss": 0.6085, + "step": 4485 + }, + { + "epoch": 0.47, + "grad_norm": 1.8825036097416747, + "learning_rate": 5.7854144043590775e-06, + "loss": 0.6847, + "step": 4486 + }, + { + "epoch": 0.47, + "grad_norm": 2.023138211383369, + "learning_rate": 5.7837518418645e-06, + "loss": 0.6308, + "step": 4487 + }, + { + "epoch": 0.47, + "grad_norm": 1.9547099231146705, + "learning_rate": 5.782089190527185e-06, + "loss": 0.6312, + "step": 4488 + }, + { + "epoch": 0.47, + "grad_norm": 1.643185636278992, + "learning_rate": 5.7804264505356e-06, + "loss": 0.6085, + "step": 4489 + }, + { + "epoch": 0.47, + "grad_norm": 2.0414113888224117, + "learning_rate": 5.7787636220782294e-06, + "loss": 0.6321, + "step": 4490 + }, + { + "epoch": 0.47, + "grad_norm": 2.2067355496136902, + "learning_rate": 5.777100705343565e-06, + "loss": 0.7242, + "step": 4491 + }, + { + "epoch": 0.47, + "grad_norm": 1.9603021200117243, + "learning_rate": 5.775437700520103e-06, + "loss": 0.59, + "step": 4492 + }, + { + "epoch": 0.47, + "grad_norm": 1.9097724088822532, + "learning_rate": 5.7737746077963605e-06, + "loss": 0.714, + "step": 4493 + }, + { + "epoch": 0.47, + "grad_norm": 1.6876366699927718, + "learning_rate": 5.772111427360855e-06, + "loss": 0.5481, + "step": 4494 + }, + { + "epoch": 0.47, + "grad_norm": 1.8767910335072366, + "learning_rate": 5.770448159402118e-06, + "loss": 0.6646, + "step": 4495 + }, + { + "epoch": 0.47, + "grad_norm": 2.0294806073429137, + "learning_rate": 5.7687848041086905e-06, + "loss": 0.6843, + "step": 4496 + }, + { + "epoch": 0.47, + "grad_norm": 1.9588128637096367, + "learning_rate": 5.767121361669125e-06, + "loss": 0.7105, + "step": 4497 + }, + { + "epoch": 0.47, + "grad_norm": 1.9716151486499722, + "learning_rate": 5.765457832271979e-06, + "loss": 0.6373, + "step": 4498 + }, + { + "epoch": 0.47, + "grad_norm": 2.080359918608944, + "learning_rate": 5.763794216105826e-06, + "loss": 0.7501, + "step": 4499 + }, + { + "epoch": 0.47, + "grad_norm": 1.8964238274940508, + "learning_rate": 5.762130513359244e-06, + "loss": 0.5701, + "step": 4500 + }, + { + "epoch": 0.47, + "grad_norm": 1.8669777706044983, + "learning_rate": 5.760466724220824e-06, + "loss": 0.5573, + "step": 4501 + }, + { + "epoch": 0.47, + "grad_norm": 1.9046577228447057, + "learning_rate": 5.758802848879169e-06, + "loss": 0.6396, + "step": 4502 + }, + { + "epoch": 0.47, + "grad_norm": 2.2300958145019742, + "learning_rate": 5.757138887522884e-06, + "loss": 0.7542, + "step": 4503 + }, + { + "epoch": 0.47, + "grad_norm": 1.8800972518796621, + "learning_rate": 5.75547484034059e-06, + "loss": 0.5727, + "step": 4504 + }, + { + "epoch": 0.47, + "grad_norm": 2.2108595105155944, + "learning_rate": 5.753810707520918e-06, + "loss": 0.5708, + "step": 4505 + }, + { + "epoch": 0.47, + "grad_norm": 1.9557531259523118, + "learning_rate": 5.7521464892525055e-06, + "loss": 0.668, + "step": 4506 + }, + { + "epoch": 0.47, + "grad_norm": 1.6753004989326905, + "learning_rate": 5.750482185724001e-06, + "loss": 0.5802, + "step": 4507 + }, + { + "epoch": 0.47, + "grad_norm": 1.8277555372897838, + "learning_rate": 5.748817797124063e-06, + "loss": 0.5803, + "step": 4508 + }, + { + "epoch": 0.47, + "grad_norm": 1.9809336995417062, + "learning_rate": 5.74715332364136e-06, + "loss": 0.6485, + "step": 4509 + }, + { + "epoch": 0.47, + "grad_norm": 1.944164497840986, + "learning_rate": 5.7454887654645706e-06, + "loss": 0.554, + "step": 4510 + }, + { + "epoch": 0.47, + "grad_norm": 1.9367604540968901, + "learning_rate": 5.743824122782379e-06, + "loss": 0.7028, + "step": 4511 + }, + { + "epoch": 0.47, + "grad_norm": 2.0476021643490068, + "learning_rate": 5.7421593957834835e-06, + "loss": 0.5873, + "step": 4512 + }, + { + "epoch": 0.47, + "grad_norm": 1.8303359306005593, + "learning_rate": 5.74049458465659e-06, + "loss": 0.5952, + "step": 4513 + }, + { + "epoch": 0.47, + "grad_norm": 1.6641282597478058, + "learning_rate": 5.738829689590415e-06, + "loss": 0.6916, + "step": 4514 + }, + { + "epoch": 0.47, + "grad_norm": 1.9218716826790592, + "learning_rate": 5.7371647107736824e-06, + "loss": 0.5552, + "step": 4515 + }, + { + "epoch": 0.47, + "grad_norm": 1.8021875591091798, + "learning_rate": 5.73549964839513e-06, + "loss": 0.6477, + "step": 4516 + }, + { + "epoch": 0.47, + "grad_norm": 1.874696675514852, + "learning_rate": 5.7338345026434995e-06, + "loss": 0.6552, + "step": 4517 + }, + { + "epoch": 0.47, + "grad_norm": 1.7124112887912268, + "learning_rate": 5.732169273707545e-06, + "loss": 0.6077, + "step": 4518 + }, + { + "epoch": 0.47, + "grad_norm": 2.036393377024413, + "learning_rate": 5.73050396177603e-06, + "loss": 0.7247, + "step": 4519 + }, + { + "epoch": 0.47, + "grad_norm": 1.7701392918915468, + "learning_rate": 5.728838567037728e-06, + "loss": 0.7127, + "step": 4520 + }, + { + "epoch": 0.47, + "grad_norm": 1.9231725547933194, + "learning_rate": 5.72717308968142e-06, + "loss": 0.6524, + "step": 4521 + }, + { + "epoch": 0.47, + "grad_norm": 2.080539493901293, + "learning_rate": 5.725507529895898e-06, + "loss": 0.6405, + "step": 4522 + }, + { + "epoch": 0.47, + "grad_norm": 1.8921086679701191, + "learning_rate": 5.723841887869961e-06, + "loss": 0.6963, + "step": 4523 + }, + { + "epoch": 0.47, + "grad_norm": 1.982435709140776, + "learning_rate": 5.72217616379242e-06, + "loss": 0.553, + "step": 4524 + }, + { + "epoch": 0.47, + "grad_norm": 1.869479487988279, + "learning_rate": 5.7205103578520956e-06, + "loss": 0.5267, + "step": 4525 + }, + { + "epoch": 0.47, + "grad_norm": 1.8967605276922301, + "learning_rate": 5.7188444702378155e-06, + "loss": 0.6433, + "step": 4526 + }, + { + "epoch": 0.47, + "grad_norm": 2.1663730345064023, + "learning_rate": 5.717178501138416e-06, + "loss": 0.7024, + "step": 4527 + }, + { + "epoch": 0.47, + "grad_norm": 1.6711389674607338, + "learning_rate": 5.715512450742749e-06, + "loss": 0.6199, + "step": 4528 + }, + { + "epoch": 0.47, + "grad_norm": 2.037140945002469, + "learning_rate": 5.713846319239664e-06, + "loss": 0.6665, + "step": 4529 + }, + { + "epoch": 0.47, + "grad_norm": 2.022157294672067, + "learning_rate": 5.71218010681803e-06, + "loss": 0.7589, + "step": 4530 + }, + { + "epoch": 0.47, + "grad_norm": 1.7878692251576156, + "learning_rate": 5.710513813666722e-06, + "loss": 0.6059, + "step": 4531 + }, + { + "epoch": 0.47, + "grad_norm": 1.9980205127983506, + "learning_rate": 5.708847439974625e-06, + "loss": 0.6536, + "step": 4532 + }, + { + "epoch": 0.47, + "grad_norm": 2.109109182112321, + "learning_rate": 5.707180985930629e-06, + "loss": 0.6149, + "step": 4533 + }, + { + "epoch": 0.47, + "grad_norm": 2.035240334490125, + "learning_rate": 5.705514451723637e-06, + "loss": 0.6236, + "step": 4534 + }, + { + "epoch": 0.47, + "grad_norm": 1.991215297512859, + "learning_rate": 5.703847837542562e-06, + "loss": 0.6123, + "step": 4535 + }, + { + "epoch": 0.47, + "grad_norm": 1.8754198782296225, + "learning_rate": 5.702181143576323e-06, + "loss": 0.5805, + "step": 4536 + }, + { + "epoch": 0.47, + "grad_norm": 1.8767609705655672, + "learning_rate": 5.7005143700138474e-06, + "loss": 0.5803, + "step": 4537 + }, + { + "epoch": 0.47, + "grad_norm": 1.8788940690815126, + "learning_rate": 5.698847517044076e-06, + "loss": 0.6342, + "step": 4538 + }, + { + "epoch": 0.47, + "grad_norm": 2.046203066451224, + "learning_rate": 5.697180584855957e-06, + "loss": 0.7154, + "step": 4539 + }, + { + "epoch": 0.47, + "grad_norm": 2.147823782012038, + "learning_rate": 5.6955135736384425e-06, + "loss": 0.6689, + "step": 4540 + }, + { + "epoch": 0.47, + "grad_norm": 1.9211938256886534, + "learning_rate": 5.693846483580501e-06, + "loss": 0.7068, + "step": 4541 + }, + { + "epoch": 0.47, + "grad_norm": 1.7172972276987455, + "learning_rate": 5.692179314871104e-06, + "loss": 0.5695, + "step": 4542 + }, + { + "epoch": 0.47, + "grad_norm": 1.9061594156772637, + "learning_rate": 5.69051206769924e-06, + "loss": 0.6323, + "step": 4543 + }, + { + "epoch": 0.47, + "grad_norm": 1.9402592924801827, + "learning_rate": 5.688844742253895e-06, + "loss": 0.7288, + "step": 4544 + }, + { + "epoch": 0.47, + "grad_norm": 1.9548934582721385, + "learning_rate": 5.687177338724073e-06, + "loss": 0.6359, + "step": 4545 + }, + { + "epoch": 0.47, + "grad_norm": 1.9058935634603749, + "learning_rate": 5.685509857298781e-06, + "loss": 0.6532, + "step": 4546 + }, + { + "epoch": 0.47, + "grad_norm": 1.8694790048885155, + "learning_rate": 5.683842298167041e-06, + "loss": 0.6986, + "step": 4547 + }, + { + "epoch": 0.47, + "grad_norm": 2.0055822862463293, + "learning_rate": 5.68217466151788e-06, + "loss": 0.5503, + "step": 4548 + }, + { + "epoch": 0.47, + "grad_norm": 1.8668045735488943, + "learning_rate": 5.680506947540331e-06, + "loss": 0.7325, + "step": 4549 + }, + { + "epoch": 0.47, + "grad_norm": 1.6947983432539857, + "learning_rate": 5.67883915642344e-06, + "loss": 0.692, + "step": 4550 + }, + { + "epoch": 0.47, + "grad_norm": 1.8895950172582314, + "learning_rate": 5.677171288356263e-06, + "loss": 0.7109, + "step": 4551 + }, + { + "epoch": 0.47, + "grad_norm": 1.971379110045142, + "learning_rate": 5.675503343527861e-06, + "loss": 0.623, + "step": 4552 + }, + { + "epoch": 0.47, + "grad_norm": 2.030884339212526, + "learning_rate": 5.673835322127304e-06, + "loss": 0.6353, + "step": 4553 + }, + { + "epoch": 0.47, + "grad_norm": 1.928163277407566, + "learning_rate": 5.672167224343673e-06, + "loss": 0.6159, + "step": 4554 + }, + { + "epoch": 0.47, + "grad_norm": 1.908799846546338, + "learning_rate": 5.670499050366055e-06, + "loss": 0.5873, + "step": 4555 + }, + { + "epoch": 0.47, + "grad_norm": 2.141916280219293, + "learning_rate": 5.668830800383548e-06, + "loss": 0.8722, + "step": 4556 + }, + { + "epoch": 0.47, + "grad_norm": 1.9235665075383108, + "learning_rate": 5.667162474585258e-06, + "loss": 0.6594, + "step": 4557 + }, + { + "epoch": 0.47, + "grad_norm": 1.9022698171629637, + "learning_rate": 5.6654940731602995e-06, + "loss": 0.674, + "step": 4558 + }, + { + "epoch": 0.47, + "grad_norm": 2.0388621559832467, + "learning_rate": 5.663825596297794e-06, + "loss": 0.6344, + "step": 4559 + }, + { + "epoch": 0.47, + "grad_norm": 1.689495512348284, + "learning_rate": 5.662157044186873e-06, + "loss": 0.5544, + "step": 4560 + }, + { + "epoch": 0.47, + "grad_norm": 2.0045306909764857, + "learning_rate": 5.6604884170166765e-06, + "loss": 0.673, + "step": 4561 + }, + { + "epoch": 0.47, + "grad_norm": 1.9111890412837886, + "learning_rate": 5.658819714976355e-06, + "loss": 0.6942, + "step": 4562 + }, + { + "epoch": 0.47, + "grad_norm": 1.9998776939328162, + "learning_rate": 5.657150938255062e-06, + "loss": 0.701, + "step": 4563 + }, + { + "epoch": 0.47, + "grad_norm": 2.0427375075903735, + "learning_rate": 5.655482087041965e-06, + "loss": 0.6718, + "step": 4564 + }, + { + "epoch": 0.47, + "grad_norm": 2.0273043034155704, + "learning_rate": 5.653813161526237e-06, + "loss": 0.6799, + "step": 4565 + }, + { + "epoch": 0.47, + "grad_norm": 1.9518930981995333, + "learning_rate": 5.6521441618970605e-06, + "loss": 0.5647, + "step": 4566 + }, + { + "epoch": 0.47, + "grad_norm": 1.7441388879377546, + "learning_rate": 5.6504750883436275e-06, + "loss": 0.5699, + "step": 4567 + }, + { + "epoch": 0.47, + "grad_norm": 1.890032316812201, + "learning_rate": 5.648805941055135e-06, + "loss": 0.6521, + "step": 4568 + }, + { + "epoch": 0.47, + "grad_norm": 1.880305597341674, + "learning_rate": 5.647136720220791e-06, + "loss": 0.6335, + "step": 4569 + }, + { + "epoch": 0.48, + "grad_norm": 1.8075253023263849, + "learning_rate": 5.64546742602981e-06, + "loss": 0.6353, + "step": 4570 + }, + { + "epoch": 0.48, + "grad_norm": 1.7588936557722914, + "learning_rate": 5.643798058671418e-06, + "loss": 0.6952, + "step": 4571 + }, + { + "epoch": 0.48, + "grad_norm": 1.6995840290285382, + "learning_rate": 5.6421286183348465e-06, + "loss": 0.6305, + "step": 4572 + }, + { + "epoch": 0.48, + "grad_norm": 2.0948395121790315, + "learning_rate": 5.640459105209337e-06, + "loss": 0.6411, + "step": 4573 + }, + { + "epoch": 0.48, + "grad_norm": 1.9736848664004365, + "learning_rate": 5.638789519484137e-06, + "loss": 0.6777, + "step": 4574 + }, + { + "epoch": 0.48, + "grad_norm": 2.0435556431950803, + "learning_rate": 5.637119861348504e-06, + "loss": 0.7337, + "step": 4575 + }, + { + "epoch": 0.48, + "grad_norm": 2.0847525177688038, + "learning_rate": 5.6354501309917034e-06, + "loss": 0.5759, + "step": 4576 + }, + { + "epoch": 0.48, + "grad_norm": 1.659038326232906, + "learning_rate": 5.633780328603008e-06, + "loss": 0.5909, + "step": 4577 + }, + { + "epoch": 0.48, + "grad_norm": 1.9345623802471068, + "learning_rate": 5.6321104543717e-06, + "loss": 0.6019, + "step": 4578 + }, + { + "epoch": 0.48, + "grad_norm": 1.963218170933064, + "learning_rate": 5.630440508487068e-06, + "loss": 0.6093, + "step": 4579 + }, + { + "epoch": 0.48, + "grad_norm": 1.7722605156571707, + "learning_rate": 5.628770491138414e-06, + "loss": 0.5577, + "step": 4580 + }, + { + "epoch": 0.48, + "grad_norm": 1.9424797388437025, + "learning_rate": 5.627100402515038e-06, + "loss": 0.6364, + "step": 4581 + }, + { + "epoch": 0.48, + "grad_norm": 1.9240402046319094, + "learning_rate": 5.625430242806258e-06, + "loss": 0.7009, + "step": 4582 + }, + { + "epoch": 0.48, + "grad_norm": 1.721892694944862, + "learning_rate": 5.623760012201394e-06, + "loss": 0.6482, + "step": 4583 + }, + { + "epoch": 0.48, + "grad_norm": 1.977992478992338, + "learning_rate": 5.62208971088978e-06, + "loss": 0.7306, + "step": 4584 + }, + { + "epoch": 0.48, + "grad_norm": 1.847701541976097, + "learning_rate": 5.62041933906075e-06, + "loss": 0.6842, + "step": 4585 + }, + { + "epoch": 0.48, + "grad_norm": 1.8640134483922797, + "learning_rate": 5.618748896903652e-06, + "loss": 0.6163, + "step": 4586 + }, + { + "epoch": 0.48, + "grad_norm": 1.8814485760535749, + "learning_rate": 5.617078384607839e-06, + "loss": 0.7321, + "step": 4587 + }, + { + "epoch": 0.48, + "grad_norm": 1.6377353046992302, + "learning_rate": 5.615407802362675e-06, + "loss": 0.6036, + "step": 4588 + }, + { + "epoch": 0.48, + "grad_norm": 1.8481119867884892, + "learning_rate": 5.613737150357528e-06, + "loss": 0.629, + "step": 4589 + }, + { + "epoch": 0.48, + "grad_norm": 1.892284220082062, + "learning_rate": 5.6120664287817765e-06, + "loss": 0.6487, + "step": 4590 + }, + { + "epoch": 0.48, + "grad_norm": 2.021796552026886, + "learning_rate": 5.610395637824808e-06, + "loss": 0.7287, + "step": 4591 + }, + { + "epoch": 0.48, + "grad_norm": 1.8874399763646057, + "learning_rate": 5.608724777676013e-06, + "loss": 0.6372, + "step": 4592 + }, + { + "epoch": 0.48, + "grad_norm": 1.7794562570973433, + "learning_rate": 5.607053848524796e-06, + "loss": 0.6554, + "step": 4593 + }, + { + "epoch": 0.48, + "grad_norm": 1.741221722198036, + "learning_rate": 5.605382850560565e-06, + "loss": 0.6421, + "step": 4594 + }, + { + "epoch": 0.48, + "grad_norm": 1.7314815048933372, + "learning_rate": 5.603711783972738e-06, + "loss": 0.5944, + "step": 4595 + }, + { + "epoch": 0.48, + "grad_norm": 1.8385199882160486, + "learning_rate": 5.6020406489507385e-06, + "loss": 0.6879, + "step": 4596 + }, + { + "epoch": 0.48, + "grad_norm": 1.9110860400813556, + "learning_rate": 5.600369445683999e-06, + "loss": 0.555, + "step": 4597 + }, + { + "epoch": 0.48, + "grad_norm": 1.9249828682309444, + "learning_rate": 5.5986981743619615e-06, + "loss": 0.6981, + "step": 4598 + }, + { + "epoch": 0.48, + "grad_norm": 1.949901847401595, + "learning_rate": 5.597026835174075e-06, + "loss": 0.6773, + "step": 4599 + }, + { + "epoch": 0.48, + "grad_norm": 1.8957329701386099, + "learning_rate": 5.5953554283097925e-06, + "loss": 0.7059, + "step": 4600 + }, + { + "epoch": 0.48, + "grad_norm": 1.9662603076387533, + "learning_rate": 5.593683953958579e-06, + "loss": 0.5705, + "step": 4601 + }, + { + "epoch": 0.48, + "grad_norm": 1.9760343575599348, + "learning_rate": 5.592012412309905e-06, + "loss": 0.5734, + "step": 4602 + }, + { + "epoch": 0.48, + "grad_norm": 1.9314288640416741, + "learning_rate": 5.590340803553249e-06, + "loss": 0.5705, + "step": 4603 + }, + { + "epoch": 0.48, + "grad_norm": 1.7749475103591477, + "learning_rate": 5.5886691278780995e-06, + "loss": 0.5766, + "step": 4604 + }, + { + "epoch": 0.48, + "grad_norm": 1.8301652296483235, + "learning_rate": 5.586997385473949e-06, + "loss": 0.5995, + "step": 4605 + }, + { + "epoch": 0.48, + "grad_norm": 2.0224492387711708, + "learning_rate": 5.5853255765302995e-06, + "loss": 0.7307, + "step": 4606 + }, + { + "epoch": 0.48, + "grad_norm": 1.7800813580039285, + "learning_rate": 5.583653701236658e-06, + "loss": 0.5758, + "step": 4607 + }, + { + "epoch": 0.48, + "grad_norm": 1.7387567730906435, + "learning_rate": 5.581981759782543e-06, + "loss": 0.6738, + "step": 4608 + }, + { + "epoch": 0.48, + "grad_norm": 1.8039219026028164, + "learning_rate": 5.580309752357479e-06, + "loss": 0.582, + "step": 4609 + }, + { + "epoch": 0.48, + "grad_norm": 1.9569486267401757, + "learning_rate": 5.578637679150997e-06, + "loss": 0.6614, + "step": 4610 + }, + { + "epoch": 0.48, + "grad_norm": 1.7888601680101468, + "learning_rate": 5.576965540352637e-06, + "loss": 0.5691, + "step": 4611 + }, + { + "epoch": 0.48, + "grad_norm": 1.7861531175452023, + "learning_rate": 5.575293336151943e-06, + "loss": 0.5806, + "step": 4612 + }, + { + "epoch": 0.48, + "grad_norm": 1.9083526025670454, + "learning_rate": 5.573621066738471e-06, + "loss": 0.6261, + "step": 4613 + }, + { + "epoch": 0.48, + "grad_norm": 1.737846510421099, + "learning_rate": 5.571948732301781e-06, + "loss": 0.6108, + "step": 4614 + }, + { + "epoch": 0.48, + "grad_norm": 2.0423564349394105, + "learning_rate": 5.570276333031441e-06, + "loss": 0.6206, + "step": 4615 + }, + { + "epoch": 0.48, + "grad_norm": 2.125783143784693, + "learning_rate": 5.568603869117029e-06, + "loss": 0.7213, + "step": 4616 + }, + { + "epoch": 0.48, + "grad_norm": 1.8570347144204054, + "learning_rate": 5.56693134074813e-06, + "loss": 0.6892, + "step": 4617 + }, + { + "epoch": 0.48, + "grad_norm": 1.7588279865564456, + "learning_rate": 5.56525874811433e-06, + "loss": 0.618, + "step": 4618 + }, + { + "epoch": 0.48, + "grad_norm": 1.7432706073652564, + "learning_rate": 5.563586091405229e-06, + "loss": 0.5623, + "step": 4619 + }, + { + "epoch": 0.48, + "grad_norm": 1.847040133653717, + "learning_rate": 5.561913370810432e-06, + "loss": 0.6304, + "step": 4620 + }, + { + "epoch": 0.48, + "grad_norm": 2.0289255167643967, + "learning_rate": 5.560240586519553e-06, + "loss": 0.5503, + "step": 4621 + }, + { + "epoch": 0.48, + "grad_norm": 1.781866236186172, + "learning_rate": 5.558567738722208e-06, + "loss": 0.5261, + "step": 4622 + }, + { + "epoch": 0.48, + "grad_norm": 2.270523192504574, + "learning_rate": 5.556894827608027e-06, + "loss": 0.7636, + "step": 4623 + }, + { + "epoch": 0.48, + "grad_norm": 1.925337493063739, + "learning_rate": 5.555221853366644e-06, + "loss": 0.503, + "step": 4624 + }, + { + "epoch": 0.48, + "grad_norm": 1.7839496985845051, + "learning_rate": 5.5535488161876994e-06, + "loss": 0.612, + "step": 4625 + }, + { + "epoch": 0.48, + "grad_norm": 1.6814669083880103, + "learning_rate": 5.55187571626084e-06, + "loss": 0.6839, + "step": 4626 + }, + { + "epoch": 0.48, + "grad_norm": 1.6368160551509423, + "learning_rate": 5.550202553775723e-06, + "loss": 0.6327, + "step": 4627 + }, + { + "epoch": 0.48, + "grad_norm": 1.7793440986251825, + "learning_rate": 5.548529328922012e-06, + "loss": 0.5759, + "step": 4628 + }, + { + "epoch": 0.48, + "grad_norm": 1.9763629323931438, + "learning_rate": 5.546856041889374e-06, + "loss": 0.6881, + "step": 4629 + }, + { + "epoch": 0.48, + "grad_norm": 1.9111816163433664, + "learning_rate": 5.545182692867486e-06, + "loss": 0.5799, + "step": 4630 + }, + { + "epoch": 0.48, + "grad_norm": 1.825419245447919, + "learning_rate": 5.543509282046031e-06, + "loss": 0.6236, + "step": 4631 + }, + { + "epoch": 0.48, + "grad_norm": 1.8821953354276433, + "learning_rate": 5.541835809614704e-06, + "loss": 0.6645, + "step": 4632 + }, + { + "epoch": 0.48, + "grad_norm": 1.800286495436237, + "learning_rate": 5.540162275763198e-06, + "loss": 0.5669, + "step": 4633 + }, + { + "epoch": 0.48, + "grad_norm": 1.9310568135270056, + "learning_rate": 5.53848868068122e-06, + "loss": 0.6105, + "step": 4634 + }, + { + "epoch": 0.48, + "grad_norm": 1.894980337439149, + "learning_rate": 5.53681502455848e-06, + "loss": 0.6786, + "step": 4635 + }, + { + "epoch": 0.48, + "grad_norm": 1.966074480215953, + "learning_rate": 5.535141307584697e-06, + "loss": 0.6413, + "step": 4636 + }, + { + "epoch": 0.48, + "grad_norm": 1.8636160131590067, + "learning_rate": 5.5334675299495975e-06, + "loss": 0.5534, + "step": 4637 + }, + { + "epoch": 0.48, + "grad_norm": 1.9035811073752469, + "learning_rate": 5.531793691842912e-06, + "loss": 0.7617, + "step": 4638 + }, + { + "epoch": 0.48, + "grad_norm": 1.9211832192257259, + "learning_rate": 5.530119793454381e-06, + "loss": 0.6682, + "step": 4639 + }, + { + "epoch": 0.48, + "grad_norm": 1.990268702208091, + "learning_rate": 5.52844583497375e-06, + "loss": 0.6658, + "step": 4640 + }, + { + "epoch": 0.48, + "grad_norm": 2.048537657034261, + "learning_rate": 5.52677181659077e-06, + "loss": 0.7124, + "step": 4641 + }, + { + "epoch": 0.48, + "grad_norm": 2.241227644645234, + "learning_rate": 5.525097738495204e-06, + "loss": 0.6851, + "step": 4642 + }, + { + "epoch": 0.48, + "grad_norm": 1.7727776823270593, + "learning_rate": 5.523423600876816e-06, + "loss": 0.5747, + "step": 4643 + }, + { + "epoch": 0.48, + "grad_norm": 1.9624937546211163, + "learning_rate": 5.521749403925379e-06, + "loss": 0.6018, + "step": 4644 + }, + { + "epoch": 0.48, + "grad_norm": 1.8723540282901674, + "learning_rate": 5.520075147830674e-06, + "loss": 0.6188, + "step": 4645 + }, + { + "epoch": 0.48, + "grad_norm": 1.8822938715140052, + "learning_rate": 5.518400832782485e-06, + "loss": 0.5811, + "step": 4646 + }, + { + "epoch": 0.48, + "grad_norm": 1.9055839171728313, + "learning_rate": 5.516726458970608e-06, + "loss": 0.5566, + "step": 4647 + }, + { + "epoch": 0.48, + "grad_norm": 1.805471104267127, + "learning_rate": 5.515052026584842e-06, + "loss": 0.6422, + "step": 4648 + }, + { + "epoch": 0.48, + "grad_norm": 1.8946969960370972, + "learning_rate": 5.513377535814992e-06, + "loss": 0.7232, + "step": 4649 + }, + { + "epoch": 0.48, + "grad_norm": 1.996885687942003, + "learning_rate": 5.511702986850873e-06, + "loss": 0.5903, + "step": 4650 + }, + { + "epoch": 0.48, + "grad_norm": 1.9418199161319338, + "learning_rate": 5.510028379882304e-06, + "loss": 0.6816, + "step": 4651 + }, + { + "epoch": 0.48, + "grad_norm": 1.9209458855987755, + "learning_rate": 5.508353715099111e-06, + "loss": 0.7325, + "step": 4652 + }, + { + "epoch": 0.48, + "grad_norm": 1.737261300068071, + "learning_rate": 5.506678992691126e-06, + "loss": 0.578, + "step": 4653 + }, + { + "epoch": 0.48, + "grad_norm": 1.887091236693906, + "learning_rate": 5.50500421284819e-06, + "loss": 0.648, + "step": 4654 + }, + { + "epoch": 0.48, + "grad_norm": 1.9141217370293513, + "learning_rate": 5.503329375760148e-06, + "loss": 0.5891, + "step": 4655 + }, + { + "epoch": 0.48, + "grad_norm": 2.240905822821042, + "learning_rate": 5.5016544816168515e-06, + "loss": 0.6495, + "step": 4656 + }, + { + "epoch": 0.48, + "grad_norm": 2.008628318504785, + "learning_rate": 5.49997953060816e-06, + "loss": 0.6581, + "step": 4657 + }, + { + "epoch": 0.48, + "grad_norm": 1.848748059234952, + "learning_rate": 5.498304522923941e-06, + "loss": 0.6991, + "step": 4658 + }, + { + "epoch": 0.48, + "grad_norm": 1.9226179328864823, + "learning_rate": 5.4966294587540626e-06, + "loss": 0.62, + "step": 4659 + }, + { + "epoch": 0.48, + "grad_norm": 1.731765557271535, + "learning_rate": 5.494954338288404e-06, + "loss": 0.5619, + "step": 4660 + }, + { + "epoch": 0.48, + "grad_norm": 1.9932290535373656, + "learning_rate": 5.493279161716851e-06, + "loss": 0.6834, + "step": 4661 + }, + { + "epoch": 0.48, + "grad_norm": 1.7461404202900574, + "learning_rate": 5.491603929229293e-06, + "loss": 0.6477, + "step": 4662 + }, + { + "epoch": 0.48, + "grad_norm": 1.9588893964565277, + "learning_rate": 5.4899286410156275e-06, + "loss": 0.7275, + "step": 4663 + }, + { + "epoch": 0.48, + "grad_norm": 1.888923777184811, + "learning_rate": 5.488253297265757e-06, + "loss": 0.6459, + "step": 4664 + }, + { + "epoch": 0.48, + "grad_norm": 1.990188644636464, + "learning_rate": 5.486577898169595e-06, + "loss": 0.6675, + "step": 4665 + }, + { + "epoch": 0.49, + "grad_norm": 1.8803794705216603, + "learning_rate": 5.484902443917053e-06, + "loss": 0.6864, + "step": 4666 + }, + { + "epoch": 0.49, + "grad_norm": 2.0001158625188173, + "learning_rate": 5.483226934698055e-06, + "loss": 0.6569, + "step": 4667 + }, + { + "epoch": 0.49, + "grad_norm": 1.8830150639111904, + "learning_rate": 5.4815513707025306e-06, + "loss": 0.6279, + "step": 4668 + }, + { + "epoch": 0.49, + "grad_norm": 2.023732196650425, + "learning_rate": 5.479875752120414e-06, + "loss": 0.6609, + "step": 4669 + }, + { + "epoch": 0.49, + "grad_norm": 2.0625613008373773, + "learning_rate": 5.478200079141644e-06, + "loss": 0.643, + "step": 4670 + }, + { + "epoch": 0.49, + "grad_norm": 1.9591030994917111, + "learning_rate": 5.47652435195617e-06, + "loss": 0.6557, + "step": 4671 + }, + { + "epoch": 0.49, + "grad_norm": 1.7978853519603617, + "learning_rate": 5.4748485707539435e-06, + "loss": 0.5881, + "step": 4672 + }, + { + "epoch": 0.49, + "grad_norm": 1.9473749824242983, + "learning_rate": 5.473172735724927e-06, + "loss": 0.6415, + "step": 4673 + }, + { + "epoch": 0.49, + "grad_norm": 2.058316827182397, + "learning_rate": 5.471496847059082e-06, + "loss": 0.7541, + "step": 4674 + }, + { + "epoch": 0.49, + "grad_norm": 2.0498700867540416, + "learning_rate": 5.469820904946383e-06, + "loss": 0.6762, + "step": 4675 + }, + { + "epoch": 0.49, + "grad_norm": 1.9141503530490744, + "learning_rate": 5.468144909576807e-06, + "loss": 0.5577, + "step": 4676 + }, + { + "epoch": 0.49, + "grad_norm": 1.797991597019518, + "learning_rate": 5.466468861140337e-06, + "loss": 0.5654, + "step": 4677 + }, + { + "epoch": 0.49, + "grad_norm": 1.8613963899363541, + "learning_rate": 5.464792759826962e-06, + "loss": 0.6946, + "step": 4678 + }, + { + "epoch": 0.49, + "grad_norm": 1.8335011635462246, + "learning_rate": 5.463116605826678e-06, + "loss": 0.678, + "step": 4679 + }, + { + "epoch": 0.49, + "grad_norm": 2.0050714504137965, + "learning_rate": 5.4614403993294895e-06, + "loss": 0.6334, + "step": 4680 + }, + { + "epoch": 0.49, + "grad_norm": 2.032874636356343, + "learning_rate": 5.4597641405254e-06, + "loss": 0.6813, + "step": 4681 + }, + { + "epoch": 0.49, + "grad_norm": 1.790986980802673, + "learning_rate": 5.458087829604423e-06, + "loss": 0.6778, + "step": 4682 + }, + { + "epoch": 0.49, + "grad_norm": 1.751145689736165, + "learning_rate": 5.456411466756584e-06, + "loss": 0.5752, + "step": 4683 + }, + { + "epoch": 0.49, + "grad_norm": 1.697096974282548, + "learning_rate": 5.4547350521719e-06, + "loss": 0.567, + "step": 4684 + }, + { + "epoch": 0.49, + "grad_norm": 2.1498813486938393, + "learning_rate": 5.453058586040406e-06, + "loss": 0.6605, + "step": 4685 + }, + { + "epoch": 0.49, + "grad_norm": 1.9907803657150827, + "learning_rate": 5.45138206855214e-06, + "loss": 0.6133, + "step": 4686 + }, + { + "epoch": 0.49, + "grad_norm": 2.1637345099530534, + "learning_rate": 5.4497054998971445e-06, + "loss": 0.6557, + "step": 4687 + }, + { + "epoch": 0.49, + "grad_norm": 1.8252761461235034, + "learning_rate": 5.448028880265467e-06, + "loss": 0.6198, + "step": 4688 + }, + { + "epoch": 0.49, + "grad_norm": 1.82343633197036, + "learning_rate": 5.446352209847161e-06, + "loss": 0.7194, + "step": 4689 + }, + { + "epoch": 0.49, + "grad_norm": 2.0614527120208, + "learning_rate": 5.444675488832288e-06, + "loss": 0.6723, + "step": 4690 + }, + { + "epoch": 0.49, + "grad_norm": 2.1820987619351175, + "learning_rate": 5.442998717410916e-06, + "loss": 0.778, + "step": 4691 + }, + { + "epoch": 0.49, + "grad_norm": 2.0025828276893134, + "learning_rate": 5.441321895773112e-06, + "loss": 0.6472, + "step": 4692 + }, + { + "epoch": 0.49, + "grad_norm": 1.9803099147219958, + "learning_rate": 5.439645024108956e-06, + "loss": 0.6677, + "step": 4693 + }, + { + "epoch": 0.49, + "grad_norm": 1.8356847480376939, + "learning_rate": 5.4379681026085305e-06, + "loss": 0.5864, + "step": 4694 + }, + { + "epoch": 0.49, + "grad_norm": 1.9702510121981012, + "learning_rate": 5.436291131461926e-06, + "loss": 0.643, + "step": 4695 + }, + { + "epoch": 0.49, + "grad_norm": 2.259676272993004, + "learning_rate": 5.434614110859233e-06, + "loss": 0.7192, + "step": 4696 + }, + { + "epoch": 0.49, + "grad_norm": 1.8027623450713939, + "learning_rate": 5.432937040990553e-06, + "loss": 0.6741, + "step": 4697 + }, + { + "epoch": 0.49, + "grad_norm": 2.2863564801237377, + "learning_rate": 5.431259922045995e-06, + "loss": 0.7341, + "step": 4698 + }, + { + "epoch": 0.49, + "grad_norm": 1.8466916983922386, + "learning_rate": 5.429582754215664e-06, + "loss": 0.5891, + "step": 4699 + }, + { + "epoch": 0.49, + "grad_norm": 1.7862194556644164, + "learning_rate": 5.427905537689679e-06, + "loss": 0.638, + "step": 4700 + }, + { + "epoch": 0.49, + "grad_norm": 1.8250276402626973, + "learning_rate": 5.426228272658163e-06, + "loss": 0.6587, + "step": 4701 + }, + { + "epoch": 0.49, + "grad_norm": 1.9226029242682035, + "learning_rate": 5.424550959311244e-06, + "loss": 0.6189, + "step": 4702 + }, + { + "epoch": 0.49, + "grad_norm": 1.778075048144127, + "learning_rate": 5.422873597839052e-06, + "loss": 0.5834, + "step": 4703 + }, + { + "epoch": 0.49, + "grad_norm": 1.8704703373919398, + "learning_rate": 5.4211961884317285e-06, + "loss": 0.609, + "step": 4704 + }, + { + "epoch": 0.49, + "grad_norm": 1.9872381122558112, + "learning_rate": 5.4195187312794165e-06, + "loss": 0.6613, + "step": 4705 + }, + { + "epoch": 0.49, + "grad_norm": 2.2208206315817356, + "learning_rate": 5.417841226572263e-06, + "loss": 0.6202, + "step": 4706 + }, + { + "epoch": 0.49, + "grad_norm": 1.8030691134210728, + "learning_rate": 5.416163674500429e-06, + "loss": 0.5852, + "step": 4707 + }, + { + "epoch": 0.49, + "grad_norm": 1.9087645300174885, + "learning_rate": 5.4144860752540675e-06, + "loss": 0.5521, + "step": 4708 + }, + { + "epoch": 0.49, + "grad_norm": 2.003121434948416, + "learning_rate": 5.412808429023346e-06, + "loss": 0.6437, + "step": 4709 + }, + { + "epoch": 0.49, + "grad_norm": 1.805314904231961, + "learning_rate": 5.4111307359984375e-06, + "loss": 0.6137, + "step": 4710 + }, + { + "epoch": 0.49, + "grad_norm": 1.8556671218808498, + "learning_rate": 5.409452996369517e-06, + "loss": 0.5666, + "step": 4711 + }, + { + "epoch": 0.49, + "grad_norm": 1.913852383240618, + "learning_rate": 5.407775210326765e-06, + "loss": 0.593, + "step": 4712 + }, + { + "epoch": 0.49, + "grad_norm": 1.8579997869434959, + "learning_rate": 5.40609737806037e-06, + "loss": 0.6143, + "step": 4713 + }, + { + "epoch": 0.49, + "grad_norm": 1.955230915460718, + "learning_rate": 5.404419499760521e-06, + "loss": 0.6543, + "step": 4714 + }, + { + "epoch": 0.49, + "grad_norm": 1.918928720060036, + "learning_rate": 5.402741575617417e-06, + "loss": 0.5767, + "step": 4715 + }, + { + "epoch": 0.49, + "grad_norm": 1.9450222578852545, + "learning_rate": 5.401063605821259e-06, + "loss": 0.599, + "step": 4716 + }, + { + "epoch": 0.49, + "grad_norm": 1.830693102223202, + "learning_rate": 5.399385590562257e-06, + "loss": 0.6338, + "step": 4717 + }, + { + "epoch": 0.49, + "grad_norm": 1.86583491311485, + "learning_rate": 5.397707530030621e-06, + "loss": 0.5858, + "step": 4718 + }, + { + "epoch": 0.49, + "grad_norm": 1.9159187468657468, + "learning_rate": 5.3960294244165705e-06, + "loss": 0.66, + "step": 4719 + }, + { + "epoch": 0.49, + "grad_norm": 1.9568612561515344, + "learning_rate": 5.394351273910327e-06, + "loss": 0.6735, + "step": 4720 + }, + { + "epoch": 0.49, + "grad_norm": 1.8762483550864508, + "learning_rate": 5.392673078702118e-06, + "loss": 0.6049, + "step": 4721 + }, + { + "epoch": 0.49, + "grad_norm": 1.7436999227726706, + "learning_rate": 5.390994838982178e-06, + "loss": 0.5753, + "step": 4722 + }, + { + "epoch": 0.49, + "grad_norm": 2.012137021393326, + "learning_rate": 5.3893165549407435e-06, + "loss": 0.6212, + "step": 4723 + }, + { + "epoch": 0.49, + "grad_norm": 1.8836151547134943, + "learning_rate": 5.38763822676806e-06, + "loss": 0.6909, + "step": 4724 + }, + { + "epoch": 0.49, + "grad_norm": 1.804096332876596, + "learning_rate": 5.385959854654374e-06, + "loss": 0.5916, + "step": 4725 + }, + { + "epoch": 0.49, + "grad_norm": 1.737330588597606, + "learning_rate": 5.384281438789937e-06, + "loss": 0.6128, + "step": 4726 + }, + { + "epoch": 0.49, + "grad_norm": 1.8569231685201164, + "learning_rate": 5.382602979365009e-06, + "loss": 0.6701, + "step": 4727 + }, + { + "epoch": 0.49, + "grad_norm": 1.7276227842031793, + "learning_rate": 5.380924476569854e-06, + "loss": 0.5959, + "step": 4728 + }, + { + "epoch": 0.49, + "grad_norm": 1.8705463387699754, + "learning_rate": 5.379245930594738e-06, + "loss": 0.6266, + "step": 4729 + }, + { + "epoch": 0.49, + "grad_norm": 1.9820668249263957, + "learning_rate": 5.3775673416299325e-06, + "loss": 0.7306, + "step": 4730 + }, + { + "epoch": 0.49, + "grad_norm": 2.1991728959408166, + "learning_rate": 5.375888709865718e-06, + "loss": 0.6499, + "step": 4731 + }, + { + "epoch": 0.49, + "grad_norm": 1.9049944223482733, + "learning_rate": 5.374210035492375e-06, + "loss": 0.6072, + "step": 4732 + }, + { + "epoch": 0.49, + "grad_norm": 1.7630256763054708, + "learning_rate": 5.372531318700192e-06, + "loss": 0.5893, + "step": 4733 + }, + { + "epoch": 0.49, + "grad_norm": 1.8797816070120186, + "learning_rate": 5.370852559679461e-06, + "loss": 0.6232, + "step": 4734 + }, + { + "epoch": 0.49, + "grad_norm": 1.9335377315595026, + "learning_rate": 5.36917375862048e-06, + "loss": 0.6255, + "step": 4735 + }, + { + "epoch": 0.49, + "grad_norm": 1.8935577231729566, + "learning_rate": 5.367494915713547e-06, + "loss": 0.6538, + "step": 4736 + }, + { + "epoch": 0.49, + "grad_norm": 2.159238536605417, + "learning_rate": 5.365816031148971e-06, + "loss": 0.6783, + "step": 4737 + }, + { + "epoch": 0.49, + "grad_norm": 2.2691128833223444, + "learning_rate": 5.364137105117062e-06, + "loss": 0.6657, + "step": 4738 + }, + { + "epoch": 0.49, + "grad_norm": 1.9293045676431337, + "learning_rate": 5.362458137808139e-06, + "loss": 0.6729, + "step": 4739 + }, + { + "epoch": 0.49, + "grad_norm": 1.9461395972368574, + "learning_rate": 5.360779129412519e-06, + "loss": 0.5758, + "step": 4740 + }, + { + "epoch": 0.49, + "grad_norm": 1.7469143360040644, + "learning_rate": 5.359100080120527e-06, + "loss": 0.5882, + "step": 4741 + }, + { + "epoch": 0.49, + "grad_norm": 1.525512152522639, + "learning_rate": 5.357420990122495e-06, + "loss": 0.5615, + "step": 4742 + }, + { + "epoch": 0.49, + "grad_norm": 1.8314068903897658, + "learning_rate": 5.355741859608756e-06, + "loss": 0.6769, + "step": 4743 + }, + { + "epoch": 0.49, + "grad_norm": 1.8789868416667825, + "learning_rate": 5.35406268876965e-06, + "loss": 0.5629, + "step": 4744 + }, + { + "epoch": 0.49, + "grad_norm": 1.9337216447416474, + "learning_rate": 5.352383477795522e-06, + "loss": 0.6372, + "step": 4745 + }, + { + "epoch": 0.49, + "grad_norm": 1.991741251184157, + "learning_rate": 5.3507042268767165e-06, + "loss": 0.7707, + "step": 4746 + }, + { + "epoch": 0.49, + "grad_norm": 1.7991416686125543, + "learning_rate": 5.3490249362035875e-06, + "loss": 0.6465, + "step": 4747 + }, + { + "epoch": 0.49, + "grad_norm": 2.2012059216577566, + "learning_rate": 5.347345605966493e-06, + "loss": 0.7182, + "step": 4748 + }, + { + "epoch": 0.49, + "grad_norm": 1.9724821199819178, + "learning_rate": 5.345666236355794e-06, + "loss": 0.6756, + "step": 4749 + }, + { + "epoch": 0.49, + "grad_norm": 1.8347424754598582, + "learning_rate": 5.343986827561859e-06, + "loss": 0.6407, + "step": 4750 + }, + { + "epoch": 0.49, + "grad_norm": 1.8192154867135653, + "learning_rate": 5.342307379775053e-06, + "loss": 0.6398, + "step": 4751 + }, + { + "epoch": 0.49, + "grad_norm": 1.9939590161819776, + "learning_rate": 5.340627893185757e-06, + "loss": 0.5802, + "step": 4752 + }, + { + "epoch": 0.49, + "grad_norm": 2.1746888356632277, + "learning_rate": 5.338948367984347e-06, + "loss": 0.7248, + "step": 4753 + }, + { + "epoch": 0.49, + "grad_norm": 1.7532309609679348, + "learning_rate": 5.337268804361208e-06, + "loss": 0.5655, + "step": 4754 + }, + { + "epoch": 0.49, + "grad_norm": 1.727385613972203, + "learning_rate": 5.335589202506727e-06, + "loss": 0.5939, + "step": 4755 + }, + { + "epoch": 0.49, + "grad_norm": 2.0887581011841823, + "learning_rate": 5.3339095626112965e-06, + "loss": 0.6019, + "step": 4756 + }, + { + "epoch": 0.49, + "grad_norm": 1.8319707154087208, + "learning_rate": 5.332229884865316e-06, + "loss": 0.6287, + "step": 4757 + }, + { + "epoch": 0.49, + "grad_norm": 1.935043766822851, + "learning_rate": 5.3305501694591836e-06, + "loss": 0.6483, + "step": 4758 + }, + { + "epoch": 0.49, + "grad_norm": 1.8084928100607975, + "learning_rate": 5.3288704165833035e-06, + "loss": 0.5682, + "step": 4759 + }, + { + "epoch": 0.49, + "grad_norm": 1.884803293181149, + "learning_rate": 5.327190626428089e-06, + "loss": 0.6029, + "step": 4760 + }, + { + "epoch": 0.49, + "grad_norm": 1.932522766945742, + "learning_rate": 5.325510799183953e-06, + "loss": 0.6771, + "step": 4761 + }, + { + "epoch": 0.5, + "grad_norm": 1.9866449112920452, + "learning_rate": 5.32383093504131e-06, + "loss": 0.5524, + "step": 4762 + }, + { + "epoch": 0.5, + "grad_norm": 1.8167555076320083, + "learning_rate": 5.3221510341905855e-06, + "loss": 0.6437, + "step": 4763 + }, + { + "epoch": 0.5, + "grad_norm": 1.8329283982720728, + "learning_rate": 5.320471096822206e-06, + "loss": 0.5299, + "step": 4764 + }, + { + "epoch": 0.5, + "grad_norm": 1.9138426461164462, + "learning_rate": 5.318791123126601e-06, + "loss": 0.62, + "step": 4765 + }, + { + "epoch": 0.5, + "grad_norm": 2.1010868001824097, + "learning_rate": 5.3171111132942045e-06, + "loss": 0.658, + "step": 4766 + }, + { + "epoch": 0.5, + "grad_norm": 1.9813575064514743, + "learning_rate": 5.315431067515456e-06, + "loss": 0.7796, + "step": 4767 + }, + { + "epoch": 0.5, + "grad_norm": 1.9144527346597267, + "learning_rate": 5.313750985980799e-06, + "loss": 0.6166, + "step": 4768 + }, + { + "epoch": 0.5, + "grad_norm": 1.9394591482862733, + "learning_rate": 5.312070868880678e-06, + "loss": 0.6915, + "step": 4769 + }, + { + "epoch": 0.5, + "grad_norm": 1.7787970809473537, + "learning_rate": 5.310390716405546e-06, + "loss": 0.5507, + "step": 4770 + }, + { + "epoch": 0.5, + "grad_norm": 1.9257353515354803, + "learning_rate": 5.308710528745856e-06, + "loss": 0.5736, + "step": 4771 + }, + { + "epoch": 0.5, + "grad_norm": 1.899196278729454, + "learning_rate": 5.3070303060920706e-06, + "loss": 0.6948, + "step": 4772 + }, + { + "epoch": 0.5, + "grad_norm": 1.8548166587030892, + "learning_rate": 5.305350048634648e-06, + "loss": 0.6213, + "step": 4773 + }, + { + "epoch": 0.5, + "grad_norm": 1.72877940583896, + "learning_rate": 5.303669756564057e-06, + "loss": 0.5737, + "step": 4774 + }, + { + "epoch": 0.5, + "grad_norm": 1.81760933387422, + "learning_rate": 5.301989430070767e-06, + "loss": 0.5618, + "step": 4775 + }, + { + "epoch": 0.5, + "grad_norm": 1.9269910883598707, + "learning_rate": 5.300309069345257e-06, + "loss": 0.6532, + "step": 4776 + }, + { + "epoch": 0.5, + "grad_norm": 1.9463546960698885, + "learning_rate": 5.298628674578e-06, + "loss": 0.6358, + "step": 4777 + }, + { + "epoch": 0.5, + "grad_norm": 1.8442409478011446, + "learning_rate": 5.296948245959481e-06, + "loss": 0.6798, + "step": 4778 + }, + { + "epoch": 0.5, + "grad_norm": 1.6872881787647978, + "learning_rate": 5.295267783680186e-06, + "loss": 0.615, + "step": 4779 + }, + { + "epoch": 0.5, + "grad_norm": 2.020457824524539, + "learning_rate": 5.293587287930605e-06, + "loss": 0.7365, + "step": 4780 + }, + { + "epoch": 0.5, + "grad_norm": 1.9419792797429891, + "learning_rate": 5.291906758901231e-06, + "loss": 0.6422, + "step": 4781 + }, + { + "epoch": 0.5, + "grad_norm": 1.9961829611233421, + "learning_rate": 5.290226196782562e-06, + "loss": 0.6491, + "step": 4782 + }, + { + "epoch": 0.5, + "grad_norm": 1.8346912090258714, + "learning_rate": 5.2885456017651e-06, + "loss": 0.6669, + "step": 4783 + }, + { + "epoch": 0.5, + "grad_norm": 1.9113822633234845, + "learning_rate": 5.286864974039349e-06, + "loss": 0.6546, + "step": 4784 + }, + { + "epoch": 0.5, + "grad_norm": 1.9088972328489655, + "learning_rate": 5.285184313795818e-06, + "loss": 0.6967, + "step": 4785 + }, + { + "epoch": 0.5, + "grad_norm": 1.9770384150188363, + "learning_rate": 5.28350362122502e-06, + "loss": 0.6551, + "step": 4786 + }, + { + "epoch": 0.5, + "grad_norm": 1.983698674791684, + "learning_rate": 5.281822896517471e-06, + "loss": 0.6672, + "step": 4787 + }, + { + "epoch": 0.5, + "grad_norm": 1.9723552036785195, + "learning_rate": 5.280142139863689e-06, + "loss": 0.6123, + "step": 4788 + }, + { + "epoch": 0.5, + "grad_norm": 1.7502841190125016, + "learning_rate": 5.278461351454199e-06, + "loss": 0.6269, + "step": 4789 + }, + { + "epoch": 0.5, + "grad_norm": 1.8380073595702189, + "learning_rate": 5.276780531479528e-06, + "loss": 0.629, + "step": 4790 + }, + { + "epoch": 0.5, + "grad_norm": 1.9008197624107583, + "learning_rate": 5.275099680130207e-06, + "loss": 0.686, + "step": 4791 + }, + { + "epoch": 0.5, + "grad_norm": 1.6690452997820755, + "learning_rate": 5.273418797596769e-06, + "loss": 0.541, + "step": 4792 + }, + { + "epoch": 0.5, + "grad_norm": 1.7704221672792175, + "learning_rate": 5.271737884069751e-06, + "loss": 0.7384, + "step": 4793 + }, + { + "epoch": 0.5, + "grad_norm": 1.8289870533935189, + "learning_rate": 5.270056939739695e-06, + "loss": 0.7028, + "step": 4794 + }, + { + "epoch": 0.5, + "grad_norm": 1.9957521734241734, + "learning_rate": 5.268375964797147e-06, + "loss": 0.8097, + "step": 4795 + }, + { + "epoch": 0.5, + "grad_norm": 1.8245717485844726, + "learning_rate": 5.266694959432651e-06, + "loss": 0.5797, + "step": 4796 + }, + { + "epoch": 0.5, + "grad_norm": 1.6513891472143982, + "learning_rate": 5.265013923836763e-06, + "loss": 0.5582, + "step": 4797 + }, + { + "epoch": 0.5, + "grad_norm": 1.8439248589354222, + "learning_rate": 5.263332858200037e-06, + "loss": 0.5971, + "step": 4798 + }, + { + "epoch": 0.5, + "grad_norm": 1.7693969050117528, + "learning_rate": 5.261651762713029e-06, + "loss": 0.6627, + "step": 4799 + }, + { + "epoch": 0.5, + "grad_norm": 1.8799400076854698, + "learning_rate": 5.259970637566303e-06, + "loss": 0.6186, + "step": 4800 + }, + { + "epoch": 0.5, + "grad_norm": 1.810317566636923, + "learning_rate": 5.2582894829504225e-06, + "loss": 0.6289, + "step": 4801 + }, + { + "epoch": 0.5, + "grad_norm": 2.1064888897228604, + "learning_rate": 5.256608299055959e-06, + "loss": 0.7368, + "step": 4802 + }, + { + "epoch": 0.5, + "grad_norm": 1.7934611876160385, + "learning_rate": 5.254927086073481e-06, + "loss": 0.5756, + "step": 4803 + }, + { + "epoch": 0.5, + "grad_norm": 1.918347419758823, + "learning_rate": 5.253245844193564e-06, + "loss": 0.6471, + "step": 4804 + }, + { + "epoch": 0.5, + "grad_norm": 1.921103053022868, + "learning_rate": 5.251564573606789e-06, + "loss": 0.6237, + "step": 4805 + }, + { + "epoch": 0.5, + "grad_norm": 1.881055921289424, + "learning_rate": 5.249883274503734e-06, + "loss": 0.655, + "step": 4806 + }, + { + "epoch": 0.5, + "grad_norm": 2.1128203412785695, + "learning_rate": 5.248201947074986e-06, + "loss": 0.6767, + "step": 4807 + }, + { + "epoch": 0.5, + "grad_norm": 1.8473802354895168, + "learning_rate": 5.246520591511133e-06, + "loss": 0.7472, + "step": 4808 + }, + { + "epoch": 0.5, + "grad_norm": 1.939482551753464, + "learning_rate": 5.244839208002766e-06, + "loss": 0.7308, + "step": 4809 + }, + { + "epoch": 0.5, + "grad_norm": 1.924260276947548, + "learning_rate": 5.243157796740478e-06, + "loss": 0.6115, + "step": 4810 + }, + { + "epoch": 0.5, + "grad_norm": 2.0874541041869157, + "learning_rate": 5.241476357914869e-06, + "loss": 0.7236, + "step": 4811 + }, + { + "epoch": 0.5, + "grad_norm": 1.837733357562263, + "learning_rate": 5.239794891716538e-06, + "loss": 0.6035, + "step": 4812 + }, + { + "epoch": 0.5, + "grad_norm": 1.8910288208809416, + "learning_rate": 5.238113398336089e-06, + "loss": 0.6271, + "step": 4813 + }, + { + "epoch": 0.5, + "grad_norm": 2.0685585225934435, + "learning_rate": 5.236431877964129e-06, + "loss": 0.7462, + "step": 4814 + }, + { + "epoch": 0.5, + "grad_norm": 1.901940101007261, + "learning_rate": 5.234750330791268e-06, + "loss": 0.6331, + "step": 4815 + }, + { + "epoch": 0.5, + "grad_norm": 1.7745538373385725, + "learning_rate": 5.23306875700812e-06, + "loss": 0.5965, + "step": 4816 + }, + { + "epoch": 0.5, + "grad_norm": 1.914938178129142, + "learning_rate": 5.231387156805299e-06, + "loss": 0.6152, + "step": 4817 + }, + { + "epoch": 0.5, + "grad_norm": 1.9602214851083848, + "learning_rate": 5.229705530373424e-06, + "loss": 0.619, + "step": 4818 + }, + { + "epoch": 0.5, + "grad_norm": 1.8208173429792096, + "learning_rate": 5.228023877903119e-06, + "loss": 0.5953, + "step": 4819 + }, + { + "epoch": 0.5, + "grad_norm": 1.8886106395780093, + "learning_rate": 5.22634219958501e-06, + "loss": 0.6042, + "step": 4820 + }, + { + "epoch": 0.5, + "grad_norm": 1.8052470224663513, + "learning_rate": 5.224660495609719e-06, + "loss": 0.5851, + "step": 4821 + }, + { + "epoch": 0.5, + "grad_norm": 1.8864420425653845, + "learning_rate": 5.222978766167881e-06, + "loss": 0.6513, + "step": 4822 + }, + { + "epoch": 0.5, + "grad_norm": 1.8881947704523596, + "learning_rate": 5.221297011450129e-06, + "loss": 0.6889, + "step": 4823 + }, + { + "epoch": 0.5, + "grad_norm": 1.7349782274775427, + "learning_rate": 5.219615231647102e-06, + "loss": 0.7054, + "step": 4824 + }, + { + "epoch": 0.5, + "grad_norm": 1.876831428970342, + "learning_rate": 5.2179334269494345e-06, + "loss": 0.698, + "step": 4825 + }, + { + "epoch": 0.5, + "grad_norm": 1.7635223691101098, + "learning_rate": 5.21625159754777e-06, + "loss": 0.6371, + "step": 4826 + }, + { + "epoch": 0.5, + "grad_norm": 1.7059481236334688, + "learning_rate": 5.214569743632756e-06, + "loss": 0.5452, + "step": 4827 + }, + { + "epoch": 0.5, + "grad_norm": 1.5553605927294225, + "learning_rate": 5.212887865395038e-06, + "loss": 0.566, + "step": 4828 + }, + { + "epoch": 0.5, + "grad_norm": 1.9616001578735676, + "learning_rate": 5.211205963025268e-06, + "loss": 0.7019, + "step": 4829 + }, + { + "epoch": 0.5, + "grad_norm": 1.8648642120464367, + "learning_rate": 5.209524036714096e-06, + "loss": 0.67, + "step": 4830 + }, + { + "epoch": 0.5, + "grad_norm": 2.0163559085077853, + "learning_rate": 5.207842086652183e-06, + "loss": 0.6602, + "step": 4831 + }, + { + "epoch": 0.5, + "grad_norm": 2.035789772992233, + "learning_rate": 5.206160113030182e-06, + "loss": 0.6346, + "step": 4832 + }, + { + "epoch": 0.5, + "grad_norm": 1.9600216866470923, + "learning_rate": 5.204478116038758e-06, + "loss": 0.6541, + "step": 4833 + }, + { + "epoch": 0.5, + "grad_norm": 1.7741794956531685, + "learning_rate": 5.202796095868574e-06, + "loss": 0.5933, + "step": 4834 + }, + { + "epoch": 0.5, + "grad_norm": 1.7335196713309575, + "learning_rate": 5.201114052710299e-06, + "loss": 0.6089, + "step": 4835 + }, + { + "epoch": 0.5, + "grad_norm": 1.9376194087329808, + "learning_rate": 5.1994319867545974e-06, + "loss": 0.7502, + "step": 4836 + }, + { + "epoch": 0.5, + "grad_norm": 1.8325179415301216, + "learning_rate": 5.197749898192144e-06, + "loss": 0.6688, + "step": 4837 + }, + { + "epoch": 0.5, + "grad_norm": 1.9692153336399745, + "learning_rate": 5.196067787213611e-06, + "loss": 0.6632, + "step": 4838 + }, + { + "epoch": 0.5, + "grad_norm": 1.800237625965685, + "learning_rate": 5.1943856540096795e-06, + "loss": 0.648, + "step": 4839 + }, + { + "epoch": 0.5, + "grad_norm": 1.795377760515686, + "learning_rate": 5.1927034987710245e-06, + "loss": 0.6032, + "step": 4840 + }, + { + "epoch": 0.5, + "grad_norm": 1.895454260117348, + "learning_rate": 5.19102132168833e-06, + "loss": 0.6731, + "step": 4841 + }, + { + "epoch": 0.5, + "grad_norm": 2.039687589229877, + "learning_rate": 5.189339122952281e-06, + "loss": 0.6824, + "step": 4842 + }, + { + "epoch": 0.5, + "grad_norm": 1.8781273228678457, + "learning_rate": 5.18765690275356e-06, + "loss": 0.6945, + "step": 4843 + }, + { + "epoch": 0.5, + "grad_norm": 1.8013673164406765, + "learning_rate": 5.185974661282862e-06, + "loss": 0.6765, + "step": 4844 + }, + { + "epoch": 0.5, + "grad_norm": 1.98407236347405, + "learning_rate": 5.184292398730876e-06, + "loss": 0.7248, + "step": 4845 + }, + { + "epoch": 0.5, + "grad_norm": 2.0038873123117757, + "learning_rate": 5.182610115288296e-06, + "loss": 0.6836, + "step": 4846 + }, + { + "epoch": 0.5, + "grad_norm": 1.660770204955728, + "learning_rate": 5.180927811145818e-06, + "loss": 0.56, + "step": 4847 + }, + { + "epoch": 0.5, + "grad_norm": 2.17998760571231, + "learning_rate": 5.179245486494141e-06, + "loss": 0.7484, + "step": 4848 + }, + { + "epoch": 0.5, + "grad_norm": 1.9112108041835019, + "learning_rate": 5.177563141523967e-06, + "loss": 0.7095, + "step": 4849 + }, + { + "epoch": 0.5, + "grad_norm": 2.160618119718771, + "learning_rate": 5.175880776425999e-06, + "loss": 0.7162, + "step": 4850 + }, + { + "epoch": 0.5, + "grad_norm": 1.85331993962675, + "learning_rate": 5.174198391390942e-06, + "loss": 0.6401, + "step": 4851 + }, + { + "epoch": 0.5, + "grad_norm": 1.939094194390486, + "learning_rate": 5.172515986609504e-06, + "loss": 0.6593, + "step": 4852 + }, + { + "epoch": 0.5, + "grad_norm": 1.9149904541209817, + "learning_rate": 5.170833562272398e-06, + "loss": 0.6141, + "step": 4853 + }, + { + "epoch": 0.5, + "grad_norm": 1.9222041139959078, + "learning_rate": 5.169151118570332e-06, + "loss": 0.6764, + "step": 4854 + }, + { + "epoch": 0.5, + "grad_norm": 1.7669182377758037, + "learning_rate": 5.167468655694022e-06, + "loss": 0.538, + "step": 4855 + }, + { + "epoch": 0.5, + "grad_norm": 2.1867857975643434, + "learning_rate": 5.165786173834187e-06, + "loss": 0.656, + "step": 4856 + }, + { + "epoch": 0.5, + "grad_norm": 1.6534862157740111, + "learning_rate": 5.164103673181544e-06, + "loss": 0.6398, + "step": 4857 + }, + { + "epoch": 0.5, + "grad_norm": 2.1437201670617037, + "learning_rate": 5.162421153926814e-06, + "loss": 0.7164, + "step": 4858 + }, + { + "epoch": 0.51, + "grad_norm": 1.6553379755707762, + "learning_rate": 5.16073861626072e-06, + "loss": 0.6298, + "step": 4859 + }, + { + "epoch": 0.51, + "grad_norm": 1.8648222159713144, + "learning_rate": 5.1590560603739885e-06, + "loss": 0.697, + "step": 4860 + }, + { + "epoch": 0.51, + "grad_norm": 1.961783728362329, + "learning_rate": 5.157373486457346e-06, + "loss": 0.6445, + "step": 4861 + }, + { + "epoch": 0.51, + "grad_norm": 1.7476062727074282, + "learning_rate": 5.15569089470152e-06, + "loss": 0.4982, + "step": 4862 + }, + { + "epoch": 0.51, + "grad_norm": 2.0169770654335872, + "learning_rate": 5.1540082852972455e-06, + "loss": 0.6628, + "step": 4863 + }, + { + "epoch": 0.51, + "grad_norm": 1.7595848868953443, + "learning_rate": 5.152325658435254e-06, + "loss": 0.6338, + "step": 4864 + }, + { + "epoch": 0.51, + "grad_norm": 1.6235278140692069, + "learning_rate": 5.15064301430628e-06, + "loss": 0.5293, + "step": 4865 + }, + { + "epoch": 0.51, + "grad_norm": 1.8762127911073818, + "learning_rate": 5.148960353101063e-06, + "loss": 0.5885, + "step": 4866 + }, + { + "epoch": 0.51, + "grad_norm": 2.155003322145285, + "learning_rate": 5.147277675010339e-06, + "loss": 0.749, + "step": 4867 + }, + { + "epoch": 0.51, + "grad_norm": 1.9059516874727496, + "learning_rate": 5.145594980224853e-06, + "loss": 0.6446, + "step": 4868 + }, + { + "epoch": 0.51, + "grad_norm": 1.7276881296721822, + "learning_rate": 5.143912268935345e-06, + "loss": 0.5343, + "step": 4869 + }, + { + "epoch": 0.51, + "grad_norm": 2.069942567716001, + "learning_rate": 5.14222954133256e-06, + "loss": 0.6633, + "step": 4870 + }, + { + "epoch": 0.51, + "grad_norm": 2.0174980514678205, + "learning_rate": 5.140546797607248e-06, + "loss": 0.6722, + "step": 4871 + }, + { + "epoch": 0.51, + "grad_norm": 1.9082251784968078, + "learning_rate": 5.138864037950155e-06, + "loss": 0.5613, + "step": 4872 + }, + { + "epoch": 0.51, + "grad_norm": 1.8732227894779176, + "learning_rate": 5.137181262552031e-06, + "loss": 0.6477, + "step": 4873 + }, + { + "epoch": 0.51, + "grad_norm": 1.6832285896137942, + "learning_rate": 5.135498471603629e-06, + "loss": 0.6062, + "step": 4874 + }, + { + "epoch": 0.51, + "grad_norm": 1.6344271952482194, + "learning_rate": 5.133815665295704e-06, + "loss": 0.676, + "step": 4875 + }, + { + "epoch": 0.51, + "grad_norm": 1.7922029487358937, + "learning_rate": 5.13213284381901e-06, + "loss": 0.571, + "step": 4876 + }, + { + "epoch": 0.51, + "grad_norm": 1.8548110893994498, + "learning_rate": 5.1304500073643045e-06, + "loss": 0.5944, + "step": 4877 + }, + { + "epoch": 0.51, + "grad_norm": 6.780740593676751, + "learning_rate": 5.128767156122347e-06, + "loss": 0.5777, + "step": 4878 + }, + { + "epoch": 0.51, + "grad_norm": 2.247295270187268, + "learning_rate": 5.1270842902839e-06, + "loss": 0.7154, + "step": 4879 + }, + { + "epoch": 0.51, + "grad_norm": 1.9584193373117975, + "learning_rate": 5.125401410039723e-06, + "loss": 0.6757, + "step": 4880 + }, + { + "epoch": 0.51, + "grad_norm": 2.095672743950007, + "learning_rate": 5.123718515580581e-06, + "loss": 0.6605, + "step": 4881 + }, + { + "epoch": 0.51, + "grad_norm": 1.8048789293075735, + "learning_rate": 5.1220356070972414e-06, + "loss": 0.7379, + "step": 4882 + }, + { + "epoch": 0.51, + "grad_norm": 1.9146554643863576, + "learning_rate": 5.120352684780469e-06, + "loss": 0.5925, + "step": 4883 + }, + { + "epoch": 0.51, + "grad_norm": 2.1568243392901105, + "learning_rate": 5.118669748821034e-06, + "loss": 0.7402, + "step": 4884 + }, + { + "epoch": 0.51, + "grad_norm": 1.9630846444451566, + "learning_rate": 5.116986799409708e-06, + "loss": 0.6983, + "step": 4885 + }, + { + "epoch": 0.51, + "grad_norm": 1.939417972921571, + "learning_rate": 5.11530383673726e-06, + "loss": 0.6694, + "step": 4886 + }, + { + "epoch": 0.51, + "grad_norm": 2.15676437427674, + "learning_rate": 5.1136208609944644e-06, + "loss": 0.6987, + "step": 4887 + }, + { + "epoch": 0.51, + "grad_norm": 1.9490400227157634, + "learning_rate": 5.111937872372097e-06, + "loss": 0.6834, + "step": 4888 + }, + { + "epoch": 0.51, + "grad_norm": 1.89411260769405, + "learning_rate": 5.110254871060933e-06, + "loss": 0.6891, + "step": 4889 + }, + { + "epoch": 0.51, + "grad_norm": 1.8278879530072518, + "learning_rate": 5.108571857251754e-06, + "loss": 0.5864, + "step": 4890 + }, + { + "epoch": 0.51, + "grad_norm": 1.959630964015898, + "learning_rate": 5.106888831135334e-06, + "loss": 0.5718, + "step": 4891 + }, + { + "epoch": 0.51, + "grad_norm": 1.887053393441401, + "learning_rate": 5.105205792902456e-06, + "loss": 0.5991, + "step": 4892 + }, + { + "epoch": 0.51, + "grad_norm": 1.8419676864917576, + "learning_rate": 5.103522742743901e-06, + "loss": 0.6203, + "step": 4893 + }, + { + "epoch": 0.51, + "grad_norm": 1.9813179222203285, + "learning_rate": 5.101839680850454e-06, + "loss": 0.7061, + "step": 4894 + }, + { + "epoch": 0.51, + "grad_norm": 1.9051838482841017, + "learning_rate": 5.100156607412899e-06, + "loss": 0.663, + "step": 4895 + }, + { + "epoch": 0.51, + "grad_norm": 1.7946750412220063, + "learning_rate": 5.09847352262202e-06, + "loss": 0.6395, + "step": 4896 + }, + { + "epoch": 0.51, + "grad_norm": 1.692629416537992, + "learning_rate": 5.096790426668608e-06, + "loss": 0.6797, + "step": 4897 + }, + { + "epoch": 0.51, + "grad_norm": 1.6992150985739385, + "learning_rate": 5.095107319743449e-06, + "loss": 0.6609, + "step": 4898 + }, + { + "epoch": 0.51, + "grad_norm": 2.0328131001769885, + "learning_rate": 5.093424202037333e-06, + "loss": 0.6063, + "step": 4899 + }, + { + "epoch": 0.51, + "grad_norm": 1.8965582288309775, + "learning_rate": 5.09174107374105e-06, + "loss": 0.7277, + "step": 4900 + }, + { + "epoch": 0.51, + "grad_norm": 1.9789015178424736, + "learning_rate": 5.090057935045395e-06, + "loss": 0.7337, + "step": 4901 + }, + { + "epoch": 0.51, + "grad_norm": 2.0892636946117795, + "learning_rate": 5.088374786141159e-06, + "loss": 0.5095, + "step": 4902 + }, + { + "epoch": 0.51, + "grad_norm": 1.9916605863411267, + "learning_rate": 5.086691627219137e-06, + "loss": 0.6281, + "step": 4903 + }, + { + "epoch": 0.51, + "grad_norm": 1.8199559774598433, + "learning_rate": 5.085008458470126e-06, + "loss": 0.6696, + "step": 4904 + }, + { + "epoch": 0.51, + "grad_norm": 2.2905345302233284, + "learning_rate": 5.0833252800849205e-06, + "loss": 0.723, + "step": 4905 + }, + { + "epoch": 0.51, + "grad_norm": 1.8128746657582677, + "learning_rate": 5.0816420922543195e-06, + "loss": 0.6756, + "step": 4906 + }, + { + "epoch": 0.51, + "grad_norm": 1.6940473207507076, + "learning_rate": 5.079958895169122e-06, + "loss": 0.6189, + "step": 4907 + }, + { + "epoch": 0.51, + "grad_norm": 2.099894774247856, + "learning_rate": 5.078275689020129e-06, + "loss": 0.6214, + "step": 4908 + }, + { + "epoch": 0.51, + "grad_norm": 1.923333567399948, + "learning_rate": 5.076592473998141e-06, + "loss": 0.5972, + "step": 4909 + }, + { + "epoch": 0.51, + "grad_norm": 1.742185055247871, + "learning_rate": 5.0749092502939575e-06, + "loss": 0.6876, + "step": 4910 + }, + { + "epoch": 0.51, + "grad_norm": 1.7921342848541095, + "learning_rate": 5.073226018098385e-06, + "loss": 0.5127, + "step": 4911 + }, + { + "epoch": 0.51, + "grad_norm": 1.9020236365196561, + "learning_rate": 5.071542777602225e-06, + "loss": 0.5513, + "step": 4912 + }, + { + "epoch": 0.51, + "grad_norm": 1.8385169703695514, + "learning_rate": 5.0698595289962845e-06, + "loss": 0.6028, + "step": 4913 + }, + { + "epoch": 0.51, + "grad_norm": 1.868450962690883, + "learning_rate": 5.068176272471368e-06, + "loss": 0.676, + "step": 4914 + }, + { + "epoch": 0.51, + "grad_norm": 1.8850082304527058, + "learning_rate": 5.066493008218282e-06, + "loss": 0.6854, + "step": 4915 + }, + { + "epoch": 0.51, + "grad_norm": 1.9577619267509585, + "learning_rate": 5.064809736427835e-06, + "loss": 0.6569, + "step": 4916 + }, + { + "epoch": 0.51, + "grad_norm": 1.9076916716005377, + "learning_rate": 5.0631264572908334e-06, + "loss": 0.5663, + "step": 4917 + }, + { + "epoch": 0.51, + "grad_norm": 2.2592632992616415, + "learning_rate": 5.0614431709980895e-06, + "loss": 0.669, + "step": 4918 + }, + { + "epoch": 0.51, + "grad_norm": 1.6810611347190536, + "learning_rate": 5.059759877740411e-06, + "loss": 0.5439, + "step": 4919 + }, + { + "epoch": 0.51, + "grad_norm": 2.2498791112061545, + "learning_rate": 5.058076577708611e-06, + "loss": 0.7829, + "step": 4920 + }, + { + "epoch": 0.51, + "grad_norm": 1.7291877715984016, + "learning_rate": 5.056393271093498e-06, + "loss": 0.6395, + "step": 4921 + }, + { + "epoch": 0.51, + "grad_norm": 1.6289988487754408, + "learning_rate": 5.0547099580858874e-06, + "loss": 0.5376, + "step": 4922 + }, + { + "epoch": 0.51, + "grad_norm": 2.213277225739156, + "learning_rate": 5.053026638876591e-06, + "loss": 0.7141, + "step": 4923 + }, + { + "epoch": 0.51, + "grad_norm": 2.141769948061226, + "learning_rate": 5.0513433136564236e-06, + "loss": 0.6407, + "step": 4924 + }, + { + "epoch": 0.51, + "grad_norm": 1.804589027364479, + "learning_rate": 5.049659982616199e-06, + "loss": 0.5956, + "step": 4925 + }, + { + "epoch": 0.51, + "grad_norm": 1.9554108740288227, + "learning_rate": 5.047976645946732e-06, + "loss": 0.6802, + "step": 4926 + }, + { + "epoch": 0.51, + "grad_norm": 1.6593553009678443, + "learning_rate": 5.046293303838838e-06, + "loss": 0.6175, + "step": 4927 + }, + { + "epoch": 0.51, + "grad_norm": 2.25531640266805, + "learning_rate": 5.044609956483335e-06, + "loss": 0.7248, + "step": 4928 + }, + { + "epoch": 0.51, + "grad_norm": 1.8342676675179805, + "learning_rate": 5.042926604071039e-06, + "loss": 0.567, + "step": 4929 + }, + { + "epoch": 0.51, + "grad_norm": 1.983517953815852, + "learning_rate": 5.0412432467927674e-06, + "loss": 0.728, + "step": 4930 + }, + { + "epoch": 0.51, + "grad_norm": 2.0957696190309636, + "learning_rate": 5.039559884839339e-06, + "loss": 0.6731, + "step": 4931 + }, + { + "epoch": 0.51, + "grad_norm": 1.7798860206636986, + "learning_rate": 5.037876518401572e-06, + "loss": 0.6234, + "step": 4932 + }, + { + "epoch": 0.51, + "grad_norm": 1.8782106374167673, + "learning_rate": 5.036193147670286e-06, + "loss": 0.5433, + "step": 4933 + }, + { + "epoch": 0.51, + "grad_norm": 1.965876390350256, + "learning_rate": 5.0345097728363e-06, + "loss": 0.7375, + "step": 4934 + }, + { + "epoch": 0.51, + "grad_norm": 1.6555281848960024, + "learning_rate": 5.032826394090435e-06, + "loss": 0.5369, + "step": 4935 + }, + { + "epoch": 0.51, + "grad_norm": 2.0343517692919293, + "learning_rate": 5.031143011623511e-06, + "loss": 0.6064, + "step": 4936 + }, + { + "epoch": 0.51, + "grad_norm": 1.7335127141852276, + "learning_rate": 5.02945962562635e-06, + "loss": 0.6594, + "step": 4937 + }, + { + "epoch": 0.51, + "grad_norm": 1.7992246411216586, + "learning_rate": 5.027776236289772e-06, + "loss": 0.7669, + "step": 4938 + }, + { + "epoch": 0.51, + "grad_norm": 1.7969757038163299, + "learning_rate": 5.026092843804599e-06, + "loss": 0.5572, + "step": 4939 + }, + { + "epoch": 0.51, + "grad_norm": 2.2179435808357195, + "learning_rate": 5.024409448361653e-06, + "loss": 0.6424, + "step": 4940 + }, + { + "epoch": 0.51, + "grad_norm": 1.7330945557046764, + "learning_rate": 5.022726050151756e-06, + "loss": 0.6363, + "step": 4941 + }, + { + "epoch": 0.51, + "grad_norm": 1.6773497431028328, + "learning_rate": 5.0210426493657335e-06, + "loss": 0.6059, + "step": 4942 + }, + { + "epoch": 0.51, + "grad_norm": 1.7029285789828024, + "learning_rate": 5.019359246194406e-06, + "loss": 0.5937, + "step": 4943 + }, + { + "epoch": 0.51, + "grad_norm": 1.7350035183219865, + "learning_rate": 5.017675840828597e-06, + "loss": 0.5811, + "step": 4944 + }, + { + "epoch": 0.51, + "grad_norm": 1.8633238467146478, + "learning_rate": 5.0159924334591316e-06, + "loss": 0.6168, + "step": 4945 + }, + { + "epoch": 0.51, + "grad_norm": 2.0472909051472272, + "learning_rate": 5.014309024276833e-06, + "loss": 0.6218, + "step": 4946 + }, + { + "epoch": 0.51, + "grad_norm": 1.889431669608345, + "learning_rate": 5.012625613472525e-06, + "loss": 0.6791, + "step": 4947 + }, + { + "epoch": 0.51, + "grad_norm": 1.7828852466791096, + "learning_rate": 5.010942201237031e-06, + "loss": 0.5815, + "step": 4948 + }, + { + "epoch": 0.51, + "grad_norm": 1.7605094733233677, + "learning_rate": 5.009258787761178e-06, + "loss": 0.6806, + "step": 4949 + }, + { + "epoch": 0.51, + "grad_norm": 2.106865906855424, + "learning_rate": 5.007575373235786e-06, + "loss": 0.5734, + "step": 4950 + }, + { + "epoch": 0.51, + "grad_norm": 2.069021850878164, + "learning_rate": 5.005891957851683e-06, + "loss": 0.6051, + "step": 4951 + }, + { + "epoch": 0.51, + "grad_norm": 1.953305074899751, + "learning_rate": 5.004208541799693e-06, + "loss": 0.6234, + "step": 4952 + }, + { + "epoch": 0.51, + "grad_norm": 1.6461163901567595, + "learning_rate": 5.002525125270641e-06, + "loss": 0.7198, + "step": 4953 + }, + { + "epoch": 0.51, + "grad_norm": 1.9211725095625716, + "learning_rate": 5.000841708455351e-06, + "loss": 0.5863, + "step": 4954 + }, + { + "epoch": 0.52, + "grad_norm": 1.8963853637765036, + "learning_rate": 4.99915829154465e-06, + "loss": 0.6632, + "step": 4955 + }, + { + "epoch": 0.52, + "grad_norm": 1.9670876231350454, + "learning_rate": 4.997474874729361e-06, + "loss": 0.7981, + "step": 4956 + }, + { + "epoch": 0.52, + "grad_norm": 1.7891379226397384, + "learning_rate": 4.995791458200309e-06, + "loss": 0.5285, + "step": 4957 + }, + { + "epoch": 0.52, + "grad_norm": 2.176966656664999, + "learning_rate": 4.994108042148318e-06, + "loss": 0.702, + "step": 4958 + }, + { + "epoch": 0.52, + "grad_norm": 1.9299774614223446, + "learning_rate": 4.992424626764216e-06, + "loss": 0.6392, + "step": 4959 + }, + { + "epoch": 0.52, + "grad_norm": 1.699908367772429, + "learning_rate": 4.990741212238825e-06, + "loss": 0.5981, + "step": 4960 + }, + { + "epoch": 0.52, + "grad_norm": 1.6255346707577691, + "learning_rate": 4.98905779876297e-06, + "loss": 0.5514, + "step": 4961 + }, + { + "epoch": 0.52, + "grad_norm": 1.8368275633671942, + "learning_rate": 4.987374386527478e-06, + "loss": 0.6648, + "step": 4962 + }, + { + "epoch": 0.52, + "grad_norm": 1.8212502460777635, + "learning_rate": 4.985690975723168e-06, + "loss": 0.6234, + "step": 4963 + }, + { + "epoch": 0.52, + "grad_norm": 1.5672021052986176, + "learning_rate": 4.984007566540869e-06, + "loss": 0.5612, + "step": 4964 + }, + { + "epoch": 0.52, + "grad_norm": 1.8762480041490517, + "learning_rate": 4.982324159171404e-06, + "loss": 0.6543, + "step": 4965 + }, + { + "epoch": 0.52, + "grad_norm": 1.867185684724656, + "learning_rate": 4.980640753805595e-06, + "loss": 0.6743, + "step": 4966 + }, + { + "epoch": 0.52, + "grad_norm": 1.979608168978776, + "learning_rate": 4.978957350634267e-06, + "loss": 0.6139, + "step": 4967 + }, + { + "epoch": 0.52, + "grad_norm": 2.0889995333117413, + "learning_rate": 4.977273949848244e-06, + "loss": 0.6246, + "step": 4968 + }, + { + "epoch": 0.52, + "grad_norm": 2.0304842602058533, + "learning_rate": 4.975590551638348e-06, + "loss": 0.6666, + "step": 4969 + }, + { + "epoch": 0.52, + "grad_norm": 2.0355864818291303, + "learning_rate": 4.973907156195405e-06, + "loss": 0.637, + "step": 4970 + }, + { + "epoch": 0.52, + "grad_norm": 1.8969754159542844, + "learning_rate": 4.972223763710231e-06, + "loss": 0.6201, + "step": 4971 + }, + { + "epoch": 0.52, + "grad_norm": 2.2601088567596532, + "learning_rate": 4.970540374373653e-06, + "loss": 0.6495, + "step": 4972 + }, + { + "epoch": 0.52, + "grad_norm": 1.86528365828505, + "learning_rate": 4.96885698837649e-06, + "loss": 0.5782, + "step": 4973 + }, + { + "epoch": 0.52, + "grad_norm": 1.9862084426505078, + "learning_rate": 4.967173605909566e-06, + "loss": 0.6409, + "step": 4974 + }, + { + "epoch": 0.52, + "grad_norm": 1.7833941900740438, + "learning_rate": 4.9654902271637005e-06, + "loss": 0.7328, + "step": 4975 + }, + { + "epoch": 0.52, + "grad_norm": 1.8010266943749187, + "learning_rate": 4.963806852329715e-06, + "loss": 0.6425, + "step": 4976 + }, + { + "epoch": 0.52, + "grad_norm": 1.9009609692301752, + "learning_rate": 4.962123481598431e-06, + "loss": 0.6324, + "step": 4977 + }, + { + "epoch": 0.52, + "grad_norm": 1.8052576908809401, + "learning_rate": 4.9604401151606626e-06, + "loss": 0.6518, + "step": 4978 + }, + { + "epoch": 0.52, + "grad_norm": 1.868428200480606, + "learning_rate": 4.958756753207234e-06, + "loss": 0.5987, + "step": 4979 + }, + { + "epoch": 0.52, + "grad_norm": 1.9367859434196406, + "learning_rate": 4.957073395928963e-06, + "loss": 0.7087, + "step": 4980 + }, + { + "epoch": 0.52, + "grad_norm": 1.7339870392195045, + "learning_rate": 4.955390043516666e-06, + "loss": 0.5802, + "step": 4981 + }, + { + "epoch": 0.52, + "grad_norm": 2.0035904262119333, + "learning_rate": 4.953706696161163e-06, + "loss": 0.7026, + "step": 4982 + }, + { + "epoch": 0.52, + "grad_norm": 1.961898238315474, + "learning_rate": 4.952023354053269e-06, + "loss": 0.6831, + "step": 4983 + }, + { + "epoch": 0.52, + "grad_norm": 1.8107631912130409, + "learning_rate": 4.950340017383802e-06, + "loss": 0.6986, + "step": 4984 + }, + { + "epoch": 0.52, + "grad_norm": 1.7360843176805136, + "learning_rate": 4.948656686343577e-06, + "loss": 0.7014, + "step": 4985 + }, + { + "epoch": 0.52, + "grad_norm": 1.880010839932374, + "learning_rate": 4.946973361123411e-06, + "loss": 0.6949, + "step": 4986 + }, + { + "epoch": 0.52, + "grad_norm": 2.0900365683376148, + "learning_rate": 4.945290041914114e-06, + "loss": 0.7737, + "step": 4987 + }, + { + "epoch": 0.52, + "grad_norm": 1.7786822013502936, + "learning_rate": 4.943606728906503e-06, + "loss": 0.5823, + "step": 4988 + }, + { + "epoch": 0.52, + "grad_norm": 2.00799604136966, + "learning_rate": 4.941923422291392e-06, + "loss": 0.6422, + "step": 4989 + }, + { + "epoch": 0.52, + "grad_norm": 2.0586128906008607, + "learning_rate": 4.94024012225959e-06, + "loss": 0.7, + "step": 4990 + }, + { + "epoch": 0.52, + "grad_norm": 1.7841514369236677, + "learning_rate": 4.938556829001912e-06, + "loss": 0.5756, + "step": 4991 + }, + { + "epoch": 0.52, + "grad_norm": 2.306439783095156, + "learning_rate": 4.936873542709168e-06, + "loss": 0.5948, + "step": 4992 + }, + { + "epoch": 0.52, + "grad_norm": 2.615959858500367, + "learning_rate": 4.935190263572168e-06, + "loss": 0.5653, + "step": 4993 + }, + { + "epoch": 0.52, + "grad_norm": 1.7733694335394845, + "learning_rate": 4.93350699178172e-06, + "loss": 0.5998, + "step": 4994 + }, + { + "epoch": 0.52, + "grad_norm": 1.7428928640514691, + "learning_rate": 4.931823727528634e-06, + "loss": 0.6163, + "step": 4995 + }, + { + "epoch": 0.52, + "grad_norm": 1.8644342122695163, + "learning_rate": 4.930140471003716e-06, + "loss": 0.7548, + "step": 4996 + }, + { + "epoch": 0.52, + "grad_norm": 2.0175758276054747, + "learning_rate": 4.9284572223977755e-06, + "loss": 0.6279, + "step": 4997 + }, + { + "epoch": 0.52, + "grad_norm": 2.0749950976688205, + "learning_rate": 4.926773981901616e-06, + "loss": 0.6444, + "step": 4998 + }, + { + "epoch": 0.52, + "grad_norm": 2.124837336083513, + "learning_rate": 4.925090749706045e-06, + "loss": 0.719, + "step": 4999 + }, + { + "epoch": 0.52, + "grad_norm": 1.993800913838882, + "learning_rate": 4.9234075260018615e-06, + "loss": 0.6828, + "step": 5000 + }, + { + "epoch": 0.52, + "grad_norm": 1.8242495509149763, + "learning_rate": 4.921724310979872e-06, + "loss": 0.6616, + "step": 5001 + }, + { + "epoch": 0.52, + "grad_norm": 1.8393977819228315, + "learning_rate": 4.920041104830879e-06, + "loss": 0.5354, + "step": 5002 + }, + { + "epoch": 0.52, + "grad_norm": 1.890344553733854, + "learning_rate": 4.918357907745681e-06, + "loss": 0.5357, + "step": 5003 + }, + { + "epoch": 0.52, + "grad_norm": 1.8790617991665943, + "learning_rate": 4.91667471991508e-06, + "loss": 0.5759, + "step": 5004 + }, + { + "epoch": 0.52, + "grad_norm": 1.8795858588418088, + "learning_rate": 4.914991541529875e-06, + "loss": 0.6199, + "step": 5005 + }, + { + "epoch": 0.52, + "grad_norm": 1.7329455799381723, + "learning_rate": 4.913308372780863e-06, + "loss": 0.6488, + "step": 5006 + }, + { + "epoch": 0.52, + "grad_norm": 1.9222191221821798, + "learning_rate": 4.9116252138588435e-06, + "loss": 0.6239, + "step": 5007 + }, + { + "epoch": 0.52, + "grad_norm": 1.8479397466575056, + "learning_rate": 4.909942064954607e-06, + "loss": 0.6731, + "step": 5008 + }, + { + "epoch": 0.52, + "grad_norm": 1.9196852172679921, + "learning_rate": 4.908258926258951e-06, + "loss": 0.607, + "step": 5009 + }, + { + "epoch": 0.52, + "grad_norm": 1.8166403880883872, + "learning_rate": 4.906575797962669e-06, + "loss": 0.6431, + "step": 5010 + }, + { + "epoch": 0.52, + "grad_norm": 1.7465182013878113, + "learning_rate": 4.904892680256553e-06, + "loss": 0.6119, + "step": 5011 + }, + { + "epoch": 0.52, + "grad_norm": 1.7704646382888727, + "learning_rate": 4.903209573331393e-06, + "loss": 0.6068, + "step": 5012 + }, + { + "epoch": 0.52, + "grad_norm": 1.9521348314630844, + "learning_rate": 4.90152647737798e-06, + "loss": 0.5722, + "step": 5013 + }, + { + "epoch": 0.52, + "grad_norm": 1.8961939935911354, + "learning_rate": 4.899843392587104e-06, + "loss": 0.6284, + "step": 5014 + }, + { + "epoch": 0.52, + "grad_norm": 2.06014596993545, + "learning_rate": 4.8981603191495484e-06, + "loss": 0.6333, + "step": 5015 + }, + { + "epoch": 0.52, + "grad_norm": 1.8220721139660931, + "learning_rate": 4.8964772572561e-06, + "loss": 0.5629, + "step": 5016 + }, + { + "epoch": 0.52, + "grad_norm": 1.9131734154680402, + "learning_rate": 4.894794207097546e-06, + "loss": 0.6336, + "step": 5017 + }, + { + "epoch": 0.52, + "grad_norm": 2.0006845737302243, + "learning_rate": 4.893111168864668e-06, + "loss": 0.6024, + "step": 5018 + }, + { + "epoch": 0.52, + "grad_norm": 1.835189048852703, + "learning_rate": 4.891428142748247e-06, + "loss": 0.5964, + "step": 5019 + }, + { + "epoch": 0.52, + "grad_norm": 1.9859963215195038, + "learning_rate": 4.889745128939067e-06, + "loss": 0.7307, + "step": 5020 + }, + { + "epoch": 0.52, + "grad_norm": 2.0406766351110996, + "learning_rate": 4.888062127627904e-06, + "loss": 0.6828, + "step": 5021 + }, + { + "epoch": 0.52, + "grad_norm": 1.9519571650177585, + "learning_rate": 4.886379139005537e-06, + "loss": 0.6814, + "step": 5022 + }, + { + "epoch": 0.52, + "grad_norm": 1.8257334568373909, + "learning_rate": 4.884696163262742e-06, + "loss": 0.5058, + "step": 5023 + }, + { + "epoch": 0.52, + "grad_norm": 1.7128904695532874, + "learning_rate": 4.883013200590294e-06, + "loss": 0.5868, + "step": 5024 + }, + { + "epoch": 0.52, + "grad_norm": 1.7085841774434567, + "learning_rate": 4.881330251178968e-06, + "loss": 0.5638, + "step": 5025 + }, + { + "epoch": 0.52, + "grad_norm": 1.908140564370563, + "learning_rate": 4.879647315219533e-06, + "loss": 0.5806, + "step": 5026 + }, + { + "epoch": 0.52, + "grad_norm": 1.8237970784787472, + "learning_rate": 4.87796439290276e-06, + "loss": 0.6137, + "step": 5027 + }, + { + "epoch": 0.52, + "grad_norm": 1.884695749390379, + "learning_rate": 4.87628148441942e-06, + "loss": 0.6748, + "step": 5028 + }, + { + "epoch": 0.52, + "grad_norm": 1.8186368146388783, + "learning_rate": 4.874598589960279e-06, + "loss": 0.695, + "step": 5029 + }, + { + "epoch": 0.52, + "grad_norm": 1.7967346368568278, + "learning_rate": 4.8729157097161025e-06, + "loss": 0.6156, + "step": 5030 + }, + { + "epoch": 0.52, + "grad_norm": 1.786633454376535, + "learning_rate": 4.871232843877654e-06, + "loss": 0.562, + "step": 5031 + }, + { + "epoch": 0.52, + "grad_norm": 1.9850275764992054, + "learning_rate": 4.869549992635697e-06, + "loss": 0.5847, + "step": 5032 + }, + { + "epoch": 0.52, + "grad_norm": 1.8984219117890422, + "learning_rate": 4.867867156180992e-06, + "loss": 0.622, + "step": 5033 + }, + { + "epoch": 0.52, + "grad_norm": 1.7075309788258288, + "learning_rate": 4.866184334704297e-06, + "loss": 0.5809, + "step": 5034 + }, + { + "epoch": 0.52, + "grad_norm": 1.8264943957080289, + "learning_rate": 4.864501528396371e-06, + "loss": 0.6548, + "step": 5035 + }, + { + "epoch": 0.52, + "grad_norm": 2.025846349325443, + "learning_rate": 4.862818737447971e-06, + "loss": 0.6318, + "step": 5036 + }, + { + "epoch": 0.52, + "grad_norm": 1.7219540198512069, + "learning_rate": 4.861135962049847e-06, + "loss": 0.6695, + "step": 5037 + }, + { + "epoch": 0.52, + "grad_norm": 2.077354289814275, + "learning_rate": 4.859453202392753e-06, + "loss": 0.6169, + "step": 5038 + }, + { + "epoch": 0.52, + "grad_norm": 1.9060446882946842, + "learning_rate": 4.8577704586674405e-06, + "loss": 0.5871, + "step": 5039 + }, + { + "epoch": 0.52, + "grad_norm": 2.2842872317930714, + "learning_rate": 4.856087731064656e-06, + "loss": 0.6755, + "step": 5040 + }, + { + "epoch": 0.52, + "grad_norm": 1.8767079908441326, + "learning_rate": 4.854405019775148e-06, + "loss": 0.6126, + "step": 5041 + }, + { + "epoch": 0.52, + "grad_norm": 1.7312471324735106, + "learning_rate": 4.852722324989661e-06, + "loss": 0.6224, + "step": 5042 + }, + { + "epoch": 0.52, + "grad_norm": 1.9361134166550507, + "learning_rate": 4.851039646898938e-06, + "loss": 0.6556, + "step": 5043 + }, + { + "epoch": 0.52, + "grad_norm": 1.7027530285331263, + "learning_rate": 4.8493569856937215e-06, + "loss": 0.6058, + "step": 5044 + }, + { + "epoch": 0.52, + "grad_norm": 2.1245655764901317, + "learning_rate": 4.847674341564748e-06, + "loss": 0.7028, + "step": 5045 + }, + { + "epoch": 0.52, + "grad_norm": 1.9143911466692012, + "learning_rate": 4.845991714702755e-06, + "loss": 0.6948, + "step": 5046 + }, + { + "epoch": 0.52, + "grad_norm": 1.941124018073597, + "learning_rate": 4.844309105298481e-06, + "loss": 0.7231, + "step": 5047 + }, + { + "epoch": 0.52, + "grad_norm": 1.9438368169759088, + "learning_rate": 4.842626513542656e-06, + "loss": 0.6155, + "step": 5048 + }, + { + "epoch": 0.52, + "grad_norm": 1.828675457167432, + "learning_rate": 4.840943939626012e-06, + "loss": 0.6513, + "step": 5049 + }, + { + "epoch": 0.52, + "grad_norm": 1.8271797659739548, + "learning_rate": 4.83926138373928e-06, + "loss": 0.5988, + "step": 5050 + }, + { + "epoch": 0.53, + "grad_norm": 3.9452851427531885, + "learning_rate": 4.8375788460731885e-06, + "loss": 0.6387, + "step": 5051 + }, + { + "epoch": 0.53, + "grad_norm": 1.9841860268134885, + "learning_rate": 4.8358963268184585e-06, + "loss": 0.6832, + "step": 5052 + }, + { + "epoch": 0.53, + "grad_norm": 1.8792424593951718, + "learning_rate": 4.8342138261658145e-06, + "loss": 0.5336, + "step": 5053 + }, + { + "epoch": 0.53, + "grad_norm": 1.9310634981620276, + "learning_rate": 4.832531344305979e-06, + "loss": 0.6749, + "step": 5054 + }, + { + "epoch": 0.53, + "grad_norm": 2.02272191555057, + "learning_rate": 4.8308488814296695e-06, + "loss": 0.7115, + "step": 5055 + }, + { + "epoch": 0.53, + "grad_norm": 1.7549744426219025, + "learning_rate": 4.829166437727603e-06, + "loss": 0.6443, + "step": 5056 + }, + { + "epoch": 0.53, + "grad_norm": 2.4305012850264225, + "learning_rate": 4.827484013390496e-06, + "loss": 0.5472, + "step": 5057 + }, + { + "epoch": 0.53, + "grad_norm": 2.1868137796135327, + "learning_rate": 4.825801608609059e-06, + "loss": 0.6223, + "step": 5058 + }, + { + "epoch": 0.53, + "grad_norm": 1.853626000392213, + "learning_rate": 4.824119223574002e-06, + "loss": 0.6385, + "step": 5059 + }, + { + "epoch": 0.53, + "grad_norm": 1.8454461648434337, + "learning_rate": 4.8224368584760345e-06, + "loss": 0.6323, + "step": 5060 + }, + { + "epoch": 0.53, + "grad_norm": 1.735519333587952, + "learning_rate": 4.82075451350586e-06, + "loss": 0.6135, + "step": 5061 + }, + { + "epoch": 0.53, + "grad_norm": 1.858344450733772, + "learning_rate": 4.819072188854183e-06, + "loss": 0.535, + "step": 5062 + }, + { + "epoch": 0.53, + "grad_norm": 2.0553850757925702, + "learning_rate": 4.817389884711706e-06, + "loss": 0.652, + "step": 5063 + }, + { + "epoch": 0.53, + "grad_norm": 1.9772353357592571, + "learning_rate": 4.815707601269126e-06, + "loss": 0.662, + "step": 5064 + }, + { + "epoch": 0.53, + "grad_norm": 1.7625156674615081, + "learning_rate": 4.814025338717139e-06, + "loss": 0.6364, + "step": 5065 + }, + { + "epoch": 0.53, + "grad_norm": 1.9316812486984454, + "learning_rate": 4.812343097246442e-06, + "loss": 0.6193, + "step": 5066 + }, + { + "epoch": 0.53, + "grad_norm": 1.4902859699492574, + "learning_rate": 4.8106608770477225e-06, + "loss": 0.4888, + "step": 5067 + }, + { + "epoch": 0.53, + "grad_norm": 2.191617491974796, + "learning_rate": 4.808978678311672e-06, + "loss": 0.5908, + "step": 5068 + }, + { + "epoch": 0.53, + "grad_norm": 1.9555631768420776, + "learning_rate": 4.807296501228977e-06, + "loss": 0.6095, + "step": 5069 + }, + { + "epoch": 0.53, + "grad_norm": 1.9063575635693246, + "learning_rate": 4.805614345990322e-06, + "loss": 0.6828, + "step": 5070 + }, + { + "epoch": 0.53, + "grad_norm": 1.7436840319728293, + "learning_rate": 4.803932212786389e-06, + "loss": 0.7226, + "step": 5071 + }, + { + "epoch": 0.53, + "grad_norm": 2.2328858534263847, + "learning_rate": 4.802250101807857e-06, + "loss": 0.6088, + "step": 5072 + }, + { + "epoch": 0.53, + "grad_norm": 1.6911436875533297, + "learning_rate": 4.800568013245405e-06, + "loss": 0.6307, + "step": 5073 + }, + { + "epoch": 0.53, + "grad_norm": 1.9214187162207514, + "learning_rate": 4.798885947289705e-06, + "loss": 0.6821, + "step": 5074 + }, + { + "epoch": 0.53, + "grad_norm": 1.7311494622263512, + "learning_rate": 4.797203904131427e-06, + "loss": 0.6244, + "step": 5075 + }, + { + "epoch": 0.53, + "grad_norm": 2.00546387408505, + "learning_rate": 4.795521883961243e-06, + "loss": 0.6301, + "step": 5076 + }, + { + "epoch": 0.53, + "grad_norm": 2.7067685876166543, + "learning_rate": 4.793839886969819e-06, + "loss": 0.6525, + "step": 5077 + }, + { + "epoch": 0.53, + "grad_norm": 1.9479052680870534, + "learning_rate": 4.792157913347819e-06, + "loss": 0.6248, + "step": 5078 + }, + { + "epoch": 0.53, + "grad_norm": 1.9878951234618571, + "learning_rate": 4.790475963285904e-06, + "loss": 0.6752, + "step": 5079 + }, + { + "epoch": 0.53, + "grad_norm": 1.8307605231277921, + "learning_rate": 4.788794036974733e-06, + "loss": 0.6728, + "step": 5080 + }, + { + "epoch": 0.53, + "grad_norm": 2.007850691507178, + "learning_rate": 4.787112134604964e-06, + "loss": 0.6615, + "step": 5081 + }, + { + "epoch": 0.53, + "grad_norm": 1.9682209210971686, + "learning_rate": 4.785430256367246e-06, + "loss": 0.6212, + "step": 5082 + }, + { + "epoch": 0.53, + "grad_norm": 1.9539068327837195, + "learning_rate": 4.783748402452231e-06, + "loss": 0.6382, + "step": 5083 + }, + { + "epoch": 0.53, + "grad_norm": 1.8254711972644915, + "learning_rate": 4.782066573050567e-06, + "loss": 0.5879, + "step": 5084 + }, + { + "epoch": 0.53, + "grad_norm": 1.9498763509941295, + "learning_rate": 4.7803847683529e-06, + "loss": 0.6181, + "step": 5085 + }, + { + "epoch": 0.53, + "grad_norm": 1.8874003778516166, + "learning_rate": 4.77870298854987e-06, + "loss": 0.6335, + "step": 5086 + }, + { + "epoch": 0.53, + "grad_norm": 1.8312048896987314, + "learning_rate": 4.777021233832119e-06, + "loss": 0.5927, + "step": 5087 + }, + { + "epoch": 0.53, + "grad_norm": 1.9016075310030311, + "learning_rate": 4.775339504390283e-06, + "loss": 0.6854, + "step": 5088 + }, + { + "epoch": 0.53, + "grad_norm": 2.0648643513408897, + "learning_rate": 4.7736578004149936e-06, + "loss": 0.585, + "step": 5089 + }, + { + "epoch": 0.53, + "grad_norm": 1.7637557948369473, + "learning_rate": 4.771976122096882e-06, + "loss": 0.638, + "step": 5090 + }, + { + "epoch": 0.53, + "grad_norm": 2.2250046913316366, + "learning_rate": 4.7702944696265766e-06, + "loss": 0.6797, + "step": 5091 + }, + { + "epoch": 0.53, + "grad_norm": 1.8031799845090113, + "learning_rate": 4.768612843194703e-06, + "loss": 0.6511, + "step": 5092 + }, + { + "epoch": 0.53, + "grad_norm": 1.8677023618333652, + "learning_rate": 4.766931242991882e-06, + "loss": 0.555, + "step": 5093 + }, + { + "epoch": 0.53, + "grad_norm": 1.9985670391581052, + "learning_rate": 4.765249669208733e-06, + "loss": 0.6718, + "step": 5094 + }, + { + "epoch": 0.53, + "grad_norm": 1.7495206295185197, + "learning_rate": 4.763568122035873e-06, + "loss": 0.7245, + "step": 5095 + }, + { + "epoch": 0.53, + "grad_norm": 1.8561316308119256, + "learning_rate": 4.761886601663913e-06, + "loss": 0.7281, + "step": 5096 + }, + { + "epoch": 0.53, + "grad_norm": 1.8464447577024359, + "learning_rate": 4.760205108283465e-06, + "loss": 0.7014, + "step": 5097 + }, + { + "epoch": 0.53, + "grad_norm": 1.912072520084057, + "learning_rate": 4.758523642085133e-06, + "loss": 0.683, + "step": 5098 + }, + { + "epoch": 0.53, + "grad_norm": 1.9941678964102907, + "learning_rate": 4.756842203259523e-06, + "loss": 0.5755, + "step": 5099 + }, + { + "epoch": 0.53, + "grad_norm": 1.951575529813139, + "learning_rate": 4.755160791997235e-06, + "loss": 0.7646, + "step": 5100 + }, + { + "epoch": 0.53, + "grad_norm": 1.8620977956553646, + "learning_rate": 4.753479408488868e-06, + "loss": 0.6415, + "step": 5101 + }, + { + "epoch": 0.53, + "grad_norm": 1.9029340613448509, + "learning_rate": 4.751798052925015e-06, + "loss": 0.6514, + "step": 5102 + }, + { + "epoch": 0.53, + "grad_norm": 1.8930213798910867, + "learning_rate": 4.750116725496268e-06, + "loss": 0.6322, + "step": 5103 + }, + { + "epoch": 0.53, + "grad_norm": 1.804024019566647, + "learning_rate": 4.7484354263932136e-06, + "loss": 0.6197, + "step": 5104 + }, + { + "epoch": 0.53, + "grad_norm": 1.7759620876156021, + "learning_rate": 4.746754155806437e-06, + "loss": 0.5931, + "step": 5105 + }, + { + "epoch": 0.53, + "grad_norm": 1.728832946061556, + "learning_rate": 4.74507291392652e-06, + "loss": 0.5501, + "step": 5106 + }, + { + "epoch": 0.53, + "grad_norm": 2.116357919100813, + "learning_rate": 4.743391700944042e-06, + "loss": 0.6326, + "step": 5107 + }, + { + "epoch": 0.53, + "grad_norm": 1.841309504995033, + "learning_rate": 4.7417105170495775e-06, + "loss": 0.6341, + "step": 5108 + }, + { + "epoch": 0.53, + "grad_norm": 1.9423787604865643, + "learning_rate": 4.740029362433698e-06, + "loss": 0.6879, + "step": 5109 + }, + { + "epoch": 0.53, + "grad_norm": 1.9253807388055482, + "learning_rate": 4.7383482372869735e-06, + "loss": 0.5657, + "step": 5110 + }, + { + "epoch": 0.53, + "grad_norm": 1.9821619032453928, + "learning_rate": 4.736667141799965e-06, + "loss": 0.7289, + "step": 5111 + }, + { + "epoch": 0.53, + "grad_norm": 1.908684744520375, + "learning_rate": 4.734986076163238e-06, + "loss": 0.7003, + "step": 5112 + }, + { + "epoch": 0.53, + "grad_norm": 1.9596783586093547, + "learning_rate": 4.73330504056735e-06, + "loss": 0.6458, + "step": 5113 + }, + { + "epoch": 0.53, + "grad_norm": 1.9224228062346809, + "learning_rate": 4.731624035202856e-06, + "loss": 0.7468, + "step": 5114 + }, + { + "epoch": 0.53, + "grad_norm": 1.8680844649352413, + "learning_rate": 4.729943060260306e-06, + "loss": 0.6674, + "step": 5115 + }, + { + "epoch": 0.53, + "grad_norm": 2.024935667135311, + "learning_rate": 4.72826211593025e-06, + "loss": 0.7308, + "step": 5116 + }, + { + "epoch": 0.53, + "grad_norm": 1.8095294446492074, + "learning_rate": 4.726581202403232e-06, + "loss": 0.7611, + "step": 5117 + }, + { + "epoch": 0.53, + "grad_norm": 2.0807128692653345, + "learning_rate": 4.7249003198697955e-06, + "loss": 0.6249, + "step": 5118 + }, + { + "epoch": 0.53, + "grad_norm": 2.0181745603197236, + "learning_rate": 4.723219468520474e-06, + "loss": 0.6167, + "step": 5119 + }, + { + "epoch": 0.53, + "grad_norm": 1.7391558203953414, + "learning_rate": 4.721538648545802e-06, + "loss": 0.6373, + "step": 5120 + }, + { + "epoch": 0.53, + "grad_norm": 1.7577092949947608, + "learning_rate": 4.719857860136312e-06, + "loss": 0.6041, + "step": 5121 + }, + { + "epoch": 0.53, + "grad_norm": 1.9735884394796157, + "learning_rate": 4.7181771034825306e-06, + "loss": 0.6166, + "step": 5122 + }, + { + "epoch": 0.53, + "grad_norm": 1.9220027882098172, + "learning_rate": 4.716496378774981e-06, + "loss": 0.524, + "step": 5123 + }, + { + "epoch": 0.53, + "grad_norm": 1.9092831897320568, + "learning_rate": 4.714815686204182e-06, + "loss": 0.5647, + "step": 5124 + }, + { + "epoch": 0.53, + "grad_norm": 1.9953171291519414, + "learning_rate": 4.713135025960652e-06, + "loss": 0.7217, + "step": 5125 + }, + { + "epoch": 0.53, + "grad_norm": 1.6846329284575117, + "learning_rate": 4.711454398234902e-06, + "loss": 0.5023, + "step": 5126 + }, + { + "epoch": 0.53, + "grad_norm": 1.8337482270315908, + "learning_rate": 4.7097738032174394e-06, + "loss": 0.6636, + "step": 5127 + }, + { + "epoch": 0.53, + "grad_norm": 1.7405246564961698, + "learning_rate": 4.708093241098771e-06, + "loss": 0.6347, + "step": 5128 + }, + { + "epoch": 0.53, + "grad_norm": 1.7954045338519318, + "learning_rate": 4.7064127120693965e-06, + "loss": 0.6376, + "step": 5129 + }, + { + "epoch": 0.53, + "grad_norm": 1.8846517693769833, + "learning_rate": 4.704732216319815e-06, + "loss": 0.636, + "step": 5130 + }, + { + "epoch": 0.53, + "grad_norm": 1.8830831159833605, + "learning_rate": 4.7030517540405195e-06, + "loss": 0.6288, + "step": 5131 + }, + { + "epoch": 0.53, + "grad_norm": 1.8156056135854486, + "learning_rate": 4.701371325422002e-06, + "loss": 0.6129, + "step": 5132 + }, + { + "epoch": 0.53, + "grad_norm": 2.0101280335971166, + "learning_rate": 4.6996909306547455e-06, + "loss": 0.7395, + "step": 5133 + }, + { + "epoch": 0.53, + "grad_norm": 2.321181377226723, + "learning_rate": 4.6980105699292335e-06, + "loss": 0.6699, + "step": 5134 + }, + { + "epoch": 0.53, + "grad_norm": 2.226576509137887, + "learning_rate": 4.696330243435945e-06, + "loss": 0.687, + "step": 5135 + }, + { + "epoch": 0.53, + "grad_norm": 1.9204092632555354, + "learning_rate": 4.694649951365354e-06, + "loss": 0.5934, + "step": 5136 + }, + { + "epoch": 0.53, + "grad_norm": 2.0429760454559283, + "learning_rate": 4.692969693907931e-06, + "loss": 0.6716, + "step": 5137 + }, + { + "epoch": 0.53, + "grad_norm": 1.9277314108498504, + "learning_rate": 4.691289471254144e-06, + "loss": 0.6667, + "step": 5138 + }, + { + "epoch": 0.53, + "grad_norm": 1.988347172280978, + "learning_rate": 4.689609283594454e-06, + "loss": 0.6952, + "step": 5139 + }, + { + "epoch": 0.53, + "grad_norm": 1.952429859380367, + "learning_rate": 4.6879291311193244e-06, + "loss": 0.579, + "step": 5140 + }, + { + "epoch": 0.53, + "grad_norm": 1.804754920642748, + "learning_rate": 4.686249014019203e-06, + "loss": 0.6225, + "step": 5141 + }, + { + "epoch": 0.53, + "grad_norm": 1.989089474261446, + "learning_rate": 4.684568932484546e-06, + "loss": 0.6693, + "step": 5142 + }, + { + "epoch": 0.53, + "grad_norm": 1.983466565946788, + "learning_rate": 4.682888886705797e-06, + "loss": 0.6234, + "step": 5143 + }, + { + "epoch": 0.53, + "grad_norm": 1.7356926775014248, + "learning_rate": 4.6812088768734e-06, + "loss": 0.5841, + "step": 5144 + }, + { + "epoch": 0.53, + "grad_norm": 1.9370988809301124, + "learning_rate": 4.679528903177795e-06, + "loss": 0.6462, + "step": 5145 + }, + { + "epoch": 0.53, + "grad_norm": 1.9192368651984106, + "learning_rate": 4.6778489658094145e-06, + "loss": 0.6877, + "step": 5146 + }, + { + "epoch": 0.54, + "grad_norm": 1.82458653830636, + "learning_rate": 4.676169064958692e-06, + "loss": 0.6435, + "step": 5147 + }, + { + "epoch": 0.54, + "grad_norm": 2.0128828499828373, + "learning_rate": 4.674489200816051e-06, + "loss": 0.6844, + "step": 5148 + }, + { + "epoch": 0.54, + "grad_norm": 2.079968396447985, + "learning_rate": 4.672809373571913e-06, + "loss": 0.712, + "step": 5149 + }, + { + "epoch": 0.54, + "grad_norm": 1.9981225791183472, + "learning_rate": 4.671129583416697e-06, + "loss": 0.6063, + "step": 5150 + }, + { + "epoch": 0.54, + "grad_norm": 1.800390445160367, + "learning_rate": 4.669449830540819e-06, + "loss": 0.6671, + "step": 5151 + }, + { + "epoch": 0.54, + "grad_norm": 1.7464047867362262, + "learning_rate": 4.6677701151346856e-06, + "loss": 0.6108, + "step": 5152 + }, + { + "epoch": 0.54, + "grad_norm": 1.8788397288269676, + "learning_rate": 4.6660904373887035e-06, + "loss": 0.6478, + "step": 5153 + }, + { + "epoch": 0.54, + "grad_norm": 1.8506751756270443, + "learning_rate": 4.664410797493275e-06, + "loss": 0.6116, + "step": 5154 + }, + { + "epoch": 0.54, + "grad_norm": 1.8466483682885082, + "learning_rate": 4.662731195638794e-06, + "loss": 0.5673, + "step": 5155 + }, + { + "epoch": 0.54, + "grad_norm": 1.9893512470431758, + "learning_rate": 4.661051632015655e-06, + "loss": 0.5976, + "step": 5156 + }, + { + "epoch": 0.54, + "grad_norm": 1.8473962326091087, + "learning_rate": 4.659372106814244e-06, + "loss": 0.7179, + "step": 5157 + }, + { + "epoch": 0.54, + "grad_norm": 1.8880905203942298, + "learning_rate": 4.657692620224948e-06, + "loss": 0.6614, + "step": 5158 + }, + { + "epoch": 0.54, + "grad_norm": 1.8210595211398692, + "learning_rate": 4.656013172438143e-06, + "loss": 0.6492, + "step": 5159 + }, + { + "epoch": 0.54, + "grad_norm": 1.8642394980355337, + "learning_rate": 4.654333763644206e-06, + "loss": 0.6072, + "step": 5160 + }, + { + "epoch": 0.54, + "grad_norm": 2.113480454514094, + "learning_rate": 4.652654394033508e-06, + "loss": 0.7173, + "step": 5161 + }, + { + "epoch": 0.54, + "grad_norm": 1.7522034431581206, + "learning_rate": 4.650975063796414e-06, + "loss": 0.6015, + "step": 5162 + }, + { + "epoch": 0.54, + "grad_norm": 2.2933826515210276, + "learning_rate": 4.649295773123285e-06, + "loss": 0.5954, + "step": 5163 + }, + { + "epoch": 0.54, + "grad_norm": 1.5870787892330132, + "learning_rate": 4.6476165222044795e-06, + "loss": 0.6106, + "step": 5164 + }, + { + "epoch": 0.54, + "grad_norm": 1.897198998438171, + "learning_rate": 4.645937311230351e-06, + "loss": 0.581, + "step": 5165 + }, + { + "epoch": 0.54, + "grad_norm": 1.9011534788596396, + "learning_rate": 4.6442581403912444e-06, + "loss": 0.7063, + "step": 5166 + }, + { + "epoch": 0.54, + "grad_norm": 1.8255651882129653, + "learning_rate": 4.6425790098775064e-06, + "loss": 0.7078, + "step": 5167 + }, + { + "epoch": 0.54, + "grad_norm": 2.103934341812746, + "learning_rate": 4.6408999198794744e-06, + "loss": 0.7187, + "step": 5168 + }, + { + "epoch": 0.54, + "grad_norm": 1.8022669695183133, + "learning_rate": 4.639220870587484e-06, + "loss": 0.6677, + "step": 5169 + }, + { + "epoch": 0.54, + "grad_norm": 2.0815425124291265, + "learning_rate": 4.637541862191864e-06, + "loss": 0.7668, + "step": 5170 + }, + { + "epoch": 0.54, + "grad_norm": 1.6168920667058806, + "learning_rate": 4.635862894882939e-06, + "loss": 0.6407, + "step": 5171 + }, + { + "epoch": 0.54, + "grad_norm": 1.8448181111407145, + "learning_rate": 4.634183968851031e-06, + "loss": 0.5558, + "step": 5172 + }, + { + "epoch": 0.54, + "grad_norm": 1.8625072882231783, + "learning_rate": 4.632505084286455e-06, + "loss": 0.5622, + "step": 5173 + }, + { + "epoch": 0.54, + "grad_norm": 2.02673876945062, + "learning_rate": 4.630826241379522e-06, + "loss": 0.6759, + "step": 5174 + }, + { + "epoch": 0.54, + "grad_norm": 2.001843218175361, + "learning_rate": 4.629147440320539e-06, + "loss": 0.7657, + "step": 5175 + }, + { + "epoch": 0.54, + "grad_norm": 1.9718566782218825, + "learning_rate": 4.627468681299808e-06, + "loss": 0.6772, + "step": 5176 + }, + { + "epoch": 0.54, + "grad_norm": 1.8470501784821054, + "learning_rate": 4.625789964507626e-06, + "loss": 0.6576, + "step": 5177 + }, + { + "epoch": 0.54, + "grad_norm": 1.9660622527248055, + "learning_rate": 4.624111290134284e-06, + "loss": 0.7013, + "step": 5178 + }, + { + "epoch": 0.54, + "grad_norm": 1.8012877386792032, + "learning_rate": 4.622432658370069e-06, + "loss": 0.7023, + "step": 5179 + }, + { + "epoch": 0.54, + "grad_norm": 1.8651094026382105, + "learning_rate": 4.620754069405265e-06, + "loss": 0.7387, + "step": 5180 + }, + { + "epoch": 0.54, + "grad_norm": 1.984823336261961, + "learning_rate": 4.619075523430147e-06, + "loss": 0.6433, + "step": 5181 + }, + { + "epoch": 0.54, + "grad_norm": 1.8084018306588718, + "learning_rate": 4.617397020634991e-06, + "loss": 0.4955, + "step": 5182 + }, + { + "epoch": 0.54, + "grad_norm": 1.7058235501308092, + "learning_rate": 4.615718561210063e-06, + "loss": 0.4806, + "step": 5183 + }, + { + "epoch": 0.54, + "grad_norm": 2.135606445952955, + "learning_rate": 4.614040145345629e-06, + "loss": 0.6489, + "step": 5184 + }, + { + "epoch": 0.54, + "grad_norm": 1.9003261832974596, + "learning_rate": 4.612361773231942e-06, + "loss": 0.6782, + "step": 5185 + }, + { + "epoch": 0.54, + "grad_norm": 1.766062587324516, + "learning_rate": 4.610683445059257e-06, + "loss": 0.6328, + "step": 5186 + }, + { + "epoch": 0.54, + "grad_norm": 1.772434598760773, + "learning_rate": 4.609005161017824e-06, + "loss": 0.5695, + "step": 5187 + }, + { + "epoch": 0.54, + "grad_norm": 1.9399750167465266, + "learning_rate": 4.607326921297883e-06, + "loss": 0.7608, + "step": 5188 + }, + { + "epoch": 0.54, + "grad_norm": 1.834185074488284, + "learning_rate": 4.605648726089674e-06, + "loss": 0.6278, + "step": 5189 + }, + { + "epoch": 0.54, + "grad_norm": 1.9981329160306083, + "learning_rate": 4.60397057558343e-06, + "loss": 0.6317, + "step": 5190 + }, + { + "epoch": 0.54, + "grad_norm": 2.075228145121904, + "learning_rate": 4.602292469969381e-06, + "loss": 0.627, + "step": 5191 + }, + { + "epoch": 0.54, + "grad_norm": 1.7500511714057594, + "learning_rate": 4.600614409437745e-06, + "loss": 0.6107, + "step": 5192 + }, + { + "epoch": 0.54, + "grad_norm": 2.0761176947995206, + "learning_rate": 4.5989363941787415e-06, + "loss": 0.6552, + "step": 5193 + }, + { + "epoch": 0.54, + "grad_norm": 1.788070090950307, + "learning_rate": 4.597258424382584e-06, + "loss": 0.5552, + "step": 5194 + }, + { + "epoch": 0.54, + "grad_norm": 1.8921091625857311, + "learning_rate": 4.5955805002394804e-06, + "loss": 0.6449, + "step": 5195 + }, + { + "epoch": 0.54, + "grad_norm": 1.8761192118955454, + "learning_rate": 4.593902621939632e-06, + "loss": 0.609, + "step": 5196 + }, + { + "epoch": 0.54, + "grad_norm": 2.054485578286654, + "learning_rate": 4.592224789673235e-06, + "loss": 0.5932, + "step": 5197 + }, + { + "epoch": 0.54, + "grad_norm": 1.9350066838823927, + "learning_rate": 4.590547003630484e-06, + "loss": 0.691, + "step": 5198 + }, + { + "epoch": 0.54, + "grad_norm": 1.9608958757814123, + "learning_rate": 4.588869264001563e-06, + "loss": 0.5844, + "step": 5199 + }, + { + "epoch": 0.54, + "grad_norm": 1.9773425293593354, + "learning_rate": 4.587191570976655e-06, + "loss": 0.669, + "step": 5200 + }, + { + "epoch": 0.54, + "grad_norm": 1.9251634580282069, + "learning_rate": 4.585513924745935e-06, + "loss": 0.6388, + "step": 5201 + }, + { + "epoch": 0.54, + "grad_norm": 1.7757867948720212, + "learning_rate": 4.583836325499573e-06, + "loss": 0.5827, + "step": 5202 + }, + { + "epoch": 0.54, + "grad_norm": 2.085264127591141, + "learning_rate": 4.5821587734277374e-06, + "loss": 0.532, + "step": 5203 + }, + { + "epoch": 0.54, + "grad_norm": 1.75056064511947, + "learning_rate": 4.580481268720585e-06, + "loss": 0.538, + "step": 5204 + }, + { + "epoch": 0.54, + "grad_norm": 1.8790372630466574, + "learning_rate": 4.578803811568272e-06, + "loss": 0.5644, + "step": 5205 + }, + { + "epoch": 0.54, + "grad_norm": 1.9390235343364985, + "learning_rate": 4.5771264021609494e-06, + "loss": 0.5906, + "step": 5206 + }, + { + "epoch": 0.54, + "grad_norm": 1.924353396395898, + "learning_rate": 4.575449040688758e-06, + "loss": 0.5973, + "step": 5207 + }, + { + "epoch": 0.54, + "grad_norm": 1.7020159742103511, + "learning_rate": 4.573771727341838e-06, + "loss": 0.5145, + "step": 5208 + }, + { + "epoch": 0.54, + "grad_norm": 1.7708593482880979, + "learning_rate": 4.572094462310322e-06, + "loss": 0.6289, + "step": 5209 + }, + { + "epoch": 0.54, + "grad_norm": 2.166457630399841, + "learning_rate": 4.570417245784337e-06, + "loss": 0.6675, + "step": 5210 + }, + { + "epoch": 0.54, + "grad_norm": 1.877828575644666, + "learning_rate": 4.568740077954007e-06, + "loss": 0.6011, + "step": 5211 + }, + { + "epoch": 0.54, + "grad_norm": 1.911475306416678, + "learning_rate": 4.567062959009446e-06, + "loss": 0.569, + "step": 5212 + }, + { + "epoch": 0.54, + "grad_norm": 1.8913396335881767, + "learning_rate": 4.565385889140767e-06, + "loss": 0.6125, + "step": 5213 + }, + { + "epoch": 0.54, + "grad_norm": 2.08561786360724, + "learning_rate": 4.563708868538077e-06, + "loss": 0.7155, + "step": 5214 + }, + { + "epoch": 0.54, + "grad_norm": 2.154811342394879, + "learning_rate": 4.56203189739147e-06, + "loss": 0.5944, + "step": 5215 + }, + { + "epoch": 0.54, + "grad_norm": 2.0035311683983323, + "learning_rate": 4.560354975891045e-06, + "loss": 0.6358, + "step": 5216 + }, + { + "epoch": 0.54, + "grad_norm": 1.9237950319146873, + "learning_rate": 4.55867810422689e-06, + "loss": 0.612, + "step": 5217 + }, + { + "epoch": 0.54, + "grad_norm": 1.891746378991884, + "learning_rate": 4.557001282589086e-06, + "loss": 0.6303, + "step": 5218 + }, + { + "epoch": 0.54, + "grad_norm": 1.7132159636798743, + "learning_rate": 4.5553245111677124e-06, + "loss": 0.5618, + "step": 5219 + }, + { + "epoch": 0.54, + "grad_norm": 1.9582069007610816, + "learning_rate": 4.55364779015284e-06, + "loss": 0.7596, + "step": 5220 + }, + { + "epoch": 0.54, + "grad_norm": 1.8416862170002277, + "learning_rate": 4.5519711197345365e-06, + "loss": 0.6514, + "step": 5221 + }, + { + "epoch": 0.54, + "grad_norm": 1.7109777078724329, + "learning_rate": 4.550294500102858e-06, + "loss": 0.6504, + "step": 5222 + }, + { + "epoch": 0.54, + "grad_norm": 1.7965413196363942, + "learning_rate": 4.548617931447861e-06, + "loss": 0.6029, + "step": 5223 + }, + { + "epoch": 0.54, + "grad_norm": 1.7730256764319179, + "learning_rate": 4.546941413959595e-06, + "loss": 0.6525, + "step": 5224 + }, + { + "epoch": 0.54, + "grad_norm": 2.003273238543562, + "learning_rate": 4.545264947828101e-06, + "loss": 0.6805, + "step": 5225 + }, + { + "epoch": 0.54, + "grad_norm": 2.010674257758387, + "learning_rate": 4.5435885332434184e-06, + "loss": 0.5269, + "step": 5226 + }, + { + "epoch": 0.54, + "grad_norm": 1.8445652402675543, + "learning_rate": 4.541912170395576e-06, + "loss": 0.6639, + "step": 5227 + }, + { + "epoch": 0.54, + "grad_norm": 1.937635194164857, + "learning_rate": 4.540235859474603e-06, + "loss": 0.5969, + "step": 5228 + }, + { + "epoch": 0.54, + "grad_norm": 1.7232427319669361, + "learning_rate": 4.538559600670513e-06, + "loss": 0.575, + "step": 5229 + }, + { + "epoch": 0.54, + "grad_norm": 1.9439853705055208, + "learning_rate": 4.5368833941733235e-06, + "loss": 0.5975, + "step": 5230 + }, + { + "epoch": 0.54, + "grad_norm": 1.8694910106701905, + "learning_rate": 4.53520724017304e-06, + "loss": 0.5624, + "step": 5231 + }, + { + "epoch": 0.54, + "grad_norm": 2.012545765815874, + "learning_rate": 4.533531138859665e-06, + "loss": 0.5813, + "step": 5232 + }, + { + "epoch": 0.54, + "grad_norm": 1.8435415628000271, + "learning_rate": 4.531855090423195e-06, + "loss": 0.606, + "step": 5233 + }, + { + "epoch": 0.54, + "grad_norm": 1.9556583134206118, + "learning_rate": 4.5301790950536175e-06, + "loss": 0.6116, + "step": 5234 + }, + { + "epoch": 0.54, + "grad_norm": 2.0787950978231047, + "learning_rate": 4.5285031529409184e-06, + "loss": 0.6888, + "step": 5235 + }, + { + "epoch": 0.54, + "grad_norm": 1.853465607687972, + "learning_rate": 4.526827264275076e-06, + "loss": 0.5916, + "step": 5236 + }, + { + "epoch": 0.54, + "grad_norm": 1.7968352682356779, + "learning_rate": 4.525151429246057e-06, + "loss": 0.6473, + "step": 5237 + }, + { + "epoch": 0.54, + "grad_norm": 2.2194788816263666, + "learning_rate": 4.523475648043832e-06, + "loss": 0.6571, + "step": 5238 + }, + { + "epoch": 0.54, + "grad_norm": 1.7577666502701064, + "learning_rate": 4.521799920858358e-06, + "loss": 0.6082, + "step": 5239 + }, + { + "epoch": 0.54, + "grad_norm": 1.9041366214064464, + "learning_rate": 4.520124247879588e-06, + "loss": 0.678, + "step": 5240 + }, + { + "epoch": 0.54, + "grad_norm": 1.865248415982124, + "learning_rate": 4.51844862929747e-06, + "loss": 0.6431, + "step": 5241 + }, + { + "epoch": 0.54, + "grad_norm": 1.8755663337410842, + "learning_rate": 4.516773065301946e-06, + "loss": 0.6337, + "step": 5242 + }, + { + "epoch": 0.55, + "grad_norm": 1.8953438735645665, + "learning_rate": 4.515097556082949e-06, + "loss": 0.5888, + "step": 5243 + }, + { + "epoch": 0.55, + "grad_norm": 1.803506322598369, + "learning_rate": 4.513422101830408e-06, + "loss": 0.5827, + "step": 5244 + }, + { + "epoch": 0.55, + "grad_norm": 1.9563878164296573, + "learning_rate": 4.5117467027342435e-06, + "loss": 0.5976, + "step": 5245 + }, + { + "epoch": 0.55, + "grad_norm": 1.7276638684043466, + "learning_rate": 4.510071358984373e-06, + "loss": 0.6353, + "step": 5246 + }, + { + "epoch": 0.55, + "grad_norm": 1.9039495600414362, + "learning_rate": 4.508396070770708e-06, + "loss": 0.6116, + "step": 5247 + }, + { + "epoch": 0.55, + "grad_norm": 1.930911155262903, + "learning_rate": 4.50672083828315e-06, + "loss": 0.6507, + "step": 5248 + }, + { + "epoch": 0.55, + "grad_norm": 1.7348548611102907, + "learning_rate": 4.505045661711596e-06, + "loss": 0.633, + "step": 5249 + }, + { + "epoch": 0.55, + "grad_norm": 2.0192760027137764, + "learning_rate": 4.503370541245937e-06, + "loss": 0.6852, + "step": 5250 + }, + { + "epoch": 0.55, + "grad_norm": 2.291914253911129, + "learning_rate": 4.501695477076062e-06, + "loss": 0.7381, + "step": 5251 + }, + { + "epoch": 0.55, + "grad_norm": 1.8818408471589059, + "learning_rate": 4.5000204693918405e-06, + "loss": 0.6618, + "step": 5252 + }, + { + "epoch": 0.55, + "grad_norm": 1.6450990278976414, + "learning_rate": 4.49834551838315e-06, + "loss": 0.7165, + "step": 5253 + }, + { + "epoch": 0.55, + "grad_norm": 1.8655688952363387, + "learning_rate": 4.496670624239854e-06, + "loss": 0.5881, + "step": 5254 + }, + { + "epoch": 0.55, + "grad_norm": 1.5881419810270465, + "learning_rate": 4.494995787151811e-06, + "loss": 0.5591, + "step": 5255 + }, + { + "epoch": 0.55, + "grad_norm": 1.9868107004461213, + "learning_rate": 4.493321007308875e-06, + "loss": 0.6265, + "step": 5256 + }, + { + "epoch": 0.55, + "grad_norm": 1.748853627472903, + "learning_rate": 4.49164628490089e-06, + "loss": 0.6645, + "step": 5257 + }, + { + "epoch": 0.55, + "grad_norm": 1.6825341367268272, + "learning_rate": 4.489971620117699e-06, + "loss": 0.5963, + "step": 5258 + }, + { + "epoch": 0.55, + "grad_norm": 1.7378224177710997, + "learning_rate": 4.4882970131491286e-06, + "loss": 0.6074, + "step": 5259 + }, + { + "epoch": 0.55, + "grad_norm": 1.7043117434948243, + "learning_rate": 4.4866224641850095e-06, + "loss": 0.5871, + "step": 5260 + }, + { + "epoch": 0.55, + "grad_norm": 1.873293864202997, + "learning_rate": 4.48494797341516e-06, + "loss": 0.6175, + "step": 5261 + }, + { + "epoch": 0.55, + "grad_norm": 1.8950760665680089, + "learning_rate": 4.4832735410293924e-06, + "loss": 0.6416, + "step": 5262 + }, + { + "epoch": 0.55, + "grad_norm": 1.825762291742463, + "learning_rate": 4.481599167217515e-06, + "loss": 0.6834, + "step": 5263 + }, + { + "epoch": 0.55, + "grad_norm": 1.7127983027636853, + "learning_rate": 4.479924852169327e-06, + "loss": 0.6113, + "step": 5264 + }, + { + "epoch": 0.55, + "grad_norm": 2.2666902600585157, + "learning_rate": 4.478250596074623e-06, + "loss": 0.5987, + "step": 5265 + }, + { + "epoch": 0.55, + "grad_norm": 2.0911717000348458, + "learning_rate": 4.476576399123187e-06, + "loss": 0.5852, + "step": 5266 + }, + { + "epoch": 0.55, + "grad_norm": 2.0671663566535403, + "learning_rate": 4.4749022615047975e-06, + "loss": 0.6781, + "step": 5267 + }, + { + "epoch": 0.55, + "grad_norm": 1.7386282025860975, + "learning_rate": 4.4732281834092305e-06, + "loss": 0.6234, + "step": 5268 + }, + { + "epoch": 0.55, + "grad_norm": 2.031240480955804, + "learning_rate": 4.471554165026251e-06, + "loss": 0.6807, + "step": 5269 + }, + { + "epoch": 0.55, + "grad_norm": 1.9289143528508108, + "learning_rate": 4.46988020654562e-06, + "loss": 0.6729, + "step": 5270 + }, + { + "epoch": 0.55, + "grad_norm": 2.0153197310458943, + "learning_rate": 4.4682063081570885e-06, + "loss": 0.631, + "step": 5271 + }, + { + "epoch": 0.55, + "grad_norm": 1.8926835051803474, + "learning_rate": 4.466532470050403e-06, + "loss": 0.6079, + "step": 5272 + }, + { + "epoch": 0.55, + "grad_norm": 1.9793089069024377, + "learning_rate": 4.464858692415304e-06, + "loss": 0.6527, + "step": 5273 + }, + { + "epoch": 0.55, + "grad_norm": 1.9025404029621105, + "learning_rate": 4.463184975441522e-06, + "loss": 0.6627, + "step": 5274 + }, + { + "epoch": 0.55, + "grad_norm": 1.9089159496126418, + "learning_rate": 4.461511319318782e-06, + "loss": 0.6441, + "step": 5275 + }, + { + "epoch": 0.55, + "grad_norm": 1.904444956313747, + "learning_rate": 4.4598377242368026e-06, + "loss": 0.6431, + "step": 5276 + }, + { + "epoch": 0.55, + "grad_norm": 1.9622376326034894, + "learning_rate": 4.458164190385297e-06, + "loss": 0.6903, + "step": 5277 + }, + { + "epoch": 0.55, + "grad_norm": 1.8203493701420281, + "learning_rate": 4.456490717953969e-06, + "loss": 0.5699, + "step": 5278 + }, + { + "epoch": 0.55, + "grad_norm": 1.7996933125103711, + "learning_rate": 4.454817307132515e-06, + "loss": 0.5578, + "step": 5279 + }, + { + "epoch": 0.55, + "grad_norm": 1.8862630623229697, + "learning_rate": 4.4531439581106295e-06, + "loss": 0.6777, + "step": 5280 + }, + { + "epoch": 0.55, + "grad_norm": 1.9189441415408823, + "learning_rate": 4.451470671077991e-06, + "loss": 0.6002, + "step": 5281 + }, + { + "epoch": 0.55, + "grad_norm": 1.7339129039063892, + "learning_rate": 4.449797446224279e-06, + "loss": 0.6599, + "step": 5282 + }, + { + "epoch": 0.55, + "grad_norm": 2.141150906787522, + "learning_rate": 4.4481242837391615e-06, + "loss": 0.6243, + "step": 5283 + }, + { + "epoch": 0.55, + "grad_norm": 2.006503116981974, + "learning_rate": 4.446451183812302e-06, + "loss": 0.5795, + "step": 5284 + }, + { + "epoch": 0.55, + "grad_norm": 2.045139071894576, + "learning_rate": 4.4447781466333565e-06, + "loss": 0.5821, + "step": 5285 + }, + { + "epoch": 0.55, + "grad_norm": 1.8897413641576013, + "learning_rate": 4.4431051723919725e-06, + "loss": 0.4936, + "step": 5286 + }, + { + "epoch": 0.55, + "grad_norm": 1.8631944101350673, + "learning_rate": 4.441432261277794e-06, + "loss": 0.5666, + "step": 5287 + }, + { + "epoch": 0.55, + "grad_norm": 1.8750818481890121, + "learning_rate": 4.43975941348045e-06, + "loss": 0.6188, + "step": 5288 + }, + { + "epoch": 0.55, + "grad_norm": 1.7830177570624304, + "learning_rate": 4.43808662918957e-06, + "loss": 0.5248, + "step": 5289 + }, + { + "epoch": 0.55, + "grad_norm": 1.9469663022421002, + "learning_rate": 4.436413908594773e-06, + "loss": 0.699, + "step": 5290 + }, + { + "epoch": 0.55, + "grad_norm": 1.9735300322889926, + "learning_rate": 4.434741251885671e-06, + "loss": 0.6604, + "step": 5291 + }, + { + "epoch": 0.55, + "grad_norm": 1.7706573081264123, + "learning_rate": 4.433068659251871e-06, + "loss": 0.8055, + "step": 5292 + }, + { + "epoch": 0.55, + "grad_norm": 2.1555746130188567, + "learning_rate": 4.43139613088297e-06, + "loss": 0.6699, + "step": 5293 + }, + { + "epoch": 0.55, + "grad_norm": 1.9824503792487942, + "learning_rate": 4.429723666968559e-06, + "loss": 0.7092, + "step": 5294 + }, + { + "epoch": 0.55, + "grad_norm": 2.1311700772834556, + "learning_rate": 4.4280512676982215e-06, + "loss": 0.7339, + "step": 5295 + }, + { + "epoch": 0.55, + "grad_norm": 1.7811696362624156, + "learning_rate": 4.426378933261532e-06, + "loss": 0.6008, + "step": 5296 + }, + { + "epoch": 0.55, + "grad_norm": 1.7760256644912769, + "learning_rate": 4.424706663848059e-06, + "loss": 0.6531, + "step": 5297 + }, + { + "epoch": 0.55, + "grad_norm": 1.7122666917208644, + "learning_rate": 4.423034459647365e-06, + "loss": 0.6768, + "step": 5298 + }, + { + "epoch": 0.55, + "grad_norm": 2.1077740469187316, + "learning_rate": 4.421362320849003e-06, + "loss": 0.7002, + "step": 5299 + }, + { + "epoch": 0.55, + "grad_norm": 2.1373646086172955, + "learning_rate": 4.419690247642521e-06, + "loss": 0.5956, + "step": 5300 + }, + { + "epoch": 0.55, + "grad_norm": 1.8105651786972892, + "learning_rate": 4.418018240217457e-06, + "loss": 0.6231, + "step": 5301 + }, + { + "epoch": 0.55, + "grad_norm": 1.7954524837112216, + "learning_rate": 4.416346298763343e-06, + "loss": 0.5767, + "step": 5302 + }, + { + "epoch": 0.55, + "grad_norm": 1.797535057274918, + "learning_rate": 4.414674423469702e-06, + "loss": 0.5235, + "step": 5303 + }, + { + "epoch": 0.55, + "grad_norm": 1.8580491980634726, + "learning_rate": 4.413002614526052e-06, + "loss": 0.6158, + "step": 5304 + }, + { + "epoch": 0.55, + "grad_norm": 2.1384624398890035, + "learning_rate": 4.411330872121901e-06, + "loss": 0.638, + "step": 5305 + }, + { + "epoch": 0.55, + "grad_norm": 1.838188572332752, + "learning_rate": 4.4096591964467515e-06, + "loss": 0.5328, + "step": 5306 + }, + { + "epoch": 0.55, + "grad_norm": 2.00835612910002, + "learning_rate": 4.407987587690097e-06, + "loss": 0.6725, + "step": 5307 + }, + { + "epoch": 0.55, + "grad_norm": 1.9969651486276907, + "learning_rate": 4.406316046041423e-06, + "loss": 0.7185, + "step": 5308 + }, + { + "epoch": 0.55, + "grad_norm": 2.1341862742091475, + "learning_rate": 4.404644571690208e-06, + "loss": 0.713, + "step": 5309 + }, + { + "epoch": 0.55, + "grad_norm": 1.9766606231136952, + "learning_rate": 4.402973164825927e-06, + "loss": 0.5984, + "step": 5310 + }, + { + "epoch": 0.55, + "grad_norm": 1.8695986242532783, + "learning_rate": 4.401301825638039e-06, + "loss": 0.6824, + "step": 5311 + }, + { + "epoch": 0.55, + "grad_norm": 2.025287723277109, + "learning_rate": 4.399630554316002e-06, + "loss": 0.5779, + "step": 5312 + }, + { + "epoch": 0.55, + "grad_norm": 1.843453109578057, + "learning_rate": 4.397959351049263e-06, + "loss": 0.5695, + "step": 5313 + }, + { + "epoch": 0.55, + "grad_norm": 1.9383448410386213, + "learning_rate": 4.396288216027264e-06, + "loss": 0.5474, + "step": 5314 + }, + { + "epoch": 0.55, + "grad_norm": 1.8430720816288715, + "learning_rate": 4.394617149439435e-06, + "loss": 0.6376, + "step": 5315 + }, + { + "epoch": 0.55, + "grad_norm": 1.8636058676938005, + "learning_rate": 4.392946151475204e-06, + "loss": 0.6708, + "step": 5316 + }, + { + "epoch": 0.55, + "grad_norm": 1.8075379505938336, + "learning_rate": 4.391275222323989e-06, + "loss": 0.623, + "step": 5317 + }, + { + "epoch": 0.55, + "grad_norm": 3.282958892491627, + "learning_rate": 4.389604362175194e-06, + "loss": 0.6761, + "step": 5318 + }, + { + "epoch": 0.55, + "grad_norm": 1.9314202760968853, + "learning_rate": 4.387933571218224e-06, + "loss": 0.6157, + "step": 5319 + }, + { + "epoch": 0.55, + "grad_norm": 1.8084471261587793, + "learning_rate": 4.386262849642474e-06, + "loss": 0.5921, + "step": 5320 + }, + { + "epoch": 0.55, + "grad_norm": 1.7878979830871655, + "learning_rate": 4.384592197637327e-06, + "loss": 0.6585, + "step": 5321 + }, + { + "epoch": 0.55, + "grad_norm": 1.7272057096771558, + "learning_rate": 4.382921615392162e-06, + "loss": 0.6154, + "step": 5322 + }, + { + "epoch": 0.55, + "grad_norm": 1.7715100059111006, + "learning_rate": 4.381251103096349e-06, + "loss": 0.5159, + "step": 5323 + }, + { + "epoch": 0.55, + "grad_norm": 1.8160415729278552, + "learning_rate": 4.379580660939253e-06, + "loss": 0.6188, + "step": 5324 + }, + { + "epoch": 0.55, + "grad_norm": 2.0714128617370844, + "learning_rate": 4.377910289110222e-06, + "loss": 0.6339, + "step": 5325 + }, + { + "epoch": 0.55, + "grad_norm": 1.7709774757228993, + "learning_rate": 4.376239987798606e-06, + "loss": 0.5522, + "step": 5326 + }, + { + "epoch": 0.55, + "grad_norm": 1.6953184185859074, + "learning_rate": 4.3745697571937434e-06, + "loss": 0.6141, + "step": 5327 + }, + { + "epoch": 0.55, + "grad_norm": 2.0788554843615246, + "learning_rate": 4.372899597484963e-06, + "loss": 0.6108, + "step": 5328 + }, + { + "epoch": 0.55, + "grad_norm": 1.8961589961491185, + "learning_rate": 4.371229508861588e-06, + "loss": 0.6662, + "step": 5329 + }, + { + "epoch": 0.55, + "grad_norm": 1.7425613651137846, + "learning_rate": 4.369559491512932e-06, + "loss": 0.5696, + "step": 5330 + }, + { + "epoch": 0.55, + "grad_norm": 1.7939534103840276, + "learning_rate": 4.367889545628301e-06, + "loss": 0.5928, + "step": 5331 + }, + { + "epoch": 0.55, + "grad_norm": 1.8404672607265415, + "learning_rate": 4.366219671396995e-06, + "loss": 0.607, + "step": 5332 + }, + { + "epoch": 0.55, + "grad_norm": 1.9584221320985873, + "learning_rate": 4.364549869008299e-06, + "loss": 0.6284, + "step": 5333 + }, + { + "epoch": 0.55, + "grad_norm": 1.9320337232975733, + "learning_rate": 4.362880138651498e-06, + "loss": 0.7023, + "step": 5334 + }, + { + "epoch": 0.55, + "grad_norm": 1.8876641008267565, + "learning_rate": 4.361210480515865e-06, + "loss": 0.61, + "step": 5335 + }, + { + "epoch": 0.55, + "grad_norm": 1.7938844774124776, + "learning_rate": 4.3595408947906644e-06, + "loss": 0.7137, + "step": 5336 + }, + { + "epoch": 0.55, + "grad_norm": 1.8127042217334888, + "learning_rate": 4.3578713816651535e-06, + "loss": 0.5189, + "step": 5337 + }, + { + "epoch": 0.55, + "grad_norm": 1.7419189776304915, + "learning_rate": 4.356201941328582e-06, + "loss": 0.6613, + "step": 5338 + }, + { + "epoch": 0.55, + "grad_norm": 1.8832170874848826, + "learning_rate": 4.354532573970191e-06, + "loss": 0.6425, + "step": 5339 + }, + { + "epoch": 0.56, + "grad_norm": 1.821648531778137, + "learning_rate": 4.352863279779211e-06, + "loss": 0.6567, + "step": 5340 + }, + { + "epoch": 0.56, + "grad_norm": 1.8233645260795328, + "learning_rate": 4.351194058944866e-06, + "loss": 0.5911, + "step": 5341 + }, + { + "epoch": 0.56, + "grad_norm": 1.9133087513735505, + "learning_rate": 4.349524911656373e-06, + "loss": 0.6188, + "step": 5342 + }, + { + "epoch": 0.56, + "grad_norm": 1.672726443042582, + "learning_rate": 4.34785583810294e-06, + "loss": 0.5733, + "step": 5343 + }, + { + "epoch": 0.56, + "grad_norm": 1.9222023216966218, + "learning_rate": 4.346186838473764e-06, + "loss": 0.6429, + "step": 5344 + }, + { + "epoch": 0.56, + "grad_norm": 1.757050865618816, + "learning_rate": 4.344517912958036e-06, + "loss": 0.6731, + "step": 5345 + }, + { + "epoch": 0.56, + "grad_norm": 1.7741802896340304, + "learning_rate": 4.342849061744939e-06, + "loss": 0.6015, + "step": 5346 + }, + { + "epoch": 0.56, + "grad_norm": 1.7665348586924297, + "learning_rate": 4.341180285023648e-06, + "loss": 0.5387, + "step": 5347 + }, + { + "epoch": 0.56, + "grad_norm": 1.9006331306865378, + "learning_rate": 4.339511582983325e-06, + "loss": 0.6501, + "step": 5348 + }, + { + "epoch": 0.56, + "grad_norm": 2.171585780899604, + "learning_rate": 4.337842955813129e-06, + "loss": 0.6288, + "step": 5349 + }, + { + "epoch": 0.56, + "grad_norm": 1.7377998975322286, + "learning_rate": 4.336174403702208e-06, + "loss": 0.6438, + "step": 5350 + }, + { + "epoch": 0.56, + "grad_norm": 1.9683352192208223, + "learning_rate": 4.334505926839702e-06, + "loss": 0.6363, + "step": 5351 + }, + { + "epoch": 0.56, + "grad_norm": 1.8374943205843048, + "learning_rate": 4.332837525414743e-06, + "loss": 0.5948, + "step": 5352 + }, + { + "epoch": 0.56, + "grad_norm": 1.7886524450076886, + "learning_rate": 4.331169199616453e-06, + "loss": 0.6511, + "step": 5353 + }, + { + "epoch": 0.56, + "grad_norm": 2.0533622511419374, + "learning_rate": 4.329500949633947e-06, + "loss": 0.6977, + "step": 5354 + }, + { + "epoch": 0.56, + "grad_norm": 2.0519868425621772, + "learning_rate": 4.32783277565633e-06, + "loss": 0.7681, + "step": 5355 + }, + { + "epoch": 0.56, + "grad_norm": 1.9748725112796992, + "learning_rate": 4.326164677872698e-06, + "loss": 0.626, + "step": 5356 + }, + { + "epoch": 0.56, + "grad_norm": 1.8229150751926968, + "learning_rate": 4.324496656472141e-06, + "loss": 0.6979, + "step": 5357 + }, + { + "epoch": 0.56, + "grad_norm": 1.8616048764016284, + "learning_rate": 4.322828711643737e-06, + "loss": 0.5955, + "step": 5358 + }, + { + "epoch": 0.56, + "grad_norm": 1.7219793227650209, + "learning_rate": 4.32116084357656e-06, + "loss": 0.6187, + "step": 5359 + }, + { + "epoch": 0.56, + "grad_norm": 1.7727210332779848, + "learning_rate": 4.31949305245967e-06, + "loss": 0.6097, + "step": 5360 + }, + { + "epoch": 0.56, + "grad_norm": 1.7854736896677759, + "learning_rate": 4.317825338482123e-06, + "loss": 0.667, + "step": 5361 + }, + { + "epoch": 0.56, + "grad_norm": 2.0495980515372945, + "learning_rate": 4.31615770183296e-06, + "loss": 0.631, + "step": 5362 + }, + { + "epoch": 0.56, + "grad_norm": 2.0096774454763766, + "learning_rate": 4.31449014270122e-06, + "loss": 0.6482, + "step": 5363 + }, + { + "epoch": 0.56, + "grad_norm": 2.2165036324005603, + "learning_rate": 4.312822661275929e-06, + "loss": 0.7564, + "step": 5364 + }, + { + "epoch": 0.56, + "grad_norm": 1.857956424239191, + "learning_rate": 4.3111552577461066e-06, + "loss": 0.592, + "step": 5365 + }, + { + "epoch": 0.56, + "grad_norm": 2.0795780405956905, + "learning_rate": 4.309487932300762e-06, + "loss": 0.7031, + "step": 5366 + }, + { + "epoch": 0.56, + "grad_norm": 1.9276173353067232, + "learning_rate": 4.307820685128896e-06, + "loss": 0.6087, + "step": 5367 + }, + { + "epoch": 0.56, + "grad_norm": 1.6763284047956102, + "learning_rate": 4.3061535164195e-06, + "loss": 0.7254, + "step": 5368 + }, + { + "epoch": 0.56, + "grad_norm": 2.0186762339298694, + "learning_rate": 4.304486426361561e-06, + "loss": 0.6859, + "step": 5369 + }, + { + "epoch": 0.56, + "grad_norm": 1.8808991283500813, + "learning_rate": 4.302819415144046e-06, + "loss": 0.5656, + "step": 5370 + }, + { + "epoch": 0.56, + "grad_norm": 1.894815287220838, + "learning_rate": 4.301152482955926e-06, + "loss": 0.6015, + "step": 5371 + }, + { + "epoch": 0.56, + "grad_norm": 1.781652807228387, + "learning_rate": 4.299485629986153e-06, + "loss": 0.5619, + "step": 5372 + }, + { + "epoch": 0.56, + "grad_norm": 1.9419047981480841, + "learning_rate": 4.297818856423679e-06, + "loss": 0.6505, + "step": 5373 + }, + { + "epoch": 0.56, + "grad_norm": 2.0120179327646777, + "learning_rate": 4.296152162457439e-06, + "loss": 0.6558, + "step": 5374 + }, + { + "epoch": 0.56, + "grad_norm": 1.8836238693944019, + "learning_rate": 4.294485548276363e-06, + "loss": 0.6515, + "step": 5375 + }, + { + "epoch": 0.56, + "grad_norm": 1.8504429409168075, + "learning_rate": 4.292819014069372e-06, + "loss": 0.7023, + "step": 5376 + }, + { + "epoch": 0.56, + "grad_norm": 2.0390665006750743, + "learning_rate": 4.291152560025377e-06, + "loss": 0.7807, + "step": 5377 + }, + { + "epoch": 0.56, + "grad_norm": 1.7922684370026463, + "learning_rate": 4.2894861863332785e-06, + "loss": 0.6747, + "step": 5378 + }, + { + "epoch": 0.56, + "grad_norm": 1.9449512984578232, + "learning_rate": 4.2878198931819705e-06, + "loss": 0.6697, + "step": 5379 + }, + { + "epoch": 0.56, + "grad_norm": 2.0122903659992213, + "learning_rate": 4.286153680760338e-06, + "loss": 0.7135, + "step": 5380 + }, + { + "epoch": 0.56, + "grad_norm": 1.9004201271206218, + "learning_rate": 4.284487549257254e-06, + "loss": 0.4542, + "step": 5381 + }, + { + "epoch": 0.56, + "grad_norm": 1.672893473626159, + "learning_rate": 4.282821498861585e-06, + "loss": 0.5413, + "step": 5382 + }, + { + "epoch": 0.56, + "grad_norm": 1.8436366187307252, + "learning_rate": 4.281155529762186e-06, + "loss": 0.6247, + "step": 5383 + }, + { + "epoch": 0.56, + "grad_norm": 1.676623195970577, + "learning_rate": 4.279489642147906e-06, + "loss": 0.6366, + "step": 5384 + }, + { + "epoch": 0.56, + "grad_norm": 1.8849736907682577, + "learning_rate": 4.277823836207581e-06, + "loss": 0.6423, + "step": 5385 + }, + { + "epoch": 0.56, + "grad_norm": 1.6504346249070367, + "learning_rate": 4.276158112130041e-06, + "loss": 0.6051, + "step": 5386 + }, + { + "epoch": 0.56, + "grad_norm": 1.900270675572181, + "learning_rate": 4.274492470104105e-06, + "loss": 0.7088, + "step": 5387 + }, + { + "epoch": 0.56, + "grad_norm": 2.0141651357876653, + "learning_rate": 4.272826910318581e-06, + "loss": 0.6249, + "step": 5388 + }, + { + "epoch": 0.56, + "grad_norm": 1.9112805090759801, + "learning_rate": 4.271161432962273e-06, + "loss": 0.6067, + "step": 5389 + }, + { + "epoch": 0.56, + "grad_norm": 2.034156020188733, + "learning_rate": 4.2694960382239705e-06, + "loss": 0.644, + "step": 5390 + }, + { + "epoch": 0.56, + "grad_norm": 1.9930605463073325, + "learning_rate": 4.2678307262924575e-06, + "loss": 0.6368, + "step": 5391 + }, + { + "epoch": 0.56, + "grad_norm": 1.906858020595567, + "learning_rate": 4.266165497356503e-06, + "loss": 0.5987, + "step": 5392 + }, + { + "epoch": 0.56, + "grad_norm": 1.8406121682236543, + "learning_rate": 4.264500351604872e-06, + "loss": 0.6536, + "step": 5393 + }, + { + "epoch": 0.56, + "grad_norm": 1.8254428663126403, + "learning_rate": 4.262835289226318e-06, + "loss": 0.5699, + "step": 5394 + }, + { + "epoch": 0.56, + "grad_norm": 1.9270412949134141, + "learning_rate": 4.261170310409586e-06, + "loss": 0.7475, + "step": 5395 + }, + { + "epoch": 0.56, + "grad_norm": 1.8066282234062603, + "learning_rate": 4.259505415343411e-06, + "loss": 0.6369, + "step": 5396 + }, + { + "epoch": 0.56, + "grad_norm": 1.758284076012403, + "learning_rate": 4.257840604216517e-06, + "loss": 0.6009, + "step": 5397 + }, + { + "epoch": 0.56, + "grad_norm": 1.6749754336124096, + "learning_rate": 4.2561758772176244e-06, + "loss": 0.6205, + "step": 5398 + }, + { + "epoch": 0.56, + "grad_norm": 1.9275539454621302, + "learning_rate": 4.254511234535432e-06, + "loss": 0.5659, + "step": 5399 + }, + { + "epoch": 0.56, + "grad_norm": 1.9011018789471603, + "learning_rate": 4.252846676358641e-06, + "loss": 0.6691, + "step": 5400 + }, + { + "epoch": 0.56, + "grad_norm": 2.0596941216693483, + "learning_rate": 4.251182202875938e-06, + "loss": 0.6916, + "step": 5401 + }, + { + "epoch": 0.56, + "grad_norm": 1.857656244516316, + "learning_rate": 4.249517814276e-06, + "loss": 0.6174, + "step": 5402 + }, + { + "epoch": 0.56, + "grad_norm": 1.9254466566790163, + "learning_rate": 4.247853510747495e-06, + "loss": 0.586, + "step": 5403 + }, + { + "epoch": 0.56, + "grad_norm": 1.7917004971343924, + "learning_rate": 4.2461892924790825e-06, + "loss": 0.5538, + "step": 5404 + }, + { + "epoch": 0.56, + "grad_norm": 1.9172145911881517, + "learning_rate": 4.244525159659409e-06, + "loss": 0.6444, + "step": 5405 + }, + { + "epoch": 0.56, + "grad_norm": 2.0655398744581377, + "learning_rate": 4.2428611124771184e-06, + "loss": 0.6404, + "step": 5406 + }, + { + "epoch": 0.56, + "grad_norm": 1.8304720049482348, + "learning_rate": 4.241197151120834e-06, + "loss": 0.6175, + "step": 5407 + }, + { + "epoch": 0.56, + "grad_norm": 1.81762951481684, + "learning_rate": 4.239533275779177e-06, + "loss": 0.6268, + "step": 5408 + }, + { + "epoch": 0.56, + "grad_norm": 1.9408357676082384, + "learning_rate": 4.237869486640758e-06, + "loss": 0.6111, + "step": 5409 + }, + { + "epoch": 0.56, + "grad_norm": 2.486652228429193, + "learning_rate": 4.236205783894176e-06, + "loss": 0.6691, + "step": 5410 + }, + { + "epoch": 0.56, + "grad_norm": 1.6509217437191293, + "learning_rate": 4.2345421677280215e-06, + "loss": 0.6332, + "step": 5411 + }, + { + "epoch": 0.56, + "grad_norm": 1.8758194011046228, + "learning_rate": 4.232878638330877e-06, + "loss": 0.6156, + "step": 5412 + }, + { + "epoch": 0.56, + "grad_norm": 1.9285754226908345, + "learning_rate": 4.231215195891311e-06, + "loss": 0.6819, + "step": 5413 + }, + { + "epoch": 0.56, + "grad_norm": 2.248208337442681, + "learning_rate": 4.229551840597884e-06, + "loss": 0.6568, + "step": 5414 + }, + { + "epoch": 0.56, + "grad_norm": 1.7753810739067684, + "learning_rate": 4.227888572639148e-06, + "loss": 0.5761, + "step": 5415 + }, + { + "epoch": 0.56, + "grad_norm": 2.0888773685296074, + "learning_rate": 4.226225392203641e-06, + "loss": 0.7153, + "step": 5416 + }, + { + "epoch": 0.56, + "grad_norm": 1.7249346313721021, + "learning_rate": 4.224562299479897e-06, + "loss": 0.547, + "step": 5417 + }, + { + "epoch": 0.56, + "grad_norm": 1.711135526070868, + "learning_rate": 4.222899294656437e-06, + "loss": 0.6079, + "step": 5418 + }, + { + "epoch": 0.56, + "grad_norm": 1.9324287494379544, + "learning_rate": 4.221236377921771e-06, + "loss": 0.5704, + "step": 5419 + }, + { + "epoch": 0.56, + "grad_norm": 1.7926007338434826, + "learning_rate": 4.219573549464403e-06, + "loss": 0.5625, + "step": 5420 + }, + { + "epoch": 0.56, + "grad_norm": 1.9062785913816724, + "learning_rate": 4.2179108094728185e-06, + "loss": 0.6407, + "step": 5421 + }, + { + "epoch": 0.56, + "grad_norm": 1.7912130192025735, + "learning_rate": 4.2162481581355015e-06, + "loss": 0.5941, + "step": 5422 + }, + { + "epoch": 0.56, + "grad_norm": 1.919288046883698, + "learning_rate": 4.214585595640923e-06, + "loss": 0.6401, + "step": 5423 + }, + { + "epoch": 0.56, + "grad_norm": 1.8434935010857791, + "learning_rate": 4.212923122177545e-06, + "loss": 0.5237, + "step": 5424 + }, + { + "epoch": 0.56, + "grad_norm": 2.0484794450464237, + "learning_rate": 4.211260737933816e-06, + "loss": 0.6333, + "step": 5425 + }, + { + "epoch": 0.56, + "grad_norm": 2.0744597871080273, + "learning_rate": 4.209598443098179e-06, + "loss": 0.572, + "step": 5426 + }, + { + "epoch": 0.56, + "grad_norm": 1.912927529947007, + "learning_rate": 4.2079362378590625e-06, + "loss": 0.6074, + "step": 5427 + }, + { + "epoch": 0.56, + "grad_norm": 1.815141363610265, + "learning_rate": 4.206274122404891e-06, + "loss": 0.5915, + "step": 5428 + }, + { + "epoch": 0.56, + "grad_norm": 1.7372331606973053, + "learning_rate": 4.204612096924069e-06, + "loss": 0.4789, + "step": 5429 + }, + { + "epoch": 0.56, + "grad_norm": 1.8298977661303382, + "learning_rate": 4.202950161605e-06, + "loss": 0.6594, + "step": 5430 + }, + { + "epoch": 0.56, + "grad_norm": 1.9098142487861431, + "learning_rate": 4.201288316636073e-06, + "loss": 0.6554, + "step": 5431 + }, + { + "epoch": 0.56, + "grad_norm": 2.0474500584686406, + "learning_rate": 4.199626562205668e-06, + "loss": 0.6762, + "step": 5432 + }, + { + "epoch": 0.56, + "grad_norm": 1.64859334265512, + "learning_rate": 4.197964898502154e-06, + "loss": 0.6535, + "step": 5433 + }, + { + "epoch": 0.56, + "grad_norm": 2.0230013593164036, + "learning_rate": 4.1963033257138904e-06, + "loss": 0.5574, + "step": 5434 + }, + { + "epoch": 0.56, + "grad_norm": 1.7973657989903222, + "learning_rate": 4.194641844029227e-06, + "loss": 0.6348, + "step": 5435 + }, + { + "epoch": 0.57, + "grad_norm": 1.949821896226258, + "learning_rate": 4.1929804536365e-06, + "loss": 0.7132, + "step": 5436 + }, + { + "epoch": 0.57, + "grad_norm": 1.8046228407634757, + "learning_rate": 4.191319154724038e-06, + "loss": 0.7059, + "step": 5437 + }, + { + "epoch": 0.57, + "grad_norm": 1.8969165232037644, + "learning_rate": 4.189657947480159e-06, + "loss": 0.5655, + "step": 5438 + }, + { + "epoch": 0.57, + "grad_norm": 1.803066198351578, + "learning_rate": 4.187996832093171e-06, + "loss": 0.5827, + "step": 5439 + }, + { + "epoch": 0.57, + "grad_norm": 1.9186713994335214, + "learning_rate": 4.186335808751369e-06, + "loss": 0.6451, + "step": 5440 + }, + { + "epoch": 0.57, + "grad_norm": 1.9254356727843782, + "learning_rate": 4.184674877643042e-06, + "loss": 0.6686, + "step": 5441 + }, + { + "epoch": 0.57, + "grad_norm": 2.0999174784887145, + "learning_rate": 4.183014038956465e-06, + "loss": 0.6833, + "step": 5442 + }, + { + "epoch": 0.57, + "grad_norm": 2.037687236980084, + "learning_rate": 4.181353292879904e-06, + "loss": 0.7428, + "step": 5443 + }, + { + "epoch": 0.57, + "grad_norm": 1.7644928803447903, + "learning_rate": 4.179692639601612e-06, + "loss": 0.7074, + "step": 5444 + }, + { + "epoch": 0.57, + "grad_norm": 1.9200170976748951, + "learning_rate": 4.178032079309836e-06, + "loss": 0.684, + "step": 5445 + }, + { + "epoch": 0.57, + "grad_norm": 1.9507059714006074, + "learning_rate": 4.176371612192808e-06, + "loss": 0.6933, + "step": 5446 + }, + { + "epoch": 0.57, + "grad_norm": 1.7315185417192958, + "learning_rate": 4.174711238438752e-06, + "loss": 0.6032, + "step": 5447 + }, + { + "epoch": 0.57, + "grad_norm": 1.9013248224515067, + "learning_rate": 4.173050958235882e-06, + "loss": 0.6588, + "step": 5448 + }, + { + "epoch": 0.57, + "grad_norm": 1.8963272377797982, + "learning_rate": 4.171390771772399e-06, + "loss": 0.5986, + "step": 5449 + }, + { + "epoch": 0.57, + "grad_norm": 1.9082415935624142, + "learning_rate": 4.169730679236496e-06, + "loss": 0.6572, + "step": 5450 + }, + { + "epoch": 0.57, + "grad_norm": 1.937334868978178, + "learning_rate": 4.168070680816351e-06, + "loss": 0.6632, + "step": 5451 + }, + { + "epoch": 0.57, + "grad_norm": 1.8696062932881234, + "learning_rate": 4.166410776700137e-06, + "loss": 0.7399, + "step": 5452 + }, + { + "epoch": 0.57, + "grad_norm": 2.1429005075050083, + "learning_rate": 4.164750967076012e-06, + "loss": 0.7346, + "step": 5453 + }, + { + "epoch": 0.57, + "grad_norm": 1.743702093443798, + "learning_rate": 4.163091252132126e-06, + "loss": 0.6522, + "step": 5454 + }, + { + "epoch": 0.57, + "grad_norm": 1.8587073840092287, + "learning_rate": 4.1614316320566174e-06, + "loss": 0.6181, + "step": 5455 + }, + { + "epoch": 0.57, + "grad_norm": 1.8675162521113273, + "learning_rate": 4.159772107037613e-06, + "loss": 0.6114, + "step": 5456 + }, + { + "epoch": 0.57, + "grad_norm": 1.912758383634301, + "learning_rate": 4.158112677263231e-06, + "loss": 0.6577, + "step": 5457 + }, + { + "epoch": 0.57, + "grad_norm": 1.836405720086498, + "learning_rate": 4.156453342921573e-06, + "loss": 0.7016, + "step": 5458 + }, + { + "epoch": 0.57, + "grad_norm": 1.9065901889070593, + "learning_rate": 4.154794104200737e-06, + "loss": 0.6624, + "step": 5459 + }, + { + "epoch": 0.57, + "grad_norm": 1.9075230574920552, + "learning_rate": 4.153134961288807e-06, + "loss": 0.6226, + "step": 5460 + }, + { + "epoch": 0.57, + "grad_norm": 1.8711562433698041, + "learning_rate": 4.151475914373856e-06, + "loss": 0.6264, + "step": 5461 + }, + { + "epoch": 0.57, + "grad_norm": 1.8048208435526953, + "learning_rate": 4.149816963643947e-06, + "loss": 0.593, + "step": 5462 + }, + { + "epoch": 0.57, + "grad_norm": 1.6991033271060485, + "learning_rate": 4.1481581092871305e-06, + "loss": 0.5303, + "step": 5463 + }, + { + "epoch": 0.57, + "grad_norm": 1.7456484367991891, + "learning_rate": 4.146499351491448e-06, + "loss": 0.5549, + "step": 5464 + }, + { + "epoch": 0.57, + "grad_norm": 1.7845115021251257, + "learning_rate": 4.144840690444931e-06, + "loss": 0.6408, + "step": 5465 + }, + { + "epoch": 0.57, + "grad_norm": 2.07574862295293, + "learning_rate": 4.143182126335594e-06, + "loss": 0.6681, + "step": 5466 + }, + { + "epoch": 0.57, + "grad_norm": 1.843927894099637, + "learning_rate": 4.1415236593514454e-06, + "loss": 0.5944, + "step": 5467 + }, + { + "epoch": 0.57, + "grad_norm": 1.7210896919710381, + "learning_rate": 4.139865289680485e-06, + "loss": 0.6904, + "step": 5468 + }, + { + "epoch": 0.57, + "grad_norm": 2.1143978026959203, + "learning_rate": 4.138207017510696e-06, + "loss": 0.7319, + "step": 5469 + }, + { + "epoch": 0.57, + "grad_norm": 1.9674427272555244, + "learning_rate": 4.136548843030052e-06, + "loss": 0.6494, + "step": 5470 + }, + { + "epoch": 0.57, + "grad_norm": 1.753820948119458, + "learning_rate": 4.134890766426521e-06, + "loss": 0.6728, + "step": 5471 + }, + { + "epoch": 0.57, + "grad_norm": 1.6638752593393304, + "learning_rate": 4.133232787888052e-06, + "loss": 0.5565, + "step": 5472 + }, + { + "epoch": 0.57, + "grad_norm": 2.1632980049827677, + "learning_rate": 4.131574907602586e-06, + "loss": 0.6462, + "step": 5473 + }, + { + "epoch": 0.57, + "grad_norm": 2.0407862239585697, + "learning_rate": 4.129917125758054e-06, + "loss": 0.746, + "step": 5474 + }, + { + "epoch": 0.57, + "grad_norm": 1.854988687102971, + "learning_rate": 4.128259442542375e-06, + "loss": 0.666, + "step": 5475 + }, + { + "epoch": 0.57, + "grad_norm": 2.0598051706252316, + "learning_rate": 4.126601858143457e-06, + "loss": 0.6856, + "step": 5476 + }, + { + "epoch": 0.57, + "grad_norm": 2.2416076777807854, + "learning_rate": 4.124944372749197e-06, + "loss": 0.7149, + "step": 5477 + }, + { + "epoch": 0.57, + "grad_norm": 1.9940483790212515, + "learning_rate": 4.12328698654748e-06, + "loss": 0.678, + "step": 5478 + }, + { + "epoch": 0.57, + "grad_norm": 1.9448515255469294, + "learning_rate": 4.12162969972618e-06, + "loss": 0.6333, + "step": 5479 + }, + { + "epoch": 0.57, + "grad_norm": 1.7150709469269039, + "learning_rate": 4.119972512473161e-06, + "loss": 0.6189, + "step": 5480 + }, + { + "epoch": 0.57, + "grad_norm": 1.8341509373262923, + "learning_rate": 4.118315424976272e-06, + "loss": 0.5073, + "step": 5481 + }, + { + "epoch": 0.57, + "grad_norm": 1.9476966860099154, + "learning_rate": 4.116658437423355e-06, + "loss": 0.5557, + "step": 5482 + }, + { + "epoch": 0.57, + "grad_norm": 1.6553886987138464, + "learning_rate": 4.115001550002241e-06, + "loss": 0.5275, + "step": 5483 + }, + { + "epoch": 0.57, + "grad_norm": 1.9821949785533268, + "learning_rate": 4.113344762900745e-06, + "loss": 0.6504, + "step": 5484 + }, + { + "epoch": 0.57, + "grad_norm": 1.7156562932709354, + "learning_rate": 4.111688076306673e-06, + "loss": 0.5742, + "step": 5485 + }, + { + "epoch": 0.57, + "grad_norm": 2.0232269740630366, + "learning_rate": 4.11003149040782e-06, + "loss": 0.6493, + "step": 5486 + }, + { + "epoch": 0.57, + "grad_norm": 1.8519773132537132, + "learning_rate": 4.108375005391972e-06, + "loss": 0.6184, + "step": 5487 + }, + { + "epoch": 0.57, + "grad_norm": 1.793916358582051, + "learning_rate": 4.106718621446899e-06, + "loss": 0.5546, + "step": 5488 + }, + { + "epoch": 0.57, + "grad_norm": 2.036030057119493, + "learning_rate": 4.105062338760361e-06, + "loss": 0.5802, + "step": 5489 + }, + { + "epoch": 0.57, + "grad_norm": 1.7927973737970935, + "learning_rate": 4.103406157520108e-06, + "loss": 0.6194, + "step": 5490 + }, + { + "epoch": 0.57, + "grad_norm": 1.6799135980861009, + "learning_rate": 4.101750077913878e-06, + "loss": 0.5536, + "step": 5491 + }, + { + "epoch": 0.57, + "grad_norm": 2.009000457580272, + "learning_rate": 4.100094100129396e-06, + "loss": 0.7814, + "step": 5492 + }, + { + "epoch": 0.57, + "grad_norm": 2.2549031104223727, + "learning_rate": 4.098438224354377e-06, + "loss": 0.628, + "step": 5493 + }, + { + "epoch": 0.57, + "grad_norm": 1.7853953706906511, + "learning_rate": 4.096782450776526e-06, + "loss": 0.6338, + "step": 5494 + }, + { + "epoch": 0.57, + "grad_norm": 1.9185594643941042, + "learning_rate": 4.09512677958353e-06, + "loss": 0.5703, + "step": 5495 + }, + { + "epoch": 0.57, + "grad_norm": 1.8102601216846244, + "learning_rate": 4.093471210963072e-06, + "loss": 0.5525, + "step": 5496 + }, + { + "epoch": 0.57, + "grad_norm": 1.9988743835333525, + "learning_rate": 4.091815745102818e-06, + "loss": 0.6161, + "step": 5497 + }, + { + "epoch": 0.57, + "grad_norm": 1.9896802093368413, + "learning_rate": 4.0901603821904264e-06, + "loss": 0.6264, + "step": 5498 + }, + { + "epoch": 0.57, + "grad_norm": 1.7811960844124857, + "learning_rate": 4.088505122413542e-06, + "loss": 0.6219, + "step": 5499 + }, + { + "epoch": 0.57, + "grad_norm": 1.642234193822947, + "learning_rate": 4.086849965959797e-06, + "loss": 0.5674, + "step": 5500 + }, + { + "epoch": 0.57, + "grad_norm": 2.1360764455341803, + "learning_rate": 4.0851949130168135e-06, + "loss": 0.763, + "step": 5501 + }, + { + "epoch": 0.57, + "grad_norm": 1.8119817360590291, + "learning_rate": 4.083539963772202e-06, + "loss": 0.6736, + "step": 5502 + }, + { + "epoch": 0.57, + "grad_norm": 2.0075224162284337, + "learning_rate": 4.081885118413557e-06, + "loss": 0.6061, + "step": 5503 + }, + { + "epoch": 0.57, + "grad_norm": 1.9394585405022449, + "learning_rate": 4.0802303771284685e-06, + "loss": 0.6544, + "step": 5504 + }, + { + "epoch": 0.57, + "grad_norm": 1.9226282044737457, + "learning_rate": 4.078575740104508e-06, + "loss": 0.6074, + "step": 5505 + }, + { + "epoch": 0.57, + "grad_norm": 1.8428113935987287, + "learning_rate": 4.07692120752924e-06, + "loss": 0.5472, + "step": 5506 + }, + { + "epoch": 0.57, + "grad_norm": 1.944596305847222, + "learning_rate": 4.075266779590214e-06, + "loss": 0.7045, + "step": 5507 + }, + { + "epoch": 0.57, + "grad_norm": 2.164562137124973, + "learning_rate": 4.073612456474969e-06, + "loss": 0.5906, + "step": 5508 + }, + { + "epoch": 0.57, + "grad_norm": 1.632993502982212, + "learning_rate": 4.071958238371036e-06, + "loss": 0.5673, + "step": 5509 + }, + { + "epoch": 0.57, + "grad_norm": 1.9024162945461678, + "learning_rate": 4.070304125465922e-06, + "loss": 0.5853, + "step": 5510 + }, + { + "epoch": 0.57, + "grad_norm": 1.9007911838792717, + "learning_rate": 4.068650117947135e-06, + "loss": 0.6144, + "step": 5511 + }, + { + "epoch": 0.57, + "grad_norm": 1.9514097246456006, + "learning_rate": 4.066996216002167e-06, + "loss": 0.6627, + "step": 5512 + }, + { + "epoch": 0.57, + "grad_norm": 2.018105134211878, + "learning_rate": 4.065342419818495e-06, + "loss": 0.6744, + "step": 5513 + }, + { + "epoch": 0.57, + "grad_norm": 1.885304977006811, + "learning_rate": 4.063688729583587e-06, + "loss": 0.7094, + "step": 5514 + }, + { + "epoch": 0.57, + "grad_norm": 2.0783420987217838, + "learning_rate": 4.0620351454848986e-06, + "loss": 0.586, + "step": 5515 + }, + { + "epoch": 0.57, + "grad_norm": 2.088674812109934, + "learning_rate": 4.0603816677098735e-06, + "loss": 0.648, + "step": 5516 + }, + { + "epoch": 0.57, + "grad_norm": 2.0529187492775622, + "learning_rate": 4.058728296445941e-06, + "loss": 0.688, + "step": 5517 + }, + { + "epoch": 0.57, + "grad_norm": 2.07021175838815, + "learning_rate": 4.057075031880521e-06, + "loss": 0.6682, + "step": 5518 + }, + { + "epoch": 0.57, + "grad_norm": 1.9584875967230313, + "learning_rate": 4.055421874201021e-06, + "loss": 0.6779, + "step": 5519 + }, + { + "epoch": 0.57, + "grad_norm": 1.9478184359951374, + "learning_rate": 4.053768823594835e-06, + "loss": 0.6618, + "step": 5520 + }, + { + "epoch": 0.57, + "grad_norm": 1.9949873298042147, + "learning_rate": 4.052115880249347e-06, + "loss": 0.5795, + "step": 5521 + }, + { + "epoch": 0.57, + "grad_norm": 1.9743169662323066, + "learning_rate": 4.050463044351927e-06, + "loss": 0.5705, + "step": 5522 + }, + { + "epoch": 0.57, + "grad_norm": 1.9749688882641023, + "learning_rate": 4.048810316089932e-06, + "loss": 0.6088, + "step": 5523 + }, + { + "epoch": 0.57, + "grad_norm": 1.8137222978800382, + "learning_rate": 4.0471576956507106e-06, + "loss": 0.6511, + "step": 5524 + }, + { + "epoch": 0.57, + "grad_norm": 2.0162638592127324, + "learning_rate": 4.045505183221594e-06, + "loss": 0.6694, + "step": 5525 + }, + { + "epoch": 0.57, + "grad_norm": 2.231644995017, + "learning_rate": 4.043852778989906e-06, + "loss": 0.7455, + "step": 5526 + }, + { + "epoch": 0.57, + "grad_norm": 2.1407694111031104, + "learning_rate": 4.042200483142955e-06, + "loss": 0.574, + "step": 5527 + }, + { + "epoch": 0.57, + "grad_norm": 1.9598142070636422, + "learning_rate": 4.040548295868039e-06, + "loss": 0.5821, + "step": 5528 + }, + { + "epoch": 0.57, + "grad_norm": 1.9740464430151743, + "learning_rate": 4.038896217352442e-06, + "loss": 0.6735, + "step": 5529 + }, + { + "epoch": 0.57, + "grad_norm": 1.6941300405779327, + "learning_rate": 4.037244247783437e-06, + "loss": 0.6544, + "step": 5530 + }, + { + "epoch": 0.57, + "grad_norm": 1.8565441269875411, + "learning_rate": 4.035592387348285e-06, + "loss": 0.6343, + "step": 5531 + }, + { + "epoch": 0.58, + "grad_norm": 2.0685840879545663, + "learning_rate": 4.033940636234233e-06, + "loss": 0.6154, + "step": 5532 + }, + { + "epoch": 0.58, + "grad_norm": 1.8535815659947346, + "learning_rate": 4.0322889946285146e-06, + "loss": 0.5411, + "step": 5533 + }, + { + "epoch": 0.58, + "grad_norm": 1.8589830851032711, + "learning_rate": 4.030637462718354e-06, + "loss": 0.5412, + "step": 5534 + }, + { + "epoch": 0.58, + "grad_norm": 1.7381406224431815, + "learning_rate": 4.028986040690963e-06, + "loss": 0.5865, + "step": 5535 + }, + { + "epoch": 0.58, + "grad_norm": 1.8576683960163518, + "learning_rate": 4.027334728733539e-06, + "loss": 0.6374, + "step": 5536 + }, + { + "epoch": 0.58, + "grad_norm": 1.9290621874870129, + "learning_rate": 4.0256835270332676e-06, + "loss": 0.6227, + "step": 5537 + }, + { + "epoch": 0.58, + "grad_norm": 1.9821450643649243, + "learning_rate": 4.024032435777321e-06, + "loss": 0.5536, + "step": 5538 + }, + { + "epoch": 0.58, + "grad_norm": 1.7135125156331292, + "learning_rate": 4.022381455152863e-06, + "loss": 0.6002, + "step": 5539 + }, + { + "epoch": 0.58, + "grad_norm": 1.9281196949391677, + "learning_rate": 4.020730585347038e-06, + "loss": 0.7845, + "step": 5540 + }, + { + "epoch": 0.58, + "grad_norm": 2.013273341855397, + "learning_rate": 4.019079826546982e-06, + "loss": 0.6069, + "step": 5541 + }, + { + "epoch": 0.58, + "grad_norm": 1.8903020483922408, + "learning_rate": 4.017429178939819e-06, + "loss": 0.6474, + "step": 5542 + }, + { + "epoch": 0.58, + "grad_norm": 1.980021206707033, + "learning_rate": 4.01577864271266e-06, + "loss": 0.6513, + "step": 5543 + }, + { + "epoch": 0.58, + "grad_norm": 1.8662321684702983, + "learning_rate": 4.0141282180526e-06, + "loss": 0.5716, + "step": 5544 + }, + { + "epoch": 0.58, + "grad_norm": 1.9032406858177675, + "learning_rate": 4.012477905146726e-06, + "loss": 0.6737, + "step": 5545 + }, + { + "epoch": 0.58, + "grad_norm": 1.951128633499513, + "learning_rate": 4.010827704182113e-06, + "loss": 0.663, + "step": 5546 + }, + { + "epoch": 0.58, + "grad_norm": 1.8364003831985076, + "learning_rate": 4.009177615345816e-06, + "loss": 0.6535, + "step": 5547 + }, + { + "epoch": 0.58, + "grad_norm": 1.7446521841903204, + "learning_rate": 4.007527638824883e-06, + "loss": 0.5733, + "step": 5548 + }, + { + "epoch": 0.58, + "grad_norm": 2.1231342278478005, + "learning_rate": 4.005877774806349e-06, + "loss": 0.6509, + "step": 5549 + }, + { + "epoch": 0.58, + "grad_norm": 1.8927153794231493, + "learning_rate": 4.004228023477236e-06, + "loss": 0.6556, + "step": 5550 + }, + { + "epoch": 0.58, + "grad_norm": 1.867765315475591, + "learning_rate": 4.002578385024552e-06, + "loss": 0.6168, + "step": 5551 + }, + { + "epoch": 0.58, + "grad_norm": 1.8285478053225521, + "learning_rate": 4.000928859635293e-06, + "loss": 0.6157, + "step": 5552 + }, + { + "epoch": 0.58, + "grad_norm": 1.7997858856785665, + "learning_rate": 3.999279447496444e-06, + "loss": 0.6565, + "step": 5553 + }, + { + "epoch": 0.58, + "grad_norm": 2.0598783545296144, + "learning_rate": 3.997630148794972e-06, + "loss": 0.579, + "step": 5554 + }, + { + "epoch": 0.58, + "grad_norm": 1.809587176377569, + "learning_rate": 3.995980963717836e-06, + "loss": 0.6207, + "step": 5555 + }, + { + "epoch": 0.58, + "grad_norm": 1.921765056383696, + "learning_rate": 3.994331892451981e-06, + "loss": 0.6272, + "step": 5556 + }, + { + "epoch": 0.58, + "grad_norm": 1.9168894037206115, + "learning_rate": 3.992682935184338e-06, + "loss": 0.552, + "step": 5557 + }, + { + "epoch": 0.58, + "grad_norm": 1.9846547269975716, + "learning_rate": 3.991034092101825e-06, + "loss": 0.6359, + "step": 5558 + }, + { + "epoch": 0.58, + "grad_norm": 1.9654747327693463, + "learning_rate": 3.9893853633913485e-06, + "loss": 0.7966, + "step": 5559 + }, + { + "epoch": 0.58, + "grad_norm": 1.8248422292882072, + "learning_rate": 3.987736749239804e-06, + "loss": 0.694, + "step": 5560 + }, + { + "epoch": 0.58, + "grad_norm": 1.7068304249889794, + "learning_rate": 3.986088249834067e-06, + "loss": 0.5808, + "step": 5561 + }, + { + "epoch": 0.58, + "grad_norm": 1.9279785384262589, + "learning_rate": 3.984439865361006e-06, + "loss": 0.6398, + "step": 5562 + }, + { + "epoch": 0.58, + "grad_norm": 1.8561935533701657, + "learning_rate": 3.982791596007474e-06, + "loss": 0.6948, + "step": 5563 + }, + { + "epoch": 0.58, + "grad_norm": 2.037470040126886, + "learning_rate": 3.981143441960312e-06, + "loss": 0.6098, + "step": 5564 + }, + { + "epoch": 0.58, + "grad_norm": 1.7159903024585526, + "learning_rate": 3.979495403406349e-06, + "loss": 0.6738, + "step": 5565 + }, + { + "epoch": 0.58, + "grad_norm": 1.9450556424814869, + "learning_rate": 3.977847480532399e-06, + "loss": 0.6524, + "step": 5566 + }, + { + "epoch": 0.58, + "grad_norm": 1.8435450586070443, + "learning_rate": 3.976199673525263e-06, + "loss": 0.6249, + "step": 5567 + }, + { + "epoch": 0.58, + "grad_norm": 1.9349741783559788, + "learning_rate": 3.974551982571732e-06, + "loss": 0.7301, + "step": 5568 + }, + { + "epoch": 0.58, + "grad_norm": 2.1214284817615923, + "learning_rate": 3.9729044078585756e-06, + "loss": 0.6747, + "step": 5569 + }, + { + "epoch": 0.58, + "grad_norm": 1.9991903575884222, + "learning_rate": 3.971256949572559e-06, + "loss": 0.6091, + "step": 5570 + }, + { + "epoch": 0.58, + "grad_norm": 1.962993073595112, + "learning_rate": 3.969609607900431e-06, + "loss": 0.5745, + "step": 5571 + }, + { + "epoch": 0.58, + "grad_norm": 2.0994401626194543, + "learning_rate": 3.967962383028927e-06, + "loss": 0.5469, + "step": 5572 + }, + { + "epoch": 0.58, + "grad_norm": 1.8638963329996499, + "learning_rate": 3.966315275144769e-06, + "loss": 0.6496, + "step": 5573 + }, + { + "epoch": 0.58, + "grad_norm": 2.040176832504217, + "learning_rate": 3.964668284434666e-06, + "loss": 0.6288, + "step": 5574 + }, + { + "epoch": 0.58, + "grad_norm": 2.022872445806577, + "learning_rate": 3.963021411085315e-06, + "loss": 0.6499, + "step": 5575 + }, + { + "epoch": 0.58, + "grad_norm": 1.9155250142977756, + "learning_rate": 3.9613746552834e-06, + "loss": 0.5525, + "step": 5576 + }, + { + "epoch": 0.58, + "grad_norm": 2.0230746897832876, + "learning_rate": 3.959728017215585e-06, + "loss": 0.5354, + "step": 5577 + }, + { + "epoch": 0.58, + "grad_norm": 2.1162250972308643, + "learning_rate": 3.958081497068528e-06, + "loss": 0.6553, + "step": 5578 + }, + { + "epoch": 0.58, + "grad_norm": 1.7692582229440683, + "learning_rate": 3.9564350950288735e-06, + "loss": 0.6553, + "step": 5579 + }, + { + "epoch": 0.58, + "grad_norm": 1.8739200142352377, + "learning_rate": 3.954788811283249e-06, + "loss": 0.6072, + "step": 5580 + }, + { + "epoch": 0.58, + "grad_norm": 1.7011178415660841, + "learning_rate": 3.953142646018269e-06, + "loss": 0.5354, + "step": 5581 + }, + { + "epoch": 0.58, + "grad_norm": 1.8492767816456872, + "learning_rate": 3.951496599420538e-06, + "loss": 0.7356, + "step": 5582 + }, + { + "epoch": 0.58, + "grad_norm": 2.0115047324799478, + "learning_rate": 3.949850671676644e-06, + "loss": 0.6773, + "step": 5583 + }, + { + "epoch": 0.58, + "grad_norm": 2.068672760742573, + "learning_rate": 3.948204862973161e-06, + "loss": 0.6645, + "step": 5584 + }, + { + "epoch": 0.58, + "grad_norm": 1.9316885249397973, + "learning_rate": 3.946559173496653e-06, + "loss": 0.7602, + "step": 5585 + }, + { + "epoch": 0.58, + "grad_norm": 1.813002548695951, + "learning_rate": 3.944913603433667e-06, + "loss": 0.6286, + "step": 5586 + }, + { + "epoch": 0.58, + "grad_norm": 1.7041134204449735, + "learning_rate": 3.943268152970737e-06, + "loss": 0.6044, + "step": 5587 + }, + { + "epoch": 0.58, + "grad_norm": 1.9429926106788047, + "learning_rate": 3.941622822294385e-06, + "loss": 0.7283, + "step": 5588 + }, + { + "epoch": 0.58, + "grad_norm": 1.9588914791860121, + "learning_rate": 3.939977611591119e-06, + "loss": 0.5775, + "step": 5589 + }, + { + "epoch": 0.58, + "grad_norm": 1.8812887191683487, + "learning_rate": 3.938332521047434e-06, + "loss": 0.6613, + "step": 5590 + }, + { + "epoch": 0.58, + "grad_norm": 1.9635352056032394, + "learning_rate": 3.936687550849807e-06, + "loss": 0.6232, + "step": 5591 + }, + { + "epoch": 0.58, + "grad_norm": 2.0963605977584754, + "learning_rate": 3.935042701184707e-06, + "loss": 0.661, + "step": 5592 + }, + { + "epoch": 0.58, + "grad_norm": 1.8334345997394716, + "learning_rate": 3.9333979722385865e-06, + "loss": 0.6448, + "step": 5593 + }, + { + "epoch": 0.58, + "grad_norm": 1.802952264613462, + "learning_rate": 3.931753364197886e-06, + "loss": 0.6125, + "step": 5594 + }, + { + "epoch": 0.58, + "grad_norm": 2.0955138413023127, + "learning_rate": 3.93010887724903e-06, + "loss": 0.6208, + "step": 5595 + }, + { + "epoch": 0.58, + "grad_norm": 1.9727861271093967, + "learning_rate": 3.928464511578432e-06, + "loss": 0.6802, + "step": 5596 + }, + { + "epoch": 0.58, + "grad_norm": 1.7483375199702063, + "learning_rate": 3.926820267372489e-06, + "loss": 0.5174, + "step": 5597 + }, + { + "epoch": 0.58, + "grad_norm": 2.079593531305218, + "learning_rate": 3.925176144817587e-06, + "loss": 0.6895, + "step": 5598 + }, + { + "epoch": 0.58, + "grad_norm": 2.115779088659057, + "learning_rate": 3.9235321441000925e-06, + "loss": 0.5738, + "step": 5599 + }, + { + "epoch": 0.58, + "grad_norm": 2.0251026439895456, + "learning_rate": 3.921888265406366e-06, + "loss": 0.6485, + "step": 5600 + }, + { + "epoch": 0.58, + "grad_norm": 1.7558793985516596, + "learning_rate": 3.92024450892275e-06, + "loss": 0.5925, + "step": 5601 + }, + { + "epoch": 0.58, + "grad_norm": 1.860407325614981, + "learning_rate": 3.918600874835573e-06, + "loss": 0.5922, + "step": 5602 + }, + { + "epoch": 0.58, + "grad_norm": 1.981471706438414, + "learning_rate": 3.9169573633311505e-06, + "loss": 0.6904, + "step": 5603 + }, + { + "epoch": 0.58, + "grad_norm": 1.8167178665462902, + "learning_rate": 3.915313974595784e-06, + "loss": 0.5259, + "step": 5604 + }, + { + "epoch": 0.58, + "grad_norm": 2.487307591563184, + "learning_rate": 3.913670708815763e-06, + "loss": 0.5648, + "step": 5605 + }, + { + "epoch": 0.58, + "grad_norm": 1.6661233435057807, + "learning_rate": 3.912027566177358e-06, + "loss": 0.5738, + "step": 5606 + }, + { + "epoch": 0.58, + "grad_norm": 2.049360076678344, + "learning_rate": 3.910384546866828e-06, + "loss": 0.6835, + "step": 5607 + }, + { + "epoch": 0.58, + "grad_norm": 2.0422476557675906, + "learning_rate": 3.9087416510704206e-06, + "loss": 0.7541, + "step": 5608 + }, + { + "epoch": 0.58, + "grad_norm": 2.0434482018310782, + "learning_rate": 3.907098878974367e-06, + "loss": 0.6639, + "step": 5609 + }, + { + "epoch": 0.58, + "grad_norm": 2.1404651279397346, + "learning_rate": 3.905456230764886e-06, + "loss": 0.6493, + "step": 5610 + }, + { + "epoch": 0.58, + "grad_norm": 2.1477378402307594, + "learning_rate": 3.903813706628179e-06, + "loss": 0.6731, + "step": 5611 + }, + { + "epoch": 0.58, + "grad_norm": 1.6555033642042376, + "learning_rate": 3.902171306750436e-06, + "loss": 0.6208, + "step": 5612 + }, + { + "epoch": 0.58, + "grad_norm": 1.9267343051568924, + "learning_rate": 3.900529031317836e-06, + "loss": 0.5961, + "step": 5613 + }, + { + "epoch": 0.58, + "grad_norm": 2.0221313434196153, + "learning_rate": 3.898886880516533e-06, + "loss": 0.5812, + "step": 5614 + }, + { + "epoch": 0.58, + "grad_norm": 1.9915391709592674, + "learning_rate": 3.89724485453268e-06, + "loss": 0.5604, + "step": 5615 + }, + { + "epoch": 0.58, + "grad_norm": 1.9024492756973865, + "learning_rate": 3.895602953552408e-06, + "loss": 0.6102, + "step": 5616 + }, + { + "epoch": 0.58, + "grad_norm": 2.0749917123549704, + "learning_rate": 3.893961177761835e-06, + "loss": 0.6328, + "step": 5617 + }, + { + "epoch": 0.58, + "grad_norm": 1.6907876354852427, + "learning_rate": 3.8923195273470686e-06, + "loss": 0.6116, + "step": 5618 + }, + { + "epoch": 0.58, + "grad_norm": 1.9039400354395462, + "learning_rate": 3.890678002494196e-06, + "loss": 0.693, + "step": 5619 + }, + { + "epoch": 0.58, + "grad_norm": 2.109421413941969, + "learning_rate": 3.889036603389297e-06, + "loss": 0.6047, + "step": 5620 + }, + { + "epoch": 0.58, + "grad_norm": 1.7020168441413386, + "learning_rate": 3.887395330218429e-06, + "loss": 0.6034, + "step": 5621 + }, + { + "epoch": 0.58, + "grad_norm": 1.9585934375817815, + "learning_rate": 3.885754183167642e-06, + "loss": 0.5768, + "step": 5622 + }, + { + "epoch": 0.58, + "grad_norm": 2.21563878724902, + "learning_rate": 3.884113162422971e-06, + "loss": 0.6703, + "step": 5623 + }, + { + "epoch": 0.58, + "grad_norm": 1.8662332887713378, + "learning_rate": 3.882472268170432e-06, + "loss": 0.5811, + "step": 5624 + }, + { + "epoch": 0.58, + "grad_norm": 2.0872931585370282, + "learning_rate": 3.8808315005960305e-06, + "loss": 0.6711, + "step": 5625 + }, + { + "epoch": 0.58, + "grad_norm": 1.8026638556504297, + "learning_rate": 3.879190859885758e-06, + "loss": 0.6181, + "step": 5626 + }, + { + "epoch": 0.58, + "grad_norm": 1.7741256125064448, + "learning_rate": 3.877550346225591e-06, + "loss": 0.7367, + "step": 5627 + }, + { + "epoch": 0.59, + "grad_norm": 2.012507567318553, + "learning_rate": 3.8759099598014895e-06, + "loss": 0.6178, + "step": 5628 + }, + { + "epoch": 0.59, + "grad_norm": 2.0531286926195667, + "learning_rate": 3.8742697007994e-06, + "loss": 0.6073, + "step": 5629 + }, + { + "epoch": 0.59, + "grad_norm": 1.8202572945880635, + "learning_rate": 3.872629569405257e-06, + "loss": 0.6469, + "step": 5630 + }, + { + "epoch": 0.59, + "grad_norm": 1.796969026042274, + "learning_rate": 3.870989565804979e-06, + "loss": 0.6439, + "step": 5631 + }, + { + "epoch": 0.59, + "grad_norm": 2.1220990590037574, + "learning_rate": 3.869349690184468e-06, + "loss": 0.6905, + "step": 5632 + }, + { + "epoch": 0.59, + "grad_norm": 2.0824399282601846, + "learning_rate": 3.867709942729613e-06, + "loss": 0.693, + "step": 5633 + }, + { + "epoch": 0.59, + "grad_norm": 1.7845464618082878, + "learning_rate": 3.866070323626291e-06, + "loss": 0.5927, + "step": 5634 + }, + { + "epoch": 0.59, + "grad_norm": 1.9299347986415278, + "learning_rate": 3.864430833060363e-06, + "loss": 0.6342, + "step": 5635 + }, + { + "epoch": 0.59, + "grad_norm": 1.7117444670691249, + "learning_rate": 3.86279147121767e-06, + "loss": 0.6953, + "step": 5636 + }, + { + "epoch": 0.59, + "grad_norm": 1.828004952003326, + "learning_rate": 3.8611522382840476e-06, + "loss": 0.6119, + "step": 5637 + }, + { + "epoch": 0.59, + "grad_norm": 1.8602018760685013, + "learning_rate": 3.859513134445308e-06, + "loss": 0.6153, + "step": 5638 + }, + { + "epoch": 0.59, + "grad_norm": 1.8653514922373067, + "learning_rate": 3.857874159887256e-06, + "loss": 0.5868, + "step": 5639 + }, + { + "epoch": 0.59, + "grad_norm": 2.06982897795329, + "learning_rate": 3.856235314795678e-06, + "loss": 0.6817, + "step": 5640 + }, + { + "epoch": 0.59, + "grad_norm": 2.1179240984389147, + "learning_rate": 3.8545965993563454e-06, + "loss": 0.6013, + "step": 5641 + }, + { + "epoch": 0.59, + "grad_norm": 1.819858665984067, + "learning_rate": 3.85295801375502e-06, + "loss": 0.5963, + "step": 5642 + }, + { + "epoch": 0.59, + "grad_norm": 1.9063952687832015, + "learning_rate": 3.851319558177439e-06, + "loss": 0.6431, + "step": 5643 + }, + { + "epoch": 0.59, + "grad_norm": 2.191630045938788, + "learning_rate": 3.8496812328093335e-06, + "loss": 0.6341, + "step": 5644 + }, + { + "epoch": 0.59, + "grad_norm": 2.070202153822423, + "learning_rate": 3.848043037836417e-06, + "loss": 0.6904, + "step": 5645 + }, + { + "epoch": 0.59, + "grad_norm": 2.223577708768498, + "learning_rate": 3.846404973444388e-06, + "loss": 0.6196, + "step": 5646 + }, + { + "epoch": 0.59, + "grad_norm": 1.8708658636793065, + "learning_rate": 3.8447670398189305e-06, + "loss": 0.6542, + "step": 5647 + }, + { + "epoch": 0.59, + "grad_norm": 1.9332690937542027, + "learning_rate": 3.843129237145713e-06, + "loss": 0.6483, + "step": 5648 + }, + { + "epoch": 0.59, + "grad_norm": 1.8760128233078004, + "learning_rate": 3.841491565610393e-06, + "loss": 0.6364, + "step": 5649 + }, + { + "epoch": 0.59, + "grad_norm": 1.833179006583722, + "learning_rate": 3.839854025398606e-06, + "loss": 0.5539, + "step": 5650 + }, + { + "epoch": 0.59, + "grad_norm": 1.7178320341789401, + "learning_rate": 3.838216616695977e-06, + "loss": 0.5955, + "step": 5651 + }, + { + "epoch": 0.59, + "grad_norm": 2.043925966235596, + "learning_rate": 3.836579339688116e-06, + "loss": 0.5994, + "step": 5652 + }, + { + "epoch": 0.59, + "grad_norm": 2.013863381361781, + "learning_rate": 3.83494219456062e-06, + "loss": 0.7002, + "step": 5653 + }, + { + "epoch": 0.59, + "grad_norm": 1.7479848563782898, + "learning_rate": 3.833305181499065e-06, + "loss": 0.6117, + "step": 5654 + }, + { + "epoch": 0.59, + "grad_norm": 1.843085118521347, + "learning_rate": 3.831668300689019e-06, + "loss": 0.5921, + "step": 5655 + }, + { + "epoch": 0.59, + "grad_norm": 1.8414043257902304, + "learning_rate": 3.8300315523160295e-06, + "loss": 0.6534, + "step": 5656 + }, + { + "epoch": 0.59, + "grad_norm": 1.8521076862406898, + "learning_rate": 3.828394936565633e-06, + "loss": 0.6813, + "step": 5657 + }, + { + "epoch": 0.59, + "grad_norm": 1.964166539127327, + "learning_rate": 3.826758453623348e-06, + "loss": 0.5947, + "step": 5658 + }, + { + "epoch": 0.59, + "grad_norm": 1.7844201494772378, + "learning_rate": 3.825122103674679e-06, + "loss": 0.6167, + "step": 5659 + }, + { + "epoch": 0.59, + "grad_norm": 1.7340840036629404, + "learning_rate": 3.823485886905116e-06, + "loss": 0.6266, + "step": 5660 + }, + { + "epoch": 0.59, + "grad_norm": 1.8099618683785281, + "learning_rate": 3.821849803500134e-06, + "loss": 0.6138, + "step": 5661 + }, + { + "epoch": 0.59, + "grad_norm": 1.944243244863238, + "learning_rate": 3.820213853645193e-06, + "loss": 0.6171, + "step": 5662 + }, + { + "epoch": 0.59, + "grad_norm": 1.8178877996961225, + "learning_rate": 3.8185780375257356e-06, + "loss": 0.6275, + "step": 5663 + }, + { + "epoch": 0.59, + "grad_norm": 1.9352643805301815, + "learning_rate": 3.816942355327191e-06, + "loss": 0.5816, + "step": 5664 + }, + { + "epoch": 0.59, + "grad_norm": 1.8325507491789237, + "learning_rate": 3.815306807234974e-06, + "loss": 0.6927, + "step": 5665 + }, + { + "epoch": 0.59, + "grad_norm": 1.837901350432591, + "learning_rate": 3.8136713934344825e-06, + "loss": 0.6256, + "step": 5666 + }, + { + "epoch": 0.59, + "grad_norm": 2.111254838340173, + "learning_rate": 3.812036114111101e-06, + "loss": 0.5796, + "step": 5667 + }, + { + "epoch": 0.59, + "grad_norm": 1.8187308591668736, + "learning_rate": 3.8104009694501963e-06, + "loss": 0.6156, + "step": 5668 + }, + { + "epoch": 0.59, + "grad_norm": 1.8485495400562826, + "learning_rate": 3.808765959637123e-06, + "loss": 0.7708, + "step": 5669 + }, + { + "epoch": 0.59, + "grad_norm": 1.8249275555676456, + "learning_rate": 3.8071310848572173e-06, + "loss": 0.5503, + "step": 5670 + }, + { + "epoch": 0.59, + "grad_norm": 2.1032973373284682, + "learning_rate": 3.8054963452958025e-06, + "loss": 0.6755, + "step": 5671 + }, + { + "epoch": 0.59, + "grad_norm": 1.9971403602282247, + "learning_rate": 3.8038617411381876e-06, + "loss": 0.6573, + "step": 5672 + }, + { + "epoch": 0.59, + "grad_norm": 1.795638769861292, + "learning_rate": 3.80222727256966e-06, + "loss": 0.6491, + "step": 5673 + }, + { + "epoch": 0.59, + "grad_norm": 1.871045292034552, + "learning_rate": 3.800592939775498e-06, + "loss": 0.5779, + "step": 5674 + }, + { + "epoch": 0.59, + "grad_norm": 2.1159675641961306, + "learning_rate": 3.798958742940963e-06, + "loss": 0.6505, + "step": 5675 + }, + { + "epoch": 0.59, + "grad_norm": 1.9957808122960192, + "learning_rate": 3.7973246822513e-06, + "loss": 0.6975, + "step": 5676 + }, + { + "epoch": 0.59, + "grad_norm": 1.8929123275811768, + "learning_rate": 3.795690757891739e-06, + "loss": 0.6495, + "step": 5677 + }, + { + "epoch": 0.59, + "grad_norm": 1.8000004498952757, + "learning_rate": 3.794056970047495e-06, + "loss": 0.6205, + "step": 5678 + }, + { + "epoch": 0.59, + "grad_norm": 2.016000272727104, + "learning_rate": 3.7924233189037697e-06, + "loss": 0.6353, + "step": 5679 + }, + { + "epoch": 0.59, + "grad_norm": 1.9150551526282293, + "learning_rate": 3.7907898046457416e-06, + "loss": 0.6361, + "step": 5680 + }, + { + "epoch": 0.59, + "grad_norm": 1.905395762182365, + "learning_rate": 3.789156427458581e-06, + "loss": 0.6509, + "step": 5681 + }, + { + "epoch": 0.59, + "grad_norm": 2.0270735773997557, + "learning_rate": 3.787523187527442e-06, + "loss": 0.6225, + "step": 5682 + }, + { + "epoch": 0.59, + "grad_norm": 1.8523260107443824, + "learning_rate": 3.7858900850374596e-06, + "loss": 0.5487, + "step": 5683 + }, + { + "epoch": 0.59, + "grad_norm": 1.8400704301179405, + "learning_rate": 3.7842571201737568e-06, + "loss": 0.6822, + "step": 5684 + }, + { + "epoch": 0.59, + "grad_norm": 2.213633936194134, + "learning_rate": 3.7826242931214386e-06, + "loss": 0.6599, + "step": 5685 + }, + { + "epoch": 0.59, + "grad_norm": 1.847807446734992, + "learning_rate": 3.780991604065598e-06, + "loss": 0.5762, + "step": 5686 + }, + { + "epoch": 0.59, + "grad_norm": 1.7717364388265258, + "learning_rate": 3.7793590531913047e-06, + "loss": 0.5142, + "step": 5687 + }, + { + "epoch": 0.59, + "grad_norm": 2.06824980539038, + "learning_rate": 3.777726640683621e-06, + "loss": 0.6043, + "step": 5688 + }, + { + "epoch": 0.59, + "grad_norm": 2.0951430699266713, + "learning_rate": 3.7760943667275884e-06, + "loss": 0.6901, + "step": 5689 + }, + { + "epoch": 0.59, + "grad_norm": 1.9928453372788413, + "learning_rate": 3.7744622315082358e-06, + "loss": 0.5764, + "step": 5690 + }, + { + "epoch": 0.59, + "grad_norm": 2.0971652173091364, + "learning_rate": 3.7728302352105743e-06, + "loss": 0.6022, + "step": 5691 + }, + { + "epoch": 0.59, + "grad_norm": 2.0164644631521598, + "learning_rate": 3.7711983780196006e-06, + "loss": 0.5844, + "step": 5692 + }, + { + "epoch": 0.59, + "grad_norm": 2.121542379135994, + "learning_rate": 3.7695666601202944e-06, + "loss": 0.7505, + "step": 5693 + }, + { + "epoch": 0.59, + "grad_norm": 1.7159088934028508, + "learning_rate": 3.767935081697622e-06, + "loss": 0.5846, + "step": 5694 + }, + { + "epoch": 0.59, + "grad_norm": 1.7520439948117041, + "learning_rate": 3.766303642936529e-06, + "loss": 0.5672, + "step": 5695 + }, + { + "epoch": 0.59, + "grad_norm": 1.7646205963873725, + "learning_rate": 3.76467234402195e-06, + "loss": 0.699, + "step": 5696 + }, + { + "epoch": 0.59, + "grad_norm": 1.7997889644069125, + "learning_rate": 3.763041185138802e-06, + "loss": 0.6302, + "step": 5697 + }, + { + "epoch": 0.59, + "grad_norm": 1.7988871525135666, + "learning_rate": 3.7614101664719866e-06, + "loss": 0.651, + "step": 5698 + }, + { + "epoch": 0.59, + "grad_norm": 1.8411621906976277, + "learning_rate": 3.759779288206388e-06, + "loss": 0.6294, + "step": 5699 + }, + { + "epoch": 0.59, + "grad_norm": 1.8931612809567004, + "learning_rate": 3.758148550526877e-06, + "loss": 0.5979, + "step": 5700 + }, + { + "epoch": 0.59, + "grad_norm": 1.8264053952182302, + "learning_rate": 3.7565179536183067e-06, + "loss": 0.6323, + "step": 5701 + }, + { + "epoch": 0.59, + "grad_norm": 1.8994850468593727, + "learning_rate": 3.7548874976655126e-06, + "loss": 0.6624, + "step": 5702 + }, + { + "epoch": 0.59, + "grad_norm": 1.8525791579411612, + "learning_rate": 3.7532571828533175e-06, + "loss": 0.5778, + "step": 5703 + }, + { + "epoch": 0.59, + "grad_norm": 1.914265211398974, + "learning_rate": 3.751627009366527e-06, + "loss": 0.6037, + "step": 5704 + }, + { + "epoch": 0.59, + "grad_norm": 1.9860797713158134, + "learning_rate": 3.7499969773899304e-06, + "loss": 0.6279, + "step": 5705 + }, + { + "epoch": 0.59, + "grad_norm": 2.4453142555832756, + "learning_rate": 3.748367087108301e-06, + "loss": 0.6271, + "step": 5706 + }, + { + "epoch": 0.59, + "grad_norm": 2.3538812133651965, + "learning_rate": 3.7467373387063973e-06, + "loss": 0.715, + "step": 5707 + }, + { + "epoch": 0.59, + "grad_norm": 2.0233372795899656, + "learning_rate": 3.7451077323689587e-06, + "loss": 0.5361, + "step": 5708 + }, + { + "epoch": 0.59, + "grad_norm": 1.8810865910977235, + "learning_rate": 3.7434782682807137e-06, + "loss": 0.6368, + "step": 5709 + }, + { + "epoch": 0.59, + "grad_norm": 1.9864037253071856, + "learning_rate": 3.7418489466263663e-06, + "loss": 0.6532, + "step": 5710 + }, + { + "epoch": 0.59, + "grad_norm": 1.7612719043231966, + "learning_rate": 3.740219767590613e-06, + "loss": 0.6277, + "step": 5711 + }, + { + "epoch": 0.59, + "grad_norm": 1.9914586483658314, + "learning_rate": 3.738590731358129e-06, + "loss": 0.6322, + "step": 5712 + }, + { + "epoch": 0.59, + "grad_norm": 2.0569132735077074, + "learning_rate": 3.736961838113575e-06, + "loss": 0.5606, + "step": 5713 + }, + { + "epoch": 0.59, + "grad_norm": 1.7802065674190661, + "learning_rate": 3.7353330880415963e-06, + "loss": 0.5631, + "step": 5714 + }, + { + "epoch": 0.59, + "grad_norm": 1.919709196487885, + "learning_rate": 3.7337044813268204e-06, + "loss": 0.6351, + "step": 5715 + }, + { + "epoch": 0.59, + "grad_norm": 1.805335375870117, + "learning_rate": 3.732076018153861e-06, + "loss": 0.5069, + "step": 5716 + }, + { + "epoch": 0.59, + "grad_norm": 1.9844777052067826, + "learning_rate": 3.73044769870731e-06, + "loss": 0.6736, + "step": 5717 + }, + { + "epoch": 0.59, + "grad_norm": 1.9877835438191704, + "learning_rate": 3.728819523171748e-06, + "loss": 0.6128, + "step": 5718 + }, + { + "epoch": 0.59, + "grad_norm": 1.8767135977824283, + "learning_rate": 3.7271914917317397e-06, + "loss": 0.7063, + "step": 5719 + }, + { + "epoch": 0.59, + "grad_norm": 1.979452378878533, + "learning_rate": 3.7255636045718295e-06, + "loss": 0.7067, + "step": 5720 + }, + { + "epoch": 0.59, + "grad_norm": 1.88533991292467, + "learning_rate": 3.723935861876549e-06, + "loss": 0.6882, + "step": 5721 + }, + { + "epoch": 0.59, + "grad_norm": 2.027102272390913, + "learning_rate": 3.722308263830412e-06, + "loss": 0.659, + "step": 5722 + }, + { + "epoch": 0.59, + "grad_norm": 2.023782394536249, + "learning_rate": 3.720680810617917e-06, + "loss": 0.6214, + "step": 5723 + }, + { + "epoch": 0.6, + "grad_norm": 2.229036069263612, + "learning_rate": 3.719053502423543e-06, + "loss": 0.6, + "step": 5724 + }, + { + "epoch": 0.6, + "grad_norm": 1.8634729871510112, + "learning_rate": 3.7174263394317565e-06, + "loss": 0.5997, + "step": 5725 + }, + { + "epoch": 0.6, + "grad_norm": 2.162803651867326, + "learning_rate": 3.715799321827004e-06, + "loss": 0.7181, + "step": 5726 + }, + { + "epoch": 0.6, + "grad_norm": 1.7501057047105535, + "learning_rate": 3.714172449793718e-06, + "loss": 0.6726, + "step": 5727 + }, + { + "epoch": 0.6, + "grad_norm": 1.821385233254919, + "learning_rate": 3.7125457235163144e-06, + "loss": 0.6426, + "step": 5728 + }, + { + "epoch": 0.6, + "grad_norm": 1.87698360403373, + "learning_rate": 3.7109191431791902e-06, + "loss": 0.5506, + "step": 5729 + }, + { + "epoch": 0.6, + "grad_norm": 1.9541379681456583, + "learning_rate": 3.7092927089667293e-06, + "loss": 0.6531, + "step": 5730 + }, + { + "epoch": 0.6, + "grad_norm": 1.9558109805930095, + "learning_rate": 3.7076664210632972e-06, + "loss": 0.665, + "step": 5731 + }, + { + "epoch": 0.6, + "grad_norm": 1.8327803792621096, + "learning_rate": 3.7060402796532414e-06, + "loss": 0.5689, + "step": 5732 + }, + { + "epoch": 0.6, + "grad_norm": 1.7268754577479737, + "learning_rate": 3.7044142849208953e-06, + "loss": 0.6506, + "step": 5733 + }, + { + "epoch": 0.6, + "grad_norm": 1.8842322171157952, + "learning_rate": 3.7027884370505753e-06, + "loss": 0.6301, + "step": 5734 + }, + { + "epoch": 0.6, + "grad_norm": 1.7694469325931714, + "learning_rate": 3.701162736226579e-06, + "loss": 0.5931, + "step": 5735 + }, + { + "epoch": 0.6, + "grad_norm": 1.9551315711768495, + "learning_rate": 3.699537182633189e-06, + "loss": 0.6189, + "step": 5736 + }, + { + "epoch": 0.6, + "grad_norm": 2.0096838711866436, + "learning_rate": 3.6979117764546735e-06, + "loss": 0.7111, + "step": 5737 + }, + { + "epoch": 0.6, + "grad_norm": 1.688583682617302, + "learning_rate": 3.6962865178752804e-06, + "loss": 0.5609, + "step": 5738 + }, + { + "epoch": 0.6, + "grad_norm": 1.8099317008081695, + "learning_rate": 3.69466140707924e-06, + "loss": 0.7394, + "step": 5739 + }, + { + "epoch": 0.6, + "grad_norm": 2.0311676895133384, + "learning_rate": 3.6930364442507693e-06, + "loss": 0.6102, + "step": 5740 + }, + { + "epoch": 0.6, + "grad_norm": 1.953417217917916, + "learning_rate": 3.6914116295740678e-06, + "loss": 0.6152, + "step": 5741 + }, + { + "epoch": 0.6, + "grad_norm": 1.871734797375731, + "learning_rate": 3.6897869632333157e-06, + "loss": 0.554, + "step": 5742 + }, + { + "epoch": 0.6, + "grad_norm": 1.9371609557756013, + "learning_rate": 3.6881624454126797e-06, + "loss": 0.6307, + "step": 5743 + }, + { + "epoch": 0.6, + "grad_norm": 1.9869409405219924, + "learning_rate": 3.686538076296307e-06, + "loss": 0.6393, + "step": 5744 + }, + { + "epoch": 0.6, + "grad_norm": 2.1734737870284913, + "learning_rate": 3.6849138560683305e-06, + "loss": 0.6261, + "step": 5745 + }, + { + "epoch": 0.6, + "grad_norm": 1.9689826803452837, + "learning_rate": 3.683289784912866e-06, + "loss": 0.5688, + "step": 5746 + }, + { + "epoch": 0.6, + "grad_norm": 1.8841430860004258, + "learning_rate": 3.681665863014008e-06, + "loss": 0.5939, + "step": 5747 + }, + { + "epoch": 0.6, + "grad_norm": 2.068574065898926, + "learning_rate": 3.6800420905558378e-06, + "loss": 0.6049, + "step": 5748 + }, + { + "epoch": 0.6, + "grad_norm": 1.8856912763649123, + "learning_rate": 3.6784184677224204e-06, + "loss": 0.6656, + "step": 5749 + }, + { + "epoch": 0.6, + "grad_norm": 2.067502585225236, + "learning_rate": 3.6767949946978026e-06, + "loss": 0.6474, + "step": 5750 + }, + { + "epoch": 0.6, + "grad_norm": 2.041166249135522, + "learning_rate": 3.6751716716660146e-06, + "loss": 0.5992, + "step": 5751 + }, + { + "epoch": 0.6, + "grad_norm": 1.934731123565553, + "learning_rate": 3.673548498811068e-06, + "loss": 0.6147, + "step": 5752 + }, + { + "epoch": 0.6, + "grad_norm": 1.7182470344213379, + "learning_rate": 3.671925476316962e-06, + "loss": 0.6081, + "step": 5753 + }, + { + "epoch": 0.6, + "grad_norm": 2.2804452109132676, + "learning_rate": 3.6703026043676715e-06, + "loss": 0.7114, + "step": 5754 + }, + { + "epoch": 0.6, + "grad_norm": 2.216666784325599, + "learning_rate": 3.6686798831471594e-06, + "loss": 0.6938, + "step": 5755 + }, + { + "epoch": 0.6, + "grad_norm": 1.7060098887404596, + "learning_rate": 3.6670573128393704e-06, + "loss": 0.562, + "step": 5756 + }, + { + "epoch": 0.6, + "grad_norm": 1.781597914983373, + "learning_rate": 3.6654348936282324e-06, + "loss": 0.6731, + "step": 5757 + }, + { + "epoch": 0.6, + "grad_norm": 1.880276423977131, + "learning_rate": 3.663812625697657e-06, + "loss": 0.6672, + "step": 5758 + }, + { + "epoch": 0.6, + "grad_norm": 2.0046983496227435, + "learning_rate": 3.6621905092315357e-06, + "loss": 0.6595, + "step": 5759 + }, + { + "epoch": 0.6, + "grad_norm": 1.8122835785911924, + "learning_rate": 3.6605685444137463e-06, + "loss": 0.6348, + "step": 5760 + }, + { + "epoch": 0.6, + "grad_norm": 1.8979209319451233, + "learning_rate": 3.658946731428147e-06, + "loss": 0.6303, + "step": 5761 + }, + { + "epoch": 0.6, + "grad_norm": 1.9871359315563444, + "learning_rate": 3.6573250704585783e-06, + "loss": 0.673, + "step": 5762 + }, + { + "epoch": 0.6, + "grad_norm": 2.174620750981435, + "learning_rate": 3.655703561688867e-06, + "loss": 0.7261, + "step": 5763 + }, + { + "epoch": 0.6, + "grad_norm": 1.9509384450832072, + "learning_rate": 3.6540822053028185e-06, + "loss": 0.603, + "step": 5764 + }, + { + "epoch": 0.6, + "grad_norm": 2.123960386898783, + "learning_rate": 3.6524610014842234e-06, + "loss": 0.6584, + "step": 5765 + }, + { + "epoch": 0.6, + "grad_norm": 1.8173412461480742, + "learning_rate": 3.6508399504168547e-06, + "loss": 0.6287, + "step": 5766 + }, + { + "epoch": 0.6, + "grad_norm": 1.9404913184513584, + "learning_rate": 3.6492190522844673e-06, + "loss": 0.656, + "step": 5767 + }, + { + "epoch": 0.6, + "grad_norm": 1.7743176436764714, + "learning_rate": 3.647598307270801e-06, + "loss": 0.5844, + "step": 5768 + }, + { + "epoch": 0.6, + "grad_norm": 1.8949196446662082, + "learning_rate": 3.6459777155595733e-06, + "loss": 0.6704, + "step": 5769 + }, + { + "epoch": 0.6, + "grad_norm": 2.2475526530936225, + "learning_rate": 3.64435727733449e-06, + "loss": 0.6862, + "step": 5770 + }, + { + "epoch": 0.6, + "grad_norm": 1.9168800541401423, + "learning_rate": 3.6427369927792354e-06, + "loss": 0.5947, + "step": 5771 + }, + { + "epoch": 0.6, + "grad_norm": 1.9223629195860992, + "learning_rate": 3.6411168620774795e-06, + "loss": 0.5721, + "step": 5772 + }, + { + "epoch": 0.6, + "grad_norm": 1.9231743963862156, + "learning_rate": 3.639496885412872e-06, + "loss": 0.5919, + "step": 5773 + }, + { + "epoch": 0.6, + "grad_norm": 1.6447425060429046, + "learning_rate": 3.6378770629690476e-06, + "loss": 0.581, + "step": 5774 + }, + { + "epoch": 0.6, + "grad_norm": 2.0694344031077168, + "learning_rate": 3.6362573949296242e-06, + "loss": 0.586, + "step": 5775 + }, + { + "epoch": 0.6, + "grad_norm": 1.7199178479200063, + "learning_rate": 3.634637881478196e-06, + "loss": 0.6447, + "step": 5776 + }, + { + "epoch": 0.6, + "grad_norm": 2.1261329402743763, + "learning_rate": 3.633018522798346e-06, + "loss": 0.5911, + "step": 5777 + }, + { + "epoch": 0.6, + "grad_norm": 2.0734597415985627, + "learning_rate": 3.6313993190736375e-06, + "loss": 0.6408, + "step": 5778 + }, + { + "epoch": 0.6, + "grad_norm": 2.0006256078542535, + "learning_rate": 3.629780270487617e-06, + "loss": 0.6043, + "step": 5779 + }, + { + "epoch": 0.6, + "grad_norm": 1.9745647927432075, + "learning_rate": 3.628161377223814e-06, + "loss": 0.7071, + "step": 5780 + }, + { + "epoch": 0.6, + "grad_norm": 1.6602642905256981, + "learning_rate": 3.626542639465738e-06, + "loss": 0.5602, + "step": 5781 + }, + { + "epoch": 0.6, + "grad_norm": 1.9001677004630113, + "learning_rate": 3.624924057396883e-06, + "loss": 0.6861, + "step": 5782 + }, + { + "epoch": 0.6, + "grad_norm": 2.0813915739240447, + "learning_rate": 3.6233056312007226e-06, + "loss": 0.7129, + "step": 5783 + }, + { + "epoch": 0.6, + "grad_norm": 2.0518818520506477, + "learning_rate": 3.6216873610607155e-06, + "loss": 0.5889, + "step": 5784 + }, + { + "epoch": 0.6, + "grad_norm": 1.726267024963287, + "learning_rate": 3.620069247160303e-06, + "loss": 0.6046, + "step": 5785 + }, + { + "epoch": 0.6, + "grad_norm": 1.6789114214946894, + "learning_rate": 3.618451289682905e-06, + "loss": 0.6614, + "step": 5786 + }, + { + "epoch": 0.6, + "grad_norm": 1.8891420844312576, + "learning_rate": 3.6168334888119295e-06, + "loss": 0.604, + "step": 5787 + }, + { + "epoch": 0.6, + "grad_norm": 1.6271989700635565, + "learning_rate": 3.6152158447307607e-06, + "loss": 0.5604, + "step": 5788 + }, + { + "epoch": 0.6, + "grad_norm": 1.960085702524464, + "learning_rate": 3.6135983576227694e-06, + "loss": 0.7115, + "step": 5789 + }, + { + "epoch": 0.6, + "grad_norm": 2.055291396604353, + "learning_rate": 3.6119810276713085e-06, + "loss": 0.6249, + "step": 5790 + }, + { + "epoch": 0.6, + "grad_norm": 1.8777115393478256, + "learning_rate": 3.6103638550597074e-06, + "loss": 0.6346, + "step": 5791 + }, + { + "epoch": 0.6, + "grad_norm": 1.8694877602773852, + "learning_rate": 3.6087468399712842e-06, + "loss": 0.6529, + "step": 5792 + }, + { + "epoch": 0.6, + "grad_norm": 2.007269671680019, + "learning_rate": 3.607129982589337e-06, + "loss": 0.5934, + "step": 5793 + }, + { + "epoch": 0.6, + "grad_norm": 1.771068848538593, + "learning_rate": 3.6055132830971446e-06, + "loss": 0.5829, + "step": 5794 + }, + { + "epoch": 0.6, + "grad_norm": 1.9968602046327504, + "learning_rate": 3.60389674167797e-06, + "loss": 0.6455, + "step": 5795 + }, + { + "epoch": 0.6, + "grad_norm": 1.9085289823709048, + "learning_rate": 3.6022803585150574e-06, + "loss": 0.6621, + "step": 5796 + }, + { + "epoch": 0.6, + "grad_norm": 1.8562577460156162, + "learning_rate": 3.6006641337916335e-06, + "loss": 0.6193, + "step": 5797 + }, + { + "epoch": 0.6, + "grad_norm": 2.0116263130134566, + "learning_rate": 3.5990480676909055e-06, + "loss": 0.7013, + "step": 5798 + }, + { + "epoch": 0.6, + "grad_norm": 1.8696463786190631, + "learning_rate": 3.597432160396064e-06, + "loss": 0.592, + "step": 5799 + }, + { + "epoch": 0.6, + "grad_norm": 1.9258705805458398, + "learning_rate": 3.5958164120902816e-06, + "loss": 0.6927, + "step": 5800 + }, + { + "epoch": 0.6, + "grad_norm": 1.788877244074425, + "learning_rate": 3.5942008229567128e-06, + "loss": 0.6083, + "step": 5801 + }, + { + "epoch": 0.6, + "grad_norm": 1.819861269764366, + "learning_rate": 3.592585393178494e-06, + "loss": 0.5593, + "step": 5802 + }, + { + "epoch": 0.6, + "grad_norm": 2.176572903100845, + "learning_rate": 3.590970122938742e-06, + "loss": 0.6081, + "step": 5803 + }, + { + "epoch": 0.6, + "grad_norm": 1.8421415635256122, + "learning_rate": 3.589355012420558e-06, + "loss": 0.5901, + "step": 5804 + }, + { + "epoch": 0.6, + "grad_norm": 1.7830410387814137, + "learning_rate": 3.587740061807024e-06, + "loss": 0.7421, + "step": 5805 + }, + { + "epoch": 0.6, + "grad_norm": 1.901456241678915, + "learning_rate": 3.5861252712812032e-06, + "loss": 0.5899, + "step": 5806 + }, + { + "epoch": 0.6, + "grad_norm": 1.740330243536855, + "learning_rate": 3.5845106410261417e-06, + "loss": 0.6182, + "step": 5807 + }, + { + "epoch": 0.6, + "grad_norm": 1.9849175676505417, + "learning_rate": 3.5828961712248667e-06, + "loss": 0.6465, + "step": 5808 + }, + { + "epoch": 0.6, + "grad_norm": 1.7925579462512344, + "learning_rate": 3.5812818620603883e-06, + "loss": 0.6665, + "step": 5809 + }, + { + "epoch": 0.6, + "grad_norm": 1.963203797068177, + "learning_rate": 3.579667713715697e-06, + "loss": 0.6477, + "step": 5810 + }, + { + "epoch": 0.6, + "grad_norm": 1.90177521042788, + "learning_rate": 3.5780537263737657e-06, + "loss": 0.717, + "step": 5811 + }, + { + "epoch": 0.6, + "grad_norm": 1.9095440405290003, + "learning_rate": 3.576439900217552e-06, + "loss": 0.6262, + "step": 5812 + }, + { + "epoch": 0.6, + "grad_norm": 2.0776672586275713, + "learning_rate": 3.574826235429988e-06, + "loss": 0.6042, + "step": 5813 + }, + { + "epoch": 0.6, + "grad_norm": 2.115976450204016, + "learning_rate": 3.5732127321939925e-06, + "loss": 0.7406, + "step": 5814 + }, + { + "epoch": 0.6, + "grad_norm": 1.8460290362187606, + "learning_rate": 3.5715993906924663e-06, + "loss": 0.5892, + "step": 5815 + }, + { + "epoch": 0.6, + "grad_norm": 1.8060615761663243, + "learning_rate": 3.5699862111082917e-06, + "loss": 0.6119, + "step": 5816 + }, + { + "epoch": 0.6, + "grad_norm": 1.8786816582722556, + "learning_rate": 3.56837319362433e-06, + "loss": 0.6141, + "step": 5817 + }, + { + "epoch": 0.6, + "grad_norm": 1.8902089472404162, + "learning_rate": 3.566760338423427e-06, + "loss": 0.6255, + "step": 5818 + }, + { + "epoch": 0.6, + "grad_norm": 1.859106348664016, + "learning_rate": 3.5651476456884103e-06, + "loss": 0.6933, + "step": 5819 + }, + { + "epoch": 0.6, + "grad_norm": 2.06543221051437, + "learning_rate": 3.5635351156020853e-06, + "loss": 0.6413, + "step": 5820 + }, + { + "epoch": 0.61, + "grad_norm": 1.929133796522785, + "learning_rate": 3.5619227483472417e-06, + "loss": 0.6346, + "step": 5821 + }, + { + "epoch": 0.61, + "grad_norm": 1.7355976170586247, + "learning_rate": 3.560310544106652e-06, + "loss": 0.5988, + "step": 5822 + }, + { + "epoch": 0.61, + "grad_norm": 1.876723408507209, + "learning_rate": 3.5586985030630685e-06, + "loss": 0.5479, + "step": 5823 + }, + { + "epoch": 0.61, + "grad_norm": 1.7976909367236422, + "learning_rate": 3.557086625399224e-06, + "loss": 0.6151, + "step": 5824 + }, + { + "epoch": 0.61, + "grad_norm": 1.7457066314116323, + "learning_rate": 3.555474911297835e-06, + "loss": 0.6609, + "step": 5825 + }, + { + "epoch": 0.61, + "grad_norm": 1.8359401944106575, + "learning_rate": 3.553863360941598e-06, + "loss": 0.6886, + "step": 5826 + }, + { + "epoch": 0.61, + "grad_norm": 1.9190157263515015, + "learning_rate": 3.552251974513194e-06, + "loss": 0.6556, + "step": 5827 + }, + { + "epoch": 0.61, + "grad_norm": 1.7954112329008878, + "learning_rate": 3.5506407521952783e-06, + "loss": 0.598, + "step": 5828 + }, + { + "epoch": 0.61, + "grad_norm": 1.7418220797119244, + "learning_rate": 3.5490296941704948e-06, + "loss": 0.6735, + "step": 5829 + }, + { + "epoch": 0.61, + "grad_norm": 1.8969973614397408, + "learning_rate": 3.547418800621466e-06, + "loss": 0.6207, + "step": 5830 + }, + { + "epoch": 0.61, + "grad_norm": 1.950442180880578, + "learning_rate": 3.545808071730795e-06, + "loss": 0.604, + "step": 5831 + }, + { + "epoch": 0.61, + "grad_norm": 1.6785701177368428, + "learning_rate": 3.544197507681068e-06, + "loss": 0.6512, + "step": 5832 + }, + { + "epoch": 0.61, + "grad_norm": 1.7768027074664066, + "learning_rate": 3.5425871086548513e-06, + "loss": 0.5867, + "step": 5833 + }, + { + "epoch": 0.61, + "grad_norm": 1.8996288570794853, + "learning_rate": 3.540976874834694e-06, + "loss": 0.6475, + "step": 5834 + }, + { + "epoch": 0.61, + "grad_norm": 2.1207513817114783, + "learning_rate": 3.539366806403123e-06, + "loss": 0.6733, + "step": 5835 + }, + { + "epoch": 0.61, + "grad_norm": 1.8518276147104666, + "learning_rate": 3.5377569035426494e-06, + "loss": 0.6066, + "step": 5836 + }, + { + "epoch": 0.61, + "grad_norm": 1.9845097936191345, + "learning_rate": 3.536147166435765e-06, + "loss": 0.7186, + "step": 5837 + }, + { + "epoch": 0.61, + "grad_norm": 1.8353019441328098, + "learning_rate": 3.534537595264944e-06, + "loss": 0.6514, + "step": 5838 + }, + { + "epoch": 0.61, + "grad_norm": 1.7635849381509756, + "learning_rate": 3.532928190212639e-06, + "loss": 0.6148, + "step": 5839 + }, + { + "epoch": 0.61, + "grad_norm": 1.650677732272397, + "learning_rate": 3.5313189514612867e-06, + "loss": 0.5851, + "step": 5840 + }, + { + "epoch": 0.61, + "grad_norm": 1.9688433425039455, + "learning_rate": 3.529709879193301e-06, + "loss": 0.607, + "step": 5841 + }, + { + "epoch": 0.61, + "grad_norm": 1.7671648226840087, + "learning_rate": 3.5281009735910822e-06, + "loss": 0.5911, + "step": 5842 + }, + { + "epoch": 0.61, + "grad_norm": 2.4477526780203385, + "learning_rate": 3.5264922348370066e-06, + "loss": 0.6202, + "step": 5843 + }, + { + "epoch": 0.61, + "grad_norm": 1.7697614181136327, + "learning_rate": 3.524883663113435e-06, + "loss": 0.6076, + "step": 5844 + }, + { + "epoch": 0.61, + "grad_norm": 1.8436384469037794, + "learning_rate": 3.523275258602708e-06, + "loss": 0.667, + "step": 5845 + }, + { + "epoch": 0.61, + "grad_norm": 1.7263835471812945, + "learning_rate": 3.5216670214871475e-06, + "loss": 0.6649, + "step": 5846 + }, + { + "epoch": 0.61, + "grad_norm": 1.7266754034601193, + "learning_rate": 3.520058951949056e-06, + "loss": 0.5832, + "step": 5847 + }, + { + "epoch": 0.61, + "grad_norm": 1.9772191883626575, + "learning_rate": 3.518451050170718e-06, + "loss": 0.6297, + "step": 5848 + }, + { + "epoch": 0.61, + "grad_norm": 1.903340236332747, + "learning_rate": 3.5168433163344005e-06, + "loss": 0.6192, + "step": 5849 + }, + { + "epoch": 0.61, + "grad_norm": 1.7564343946683172, + "learning_rate": 3.5152357506223444e-06, + "loss": 0.5977, + "step": 5850 + }, + { + "epoch": 0.61, + "grad_norm": 1.7624806514627709, + "learning_rate": 3.5136283532167786e-06, + "loss": 0.578, + "step": 5851 + }, + { + "epoch": 0.61, + "grad_norm": 1.7577685153115692, + "learning_rate": 3.5120211242999115e-06, + "loss": 0.623, + "step": 5852 + }, + { + "epoch": 0.61, + "grad_norm": 2.0273575366018726, + "learning_rate": 3.5104140640539302e-06, + "loss": 0.5983, + "step": 5853 + }, + { + "epoch": 0.61, + "grad_norm": 1.945975183296312, + "learning_rate": 3.508807172661006e-06, + "loss": 0.6439, + "step": 5854 + }, + { + "epoch": 0.61, + "grad_norm": 1.9502084132528867, + "learning_rate": 3.5072004503032876e-06, + "loss": 0.6141, + "step": 5855 + }, + { + "epoch": 0.61, + "grad_norm": 1.7862146823553806, + "learning_rate": 3.5055938971629096e-06, + "loss": 0.587, + "step": 5856 + }, + { + "epoch": 0.61, + "grad_norm": 1.8574712577859098, + "learning_rate": 3.5039875134219784e-06, + "loss": 0.6534, + "step": 5857 + }, + { + "epoch": 0.61, + "grad_norm": 1.8248444230202323, + "learning_rate": 3.5023812992625905e-06, + "loss": 0.625, + "step": 5858 + }, + { + "epoch": 0.61, + "grad_norm": 1.85422148267425, + "learning_rate": 3.5007752548668173e-06, + "loss": 0.5738, + "step": 5859 + }, + { + "epoch": 0.61, + "grad_norm": 1.638592929436304, + "learning_rate": 3.499169380416715e-06, + "loss": 0.5042, + "step": 5860 + }, + { + "epoch": 0.61, + "grad_norm": 2.0112792439563982, + "learning_rate": 3.4975636760943177e-06, + "loss": 0.5882, + "step": 5861 + }, + { + "epoch": 0.61, + "grad_norm": 1.7295258052488887, + "learning_rate": 3.4959581420816413e-06, + "loss": 0.6108, + "step": 5862 + }, + { + "epoch": 0.61, + "grad_norm": 1.9019332337346095, + "learning_rate": 3.4943527785606824e-06, + "loss": 0.6732, + "step": 5863 + }, + { + "epoch": 0.61, + "grad_norm": 1.9081740044039712, + "learning_rate": 3.49274758571342e-06, + "loss": 0.5051, + "step": 5864 + }, + { + "epoch": 0.61, + "grad_norm": 1.8635557360728945, + "learning_rate": 3.491142563721808e-06, + "loss": 0.6049, + "step": 5865 + }, + { + "epoch": 0.61, + "grad_norm": 1.583335793635603, + "learning_rate": 3.489537712767786e-06, + "loss": 0.5906, + "step": 5866 + }, + { + "epoch": 0.61, + "grad_norm": 1.7730511646734124, + "learning_rate": 3.487933033033274e-06, + "loss": 0.6527, + "step": 5867 + }, + { + "epoch": 0.61, + "grad_norm": 2.125335096700481, + "learning_rate": 3.486328524700171e-06, + "loss": 0.5927, + "step": 5868 + }, + { + "epoch": 0.61, + "grad_norm": 2.0856948814252436, + "learning_rate": 3.4847241879503574e-06, + "loss": 0.7139, + "step": 5869 + }, + { + "epoch": 0.61, + "grad_norm": 1.8465188230889573, + "learning_rate": 3.4831200229656935e-06, + "loss": 0.6565, + "step": 5870 + }, + { + "epoch": 0.61, + "grad_norm": 1.8051867760992228, + "learning_rate": 3.4815160299280225e-06, + "loss": 0.5733, + "step": 5871 + }, + { + "epoch": 0.61, + "grad_norm": 1.9613480970154895, + "learning_rate": 3.4799122090191638e-06, + "loss": 0.633, + "step": 5872 + }, + { + "epoch": 0.61, + "grad_norm": 1.7007000511001091, + "learning_rate": 3.47830856042092e-06, + "loss": 0.5902, + "step": 5873 + }, + { + "epoch": 0.61, + "grad_norm": 1.7704526339579787, + "learning_rate": 3.476705084315074e-06, + "loss": 0.5984, + "step": 5874 + }, + { + "epoch": 0.61, + "grad_norm": 2.0012874669262692, + "learning_rate": 3.47510178088339e-06, + "loss": 0.6879, + "step": 5875 + }, + { + "epoch": 0.61, + "grad_norm": 1.8050110100253667, + "learning_rate": 3.4734986503076096e-06, + "loss": 0.5875, + "step": 5876 + }, + { + "epoch": 0.61, + "grad_norm": 2.132743737087264, + "learning_rate": 3.4718956927694593e-06, + "loss": 0.6638, + "step": 5877 + }, + { + "epoch": 0.61, + "grad_norm": 1.848961511546181, + "learning_rate": 3.4702929084506433e-06, + "loss": 0.6154, + "step": 5878 + }, + { + "epoch": 0.61, + "grad_norm": 1.7426350866830753, + "learning_rate": 3.468690297532843e-06, + "loss": 0.57, + "step": 5879 + }, + { + "epoch": 0.61, + "grad_norm": 1.9103389699630549, + "learning_rate": 3.467087860197726e-06, + "loss": 0.6717, + "step": 5880 + }, + { + "epoch": 0.61, + "grad_norm": 1.9028495231307951, + "learning_rate": 3.4654855966269373e-06, + "loss": 0.625, + "step": 5881 + }, + { + "epoch": 0.61, + "grad_norm": 1.888079361566811, + "learning_rate": 3.4638835070021027e-06, + "loss": 0.6302, + "step": 5882 + }, + { + "epoch": 0.61, + "grad_norm": 1.845351370141749, + "learning_rate": 3.462281591504828e-06, + "loss": 0.6333, + "step": 5883 + }, + { + "epoch": 0.61, + "grad_norm": 1.732023585322755, + "learning_rate": 3.4606798503166994e-06, + "loss": 0.6475, + "step": 5884 + }, + { + "epoch": 0.61, + "grad_norm": 1.8978626563200731, + "learning_rate": 3.4590782836192837e-06, + "loss": 0.6542, + "step": 5885 + }, + { + "epoch": 0.61, + "grad_norm": 1.869401160752364, + "learning_rate": 3.45747689159413e-06, + "loss": 0.5564, + "step": 5886 + }, + { + "epoch": 0.61, + "grad_norm": 1.868999390580027, + "learning_rate": 3.455875674422761e-06, + "loss": 0.579, + "step": 5887 + }, + { + "epoch": 0.61, + "grad_norm": 1.832886798253401, + "learning_rate": 3.4542746322866842e-06, + "loss": 0.4929, + "step": 5888 + }, + { + "epoch": 0.61, + "grad_norm": 1.913784743810917, + "learning_rate": 3.452673765367389e-06, + "loss": 0.6379, + "step": 5889 + }, + { + "epoch": 0.61, + "grad_norm": 1.9179201733844766, + "learning_rate": 3.4510730738463417e-06, + "loss": 0.6739, + "step": 5890 + }, + { + "epoch": 0.61, + "grad_norm": 1.8110837161807765, + "learning_rate": 3.4494725579049904e-06, + "loss": 0.6973, + "step": 5891 + }, + { + "epoch": 0.61, + "grad_norm": 1.987201446085457, + "learning_rate": 3.4478722177247624e-06, + "loss": 0.6087, + "step": 5892 + }, + { + "epoch": 0.61, + "grad_norm": 1.9980930635109033, + "learning_rate": 3.4462720534870673e-06, + "loss": 0.8181, + "step": 5893 + }, + { + "epoch": 0.61, + "grad_norm": 1.8706741387059407, + "learning_rate": 3.4446720653732883e-06, + "loss": 0.6902, + "step": 5894 + }, + { + "epoch": 0.61, + "grad_norm": 1.9409228527860711, + "learning_rate": 3.4430722535647966e-06, + "loss": 0.6133, + "step": 5895 + }, + { + "epoch": 0.61, + "grad_norm": 2.0029309963359085, + "learning_rate": 3.4414726182429388e-06, + "loss": 0.6602, + "step": 5896 + }, + { + "epoch": 0.61, + "grad_norm": 1.8613225733512109, + "learning_rate": 3.439873159589043e-06, + "loss": 0.6484, + "step": 5897 + }, + { + "epoch": 0.61, + "grad_norm": 2.332715989410047, + "learning_rate": 3.438273877784417e-06, + "loss": 0.6361, + "step": 5898 + }, + { + "epoch": 0.61, + "grad_norm": 1.988217349346568, + "learning_rate": 3.4366747730103486e-06, + "loss": 0.5936, + "step": 5899 + }, + { + "epoch": 0.61, + "grad_norm": 1.8721773260571348, + "learning_rate": 3.435075845448105e-06, + "loss": 0.5663, + "step": 5900 + }, + { + "epoch": 0.61, + "grad_norm": 1.9144622101502546, + "learning_rate": 3.4334770952789354e-06, + "loss": 0.5372, + "step": 5901 + }, + { + "epoch": 0.61, + "grad_norm": 2.1063634623324843, + "learning_rate": 3.4318785226840646e-06, + "loss": 0.5908, + "step": 5902 + }, + { + "epoch": 0.61, + "grad_norm": 1.920747599443631, + "learning_rate": 3.4302801278447028e-06, + "loss": 0.6181, + "step": 5903 + }, + { + "epoch": 0.61, + "grad_norm": 2.3644717939435353, + "learning_rate": 3.4286819109420346e-06, + "loss": 0.7202, + "step": 5904 + }, + { + "epoch": 0.61, + "grad_norm": 1.9060094333028947, + "learning_rate": 3.4270838721572277e-06, + "loss": 0.6332, + "step": 5905 + }, + { + "epoch": 0.61, + "grad_norm": 1.8361097469958723, + "learning_rate": 3.4254860116714284e-06, + "loss": 0.6645, + "step": 5906 + }, + { + "epoch": 0.61, + "grad_norm": 1.9388690291273964, + "learning_rate": 3.4238883296657656e-06, + "loss": 0.5735, + "step": 5907 + }, + { + "epoch": 0.61, + "grad_norm": 1.9303120106787002, + "learning_rate": 3.4222908263213438e-06, + "loss": 0.6181, + "step": 5908 + }, + { + "epoch": 0.61, + "grad_norm": 1.8319157145588998, + "learning_rate": 3.4206935018192496e-06, + "loss": 0.5852, + "step": 5909 + }, + { + "epoch": 0.61, + "grad_norm": 1.8890578866580727, + "learning_rate": 3.4190963563405482e-06, + "loss": 0.6554, + "step": 5910 + }, + { + "epoch": 0.61, + "grad_norm": 1.719008543508964, + "learning_rate": 3.4174993900662854e-06, + "loss": 0.5783, + "step": 5911 + }, + { + "epoch": 0.61, + "grad_norm": 1.9381651528278054, + "learning_rate": 3.4159026031774873e-06, + "loss": 0.6329, + "step": 5912 + }, + { + "epoch": 0.61, + "grad_norm": 1.9071649235590062, + "learning_rate": 3.4143059958551576e-06, + "loss": 0.5583, + "step": 5913 + }, + { + "epoch": 0.61, + "grad_norm": 2.6027346173908796, + "learning_rate": 3.4127095682802823e-06, + "loss": 0.6465, + "step": 5914 + }, + { + "epoch": 0.61, + "grad_norm": 1.7520728181991414, + "learning_rate": 3.4111133206338257e-06, + "loss": 0.615, + "step": 5915 + }, + { + "epoch": 0.61, + "grad_norm": 2.061567472873294, + "learning_rate": 3.409517253096729e-06, + "loss": 0.6207, + "step": 5916 + }, + { + "epoch": 0.62, + "grad_norm": 2.017959712602565, + "learning_rate": 3.407921365849917e-06, + "loss": 0.6219, + "step": 5917 + }, + { + "epoch": 0.62, + "grad_norm": 1.6541870573418043, + "learning_rate": 3.406325659074293e-06, + "loss": 0.5519, + "step": 5918 + }, + { + "epoch": 0.62, + "grad_norm": 2.0052219645499934, + "learning_rate": 3.404730132950739e-06, + "loss": 0.6655, + "step": 5919 + }, + { + "epoch": 0.62, + "grad_norm": 2.373349253904208, + "learning_rate": 3.403134787660117e-06, + "loss": 0.638, + "step": 5920 + }, + { + "epoch": 0.62, + "grad_norm": 1.9857596614742279, + "learning_rate": 3.4015396233832687e-06, + "loss": 0.6129, + "step": 5921 + }, + { + "epoch": 0.62, + "grad_norm": 1.9865545357725798, + "learning_rate": 3.3999446403010156e-06, + "loss": 0.6958, + "step": 5922 + }, + { + "epoch": 0.62, + "grad_norm": 2.2225225785812492, + "learning_rate": 3.398349838594159e-06, + "loss": 0.6527, + "step": 5923 + }, + { + "epoch": 0.62, + "grad_norm": 1.965865099296652, + "learning_rate": 3.3967552184434753e-06, + "loss": 0.6616, + "step": 5924 + }, + { + "epoch": 0.62, + "grad_norm": 1.7599925725199548, + "learning_rate": 3.395160780029726e-06, + "loss": 0.6546, + "step": 5925 + }, + { + "epoch": 0.62, + "grad_norm": 1.7892999068048223, + "learning_rate": 3.393566523533649e-06, + "loss": 0.5618, + "step": 5926 + }, + { + "epoch": 0.62, + "grad_norm": 1.907038190708332, + "learning_rate": 3.391972449135964e-06, + "loss": 0.6594, + "step": 5927 + }, + { + "epoch": 0.62, + "grad_norm": 2.0532475771298313, + "learning_rate": 3.3903785570173665e-06, + "loss": 0.7017, + "step": 5928 + }, + { + "epoch": 0.62, + "grad_norm": 1.8848373537785805, + "learning_rate": 3.388784847358534e-06, + "loss": 0.636, + "step": 5929 + }, + { + "epoch": 0.62, + "grad_norm": 1.710821951197328, + "learning_rate": 3.387191320340125e-06, + "loss": 0.6399, + "step": 5930 + }, + { + "epoch": 0.62, + "grad_norm": 1.8487227564053192, + "learning_rate": 3.3855979761427705e-06, + "loss": 0.6817, + "step": 5931 + }, + { + "epoch": 0.62, + "grad_norm": 1.907129570306386, + "learning_rate": 3.384004814947087e-06, + "loss": 0.6738, + "step": 5932 + }, + { + "epoch": 0.62, + "grad_norm": 1.8995663612234321, + "learning_rate": 3.382411836933669e-06, + "loss": 0.5573, + "step": 5933 + }, + { + "epoch": 0.62, + "grad_norm": 1.756986647365497, + "learning_rate": 3.3808190422830887e-06, + "loss": 0.6431, + "step": 5934 + }, + { + "epoch": 0.62, + "grad_norm": 1.7715018375793272, + "learning_rate": 3.379226431175899e-06, + "loss": 0.7003, + "step": 5935 + }, + { + "epoch": 0.62, + "grad_norm": 2.035648914953771, + "learning_rate": 3.377634003792632e-06, + "loss": 0.6059, + "step": 5936 + }, + { + "epoch": 0.62, + "grad_norm": 2.014535610980337, + "learning_rate": 3.3760417603137976e-06, + "loss": 0.5789, + "step": 5937 + }, + { + "epoch": 0.62, + "grad_norm": 1.7341296370744583, + "learning_rate": 3.374449700919887e-06, + "loss": 0.6037, + "step": 5938 + }, + { + "epoch": 0.62, + "grad_norm": 1.9227588740393737, + "learning_rate": 3.372857825791367e-06, + "loss": 0.7159, + "step": 5939 + }, + { + "epoch": 0.62, + "grad_norm": 1.8532934737084343, + "learning_rate": 3.371266135108687e-06, + "loss": 0.6038, + "step": 5940 + }, + { + "epoch": 0.62, + "grad_norm": 1.8469207224534798, + "learning_rate": 3.3696746290522737e-06, + "loss": 0.6791, + "step": 5941 + }, + { + "epoch": 0.62, + "grad_norm": 1.9161878551856997, + "learning_rate": 3.368083307802535e-06, + "loss": 0.5931, + "step": 5942 + }, + { + "epoch": 0.62, + "grad_norm": 1.7666500679293722, + "learning_rate": 3.3664921715398534e-06, + "loss": 0.5605, + "step": 5943 + }, + { + "epoch": 0.62, + "grad_norm": 1.9254176807282661, + "learning_rate": 3.3649012204445953e-06, + "loss": 0.564, + "step": 5944 + }, + { + "epoch": 0.62, + "grad_norm": 1.9573810160395808, + "learning_rate": 3.3633104546971052e-06, + "loss": 0.6968, + "step": 5945 + }, + { + "epoch": 0.62, + "grad_norm": 2.0044533425543785, + "learning_rate": 3.3617198744777023e-06, + "loss": 0.6491, + "step": 5946 + }, + { + "epoch": 0.62, + "grad_norm": 1.763232630775274, + "learning_rate": 3.3601294799666896e-06, + "loss": 0.6697, + "step": 5947 + }, + { + "epoch": 0.62, + "grad_norm": 2.098459423812884, + "learning_rate": 3.3585392713443464e-06, + "loss": 0.572, + "step": 5948 + }, + { + "epoch": 0.62, + "grad_norm": 1.6914031236102578, + "learning_rate": 3.356949248790934e-06, + "loss": 0.5091, + "step": 5949 + }, + { + "epoch": 0.62, + "grad_norm": 2.1716317317901366, + "learning_rate": 3.3553594124866897e-06, + "loss": 0.5757, + "step": 5950 + }, + { + "epoch": 0.62, + "grad_norm": 2.0048322695335354, + "learning_rate": 3.3537697626118286e-06, + "loss": 0.5663, + "step": 5951 + }, + { + "epoch": 0.62, + "grad_norm": 1.9550871662432008, + "learning_rate": 3.3521802993465513e-06, + "loss": 0.6769, + "step": 5952 + }, + { + "epoch": 0.62, + "grad_norm": 1.94484331544079, + "learning_rate": 3.350591022871027e-06, + "loss": 0.6025, + "step": 5953 + }, + { + "epoch": 0.62, + "grad_norm": 1.8226399499008687, + "learning_rate": 3.349001933365411e-06, + "loss": 0.6214, + "step": 5954 + }, + { + "epoch": 0.62, + "grad_norm": 1.8242034051010882, + "learning_rate": 3.3474130310098373e-06, + "loss": 0.646, + "step": 5955 + }, + { + "epoch": 0.62, + "grad_norm": 1.9622212853904062, + "learning_rate": 3.345824315984415e-06, + "loss": 0.6289, + "step": 5956 + }, + { + "epoch": 0.62, + "grad_norm": 2.120458213494939, + "learning_rate": 3.3442357884692354e-06, + "loss": 0.6833, + "step": 5957 + }, + { + "epoch": 0.62, + "grad_norm": 2.1252064549314236, + "learning_rate": 3.3426474486443673e-06, + "loss": 0.5385, + "step": 5958 + }, + { + "epoch": 0.62, + "grad_norm": 1.9993862933069262, + "learning_rate": 3.3410592966898565e-06, + "loss": 0.5866, + "step": 5959 + }, + { + "epoch": 0.62, + "grad_norm": 1.794889673681397, + "learning_rate": 3.3394713327857325e-06, + "loss": 0.5263, + "step": 5960 + }, + { + "epoch": 0.62, + "grad_norm": 1.856636747820928, + "learning_rate": 3.3378835571119953e-06, + "loss": 0.5611, + "step": 5961 + }, + { + "epoch": 0.62, + "grad_norm": 1.7722291953552272, + "learning_rate": 3.3362959698486307e-06, + "loss": 0.6446, + "step": 5962 + }, + { + "epoch": 0.62, + "grad_norm": 2.0901179185866945, + "learning_rate": 3.3347085711756012e-06, + "loss": 0.5646, + "step": 5963 + }, + { + "epoch": 0.62, + "grad_norm": 1.836967165172361, + "learning_rate": 3.333121361272847e-06, + "loss": 0.5881, + "step": 5964 + }, + { + "epoch": 0.62, + "grad_norm": 2.102039726838115, + "learning_rate": 3.331534340320287e-06, + "loss": 0.644, + "step": 5965 + }, + { + "epoch": 0.62, + "grad_norm": 1.9283561150549664, + "learning_rate": 3.3299475084978195e-06, + "loss": 0.67, + "step": 5966 + }, + { + "epoch": 0.62, + "grad_norm": 1.9259529340587442, + "learning_rate": 3.328360865985323e-06, + "loss": 0.5941, + "step": 5967 + }, + { + "epoch": 0.62, + "grad_norm": 1.943892691297661, + "learning_rate": 3.3267744129626483e-06, + "loss": 0.6427, + "step": 5968 + }, + { + "epoch": 0.62, + "grad_norm": 1.9953597123623206, + "learning_rate": 3.3251881496096313e-06, + "loss": 0.6807, + "step": 5969 + }, + { + "epoch": 0.62, + "grad_norm": 1.6739874388513096, + "learning_rate": 3.3236020761060834e-06, + "loss": 0.4788, + "step": 5970 + }, + { + "epoch": 0.62, + "grad_norm": 2.0506706084869584, + "learning_rate": 3.322016192631795e-06, + "loss": 0.6735, + "step": 5971 + }, + { + "epoch": 0.62, + "grad_norm": 1.9665235766949143, + "learning_rate": 3.320430499366536e-06, + "loss": 0.6661, + "step": 5972 + }, + { + "epoch": 0.62, + "grad_norm": 1.7042912287859477, + "learning_rate": 3.3188449964900527e-06, + "loss": 0.5868, + "step": 5973 + }, + { + "epoch": 0.62, + "grad_norm": 1.7429369415654616, + "learning_rate": 3.3172596841820713e-06, + "loss": 0.5986, + "step": 5974 + }, + { + "epoch": 0.62, + "grad_norm": 1.846606375173926, + "learning_rate": 3.315674562622297e-06, + "loss": 0.604, + "step": 5975 + }, + { + "epoch": 0.62, + "grad_norm": 1.8538542112748246, + "learning_rate": 3.31408963199041e-06, + "loss": 0.6191, + "step": 5976 + }, + { + "epoch": 0.62, + "grad_norm": 1.9148060253706294, + "learning_rate": 3.312504892466073e-06, + "loss": 0.6505, + "step": 5977 + }, + { + "epoch": 0.62, + "grad_norm": 1.78279362426924, + "learning_rate": 3.310920344228925e-06, + "loss": 0.6188, + "step": 5978 + }, + { + "epoch": 0.62, + "grad_norm": 1.8717477405727647, + "learning_rate": 3.3093359874585832e-06, + "loss": 0.6894, + "step": 5979 + }, + { + "epoch": 0.62, + "grad_norm": 1.749990881968654, + "learning_rate": 3.3077518223346448e-06, + "loss": 0.6087, + "step": 5980 + }, + { + "epoch": 0.62, + "grad_norm": 1.7233031652705608, + "learning_rate": 3.3061678490366824e-06, + "loss": 0.649, + "step": 5981 + }, + { + "epoch": 0.62, + "grad_norm": 1.962583380637211, + "learning_rate": 3.3045840677442485e-06, + "loss": 0.5858, + "step": 5982 + }, + { + "epoch": 0.62, + "grad_norm": 2.1968870344761626, + "learning_rate": 3.303000478636874e-06, + "loss": 0.6908, + "step": 5983 + }, + { + "epoch": 0.62, + "grad_norm": 1.9286378413854515, + "learning_rate": 3.3014170818940677e-06, + "loss": 0.5823, + "step": 5984 + }, + { + "epoch": 0.62, + "grad_norm": 1.9232889386442449, + "learning_rate": 3.2998338776953163e-06, + "loss": 0.6119, + "step": 5985 + }, + { + "epoch": 0.62, + "grad_norm": 1.8910875657751063, + "learning_rate": 3.2982508662200864e-06, + "loss": 0.6875, + "step": 5986 + }, + { + "epoch": 0.62, + "grad_norm": 1.9224963141768185, + "learning_rate": 3.2966680476478196e-06, + "loss": 0.6215, + "step": 5987 + }, + { + "epoch": 0.62, + "grad_norm": 2.2111799563581465, + "learning_rate": 3.295085422157939e-06, + "loss": 0.6311, + "step": 5988 + }, + { + "epoch": 0.62, + "grad_norm": 1.7623188008375033, + "learning_rate": 3.2935029899298444e-06, + "loss": 0.5608, + "step": 5989 + }, + { + "epoch": 0.62, + "grad_norm": 1.8634780884349817, + "learning_rate": 3.291920751142912e-06, + "loss": 0.669, + "step": 5990 + }, + { + "epoch": 0.62, + "grad_norm": 2.0076300266293696, + "learning_rate": 3.290338705976497e-06, + "loss": 0.6168, + "step": 5991 + }, + { + "epoch": 0.62, + "grad_norm": 1.9389766540870652, + "learning_rate": 3.2887568546099346e-06, + "loss": 0.6409, + "step": 5992 + }, + { + "epoch": 0.62, + "grad_norm": 1.8669408346664602, + "learning_rate": 3.287175197222537e-06, + "loss": 0.6464, + "step": 5993 + }, + { + "epoch": 0.62, + "grad_norm": 2.1106017656825755, + "learning_rate": 3.2855937339935933e-06, + "loss": 0.6207, + "step": 5994 + }, + { + "epoch": 0.62, + "grad_norm": 1.8279681125103633, + "learning_rate": 3.284012465102372e-06, + "loss": 0.5878, + "step": 5995 + }, + { + "epoch": 0.62, + "grad_norm": 2.096349156684369, + "learning_rate": 3.282431390728118e-06, + "loss": 0.6014, + "step": 5996 + }, + { + "epoch": 0.62, + "grad_norm": 2.032892431253479, + "learning_rate": 3.280850511050058e-06, + "loss": 0.689, + "step": 5997 + }, + { + "epoch": 0.62, + "grad_norm": 1.9785676746663783, + "learning_rate": 3.279269826247389e-06, + "loss": 0.4741, + "step": 5998 + }, + { + "epoch": 0.62, + "grad_norm": 1.676119831126687, + "learning_rate": 3.2776893364992936e-06, + "loss": 0.5467, + "step": 5999 + }, + { + "epoch": 0.62, + "grad_norm": 1.90052864684741, + "learning_rate": 3.2761090419849286e-06, + "loss": 0.5841, + "step": 6000 + }, + { + "epoch": 0.62, + "grad_norm": 2.229316068199629, + "learning_rate": 3.2745289428834294e-06, + "loss": 0.5785, + "step": 6001 + }, + { + "epoch": 0.62, + "grad_norm": 2.0934347201144146, + "learning_rate": 3.2729490393739093e-06, + "loss": 0.6735, + "step": 6002 + }, + { + "epoch": 0.62, + "grad_norm": 2.048084478335236, + "learning_rate": 3.2713693316354593e-06, + "loss": 0.6202, + "step": 6003 + }, + { + "epoch": 0.62, + "grad_norm": 1.9404776557417234, + "learning_rate": 3.269789819847151e-06, + "loss": 0.6798, + "step": 6004 + }, + { + "epoch": 0.62, + "grad_norm": 1.9162170778231924, + "learning_rate": 3.2682105041880264e-06, + "loss": 0.5845, + "step": 6005 + }, + { + "epoch": 0.62, + "grad_norm": 1.9545142603110053, + "learning_rate": 3.2666313848371113e-06, + "loss": 0.6539, + "step": 6006 + }, + { + "epoch": 0.62, + "grad_norm": 1.794309367132827, + "learning_rate": 3.265052461973409e-06, + "loss": 0.5696, + "step": 6007 + }, + { + "epoch": 0.62, + "grad_norm": 1.842487367019058, + "learning_rate": 3.2634737357758994e-06, + "loss": 0.6149, + "step": 6008 + }, + { + "epoch": 0.62, + "grad_norm": 1.723064752719736, + "learning_rate": 3.26189520642354e-06, + "loss": 0.5282, + "step": 6009 + }, + { + "epoch": 0.62, + "grad_norm": 2.096348196467705, + "learning_rate": 3.2603168740952645e-06, + "loss": 0.5349, + "step": 6010 + }, + { + "epoch": 0.62, + "grad_norm": 2.0586495432214025, + "learning_rate": 3.2587387389699895e-06, + "loss": 0.6791, + "step": 6011 + }, + { + "epoch": 0.62, + "grad_norm": 1.8244663325194614, + "learning_rate": 3.257160801226601e-06, + "loss": 0.6393, + "step": 6012 + }, + { + "epoch": 0.63, + "grad_norm": 1.8902371345502247, + "learning_rate": 3.255583061043971e-06, + "loss": 0.4821, + "step": 6013 + }, + { + "epoch": 0.63, + "grad_norm": 1.719480343071909, + "learning_rate": 3.2540055186009428e-06, + "loss": 0.5497, + "step": 6014 + }, + { + "epoch": 0.63, + "grad_norm": 1.876300759582372, + "learning_rate": 3.252428174076341e-06, + "loss": 0.6434, + "step": 6015 + }, + { + "epoch": 0.63, + "grad_norm": 2.3262050183986918, + "learning_rate": 3.250851027648967e-06, + "loss": 0.6501, + "step": 6016 + }, + { + "epoch": 0.63, + "grad_norm": 1.8741006195400267, + "learning_rate": 3.2492740794975985e-06, + "loss": 0.5569, + "step": 6017 + }, + { + "epoch": 0.63, + "grad_norm": 1.992043370643798, + "learning_rate": 3.247697329800992e-06, + "loss": 0.7077, + "step": 6018 + }, + { + "epoch": 0.63, + "grad_norm": 2.0062231422855503, + "learning_rate": 3.246120778737883e-06, + "loss": 0.6345, + "step": 6019 + }, + { + "epoch": 0.63, + "grad_norm": 1.8615543687360108, + "learning_rate": 3.2445444264869783e-06, + "loss": 0.6295, + "step": 6020 + }, + { + "epoch": 0.63, + "grad_norm": 1.9316303752921364, + "learning_rate": 3.2429682732269685e-06, + "loss": 0.5939, + "step": 6021 + }, + { + "epoch": 0.63, + "grad_norm": 1.9484002498233846, + "learning_rate": 3.2413923191365203e-06, + "loss": 0.6531, + "step": 6022 + }, + { + "epoch": 0.63, + "grad_norm": 2.0450463816269737, + "learning_rate": 3.239816564394276e-06, + "loss": 0.5847, + "step": 6023 + }, + { + "epoch": 0.63, + "grad_norm": 1.8031836096624272, + "learning_rate": 3.2382410091788567e-06, + "loss": 0.6577, + "step": 6024 + }, + { + "epoch": 0.63, + "grad_norm": 2.2460149673113237, + "learning_rate": 3.2366656536688614e-06, + "loss": 0.5988, + "step": 6025 + }, + { + "epoch": 0.63, + "grad_norm": 1.896275207069309, + "learning_rate": 3.235090498042866e-06, + "loss": 0.5827, + "step": 6026 + }, + { + "epoch": 0.63, + "grad_norm": 1.9320872061025782, + "learning_rate": 3.2335155424794205e-06, + "loss": 0.6269, + "step": 6027 + }, + { + "epoch": 0.63, + "grad_norm": 1.6624219058989238, + "learning_rate": 3.2319407871570574e-06, + "loss": 0.663, + "step": 6028 + }, + { + "epoch": 0.63, + "grad_norm": 1.8285245189929658, + "learning_rate": 3.2303662322542835e-06, + "loss": 0.6115, + "step": 6029 + }, + { + "epoch": 0.63, + "grad_norm": 1.9162669741798801, + "learning_rate": 3.228791877949583e-06, + "loss": 0.6262, + "step": 6030 + }, + { + "epoch": 0.63, + "grad_norm": 1.9680777538685632, + "learning_rate": 3.2272177244214198e-06, + "loss": 0.5613, + "step": 6031 + }, + { + "epoch": 0.63, + "grad_norm": 2.0801896158128046, + "learning_rate": 3.2256437718482312e-06, + "loss": 0.5686, + "step": 6032 + }, + { + "epoch": 0.63, + "grad_norm": 2.217100063165203, + "learning_rate": 3.2240700204084353e-06, + "loss": 0.6403, + "step": 6033 + }, + { + "epoch": 0.63, + "grad_norm": 1.8945485168220204, + "learning_rate": 3.222496470280427e-06, + "loss": 0.6851, + "step": 6034 + }, + { + "epoch": 0.63, + "grad_norm": 1.8383948040993052, + "learning_rate": 3.220923121642573e-06, + "loss": 0.6221, + "step": 6035 + }, + { + "epoch": 0.63, + "grad_norm": 1.9516341763000284, + "learning_rate": 3.219349974673223e-06, + "loss": 0.62, + "step": 6036 + }, + { + "epoch": 0.63, + "grad_norm": 1.8320310813825704, + "learning_rate": 3.217777029550703e-06, + "loss": 0.6195, + "step": 6037 + }, + { + "epoch": 0.63, + "grad_norm": 1.847472645041628, + "learning_rate": 3.2162042864533154e-06, + "loss": 0.56, + "step": 6038 + }, + { + "epoch": 0.63, + "grad_norm": 1.978049182310532, + "learning_rate": 3.214631745559339e-06, + "loss": 0.6112, + "step": 6039 + }, + { + "epoch": 0.63, + "grad_norm": 1.8176012495141787, + "learning_rate": 3.2130594070470307e-06, + "loss": 0.5219, + "step": 6040 + }, + { + "epoch": 0.63, + "grad_norm": 1.8666199364201022, + "learning_rate": 3.2114872710946243e-06, + "loss": 0.6095, + "step": 6041 + }, + { + "epoch": 0.63, + "grad_norm": 1.969500493585429, + "learning_rate": 3.2099153378803294e-06, + "loss": 0.6168, + "step": 6042 + }, + { + "epoch": 0.63, + "grad_norm": 1.7974069563510562, + "learning_rate": 3.2083436075823353e-06, + "loss": 0.6153, + "step": 6043 + }, + { + "epoch": 0.63, + "grad_norm": 1.8639577027509828, + "learning_rate": 3.206772080378804e-06, + "loss": 0.612, + "step": 6044 + }, + { + "epoch": 0.63, + "grad_norm": 1.9270717940972086, + "learning_rate": 3.205200756447878e-06, + "loss": 0.6043, + "step": 6045 + }, + { + "epoch": 0.63, + "grad_norm": 1.7652996412307986, + "learning_rate": 3.2036296359676777e-06, + "loss": 0.6276, + "step": 6046 + }, + { + "epoch": 0.63, + "grad_norm": 1.995920741483894, + "learning_rate": 3.2020587191162956e-06, + "loss": 0.482, + "step": 6047 + }, + { + "epoch": 0.63, + "grad_norm": 1.9227514888406252, + "learning_rate": 3.2004880060718072e-06, + "loss": 0.6573, + "step": 6048 + }, + { + "epoch": 0.63, + "grad_norm": 1.7486882907902446, + "learning_rate": 3.1989174970122594e-06, + "loss": 0.6222, + "step": 6049 + }, + { + "epoch": 0.63, + "grad_norm": 2.138887268029901, + "learning_rate": 3.197347192115679e-06, + "loss": 0.5945, + "step": 6050 + }, + { + "epoch": 0.63, + "grad_norm": 2.109380295021703, + "learning_rate": 3.1957770915600696e-06, + "loss": 0.6232, + "step": 6051 + }, + { + "epoch": 0.63, + "grad_norm": 1.7196849956781948, + "learning_rate": 3.19420719552341e-06, + "loss": 0.4851, + "step": 6052 + }, + { + "epoch": 0.63, + "grad_norm": 1.917722969325608, + "learning_rate": 3.1926375041836573e-06, + "loss": 0.5262, + "step": 6053 + }, + { + "epoch": 0.63, + "grad_norm": 1.8732009281586104, + "learning_rate": 3.1910680177187453e-06, + "loss": 0.6462, + "step": 6054 + }, + { + "epoch": 0.63, + "grad_norm": 1.9473297187009115, + "learning_rate": 3.189498736306584e-06, + "loss": 0.6695, + "step": 6055 + }, + { + "epoch": 0.63, + "grad_norm": 1.7892239645860206, + "learning_rate": 3.187929660125063e-06, + "loss": 0.5702, + "step": 6056 + }, + { + "epoch": 0.63, + "grad_norm": 1.888941220834544, + "learning_rate": 3.186360789352041e-06, + "loss": 0.6687, + "step": 6057 + }, + { + "epoch": 0.63, + "grad_norm": 2.0322577338706784, + "learning_rate": 3.1847921241653614e-06, + "loss": 0.6612, + "step": 6058 + }, + { + "epoch": 0.63, + "grad_norm": 1.9369396393065463, + "learning_rate": 3.18322366474284e-06, + "loss": 0.6982, + "step": 6059 + }, + { + "epoch": 0.63, + "grad_norm": 2.277936713753932, + "learning_rate": 3.181655411262272e-06, + "loss": 0.5452, + "step": 6060 + }, + { + "epoch": 0.63, + "grad_norm": 1.9074840722455486, + "learning_rate": 3.1800873639014276e-06, + "loss": 0.4769, + "step": 6061 + }, + { + "epoch": 0.63, + "grad_norm": 1.9101919714628626, + "learning_rate": 3.1785195228380527e-06, + "loss": 0.6405, + "step": 6062 + }, + { + "epoch": 0.63, + "grad_norm": 1.9123614225065813, + "learning_rate": 3.176951888249875e-06, + "loss": 0.6504, + "step": 6063 + }, + { + "epoch": 0.63, + "grad_norm": 2.2213263420816998, + "learning_rate": 3.1753844603145894e-06, + "loss": 0.6375, + "step": 6064 + }, + { + "epoch": 0.63, + "grad_norm": 1.9608161818046423, + "learning_rate": 3.1738172392098752e-06, + "loss": 0.5604, + "step": 6065 + }, + { + "epoch": 0.63, + "grad_norm": 1.995234308228533, + "learning_rate": 3.172250225113386e-06, + "loss": 0.6577, + "step": 6066 + }, + { + "epoch": 0.63, + "grad_norm": 2.0377980498966806, + "learning_rate": 3.170683418202751e-06, + "loss": 0.6402, + "step": 6067 + }, + { + "epoch": 0.63, + "grad_norm": 1.8853013788369504, + "learning_rate": 3.1691168186555778e-06, + "loss": 0.5516, + "step": 6068 + }, + { + "epoch": 0.63, + "grad_norm": 1.71003810845978, + "learning_rate": 3.1675504266494493e-06, + "loss": 0.6465, + "step": 6069 + }, + { + "epoch": 0.63, + "grad_norm": 1.6686351788488085, + "learning_rate": 3.1659842423619237e-06, + "loss": 0.5669, + "step": 6070 + }, + { + "epoch": 0.63, + "grad_norm": 2.061531475156852, + "learning_rate": 3.1644182659705403e-06, + "loss": 0.5795, + "step": 6071 + }, + { + "epoch": 0.63, + "grad_norm": 1.762438560341012, + "learning_rate": 3.162852497652807e-06, + "loss": 0.6474, + "step": 6072 + }, + { + "epoch": 0.63, + "grad_norm": 1.9593057376669718, + "learning_rate": 3.161286937586214e-06, + "loss": 0.6712, + "step": 6073 + }, + { + "epoch": 0.63, + "grad_norm": 2.0318391689656377, + "learning_rate": 3.159721585948228e-06, + "loss": 0.6308, + "step": 6074 + }, + { + "epoch": 0.63, + "grad_norm": 1.931145018821637, + "learning_rate": 3.158156442916288e-06, + "loss": 0.7147, + "step": 6075 + }, + { + "epoch": 0.63, + "grad_norm": 2.337635038951802, + "learning_rate": 3.156591508667814e-06, + "loss": 0.734, + "step": 6076 + }, + { + "epoch": 0.63, + "grad_norm": 1.9031520641680848, + "learning_rate": 3.1550267833801993e-06, + "loss": 0.6095, + "step": 6077 + }, + { + "epoch": 0.63, + "grad_norm": 1.7487631178062244, + "learning_rate": 3.1534622672308165e-06, + "loss": 0.6306, + "step": 6078 + }, + { + "epoch": 0.63, + "grad_norm": 1.9160299182081961, + "learning_rate": 3.151897960397009e-06, + "loss": 0.6177, + "step": 6079 + }, + { + "epoch": 0.63, + "grad_norm": 1.9860175732897307, + "learning_rate": 3.150333863056102e-06, + "loss": 0.7167, + "step": 6080 + }, + { + "epoch": 0.63, + "grad_norm": 1.7346053348798447, + "learning_rate": 3.148769975385394e-06, + "loss": 0.5572, + "step": 6081 + }, + { + "epoch": 0.63, + "grad_norm": 1.7993775431735983, + "learning_rate": 3.147206297562162e-06, + "loss": 0.576, + "step": 6082 + }, + { + "epoch": 0.63, + "grad_norm": 1.8139002403350746, + "learning_rate": 3.1456428297636555e-06, + "loss": 0.5244, + "step": 6083 + }, + { + "epoch": 0.63, + "grad_norm": 1.6929454845149765, + "learning_rate": 3.1440795721671036e-06, + "loss": 0.7666, + "step": 6084 + }, + { + "epoch": 0.63, + "grad_norm": 1.7528536995455826, + "learning_rate": 3.1425165249497118e-06, + "loss": 0.599, + "step": 6085 + }, + { + "epoch": 0.63, + "grad_norm": 2.0191237440247836, + "learning_rate": 3.140953688288658e-06, + "loss": 0.6796, + "step": 6086 + }, + { + "epoch": 0.63, + "grad_norm": 1.9680374968614391, + "learning_rate": 3.1393910623611007e-06, + "loss": 0.6019, + "step": 6087 + }, + { + "epoch": 0.63, + "grad_norm": 1.9442953822047284, + "learning_rate": 3.137828647344171e-06, + "loss": 0.6288, + "step": 6088 + }, + { + "epoch": 0.63, + "grad_norm": 1.981265816746576, + "learning_rate": 3.1362664434149782e-06, + "loss": 0.5979, + "step": 6089 + }, + { + "epoch": 0.63, + "grad_norm": 1.726191123392917, + "learning_rate": 3.134704450750607e-06, + "loss": 0.6121, + "step": 6090 + }, + { + "epoch": 0.63, + "grad_norm": 1.6721715460247808, + "learning_rate": 3.133142669528118e-06, + "loss": 0.5627, + "step": 6091 + }, + { + "epoch": 0.63, + "grad_norm": 1.9570130159305819, + "learning_rate": 3.1315810999245483e-06, + "loss": 0.6396, + "step": 6092 + }, + { + "epoch": 0.63, + "grad_norm": 1.995162107251734, + "learning_rate": 3.1300197421169125e-06, + "loss": 0.7216, + "step": 6093 + }, + { + "epoch": 0.63, + "grad_norm": 2.1451911595083795, + "learning_rate": 3.1284585962821957e-06, + "loss": 0.6261, + "step": 6094 + }, + { + "epoch": 0.63, + "grad_norm": 1.901859286098706, + "learning_rate": 3.126897662597364e-06, + "loss": 0.677, + "step": 6095 + }, + { + "epoch": 0.63, + "grad_norm": 1.889639426022135, + "learning_rate": 3.1253369412393584e-06, + "loss": 0.6535, + "step": 6096 + }, + { + "epoch": 0.63, + "grad_norm": 1.8686229215951038, + "learning_rate": 3.1237764323850964e-06, + "loss": 0.6528, + "step": 6097 + }, + { + "epoch": 0.63, + "grad_norm": 2.311920832241536, + "learning_rate": 3.12221613621147e-06, + "loss": 0.6193, + "step": 6098 + }, + { + "epoch": 0.63, + "grad_norm": 1.9239392502057686, + "learning_rate": 3.1206560528953467e-06, + "loss": 0.6082, + "step": 6099 + }, + { + "epoch": 0.63, + "grad_norm": 1.8118553614299049, + "learning_rate": 3.1190961826135744e-06, + "loss": 0.635, + "step": 6100 + }, + { + "epoch": 0.63, + "grad_norm": 1.7947350380331701, + "learning_rate": 3.1175365255429685e-06, + "loss": 0.572, + "step": 6101 + }, + { + "epoch": 0.63, + "grad_norm": 1.840175405623914, + "learning_rate": 3.115977081860327e-06, + "loss": 0.56, + "step": 6102 + }, + { + "epoch": 0.63, + "grad_norm": 2.014363206044062, + "learning_rate": 3.1144178517424217e-06, + "loss": 0.5143, + "step": 6103 + }, + { + "epoch": 0.63, + "grad_norm": 1.692597253312618, + "learning_rate": 3.1128588353660006e-06, + "loss": 0.5608, + "step": 6104 + }, + { + "epoch": 0.63, + "grad_norm": 2.2750803784856393, + "learning_rate": 3.111300032907787e-06, + "loss": 0.5791, + "step": 6105 + }, + { + "epoch": 0.63, + "grad_norm": 1.7369305752415842, + "learning_rate": 3.1097414445444796e-06, + "loss": 0.5949, + "step": 6106 + }, + { + "epoch": 0.63, + "grad_norm": 2.220660084433447, + "learning_rate": 3.1081830704527535e-06, + "loss": 0.6184, + "step": 6107 + }, + { + "epoch": 0.63, + "grad_norm": 1.8162843137045013, + "learning_rate": 3.1066249108092616e-06, + "loss": 0.6274, + "step": 6108 + }, + { + "epoch": 0.64, + "grad_norm": 1.7452644073711159, + "learning_rate": 3.1050669657906257e-06, + "loss": 0.5825, + "step": 6109 + }, + { + "epoch": 0.64, + "grad_norm": 2.109057932821261, + "learning_rate": 3.10350923557345e-06, + "loss": 0.6562, + "step": 6110 + }, + { + "epoch": 0.64, + "grad_norm": 2.07479718970697, + "learning_rate": 3.101951720334312e-06, + "loss": 0.5884, + "step": 6111 + }, + { + "epoch": 0.64, + "grad_norm": 1.9154797157879695, + "learning_rate": 3.1003944202497655e-06, + "loss": 0.5808, + "step": 6112 + }, + { + "epoch": 0.64, + "grad_norm": 2.0766652476271794, + "learning_rate": 3.0988373354963387e-06, + "loss": 0.6579, + "step": 6113 + }, + { + "epoch": 0.64, + "grad_norm": 2.0534225279153886, + "learning_rate": 3.097280466250536e-06, + "loss": 0.6693, + "step": 6114 + }, + { + "epoch": 0.64, + "grad_norm": 1.9602854542078891, + "learning_rate": 3.0957238126888384e-06, + "loss": 0.5914, + "step": 6115 + }, + { + "epoch": 0.64, + "grad_norm": 1.8801721093726407, + "learning_rate": 3.0941673749877e-06, + "loss": 0.6304, + "step": 6116 + }, + { + "epoch": 0.64, + "grad_norm": 2.0461787180132682, + "learning_rate": 3.0926111533235526e-06, + "loss": 0.6503, + "step": 6117 + }, + { + "epoch": 0.64, + "grad_norm": 1.9878343487415226, + "learning_rate": 3.091055147872802e-06, + "loss": 0.7512, + "step": 6118 + }, + { + "epoch": 0.64, + "grad_norm": 2.1283625476528787, + "learning_rate": 3.0894993588118318e-06, + "loss": 0.5694, + "step": 6119 + }, + { + "epoch": 0.64, + "grad_norm": 1.8045352095592917, + "learning_rate": 3.087943786316999e-06, + "loss": 0.6016, + "step": 6120 + }, + { + "epoch": 0.64, + "grad_norm": 1.8931603547365077, + "learning_rate": 3.0863884305646364e-06, + "loss": 0.5812, + "step": 6121 + }, + { + "epoch": 0.64, + "grad_norm": 1.804868568230148, + "learning_rate": 3.0848332917310532e-06, + "loss": 0.5961, + "step": 6122 + }, + { + "epoch": 0.64, + "grad_norm": 1.9803781360337822, + "learning_rate": 3.0832783699925307e-06, + "loss": 0.7075, + "step": 6123 + }, + { + "epoch": 0.64, + "grad_norm": 1.8810190808368576, + "learning_rate": 3.081723665525331e-06, + "loss": 0.6251, + "step": 6124 + }, + { + "epoch": 0.64, + "grad_norm": 1.8230914084353704, + "learning_rate": 3.0801691785056863e-06, + "loss": 0.6425, + "step": 6125 + }, + { + "epoch": 0.64, + "grad_norm": 1.801356178216089, + "learning_rate": 3.0786149091098087e-06, + "loss": 0.5365, + "step": 6126 + }, + { + "epoch": 0.64, + "grad_norm": 1.8914863670162068, + "learning_rate": 3.0770608575138825e-06, + "loss": 0.6569, + "step": 6127 + }, + { + "epoch": 0.64, + "grad_norm": 1.8896895490041579, + "learning_rate": 3.075507023894069e-06, + "loss": 0.6015, + "step": 6128 + }, + { + "epoch": 0.64, + "grad_norm": 2.1105225623734287, + "learning_rate": 3.0739534084265032e-06, + "loss": 0.6685, + "step": 6129 + }, + { + "epoch": 0.64, + "grad_norm": 1.9830540304838769, + "learning_rate": 3.072400011287299e-06, + "loss": 0.6105, + "step": 6130 + }, + { + "epoch": 0.64, + "grad_norm": 2.1810675903640586, + "learning_rate": 3.0708468326525376e-06, + "loss": 0.6237, + "step": 6131 + }, + { + "epoch": 0.64, + "grad_norm": 1.8404334794364534, + "learning_rate": 3.069293872698284e-06, + "loss": 0.6088, + "step": 6132 + }, + { + "epoch": 0.64, + "grad_norm": 1.772357721649005, + "learning_rate": 3.0677411316005744e-06, + "loss": 0.6274, + "step": 6133 + }, + { + "epoch": 0.64, + "grad_norm": 1.8695683122224707, + "learning_rate": 3.066188609535421e-06, + "loss": 0.7137, + "step": 6134 + }, + { + "epoch": 0.64, + "grad_norm": 1.7470798861679386, + "learning_rate": 3.0646363066788114e-06, + "loss": 0.5847, + "step": 6135 + }, + { + "epoch": 0.64, + "grad_norm": 1.8760125269130379, + "learning_rate": 3.063084223206708e-06, + "loss": 0.6395, + "step": 6136 + }, + { + "epoch": 0.64, + "grad_norm": 1.879803637103201, + "learning_rate": 3.0615323592950495e-06, + "loss": 0.5537, + "step": 6137 + }, + { + "epoch": 0.64, + "grad_norm": 2.02465353465098, + "learning_rate": 3.0599807151197446e-06, + "loss": 0.6646, + "step": 6138 + }, + { + "epoch": 0.64, + "grad_norm": 2.013051093932397, + "learning_rate": 3.0584292908566836e-06, + "loss": 0.601, + "step": 6139 + }, + { + "epoch": 0.64, + "grad_norm": 1.8921168905387726, + "learning_rate": 3.056878086681729e-06, + "loss": 0.5706, + "step": 6140 + }, + { + "epoch": 0.64, + "grad_norm": 1.6381413943978955, + "learning_rate": 3.055327102770719e-06, + "loss": 0.5007, + "step": 6141 + }, + { + "epoch": 0.64, + "grad_norm": 1.9367879723399168, + "learning_rate": 3.053776339299467e-06, + "loss": 0.5682, + "step": 6142 + }, + { + "epoch": 0.64, + "grad_norm": 1.8429248685663757, + "learning_rate": 3.0522257964437586e-06, + "loss": 0.5974, + "step": 6143 + }, + { + "epoch": 0.64, + "grad_norm": 1.9168065728136365, + "learning_rate": 3.050675474379361e-06, + "loss": 0.5974, + "step": 6144 + }, + { + "epoch": 0.64, + "grad_norm": 1.749866080818118, + "learning_rate": 3.0491253732820063e-06, + "loss": 0.6956, + "step": 6145 + }, + { + "epoch": 0.64, + "grad_norm": 1.9739705708202513, + "learning_rate": 3.0475754933274106e-06, + "loss": 0.6371, + "step": 6146 + }, + { + "epoch": 0.64, + "grad_norm": 1.7874894202016325, + "learning_rate": 3.0460258346912615e-06, + "loss": 0.6762, + "step": 6147 + }, + { + "epoch": 0.64, + "grad_norm": 1.9124508607094237, + "learning_rate": 3.044476397549221e-06, + "loss": 0.6693, + "step": 6148 + }, + { + "epoch": 0.64, + "grad_norm": 1.7300970486859781, + "learning_rate": 3.042927182076927e-06, + "loss": 0.5399, + "step": 6149 + }, + { + "epoch": 0.64, + "grad_norm": 1.9977396120353856, + "learning_rate": 3.0413781884499916e-06, + "loss": 0.6662, + "step": 6150 + }, + { + "epoch": 0.64, + "grad_norm": 1.7582271936489247, + "learning_rate": 3.0398294168440023e-06, + "loss": 0.6091, + "step": 6151 + }, + { + "epoch": 0.64, + "grad_norm": 2.0479440125108272, + "learning_rate": 3.0382808674345228e-06, + "loss": 0.6703, + "step": 6152 + }, + { + "epoch": 0.64, + "grad_norm": 1.9526668255453477, + "learning_rate": 3.036732540397087e-06, + "loss": 0.527, + "step": 6153 + }, + { + "epoch": 0.64, + "grad_norm": 1.7155652955499163, + "learning_rate": 3.035184435907208e-06, + "loss": 0.5533, + "step": 6154 + }, + { + "epoch": 0.64, + "grad_norm": 1.9812973705820782, + "learning_rate": 3.0336365541403723e-06, + "loss": 0.5941, + "step": 6155 + }, + { + "epoch": 0.64, + "grad_norm": 1.8769962170542873, + "learning_rate": 3.0320888952720414e-06, + "loss": 0.596, + "step": 6156 + }, + { + "epoch": 0.64, + "grad_norm": 1.923258773106532, + "learning_rate": 3.0305414594776505e-06, + "loss": 0.6878, + "step": 6157 + }, + { + "epoch": 0.64, + "grad_norm": 2.3812002253021145, + "learning_rate": 3.0289942469326106e-06, + "loss": 0.5918, + "step": 6158 + }, + { + "epoch": 0.64, + "grad_norm": 1.7311215954566284, + "learning_rate": 3.0274472578123095e-06, + "loss": 0.5567, + "step": 6159 + }, + { + "epoch": 0.64, + "grad_norm": 1.8309946265303754, + "learning_rate": 3.0259004922921033e-06, + "loss": 0.5843, + "step": 6160 + }, + { + "epoch": 0.64, + "grad_norm": 1.8394554565201098, + "learning_rate": 3.0243539505473275e-06, + "loss": 0.529, + "step": 6161 + }, + { + "epoch": 0.64, + "grad_norm": 1.8519372361597473, + "learning_rate": 3.0228076327532925e-06, + "loss": 0.5625, + "step": 6162 + }, + { + "epoch": 0.64, + "grad_norm": 1.8204661064383223, + "learning_rate": 3.021261539085282e-06, + "loss": 0.5559, + "step": 6163 + }, + { + "epoch": 0.64, + "grad_norm": 1.782344163712384, + "learning_rate": 3.019715669718554e-06, + "loss": 0.604, + "step": 6164 + }, + { + "epoch": 0.64, + "grad_norm": 1.6640325517922043, + "learning_rate": 3.018170024828343e-06, + "loss": 0.5919, + "step": 6165 + }, + { + "epoch": 0.64, + "grad_norm": 2.037145171502773, + "learning_rate": 3.016624604589855e-06, + "loss": 0.6316, + "step": 6166 + }, + { + "epoch": 0.64, + "grad_norm": 1.6769480920552722, + "learning_rate": 3.0150794091782753e-06, + "loss": 0.6897, + "step": 6167 + }, + { + "epoch": 0.64, + "grad_norm": 2.0464765956664976, + "learning_rate": 3.013534438768756e-06, + "loss": 0.63, + "step": 6168 + }, + { + "epoch": 0.64, + "grad_norm": 1.916989468935177, + "learning_rate": 3.0119896935364305e-06, + "loss": 0.5742, + "step": 6169 + }, + { + "epoch": 0.64, + "grad_norm": 1.9472328083428492, + "learning_rate": 3.010445173656405e-06, + "loss": 0.6717, + "step": 6170 + }, + { + "epoch": 0.64, + "grad_norm": 1.8992859253335288, + "learning_rate": 3.0089008793037587e-06, + "loss": 0.6509, + "step": 6171 + }, + { + "epoch": 0.64, + "grad_norm": 1.916102090617852, + "learning_rate": 3.0073568106535465e-06, + "loss": 0.6445, + "step": 6172 + }, + { + "epoch": 0.64, + "grad_norm": 1.9064952084707214, + "learning_rate": 3.005812967880798e-06, + "loss": 0.6296, + "step": 6173 + }, + { + "epoch": 0.64, + "grad_norm": 1.807783199230649, + "learning_rate": 3.004269351160518e-06, + "loss": 0.6381, + "step": 6174 + }, + { + "epoch": 0.64, + "grad_norm": 1.9174580476574532, + "learning_rate": 3.00272596066768e-06, + "loss": 0.6323, + "step": 6175 + }, + { + "epoch": 0.64, + "grad_norm": 1.9492380466914858, + "learning_rate": 3.001182796577239e-06, + "loss": 0.6298, + "step": 6176 + }, + { + "epoch": 0.64, + "grad_norm": 1.6924745865542088, + "learning_rate": 2.9996398590641203e-06, + "loss": 0.5399, + "step": 6177 + }, + { + "epoch": 0.64, + "grad_norm": 2.033275499357687, + "learning_rate": 2.998097148303225e-06, + "loss": 0.7353, + "step": 6178 + }, + { + "epoch": 0.64, + "grad_norm": 1.9026486542697536, + "learning_rate": 2.9965546644694287e-06, + "loss": 0.5794, + "step": 6179 + }, + { + "epoch": 0.64, + "grad_norm": 2.0228080110290105, + "learning_rate": 2.995012407737581e-06, + "loss": 0.6467, + "step": 6180 + }, + { + "epoch": 0.64, + "grad_norm": 1.7861014751773872, + "learning_rate": 2.993470378282505e-06, + "loss": 0.6066, + "step": 6181 + }, + { + "epoch": 0.64, + "grad_norm": 1.839263166878586, + "learning_rate": 2.9919285762789983e-06, + "loss": 0.5448, + "step": 6182 + }, + { + "epoch": 0.64, + "grad_norm": 1.7072274410368804, + "learning_rate": 2.990387001901834e-06, + "loss": 0.5578, + "step": 6183 + }, + { + "epoch": 0.64, + "grad_norm": 1.6431673164569158, + "learning_rate": 2.988845655325756e-06, + "loss": 0.6351, + "step": 6184 + }, + { + "epoch": 0.64, + "grad_norm": 1.9010572335147635, + "learning_rate": 2.987304536725486e-06, + "loss": 0.6327, + "step": 6185 + }, + { + "epoch": 0.64, + "grad_norm": 1.6632693290336715, + "learning_rate": 2.9857636462757193e-06, + "loss": 0.5914, + "step": 6186 + }, + { + "epoch": 0.64, + "grad_norm": 2.1089846908801015, + "learning_rate": 2.984222984151124e-06, + "loss": 0.6263, + "step": 6187 + }, + { + "epoch": 0.64, + "grad_norm": 1.817502491313046, + "learning_rate": 2.9826825505263427e-06, + "loss": 0.6505, + "step": 6188 + }, + { + "epoch": 0.64, + "grad_norm": 1.8606164928292765, + "learning_rate": 2.981142345575994e-06, + "loss": 0.6494, + "step": 6189 + }, + { + "epoch": 0.64, + "grad_norm": 1.8351620719634854, + "learning_rate": 2.979602369474667e-06, + "loss": 0.522, + "step": 6190 + }, + { + "epoch": 0.64, + "grad_norm": 1.6703557627008776, + "learning_rate": 2.9780626223969256e-06, + "loss": 0.5111, + "step": 6191 + }, + { + "epoch": 0.64, + "grad_norm": 1.9164204387222075, + "learning_rate": 2.976523104517312e-06, + "loss": 0.6429, + "step": 6192 + }, + { + "epoch": 0.64, + "grad_norm": 1.9499476254969241, + "learning_rate": 2.9749838160103372e-06, + "loss": 0.6298, + "step": 6193 + }, + { + "epoch": 0.64, + "grad_norm": 2.0213576043893964, + "learning_rate": 2.9734447570504898e-06, + "loss": 0.6417, + "step": 6194 + }, + { + "epoch": 0.64, + "grad_norm": 1.8219746597887712, + "learning_rate": 2.97190592781223e-06, + "loss": 0.6355, + "step": 6195 + }, + { + "epoch": 0.64, + "grad_norm": 1.9916439598009843, + "learning_rate": 2.9703673284699945e-06, + "loss": 0.7587, + "step": 6196 + }, + { + "epoch": 0.64, + "grad_norm": 1.8151195802546238, + "learning_rate": 2.9688289591981887e-06, + "loss": 0.6119, + "step": 6197 + }, + { + "epoch": 0.64, + "grad_norm": 1.9629854400858462, + "learning_rate": 2.9672908201711986e-06, + "loss": 0.6639, + "step": 6198 + }, + { + "epoch": 0.64, + "grad_norm": 2.0632728967170935, + "learning_rate": 2.96575291156338e-06, + "loss": 0.6161, + "step": 6199 + }, + { + "epoch": 0.64, + "grad_norm": 2.0544901998541976, + "learning_rate": 2.9642152335490633e-06, + "loss": 0.5926, + "step": 6200 + }, + { + "epoch": 0.64, + "grad_norm": 1.8887067530096628, + "learning_rate": 2.9626777863025535e-06, + "loss": 0.6645, + "step": 6201 + }, + { + "epoch": 0.64, + "grad_norm": 1.9767964985943947, + "learning_rate": 2.961140569998129e-06, + "loss": 0.5331, + "step": 6202 + }, + { + "epoch": 0.64, + "grad_norm": 1.9912020438130313, + "learning_rate": 2.959603584810041e-06, + "loss": 0.595, + "step": 6203 + }, + { + "epoch": 0.64, + "grad_norm": 1.8860676490378074, + "learning_rate": 2.9580668309125203e-06, + "loss": 0.6943, + "step": 6204 + }, + { + "epoch": 0.65, + "grad_norm": 2.2330331418419878, + "learning_rate": 2.95653030847976e-06, + "loss": 0.6634, + "step": 6205 + }, + { + "epoch": 0.65, + "grad_norm": 2.081180271310904, + "learning_rate": 2.954994017685937e-06, + "loss": 0.5837, + "step": 6206 + }, + { + "epoch": 0.65, + "grad_norm": 1.8421768186562804, + "learning_rate": 2.9534579587051976e-06, + "loss": 0.6476, + "step": 6207 + }, + { + "epoch": 0.65, + "grad_norm": 1.8404952869659887, + "learning_rate": 2.9519221317116644e-06, + "loss": 0.6179, + "step": 6208 + }, + { + "epoch": 0.65, + "grad_norm": 1.8764757322689638, + "learning_rate": 2.9503865368794303e-06, + "loss": 0.5937, + "step": 6209 + }, + { + "epoch": 0.65, + "grad_norm": 2.0970231465593434, + "learning_rate": 2.948851174382565e-06, + "loss": 0.6625, + "step": 6210 + }, + { + "epoch": 0.65, + "grad_norm": 2.198985907859825, + "learning_rate": 2.947316044395112e-06, + "loss": 0.7062, + "step": 6211 + }, + { + "epoch": 0.65, + "grad_norm": 1.9215926364945755, + "learning_rate": 2.9457811470910837e-06, + "loss": 0.6313, + "step": 6212 + }, + { + "epoch": 0.65, + "grad_norm": 1.8199210140099955, + "learning_rate": 2.944246482644471e-06, + "loss": 0.583, + "step": 6213 + }, + { + "epoch": 0.65, + "grad_norm": 2.1819976857706016, + "learning_rate": 2.9427120512292368e-06, + "loss": 0.7136, + "step": 6214 + }, + { + "epoch": 0.65, + "grad_norm": 2.0989651592874425, + "learning_rate": 2.941177853019318e-06, + "loss": 0.6884, + "step": 6215 + }, + { + "epoch": 0.65, + "grad_norm": 1.8185220766754597, + "learning_rate": 2.9396438881886234e-06, + "loss": 0.6199, + "step": 6216 + }, + { + "epoch": 0.65, + "grad_norm": 1.8386182832764755, + "learning_rate": 2.9381101569110393e-06, + "loss": 0.5703, + "step": 6217 + }, + { + "epoch": 0.65, + "grad_norm": 1.7960263200181878, + "learning_rate": 2.936576659360421e-06, + "loss": 0.5296, + "step": 6218 + }, + { + "epoch": 0.65, + "grad_norm": 2.0704183498446374, + "learning_rate": 2.9350433957105995e-06, + "loss": 0.6312, + "step": 6219 + }, + { + "epoch": 0.65, + "grad_norm": 1.7147161698387252, + "learning_rate": 2.933510366135378e-06, + "loss": 0.5166, + "step": 6220 + }, + { + "epoch": 0.65, + "grad_norm": 2.0667639388096304, + "learning_rate": 2.9319775708085364e-06, + "loss": 0.6563, + "step": 6221 + }, + { + "epoch": 0.65, + "grad_norm": 1.8954356153973286, + "learning_rate": 2.930445009903824e-06, + "loss": 0.6817, + "step": 6222 + }, + { + "epoch": 0.65, + "grad_norm": 2.1322815169531784, + "learning_rate": 2.9289126835949657e-06, + "loss": 0.6872, + "step": 6223 + }, + { + "epoch": 0.65, + "grad_norm": 1.9819330643264859, + "learning_rate": 2.9273805920556586e-06, + "loss": 0.6843, + "step": 6224 + }, + { + "epoch": 0.65, + "grad_norm": 2.065487847700474, + "learning_rate": 2.9258487354595754e-06, + "loss": 0.723, + "step": 6225 + }, + { + "epoch": 0.65, + "grad_norm": 1.928418370629797, + "learning_rate": 2.9243171139803617e-06, + "loss": 0.6875, + "step": 6226 + }, + { + "epoch": 0.65, + "grad_norm": 1.9522013519745338, + "learning_rate": 2.9227857277916325e-06, + "loss": 0.655, + "step": 6227 + }, + { + "epoch": 0.65, + "grad_norm": 2.082709028367856, + "learning_rate": 2.9212545770669814e-06, + "loss": 0.6324, + "step": 6228 + }, + { + "epoch": 0.65, + "grad_norm": 2.057952654711287, + "learning_rate": 2.919723661979972e-06, + "loss": 0.5704, + "step": 6229 + }, + { + "epoch": 0.65, + "grad_norm": 1.9576733185893804, + "learning_rate": 2.918192982704143e-06, + "loss": 0.6425, + "step": 6230 + }, + { + "epoch": 0.65, + "grad_norm": 1.6660456041502691, + "learning_rate": 2.9166625394130066e-06, + "loss": 0.5195, + "step": 6231 + }, + { + "epoch": 0.65, + "grad_norm": 1.9031996084090193, + "learning_rate": 2.9151323322800433e-06, + "loss": 0.7252, + "step": 6232 + }, + { + "epoch": 0.65, + "grad_norm": 1.933943035659668, + "learning_rate": 2.913602361478716e-06, + "loss": 0.5009, + "step": 6233 + }, + { + "epoch": 0.65, + "grad_norm": 2.235638647690862, + "learning_rate": 2.912072627182453e-06, + "loss": 0.6169, + "step": 6234 + }, + { + "epoch": 0.65, + "grad_norm": 1.7939098430985856, + "learning_rate": 2.910543129564658e-06, + "loss": 0.6096, + "step": 6235 + }, + { + "epoch": 0.65, + "grad_norm": 1.9999371977297142, + "learning_rate": 2.9090138687987075e-06, + "loss": 0.626, + "step": 6236 + }, + { + "epoch": 0.65, + "grad_norm": 1.7553733344385136, + "learning_rate": 2.9074848450579545e-06, + "loss": 0.6289, + "step": 6237 + }, + { + "epoch": 0.65, + "grad_norm": 2.013563698558594, + "learning_rate": 2.9059560585157197e-06, + "loss": 0.6329, + "step": 6238 + }, + { + "epoch": 0.65, + "grad_norm": 1.9821038414510794, + "learning_rate": 2.9044275093453034e-06, + "loss": 0.5404, + "step": 6239 + }, + { + "epoch": 0.65, + "grad_norm": 1.880071424325999, + "learning_rate": 2.9028991977199705e-06, + "loss": 0.6771, + "step": 6240 + }, + { + "epoch": 0.65, + "grad_norm": 1.8332617137796834, + "learning_rate": 2.9013711238129693e-06, + "loss": 0.6157, + "step": 6241 + }, + { + "epoch": 0.65, + "grad_norm": 1.9401173115427022, + "learning_rate": 2.899843287797513e-06, + "loss": 0.645, + "step": 6242 + }, + { + "epoch": 0.65, + "grad_norm": 2.0386914173802833, + "learning_rate": 2.8983156898467885e-06, + "loss": 0.6974, + "step": 6243 + }, + { + "epoch": 0.65, + "grad_norm": 1.6509330084534442, + "learning_rate": 2.896788330133962e-06, + "loss": 0.5447, + "step": 6244 + }, + { + "epoch": 0.65, + "grad_norm": 1.94624189730017, + "learning_rate": 2.8952612088321636e-06, + "loss": 0.6139, + "step": 6245 + }, + { + "epoch": 0.65, + "grad_norm": 1.9680166782133846, + "learning_rate": 2.893734326114506e-06, + "loss": 0.653, + "step": 6246 + }, + { + "epoch": 0.65, + "grad_norm": 1.9087806201292232, + "learning_rate": 2.8922076821540657e-06, + "loss": 0.5587, + "step": 6247 + }, + { + "epoch": 0.65, + "grad_norm": 1.9714706233304224, + "learning_rate": 2.8906812771239034e-06, + "loss": 0.666, + "step": 6248 + }, + { + "epoch": 0.65, + "grad_norm": 1.8099975489747797, + "learning_rate": 2.889155111197036e-06, + "loss": 0.5629, + "step": 6249 + }, + { + "epoch": 0.65, + "grad_norm": 1.8264082657553349, + "learning_rate": 2.887629184546471e-06, + "loss": 0.5704, + "step": 6250 + }, + { + "epoch": 0.65, + "grad_norm": 2.2058717301943824, + "learning_rate": 2.8861034973451753e-06, + "loss": 0.7257, + "step": 6251 + }, + { + "epoch": 0.65, + "grad_norm": 1.8518154957166706, + "learning_rate": 2.8845780497660996e-06, + "loss": 0.6928, + "step": 6252 + }, + { + "epoch": 0.65, + "grad_norm": 1.9484583542830034, + "learning_rate": 2.883052841982157e-06, + "loss": 0.6393, + "step": 6253 + }, + { + "epoch": 0.65, + "grad_norm": 1.7555508796894785, + "learning_rate": 2.8815278741662433e-06, + "loss": 0.5886, + "step": 6254 + }, + { + "epoch": 0.65, + "grad_norm": 1.994950526398456, + "learning_rate": 2.88000314649122e-06, + "loss": 0.5842, + "step": 6255 + }, + { + "epoch": 0.65, + "grad_norm": 1.8662457743534389, + "learning_rate": 2.878478659129923e-06, + "loss": 0.584, + "step": 6256 + }, + { + "epoch": 0.65, + "grad_norm": 1.9420532601252392, + "learning_rate": 2.8769544122551606e-06, + "loss": 0.7123, + "step": 6257 + }, + { + "epoch": 0.65, + "grad_norm": 1.835196222874027, + "learning_rate": 2.875430406039719e-06, + "loss": 0.5903, + "step": 6258 + }, + { + "epoch": 0.65, + "grad_norm": 1.8756813320395385, + "learning_rate": 2.873906640656348e-06, + "loss": 0.5721, + "step": 6259 + }, + { + "epoch": 0.65, + "grad_norm": 1.9765833048671797, + "learning_rate": 2.8723831162777806e-06, + "loss": 0.6045, + "step": 6260 + }, + { + "epoch": 0.65, + "grad_norm": 2.032795441691923, + "learning_rate": 2.8708598330767105e-06, + "loss": 0.6522, + "step": 6261 + }, + { + "epoch": 0.65, + "grad_norm": 1.7505441325588476, + "learning_rate": 2.869336791225817e-06, + "loss": 0.5361, + "step": 6262 + }, + { + "epoch": 0.65, + "grad_norm": 1.9496901968364277, + "learning_rate": 2.867813990897742e-06, + "loss": 0.6319, + "step": 6263 + }, + { + "epoch": 0.65, + "grad_norm": 1.848210036629578, + "learning_rate": 2.8662914322651046e-06, + "loss": 0.5804, + "step": 6264 + }, + { + "epoch": 0.65, + "grad_norm": 2.0674380320579235, + "learning_rate": 2.864769115500493e-06, + "loss": 0.5805, + "step": 6265 + }, + { + "epoch": 0.65, + "grad_norm": 1.864513334235877, + "learning_rate": 2.8632470407764746e-06, + "loss": 0.6263, + "step": 6266 + }, + { + "epoch": 0.65, + "grad_norm": 1.9868287714690414, + "learning_rate": 2.8617252082655813e-06, + "loss": 0.568, + "step": 6267 + }, + { + "epoch": 0.65, + "grad_norm": 2.003702026459403, + "learning_rate": 2.860203618140325e-06, + "loss": 0.6686, + "step": 6268 + }, + { + "epoch": 0.65, + "grad_norm": 1.9781283187382133, + "learning_rate": 2.858682270573183e-06, + "loss": 0.5742, + "step": 6269 + }, + { + "epoch": 0.65, + "grad_norm": 1.8962333723378362, + "learning_rate": 2.857161165736613e-06, + "loss": 0.562, + "step": 6270 + }, + { + "epoch": 0.65, + "grad_norm": 1.9548515931569148, + "learning_rate": 2.8556403038030385e-06, + "loss": 0.7561, + "step": 6271 + }, + { + "epoch": 0.65, + "grad_norm": 2.0789855989958763, + "learning_rate": 2.8541196849448582e-06, + "loss": 0.5907, + "step": 6272 + }, + { + "epoch": 0.65, + "grad_norm": 1.9881196918457014, + "learning_rate": 2.8525993093344407e-06, + "loss": 0.6955, + "step": 6273 + }, + { + "epoch": 0.65, + "grad_norm": 1.968486206575867, + "learning_rate": 2.8510791771441327e-06, + "loss": 0.5586, + "step": 6274 + }, + { + "epoch": 0.65, + "grad_norm": 1.7880365080779919, + "learning_rate": 2.8495592885462476e-06, + "loss": 0.5935, + "step": 6275 + }, + { + "epoch": 0.65, + "grad_norm": 2.2478113171852816, + "learning_rate": 2.848039643713075e-06, + "loss": 0.5541, + "step": 6276 + }, + { + "epoch": 0.65, + "grad_norm": 1.888739198842372, + "learning_rate": 2.8465202428168753e-06, + "loss": 0.6149, + "step": 6277 + }, + { + "epoch": 0.65, + "grad_norm": 2.1536997825091837, + "learning_rate": 2.8450010860298784e-06, + "loss": 0.7143, + "step": 6278 + }, + { + "epoch": 0.65, + "grad_norm": 1.814516674311857, + "learning_rate": 2.8434821735242935e-06, + "loss": 0.6282, + "step": 6279 + }, + { + "epoch": 0.65, + "grad_norm": 2.01424137795222, + "learning_rate": 2.841963505472294e-06, + "loss": 0.6298, + "step": 6280 + }, + { + "epoch": 0.65, + "grad_norm": 1.9768897380178496, + "learning_rate": 2.8404450820460326e-06, + "loss": 0.6343, + "step": 6281 + }, + { + "epoch": 0.65, + "grad_norm": 2.327163802642382, + "learning_rate": 2.838926903417629e-06, + "loss": 0.7132, + "step": 6282 + }, + { + "epoch": 0.65, + "grad_norm": 1.968349224205387, + "learning_rate": 2.837408969759181e-06, + "loss": 0.5983, + "step": 6283 + }, + { + "epoch": 0.65, + "grad_norm": 1.7460529943975944, + "learning_rate": 2.8358912812427497e-06, + "loss": 0.5936, + "step": 6284 + }, + { + "epoch": 0.65, + "grad_norm": 2.0593792416805363, + "learning_rate": 2.834373838040382e-06, + "loss": 0.6621, + "step": 6285 + }, + { + "epoch": 0.65, + "grad_norm": 2.0565360915182045, + "learning_rate": 2.832856640324078e-06, + "loss": 0.6497, + "step": 6286 + }, + { + "epoch": 0.65, + "grad_norm": 1.9303234583627442, + "learning_rate": 2.831339688265829e-06, + "loss": 0.6376, + "step": 6287 + }, + { + "epoch": 0.65, + "grad_norm": 2.1135210848654555, + "learning_rate": 2.829822982037585e-06, + "loss": 0.71, + "step": 6288 + }, + { + "epoch": 0.65, + "grad_norm": 2.0335602868059968, + "learning_rate": 2.8283065218112775e-06, + "loss": 0.6106, + "step": 6289 + }, + { + "epoch": 0.65, + "grad_norm": 2.0336428429947935, + "learning_rate": 2.826790307758802e-06, + "loss": 0.7293, + "step": 6290 + }, + { + "epoch": 0.65, + "grad_norm": 1.9485197680578654, + "learning_rate": 2.8252743400520345e-06, + "loss": 0.6481, + "step": 6291 + }, + { + "epoch": 0.65, + "grad_norm": 1.9136820372082457, + "learning_rate": 2.8237586188628153e-06, + "loss": 0.5989, + "step": 6292 + }, + { + "epoch": 0.65, + "grad_norm": 1.8096490093595976, + "learning_rate": 2.8222431443629617e-06, + "loss": 0.5179, + "step": 6293 + }, + { + "epoch": 0.65, + "grad_norm": 1.9304167213931087, + "learning_rate": 2.820727916724257e-06, + "loss": 0.6775, + "step": 6294 + }, + { + "epoch": 0.65, + "grad_norm": 1.853944468773223, + "learning_rate": 2.8192129361184685e-06, + "loss": 0.6527, + "step": 6295 + }, + { + "epoch": 0.65, + "grad_norm": 1.9454826200033084, + "learning_rate": 2.8176982027173206e-06, + "loss": 0.5919, + "step": 6296 + }, + { + "epoch": 0.65, + "grad_norm": 2.0221472638370903, + "learning_rate": 2.816183716692522e-06, + "loss": 0.5724, + "step": 6297 + }, + { + "epoch": 0.65, + "grad_norm": 2.116282365722759, + "learning_rate": 2.8146694782157447e-06, + "loss": 0.6334, + "step": 6298 + }, + { + "epoch": 0.65, + "grad_norm": 1.9071368964119593, + "learning_rate": 2.8131554874586396e-06, + "loss": 0.612, + "step": 6299 + }, + { + "epoch": 0.65, + "grad_norm": 1.9872247909004435, + "learning_rate": 2.8116417445928245e-06, + "loss": 0.6012, + "step": 6300 + }, + { + "epoch": 0.65, + "grad_norm": 1.976827376780296, + "learning_rate": 2.810128249789892e-06, + "loss": 0.6022, + "step": 6301 + }, + { + "epoch": 0.66, + "grad_norm": 1.805685551078074, + "learning_rate": 2.808615003221401e-06, + "loss": 0.5421, + "step": 6302 + }, + { + "epoch": 0.66, + "grad_norm": 1.8361096040269862, + "learning_rate": 2.8071020050588927e-06, + "loss": 0.5828, + "step": 6303 + }, + { + "epoch": 0.66, + "grad_norm": 2.117462557354621, + "learning_rate": 2.8055892554738683e-06, + "loss": 0.67, + "step": 6304 + }, + { + "epoch": 0.66, + "grad_norm": 1.9521166314972613, + "learning_rate": 2.804076754637812e-06, + "loss": 0.6384, + "step": 6305 + }, + { + "epoch": 0.66, + "grad_norm": 1.887459953486852, + "learning_rate": 2.8025645027221704e-06, + "loss": 0.5704, + "step": 6306 + }, + { + "epoch": 0.66, + "grad_norm": 1.867313977993055, + "learning_rate": 2.801052499898369e-06, + "loss": 0.5305, + "step": 6307 + }, + { + "epoch": 0.66, + "grad_norm": 1.9997508923479637, + "learning_rate": 2.7995407463378004e-06, + "loss": 0.6093, + "step": 6308 + }, + { + "epoch": 0.66, + "grad_norm": 1.8703093721931023, + "learning_rate": 2.7980292422118282e-06, + "loss": 0.613, + "step": 6309 + }, + { + "epoch": 0.66, + "grad_norm": 2.061021653564945, + "learning_rate": 2.7965179876917946e-06, + "loss": 0.5967, + "step": 6310 + }, + { + "epoch": 0.66, + "grad_norm": 1.843007255448845, + "learning_rate": 2.795006982949006e-06, + "loss": 0.6791, + "step": 6311 + }, + { + "epoch": 0.66, + "grad_norm": 1.9714166575628622, + "learning_rate": 2.7934962281547422e-06, + "loss": 0.6519, + "step": 6312 + }, + { + "epoch": 0.66, + "grad_norm": 1.7854552666884103, + "learning_rate": 2.7919857234802593e-06, + "loss": 0.5518, + "step": 6313 + }, + { + "epoch": 0.66, + "grad_norm": 1.9856717506321173, + "learning_rate": 2.7904754690967808e-06, + "loss": 0.6034, + "step": 6314 + }, + { + "epoch": 0.66, + "grad_norm": 2.2295217500421063, + "learning_rate": 2.7889654651754987e-06, + "loss": 0.53, + "step": 6315 + }, + { + "epoch": 0.66, + "grad_norm": 1.6963943668827721, + "learning_rate": 2.7874557118875863e-06, + "loss": 0.5937, + "step": 6316 + }, + { + "epoch": 0.66, + "grad_norm": 2.053957569068724, + "learning_rate": 2.785946209404178e-06, + "loss": 0.7016, + "step": 6317 + }, + { + "epoch": 0.66, + "grad_norm": 1.911492913171324, + "learning_rate": 2.784436957896388e-06, + "loss": 0.7116, + "step": 6318 + }, + { + "epoch": 0.66, + "grad_norm": 1.8810697368746152, + "learning_rate": 2.7829279575352953e-06, + "loss": 0.5795, + "step": 6319 + }, + { + "epoch": 0.66, + "grad_norm": 1.9914019124478899, + "learning_rate": 2.781419208491958e-06, + "loss": 0.5958, + "step": 6320 + }, + { + "epoch": 0.66, + "grad_norm": 1.712755966691504, + "learning_rate": 2.7799107109373956e-06, + "loss": 0.5469, + "step": 6321 + }, + { + "epoch": 0.66, + "grad_norm": 1.8143583254344464, + "learning_rate": 2.7784024650426133e-06, + "loss": 0.6282, + "step": 6322 + }, + { + "epoch": 0.66, + "grad_norm": 2.1419849838155, + "learning_rate": 2.7768944709785705e-06, + "loss": 0.7249, + "step": 6323 + }, + { + "epoch": 0.66, + "grad_norm": 1.7170329232042192, + "learning_rate": 2.775386728916212e-06, + "loss": 0.5251, + "step": 6324 + }, + { + "epoch": 0.66, + "grad_norm": 2.742228854334704, + "learning_rate": 2.7738792390264456e-06, + "loss": 0.6825, + "step": 6325 + }, + { + "epoch": 0.66, + "grad_norm": 1.8459902103849364, + "learning_rate": 2.772372001480159e-06, + "loss": 0.6152, + "step": 6326 + }, + { + "epoch": 0.66, + "grad_norm": 1.8776493794011735, + "learning_rate": 2.7708650164482e-06, + "loss": 0.639, + "step": 6327 + }, + { + "epoch": 0.66, + "grad_norm": 2.063971101948811, + "learning_rate": 2.7693582841013996e-06, + "loss": 0.6364, + "step": 6328 + }, + { + "epoch": 0.66, + "grad_norm": 2.036669543108079, + "learning_rate": 2.767851804610552e-06, + "loss": 0.64, + "step": 6329 + }, + { + "epoch": 0.66, + "grad_norm": 1.6546039954617904, + "learning_rate": 2.7663455781464245e-06, + "loss": 0.5424, + "step": 6330 + }, + { + "epoch": 0.66, + "grad_norm": 1.8036933164626912, + "learning_rate": 2.7648396048797554e-06, + "loss": 0.5899, + "step": 6331 + }, + { + "epoch": 0.66, + "grad_norm": 1.8285545473432434, + "learning_rate": 2.7633338849812593e-06, + "loss": 0.5668, + "step": 6332 + }, + { + "epoch": 0.66, + "grad_norm": 1.8127905625169214, + "learning_rate": 2.7618284186216137e-06, + "loss": 0.6676, + "step": 6333 + }, + { + "epoch": 0.66, + "grad_norm": 2.153394766265645, + "learning_rate": 2.760323205971476e-06, + "loss": 0.6618, + "step": 6334 + }, + { + "epoch": 0.66, + "grad_norm": 1.9747255781767126, + "learning_rate": 2.7588182472014668e-06, + "loss": 0.6172, + "step": 6335 + }, + { + "epoch": 0.66, + "grad_norm": 2.0717810839011723, + "learning_rate": 2.757313542482185e-06, + "loss": 0.6448, + "step": 6336 + }, + { + "epoch": 0.66, + "grad_norm": 1.9391684607298043, + "learning_rate": 2.7558090919841972e-06, + "loss": 0.6767, + "step": 6337 + }, + { + "epoch": 0.66, + "grad_norm": 1.900234867377518, + "learning_rate": 2.7543048958780395e-06, + "loss": 0.567, + "step": 6338 + }, + { + "epoch": 0.66, + "grad_norm": 2.026675318690998, + "learning_rate": 2.7528009543342197e-06, + "loss": 0.6441, + "step": 6339 + }, + { + "epoch": 0.66, + "grad_norm": 1.9912333544995164, + "learning_rate": 2.751297267523223e-06, + "loss": 0.6838, + "step": 6340 + }, + { + "epoch": 0.66, + "grad_norm": 1.8455002654000203, + "learning_rate": 2.7497938356154957e-06, + "loss": 0.4949, + "step": 6341 + }, + { + "epoch": 0.66, + "grad_norm": 1.944518401536242, + "learning_rate": 2.748290658781465e-06, + "loss": 0.5808, + "step": 6342 + }, + { + "epoch": 0.66, + "grad_norm": 1.8734162282177562, + "learning_rate": 2.746787737191521e-06, + "loss": 0.6036, + "step": 6343 + }, + { + "epoch": 0.66, + "grad_norm": 2.0569689392989594, + "learning_rate": 2.7452850710160305e-06, + "loss": 0.6238, + "step": 6344 + }, + { + "epoch": 0.66, + "grad_norm": 1.859492968004961, + "learning_rate": 2.7437826604253296e-06, + "loss": 0.6002, + "step": 6345 + }, + { + "epoch": 0.66, + "grad_norm": 1.9239528734480238, + "learning_rate": 2.7422805055897224e-06, + "loss": 0.6592, + "step": 6346 + }, + { + "epoch": 0.66, + "grad_norm": 1.7751066614137307, + "learning_rate": 2.74077860667949e-06, + "loss": 0.6482, + "step": 6347 + }, + { + "epoch": 0.66, + "grad_norm": 1.8529491712765056, + "learning_rate": 2.7392769638648775e-06, + "loss": 0.6862, + "step": 6348 + }, + { + "epoch": 0.66, + "grad_norm": 1.8998307106205168, + "learning_rate": 2.73777557731611e-06, + "loss": 0.6126, + "step": 6349 + }, + { + "epoch": 0.66, + "grad_norm": 2.118611629913783, + "learning_rate": 2.736274447203373e-06, + "loss": 0.627, + "step": 6350 + }, + { + "epoch": 0.66, + "grad_norm": 1.9556808686347844, + "learning_rate": 2.7347735736968318e-06, + "loss": 0.6527, + "step": 6351 + }, + { + "epoch": 0.66, + "grad_norm": 2.013462843348238, + "learning_rate": 2.733272956966615e-06, + "loss": 0.7476, + "step": 6352 + }, + { + "epoch": 0.66, + "grad_norm": 1.8135008649655475, + "learning_rate": 2.7317725971828302e-06, + "loss": 0.6531, + "step": 6353 + }, + { + "epoch": 0.66, + "grad_norm": 1.9304868034857994, + "learning_rate": 2.7302724945155486e-06, + "loss": 0.6678, + "step": 6354 + }, + { + "epoch": 0.66, + "grad_norm": 1.9647717480570688, + "learning_rate": 2.728772649134818e-06, + "loss": 0.6529, + "step": 6355 + }, + { + "epoch": 0.66, + "grad_norm": 1.920259935679486, + "learning_rate": 2.7272730612106513e-06, + "loss": 0.5923, + "step": 6356 + }, + { + "epoch": 0.66, + "grad_norm": 2.1502531709807124, + "learning_rate": 2.7257737309130393e-06, + "loss": 0.6569, + "step": 6357 + }, + { + "epoch": 0.66, + "grad_norm": 1.9442863146247567, + "learning_rate": 2.7242746584119364e-06, + "loss": 0.6219, + "step": 6358 + }, + { + "epoch": 0.66, + "grad_norm": 1.8695957383517858, + "learning_rate": 2.722775843877275e-06, + "loss": 0.5603, + "step": 6359 + }, + { + "epoch": 0.66, + "grad_norm": 2.1452012554076223, + "learning_rate": 2.7212772874789484e-06, + "loss": 0.6129, + "step": 6360 + }, + { + "epoch": 0.66, + "grad_norm": 1.7405043357807028, + "learning_rate": 2.7197789893868307e-06, + "loss": 0.6054, + "step": 6361 + }, + { + "epoch": 0.66, + "grad_norm": 1.8611986630402495, + "learning_rate": 2.7182809497707594e-06, + "loss": 0.647, + "step": 6362 + }, + { + "epoch": 0.66, + "grad_norm": 1.8830375405003674, + "learning_rate": 2.7167831688005502e-06, + "loss": 0.5694, + "step": 6363 + }, + { + "epoch": 0.66, + "grad_norm": 1.7562835116328723, + "learning_rate": 2.715285646645981e-06, + "loss": 0.6591, + "step": 6364 + }, + { + "epoch": 0.66, + "grad_norm": 1.7368453184663584, + "learning_rate": 2.7137883834768076e-06, + "loss": 0.5398, + "step": 6365 + }, + { + "epoch": 0.66, + "grad_norm": 1.9427737888494707, + "learning_rate": 2.7122913794627528e-06, + "loss": 0.6606, + "step": 6366 + }, + { + "epoch": 0.66, + "grad_norm": 1.8309498526643528, + "learning_rate": 2.710794634773508e-06, + "loss": 0.5763, + "step": 6367 + }, + { + "epoch": 0.66, + "grad_norm": 1.8981317509223192, + "learning_rate": 2.7092981495787387e-06, + "loss": 0.5609, + "step": 6368 + }, + { + "epoch": 0.66, + "grad_norm": 2.0381064382923353, + "learning_rate": 2.7078019240480826e-06, + "loss": 0.5834, + "step": 6369 + }, + { + "epoch": 0.66, + "grad_norm": 2.0360199613465335, + "learning_rate": 2.706305958351141e-06, + "loss": 0.5999, + "step": 6370 + }, + { + "epoch": 0.66, + "grad_norm": 1.8851392663074493, + "learning_rate": 2.7048102526574948e-06, + "loss": 0.5561, + "step": 6371 + }, + { + "epoch": 0.66, + "grad_norm": 2.0319791070048185, + "learning_rate": 2.7033148071366866e-06, + "loss": 0.6536, + "step": 6372 + }, + { + "epoch": 0.66, + "grad_norm": 1.9323663914875056, + "learning_rate": 2.7018196219582404e-06, + "loss": 0.59, + "step": 6373 + }, + { + "epoch": 0.66, + "grad_norm": 2.148618386983816, + "learning_rate": 2.700324697291634e-06, + "loss": 0.6185, + "step": 6374 + }, + { + "epoch": 0.66, + "grad_norm": 1.875273962996191, + "learning_rate": 2.698830033306334e-06, + "loss": 0.6204, + "step": 6375 + }, + { + "epoch": 0.66, + "grad_norm": 2.050493079361734, + "learning_rate": 2.6973356301717633e-06, + "loss": 0.6846, + "step": 6376 + }, + { + "epoch": 0.66, + "grad_norm": 1.9716423747607381, + "learning_rate": 2.6958414880573257e-06, + "loss": 0.552, + "step": 6377 + }, + { + "epoch": 0.66, + "grad_norm": 2.0713906418446144, + "learning_rate": 2.694347607132387e-06, + "loss": 0.6723, + "step": 6378 + }, + { + "epoch": 0.66, + "grad_norm": 1.9824685803451119, + "learning_rate": 2.692853987566291e-06, + "loss": 0.5323, + "step": 6379 + }, + { + "epoch": 0.66, + "grad_norm": 2.0917417738256425, + "learning_rate": 2.6913606295283436e-06, + "loss": 0.7374, + "step": 6380 + }, + { + "epoch": 0.66, + "grad_norm": 1.8222154905337453, + "learning_rate": 2.689867533187829e-06, + "loss": 0.6312, + "step": 6381 + }, + { + "epoch": 0.66, + "grad_norm": 2.0814088840248615, + "learning_rate": 2.688374698713997e-06, + "loss": 0.6002, + "step": 6382 + }, + { + "epoch": 0.66, + "grad_norm": 2.193140630954233, + "learning_rate": 2.6868821262760673e-06, + "loss": 0.732, + "step": 6383 + }, + { + "epoch": 0.66, + "grad_norm": 1.753162209988003, + "learning_rate": 2.6853898160432347e-06, + "loss": 0.536, + "step": 6384 + }, + { + "epoch": 0.66, + "grad_norm": 1.7609052391335116, + "learning_rate": 2.683897768184657e-06, + "loss": 0.523, + "step": 6385 + }, + { + "epoch": 0.66, + "grad_norm": 1.892836073277086, + "learning_rate": 2.6824059828694715e-06, + "loss": 0.6081, + "step": 6386 + }, + { + "epoch": 0.66, + "grad_norm": 1.644640787589624, + "learning_rate": 2.6809144602667747e-06, + "loss": 0.4926, + "step": 6387 + }, + { + "epoch": 0.66, + "grad_norm": 1.7857425923134511, + "learning_rate": 2.6794232005456468e-06, + "loss": 0.6601, + "step": 6388 + }, + { + "epoch": 0.66, + "grad_norm": 1.9604816465995976, + "learning_rate": 2.6779322038751217e-06, + "loss": 0.6702, + "step": 6389 + }, + { + "epoch": 0.66, + "grad_norm": 1.938706881138097, + "learning_rate": 2.676441470424218e-06, + "loss": 0.5822, + "step": 6390 + }, + { + "epoch": 0.66, + "grad_norm": 1.9790553819807086, + "learning_rate": 2.674951000361916e-06, + "loss": 0.5655, + "step": 6391 + }, + { + "epoch": 0.66, + "grad_norm": 2.078457287587751, + "learning_rate": 2.673460793857173e-06, + "loss": 0.6475, + "step": 6392 + }, + { + "epoch": 0.66, + "grad_norm": 2.1570179625592285, + "learning_rate": 2.6719708510789077e-06, + "loss": 0.651, + "step": 6393 + }, + { + "epoch": 0.66, + "grad_norm": 1.820206500935895, + "learning_rate": 2.6704811721960174e-06, + "loss": 0.6093, + "step": 6394 + }, + { + "epoch": 0.66, + "grad_norm": 1.8501666310944005, + "learning_rate": 2.6689917573773615e-06, + "loss": 0.6263, + "step": 6395 + }, + { + "epoch": 0.66, + "grad_norm": 2.0029446573228453, + "learning_rate": 2.6675026067917808e-06, + "loss": 0.6917, + "step": 6396 + }, + { + "epoch": 0.66, + "grad_norm": 1.8953373502231254, + "learning_rate": 2.6660137206080703e-06, + "loss": 0.6434, + "step": 6397 + }, + { + "epoch": 0.67, + "grad_norm": 2.2750848498069702, + "learning_rate": 2.66452509899501e-06, + "loss": 0.6846, + "step": 6398 + }, + { + "epoch": 0.67, + "grad_norm": 1.7810113358824438, + "learning_rate": 2.66303674212134e-06, + "loss": 0.6938, + "step": 6399 + }, + { + "epoch": 0.67, + "grad_norm": 1.8671929417800113, + "learning_rate": 2.6615486501557765e-06, + "loss": 0.6661, + "step": 6400 + }, + { + "epoch": 0.67, + "grad_norm": 1.8051102463110213, + "learning_rate": 2.660060823267001e-06, + "loss": 0.5891, + "step": 6401 + }, + { + "epoch": 0.67, + "grad_norm": 1.9343515924482275, + "learning_rate": 2.6585732616236705e-06, + "loss": 0.6584, + "step": 6402 + }, + { + "epoch": 0.67, + "grad_norm": 1.8462900458666527, + "learning_rate": 2.657085965394406e-06, + "loss": 0.6426, + "step": 6403 + }, + { + "epoch": 0.67, + "grad_norm": 1.9951702962287277, + "learning_rate": 2.655598934747801e-06, + "loss": 0.5901, + "step": 6404 + }, + { + "epoch": 0.67, + "grad_norm": 1.9579632915860867, + "learning_rate": 2.654112169852418e-06, + "loss": 0.6673, + "step": 6405 + }, + { + "epoch": 0.67, + "grad_norm": 1.8767384202403952, + "learning_rate": 2.652625670876794e-06, + "loss": 0.6249, + "step": 6406 + }, + { + "epoch": 0.67, + "grad_norm": 1.965704158295992, + "learning_rate": 2.6511394379894274e-06, + "loss": 0.632, + "step": 6407 + }, + { + "epoch": 0.67, + "grad_norm": 2.1880462552255833, + "learning_rate": 2.6496534713587952e-06, + "loss": 0.8164, + "step": 6408 + }, + { + "epoch": 0.67, + "grad_norm": 1.9072224748583726, + "learning_rate": 2.648167771153337e-06, + "loss": 0.5904, + "step": 6409 + }, + { + "epoch": 0.67, + "grad_norm": 1.8003432003879964, + "learning_rate": 2.6466823375414686e-06, + "loss": 0.5516, + "step": 6410 + }, + { + "epoch": 0.67, + "grad_norm": 1.940253426639748, + "learning_rate": 2.6451971706915713e-06, + "loss": 0.6669, + "step": 6411 + }, + { + "epoch": 0.67, + "grad_norm": 1.784289714434628, + "learning_rate": 2.6437122707719964e-06, + "loss": 0.6114, + "step": 6412 + }, + { + "epoch": 0.67, + "grad_norm": 1.8651482926009573, + "learning_rate": 2.6422276379510635e-06, + "loss": 0.564, + "step": 6413 + }, + { + "epoch": 0.67, + "grad_norm": 1.7832427381408453, + "learning_rate": 2.6407432723970694e-06, + "loss": 0.6461, + "step": 6414 + }, + { + "epoch": 0.67, + "grad_norm": 1.8443549195554543, + "learning_rate": 2.6392591742782704e-06, + "loss": 0.6369, + "step": 6415 + }, + { + "epoch": 0.67, + "grad_norm": 1.8770648648089452, + "learning_rate": 2.637775343762902e-06, + "loss": 0.6854, + "step": 6416 + }, + { + "epoch": 0.67, + "grad_norm": 1.7688361860428412, + "learning_rate": 2.6362917810191597e-06, + "loss": 0.5941, + "step": 6417 + }, + { + "epoch": 0.67, + "grad_norm": 2.052556551651509, + "learning_rate": 2.634808486215219e-06, + "loss": 0.6625, + "step": 6418 + }, + { + "epoch": 0.67, + "grad_norm": 1.8203041172722234, + "learning_rate": 2.633325459519218e-06, + "loss": 0.5459, + "step": 6419 + }, + { + "epoch": 0.67, + "grad_norm": 1.9295198108217466, + "learning_rate": 2.6318427010992644e-06, + "loss": 0.6982, + "step": 6420 + }, + { + "epoch": 0.67, + "grad_norm": 1.8076841531615386, + "learning_rate": 2.6303602111234394e-06, + "loss": 0.6608, + "step": 6421 + }, + { + "epoch": 0.67, + "grad_norm": 1.9542372550586, + "learning_rate": 2.6288779897597894e-06, + "loss": 0.6433, + "step": 6422 + }, + { + "epoch": 0.67, + "grad_norm": 1.7732041112945003, + "learning_rate": 2.627396037176336e-06, + "loss": 0.6528, + "step": 6423 + }, + { + "epoch": 0.67, + "grad_norm": 1.6589412606005547, + "learning_rate": 2.6259143535410635e-06, + "loss": 0.6287, + "step": 6424 + }, + { + "epoch": 0.67, + "grad_norm": 1.9124574288954266, + "learning_rate": 2.6244329390219347e-06, + "loss": 0.6141, + "step": 6425 + }, + { + "epoch": 0.67, + "grad_norm": 2.0180214984374745, + "learning_rate": 2.6229517937868687e-06, + "loss": 0.6131, + "step": 6426 + }, + { + "epoch": 0.67, + "grad_norm": 2.402149675102522, + "learning_rate": 2.621470918003768e-06, + "loss": 0.6226, + "step": 6427 + }, + { + "epoch": 0.67, + "grad_norm": 2.0034872878783734, + "learning_rate": 2.6199903118404934e-06, + "loss": 0.6166, + "step": 6428 + }, + { + "epoch": 0.67, + "grad_norm": 1.9773378931296095, + "learning_rate": 2.6185099754648846e-06, + "loss": 0.6453, + "step": 6429 + }, + { + "epoch": 0.67, + "grad_norm": 1.7467829991028916, + "learning_rate": 2.617029909044742e-06, + "loss": 0.5366, + "step": 6430 + }, + { + "epoch": 0.67, + "grad_norm": 1.802869220129952, + "learning_rate": 2.615550112747844e-06, + "loss": 0.6091, + "step": 6431 + }, + { + "epoch": 0.67, + "grad_norm": 2.0939771876409927, + "learning_rate": 2.614070586741929e-06, + "loss": 0.666, + "step": 6432 + }, + { + "epoch": 0.67, + "grad_norm": 1.8675237371257338, + "learning_rate": 2.612591331194717e-06, + "loss": 0.6055, + "step": 6433 + }, + { + "epoch": 0.67, + "grad_norm": 1.8547868852263747, + "learning_rate": 2.611112346273881e-06, + "loss": 0.7287, + "step": 6434 + }, + { + "epoch": 0.67, + "grad_norm": 2.125456167041699, + "learning_rate": 2.6096336321470796e-06, + "loss": 0.6753, + "step": 6435 + }, + { + "epoch": 0.67, + "grad_norm": 1.7754033248669119, + "learning_rate": 2.608155188981927e-06, + "loss": 0.6194, + "step": 6436 + }, + { + "epoch": 0.67, + "grad_norm": 1.9086265288313402, + "learning_rate": 2.6066770169460198e-06, + "loss": 0.6433, + "step": 6437 + }, + { + "epoch": 0.67, + "grad_norm": 2.0396739748065693, + "learning_rate": 2.605199116206912e-06, + "loss": 0.632, + "step": 6438 + }, + { + "epoch": 0.67, + "grad_norm": 2.0428624717964166, + "learning_rate": 2.603721486932137e-06, + "loss": 0.5821, + "step": 6439 + }, + { + "epoch": 0.67, + "grad_norm": 1.9531009300773505, + "learning_rate": 2.602244129289189e-06, + "loss": 0.6206, + "step": 6440 + }, + { + "epoch": 0.67, + "grad_norm": 1.6950662619278845, + "learning_rate": 2.6007670434455357e-06, + "loss": 0.6074, + "step": 6441 + }, + { + "epoch": 0.67, + "grad_norm": 2.2333214792807508, + "learning_rate": 2.599290229568612e-06, + "loss": 0.6618, + "step": 6442 + }, + { + "epoch": 0.67, + "grad_norm": 1.8847057365906073, + "learning_rate": 2.5978136878258255e-06, + "loss": 0.5443, + "step": 6443 + }, + { + "epoch": 0.67, + "grad_norm": 1.9719360697628705, + "learning_rate": 2.596337418384548e-06, + "loss": 0.568, + "step": 6444 + }, + { + "epoch": 0.67, + "grad_norm": 2.078216650575824, + "learning_rate": 2.594861421412126e-06, + "loss": 0.714, + "step": 6445 + }, + { + "epoch": 0.67, + "grad_norm": 1.7669285768038225, + "learning_rate": 2.5933856970758693e-06, + "loss": 0.6496, + "step": 6446 + }, + { + "epoch": 0.67, + "grad_norm": 1.7929630099134348, + "learning_rate": 2.591910245543063e-06, + "loss": 0.5728, + "step": 6447 + }, + { + "epoch": 0.67, + "grad_norm": 1.9304811982301202, + "learning_rate": 2.5904350669809554e-06, + "loss": 0.6409, + "step": 6448 + }, + { + "epoch": 0.67, + "grad_norm": 1.9345487202393299, + "learning_rate": 2.5889601615567657e-06, + "loss": 0.6752, + "step": 6449 + }, + { + "epoch": 0.67, + "grad_norm": 2.2505125627362528, + "learning_rate": 2.5874855294376853e-06, + "loss": 0.5156, + "step": 6450 + }, + { + "epoch": 0.67, + "grad_norm": 2.0098075540433054, + "learning_rate": 2.586011170790872e-06, + "loss": 0.6312, + "step": 6451 + }, + { + "epoch": 0.67, + "grad_norm": 1.7605667785806696, + "learning_rate": 2.5845370857834497e-06, + "loss": 0.5879, + "step": 6452 + }, + { + "epoch": 0.67, + "grad_norm": 1.9979475108404847, + "learning_rate": 2.583063274582518e-06, + "loss": 0.669, + "step": 6453 + }, + { + "epoch": 0.67, + "grad_norm": 1.9387775598018375, + "learning_rate": 2.581589737355138e-06, + "loss": 0.6661, + "step": 6454 + }, + { + "epoch": 0.67, + "grad_norm": 2.017076403377001, + "learning_rate": 2.5801164742683484e-06, + "loss": 0.625, + "step": 6455 + }, + { + "epoch": 0.67, + "grad_norm": 1.9337213692942614, + "learning_rate": 2.5786434854891482e-06, + "loss": 0.719, + "step": 6456 + }, + { + "epoch": 0.67, + "grad_norm": 2.0865829156729423, + "learning_rate": 2.5771707711845096e-06, + "loss": 0.6503, + "step": 6457 + }, + { + "epoch": 0.67, + "grad_norm": 2.2744014743252934, + "learning_rate": 2.5756983315213748e-06, + "loss": 0.7139, + "step": 6458 + }, + { + "epoch": 0.67, + "grad_norm": 1.6805694624059913, + "learning_rate": 2.5742261666666506e-06, + "loss": 0.6167, + "step": 6459 + }, + { + "epoch": 0.67, + "grad_norm": 1.8025014515127096, + "learning_rate": 2.5727542767872188e-06, + "loss": 0.6049, + "step": 6460 + }, + { + "epoch": 0.67, + "grad_norm": 1.8839136762795483, + "learning_rate": 2.5712826620499227e-06, + "loss": 0.5673, + "step": 6461 + }, + { + "epoch": 0.67, + "grad_norm": 1.997611542367455, + "learning_rate": 2.569811322621584e-06, + "loss": 0.6339, + "step": 6462 + }, + { + "epoch": 0.67, + "grad_norm": 2.107752055939515, + "learning_rate": 2.5683402586689788e-06, + "loss": 0.6669, + "step": 6463 + }, + { + "epoch": 0.67, + "grad_norm": 2.1043518474334606, + "learning_rate": 2.5668694703588683e-06, + "loss": 0.626, + "step": 6464 + }, + { + "epoch": 0.67, + "grad_norm": 1.904909097519436, + "learning_rate": 2.565398957857969e-06, + "loss": 0.5783, + "step": 6465 + }, + { + "epoch": 0.67, + "grad_norm": 1.9161233226944596, + "learning_rate": 2.5639287213329767e-06, + "loss": 0.6308, + "step": 6466 + }, + { + "epoch": 0.67, + "grad_norm": 2.0568730245550317, + "learning_rate": 2.5624587609505475e-06, + "loss": 0.6569, + "step": 6467 + }, + { + "epoch": 0.67, + "grad_norm": 1.8986904541545218, + "learning_rate": 2.5609890768773126e-06, + "loss": 0.5687, + "step": 6468 + }, + { + "epoch": 0.67, + "grad_norm": 1.925080769355097, + "learning_rate": 2.5595196692798664e-06, + "loss": 0.6665, + "step": 6469 + }, + { + "epoch": 0.67, + "grad_norm": 2.1620329086880075, + "learning_rate": 2.5580505383247796e-06, + "loss": 0.6607, + "step": 6470 + }, + { + "epoch": 0.67, + "grad_norm": 1.8802935287691611, + "learning_rate": 2.5565816841785785e-06, + "loss": 0.6097, + "step": 6471 + }, + { + "epoch": 0.67, + "grad_norm": 2.046944025991196, + "learning_rate": 2.555113107007773e-06, + "loss": 0.6806, + "step": 6472 + }, + { + "epoch": 0.67, + "grad_norm": 1.8878402057696602, + "learning_rate": 2.55364480697883e-06, + "loss": 0.6483, + "step": 6473 + }, + { + "epoch": 0.67, + "grad_norm": 1.8425257728641522, + "learning_rate": 2.5521767842581947e-06, + "loss": 0.5859, + "step": 6474 + }, + { + "epoch": 0.67, + "grad_norm": 1.6780569604641604, + "learning_rate": 2.5507090390122704e-06, + "loss": 0.657, + "step": 6475 + }, + { + "epoch": 0.67, + "grad_norm": 1.9087001854008494, + "learning_rate": 2.5492415714074387e-06, + "loss": 0.7312, + "step": 6476 + }, + { + "epoch": 0.67, + "grad_norm": 2.1493828843849827, + "learning_rate": 2.5477743816100443e-06, + "loss": 0.7286, + "step": 6477 + }, + { + "epoch": 0.67, + "grad_norm": 1.8397832863127606, + "learning_rate": 2.5463074697864006e-06, + "loss": 0.5727, + "step": 6478 + }, + { + "epoch": 0.67, + "grad_norm": 1.9416792168693122, + "learning_rate": 2.544840836102789e-06, + "loss": 0.6589, + "step": 6479 + }, + { + "epoch": 0.67, + "grad_norm": 1.7431185986584572, + "learning_rate": 2.543374480725464e-06, + "loss": 0.5923, + "step": 6480 + }, + { + "epoch": 0.67, + "grad_norm": 1.8136659982858228, + "learning_rate": 2.5419084038206422e-06, + "loss": 0.616, + "step": 6481 + }, + { + "epoch": 0.67, + "grad_norm": 1.793225841672104, + "learning_rate": 2.540442605554516e-06, + "loss": 0.6643, + "step": 6482 + }, + { + "epoch": 0.67, + "grad_norm": 1.6887884282795877, + "learning_rate": 2.5389770860932374e-06, + "loss": 0.6283, + "step": 6483 + }, + { + "epoch": 0.67, + "grad_norm": 2.0298876307442724, + "learning_rate": 2.5375118456029345e-06, + "loss": 0.5921, + "step": 6484 + }, + { + "epoch": 0.67, + "grad_norm": 1.861398698981882, + "learning_rate": 2.5360468842497004e-06, + "loss": 0.5887, + "step": 6485 + }, + { + "epoch": 0.67, + "grad_norm": 2.031756505301046, + "learning_rate": 2.5345822021995934e-06, + "loss": 0.7035, + "step": 6486 + }, + { + "epoch": 0.67, + "grad_norm": 1.9440950572916977, + "learning_rate": 2.5331177996186494e-06, + "loss": 0.6454, + "step": 6487 + }, + { + "epoch": 0.67, + "grad_norm": 2.018185652460288, + "learning_rate": 2.5316536766728605e-06, + "loss": 0.6834, + "step": 6488 + }, + { + "epoch": 0.67, + "grad_norm": 1.786345392863452, + "learning_rate": 2.5301898335281994e-06, + "loss": 0.6012, + "step": 6489 + }, + { + "epoch": 0.67, + "grad_norm": 2.094253077479235, + "learning_rate": 2.5287262703505973e-06, + "loss": 0.6962, + "step": 6490 + }, + { + "epoch": 0.67, + "grad_norm": 2.016490719493658, + "learning_rate": 2.5272629873059564e-06, + "loss": 0.7104, + "step": 6491 + }, + { + "epoch": 0.67, + "grad_norm": 1.942400372969221, + "learning_rate": 2.525799984560152e-06, + "loss": 0.6432, + "step": 6492 + }, + { + "epoch": 0.67, + "grad_norm": 1.841903429983575, + "learning_rate": 2.524337262279022e-06, + "loss": 0.621, + "step": 6493 + }, + { + "epoch": 0.68, + "grad_norm": 1.8022659474271276, + "learning_rate": 2.5228748206283716e-06, + "loss": 0.6844, + "step": 6494 + }, + { + "epoch": 0.68, + "grad_norm": 1.9084102317140246, + "learning_rate": 2.521412659773982e-06, + "loss": 0.653, + "step": 6495 + }, + { + "epoch": 0.68, + "grad_norm": 2.003649909793403, + "learning_rate": 2.5199507798815926e-06, + "loss": 0.6809, + "step": 6496 + }, + { + "epoch": 0.68, + "grad_norm": 1.806198288915302, + "learning_rate": 2.5184891811169203e-06, + "loss": 0.513, + "step": 6497 + }, + { + "epoch": 0.68, + "grad_norm": 2.0289321659239388, + "learning_rate": 2.5170278636456413e-06, + "loss": 0.6757, + "step": 6498 + }, + { + "epoch": 0.68, + "grad_norm": 2.1476325520610784, + "learning_rate": 2.515566827633411e-06, + "loss": 0.6057, + "step": 6499 + }, + { + "epoch": 0.68, + "grad_norm": 2.1238370270123315, + "learning_rate": 2.5141060732458366e-06, + "loss": 0.6114, + "step": 6500 + }, + { + "epoch": 0.68, + "grad_norm": 1.8882367928318948, + "learning_rate": 2.512645600648511e-06, + "loss": 0.6919, + "step": 6501 + }, + { + "epoch": 0.68, + "grad_norm": 1.7336732581718404, + "learning_rate": 2.511185410006981e-06, + "loss": 0.6119, + "step": 6502 + }, + { + "epoch": 0.68, + "grad_norm": 1.8563482336968113, + "learning_rate": 2.5097255014867733e-06, + "loss": 0.6606, + "step": 6503 + }, + { + "epoch": 0.68, + "grad_norm": 2.1097536526043412, + "learning_rate": 2.508265875253372e-06, + "loss": 0.6581, + "step": 6504 + }, + { + "epoch": 0.68, + "grad_norm": 2.0103292190475575, + "learning_rate": 2.5068065314722378e-06, + "loss": 0.5797, + "step": 6505 + }, + { + "epoch": 0.68, + "grad_norm": 1.6519741874999505, + "learning_rate": 2.5053474703087943e-06, + "loss": 0.5882, + "step": 6506 + }, + { + "epoch": 0.68, + "grad_norm": 2.0533959837032523, + "learning_rate": 2.5038886919284333e-06, + "loss": 0.7001, + "step": 6507 + }, + { + "epoch": 0.68, + "grad_norm": 1.8805266087401824, + "learning_rate": 2.5024301964965157e-06, + "loss": 0.5486, + "step": 6508 + }, + { + "epoch": 0.68, + "grad_norm": 2.1279785478582482, + "learning_rate": 2.500971984178372e-06, + "loss": 0.6061, + "step": 6509 + }, + { + "epoch": 0.68, + "grad_norm": 1.938630985747579, + "learning_rate": 2.4995140551392965e-06, + "loss": 0.7083, + "step": 6510 + }, + { + "epoch": 0.68, + "grad_norm": 1.8335443599118721, + "learning_rate": 2.4980564095445562e-06, + "loss": 0.5235, + "step": 6511 + }, + { + "epoch": 0.68, + "grad_norm": 2.148717212825636, + "learning_rate": 2.4965990475593814e-06, + "loss": 0.5978, + "step": 6512 + }, + { + "epoch": 0.68, + "grad_norm": 1.824758821910578, + "learning_rate": 2.495141969348975e-06, + "loss": 0.5902, + "step": 6513 + }, + { + "epoch": 0.68, + "grad_norm": 1.7780524543423788, + "learning_rate": 2.493685175078504e-06, + "loss": 0.6284, + "step": 6514 + }, + { + "epoch": 0.68, + "grad_norm": 1.8609798057641382, + "learning_rate": 2.492228664913104e-06, + "loss": 0.6661, + "step": 6515 + }, + { + "epoch": 0.68, + "grad_norm": 1.8370681072012822, + "learning_rate": 2.4907724390178762e-06, + "loss": 0.5885, + "step": 6516 + }, + { + "epoch": 0.68, + "grad_norm": 1.895445022270483, + "learning_rate": 2.489316497557897e-06, + "loss": 0.5775, + "step": 6517 + }, + { + "epoch": 0.68, + "grad_norm": 1.7219155092631773, + "learning_rate": 2.487860840698201e-06, + "loss": 0.6289, + "step": 6518 + }, + { + "epoch": 0.68, + "grad_norm": 2.0460365604603816, + "learning_rate": 2.4864054686037993e-06, + "loss": 0.6826, + "step": 6519 + }, + { + "epoch": 0.68, + "grad_norm": 1.7147425195038142, + "learning_rate": 2.4849503814396624e-06, + "loss": 0.5136, + "step": 6520 + }, + { + "epoch": 0.68, + "grad_norm": 1.6858038167010478, + "learning_rate": 2.4834955793707376e-06, + "loss": 0.6515, + "step": 6521 + }, + { + "epoch": 0.68, + "grad_norm": 2.021509740545536, + "learning_rate": 2.4820410625619325e-06, + "loss": 0.5946, + "step": 6522 + }, + { + "epoch": 0.68, + "grad_norm": 2.044099212949625, + "learning_rate": 2.4805868311781228e-06, + "loss": 0.6226, + "step": 6523 + }, + { + "epoch": 0.68, + "grad_norm": 2.0554872349030506, + "learning_rate": 2.4791328853841577e-06, + "loss": 0.6877, + "step": 6524 + }, + { + "epoch": 0.68, + "grad_norm": 1.7756247535084098, + "learning_rate": 2.4776792253448465e-06, + "loss": 0.6656, + "step": 6525 + }, + { + "epoch": 0.68, + "grad_norm": 1.9002880458700975, + "learning_rate": 2.4762258512249745e-06, + "loss": 0.721, + "step": 6526 + }, + { + "epoch": 0.68, + "grad_norm": 1.7127361731051247, + "learning_rate": 2.4747727631892847e-06, + "loss": 0.5513, + "step": 6527 + }, + { + "epoch": 0.68, + "grad_norm": 1.804770412332777, + "learning_rate": 2.4733199614024978e-06, + "loss": 0.5056, + "step": 6528 + }, + { + "epoch": 0.68, + "grad_norm": 1.9003328353324378, + "learning_rate": 2.4718674460292945e-06, + "loss": 0.5586, + "step": 6529 + }, + { + "epoch": 0.68, + "grad_norm": 1.8474351176816664, + "learning_rate": 2.470415217234326e-06, + "loss": 0.5364, + "step": 6530 + }, + { + "epoch": 0.68, + "grad_norm": 1.8999533795354315, + "learning_rate": 2.468963275182209e-06, + "loss": 0.6509, + "step": 6531 + }, + { + "epoch": 0.68, + "grad_norm": 1.7821564463248776, + "learning_rate": 2.467511620037533e-06, + "loss": 0.6039, + "step": 6532 + }, + { + "epoch": 0.68, + "grad_norm": 1.815283031049219, + "learning_rate": 2.466060251964848e-06, + "loss": 0.5468, + "step": 6533 + }, + { + "epoch": 0.68, + "grad_norm": 1.7976932655952507, + "learning_rate": 2.4646091711286783e-06, + "loss": 0.6342, + "step": 6534 + }, + { + "epoch": 0.68, + "grad_norm": 2.1496032676946073, + "learning_rate": 2.4631583776935087e-06, + "loss": 0.6687, + "step": 6535 + }, + { + "epoch": 0.68, + "grad_norm": 1.9563009201475414, + "learning_rate": 2.4617078718237996e-06, + "loss": 0.5885, + "step": 6536 + }, + { + "epoch": 0.68, + "grad_norm": 1.9025425900601731, + "learning_rate": 2.4602576536839672e-06, + "loss": 0.7126, + "step": 6537 + }, + { + "epoch": 0.68, + "grad_norm": 2.2229623466585378, + "learning_rate": 2.4588077234384084e-06, + "loss": 0.6516, + "step": 6538 + }, + { + "epoch": 0.68, + "grad_norm": 2.078758706067484, + "learning_rate": 2.457358081251476e-06, + "loss": 0.6454, + "step": 6539 + }, + { + "epoch": 0.68, + "grad_norm": 2.206863288113695, + "learning_rate": 2.4559087272875e-06, + "loss": 0.5985, + "step": 6540 + }, + { + "epoch": 0.68, + "grad_norm": 1.7974189151535742, + "learning_rate": 2.454459661710768e-06, + "loss": 0.6699, + "step": 6541 + }, + { + "epoch": 0.68, + "grad_norm": 2.183890569895723, + "learning_rate": 2.453010884685545e-06, + "loss": 0.6428, + "step": 6542 + }, + { + "epoch": 0.68, + "grad_norm": 2.1368283835643664, + "learning_rate": 2.451562396376055e-06, + "loss": 0.6995, + "step": 6543 + }, + { + "epoch": 0.68, + "grad_norm": 1.8922048559177875, + "learning_rate": 2.4501141969464936e-06, + "loss": 0.7507, + "step": 6544 + }, + { + "epoch": 0.68, + "grad_norm": 2.044874405618413, + "learning_rate": 2.4486662865610194e-06, + "loss": 0.682, + "step": 6545 + }, + { + "epoch": 0.68, + "grad_norm": 1.903529162451403, + "learning_rate": 2.447218665383766e-06, + "loss": 0.5761, + "step": 6546 + }, + { + "epoch": 0.68, + "grad_norm": 1.7651894250163067, + "learning_rate": 2.445771333578825e-06, + "loss": 0.5569, + "step": 6547 + }, + { + "epoch": 0.68, + "grad_norm": 1.930097576630646, + "learning_rate": 2.4443242913102645e-06, + "loss": 0.6176, + "step": 6548 + }, + { + "epoch": 0.68, + "grad_norm": 1.9430342493397708, + "learning_rate": 2.44287753874211e-06, + "loss": 0.5452, + "step": 6549 + }, + { + "epoch": 0.68, + "grad_norm": 1.7755635352299155, + "learning_rate": 2.4414310760383635e-06, + "loss": 0.5851, + "step": 6550 + }, + { + "epoch": 0.68, + "grad_norm": 1.8019542492761174, + "learning_rate": 2.439984903362988e-06, + "loss": 0.5942, + "step": 6551 + }, + { + "epoch": 0.68, + "grad_norm": 1.8952673414148868, + "learning_rate": 2.4385390208799153e-06, + "loss": 0.5434, + "step": 6552 + }, + { + "epoch": 0.68, + "grad_norm": 1.6146056407936027, + "learning_rate": 2.437093428753042e-06, + "loss": 0.53, + "step": 6553 + }, + { + "epoch": 0.68, + "grad_norm": 2.0490607931816394, + "learning_rate": 2.4356481271462396e-06, + "loss": 0.6576, + "step": 6554 + }, + { + "epoch": 0.68, + "grad_norm": 2.0066381095536445, + "learning_rate": 2.434203116223336e-06, + "loss": 0.6539, + "step": 6555 + }, + { + "epoch": 0.68, + "grad_norm": 1.9100614870915273, + "learning_rate": 2.4327583961481356e-06, + "loss": 0.6054, + "step": 6556 + }, + { + "epoch": 0.68, + "grad_norm": 1.9574341823528867, + "learning_rate": 2.4313139670844016e-06, + "loss": 0.618, + "step": 6557 + }, + { + "epoch": 0.68, + "grad_norm": 1.8779301828805786, + "learning_rate": 2.429869829195872e-06, + "loss": 0.5621, + "step": 6558 + }, + { + "epoch": 0.68, + "grad_norm": 2.06963245031177, + "learning_rate": 2.4284259826462475e-06, + "loss": 0.6054, + "step": 6559 + }, + { + "epoch": 0.68, + "grad_norm": 2.074995237798964, + "learning_rate": 2.4269824275991925e-06, + "loss": 0.539, + "step": 6560 + }, + { + "epoch": 0.68, + "grad_norm": 1.8641536750051724, + "learning_rate": 2.425539164218348e-06, + "loss": 0.5851, + "step": 6561 + }, + { + "epoch": 0.68, + "grad_norm": 1.7855445771394514, + "learning_rate": 2.4240961926673107e-06, + "loss": 0.6371, + "step": 6562 + }, + { + "epoch": 0.68, + "grad_norm": 2.033972613929812, + "learning_rate": 2.422653513109654e-06, + "loss": 0.6142, + "step": 6563 + }, + { + "epoch": 0.68, + "grad_norm": 2.1191051352226, + "learning_rate": 2.42121112570891e-06, + "loss": 0.5777, + "step": 6564 + }, + { + "epoch": 0.68, + "grad_norm": 1.8636220501849985, + "learning_rate": 2.4197690306285855e-06, + "loss": 0.6975, + "step": 6565 + }, + { + "epoch": 0.68, + "grad_norm": 2.255684862296978, + "learning_rate": 2.4183272280321477e-06, + "loss": 0.6607, + "step": 6566 + }, + { + "epoch": 0.68, + "grad_norm": 1.9075617579966504, + "learning_rate": 2.416885718083035e-06, + "loss": 0.5841, + "step": 6567 + }, + { + "epoch": 0.68, + "grad_norm": 1.9785565108430303, + "learning_rate": 2.4154445009446457e-06, + "loss": 0.6007, + "step": 6568 + }, + { + "epoch": 0.68, + "grad_norm": 2.046547103393187, + "learning_rate": 2.414003576780357e-06, + "loss": 0.6176, + "step": 6569 + }, + { + "epoch": 0.68, + "grad_norm": 2.021562045047473, + "learning_rate": 2.4125629457535003e-06, + "loss": 0.6506, + "step": 6570 + }, + { + "epoch": 0.68, + "grad_norm": 1.9063754184105355, + "learning_rate": 2.4111226080273832e-06, + "loss": 0.6206, + "step": 6571 + }, + { + "epoch": 0.68, + "grad_norm": 1.7439814488302037, + "learning_rate": 2.409682563765273e-06, + "loss": 0.6125, + "step": 6572 + }, + { + "epoch": 0.68, + "grad_norm": 1.7171532890783108, + "learning_rate": 2.408242813130412e-06, + "loss": 0.5728, + "step": 6573 + }, + { + "epoch": 0.68, + "grad_norm": 1.9647678953123011, + "learning_rate": 2.406803356285997e-06, + "loss": 0.661, + "step": 6574 + }, + { + "epoch": 0.68, + "grad_norm": 1.76511392894678, + "learning_rate": 2.4053641933952043e-06, + "loss": 0.4648, + "step": 6575 + }, + { + "epoch": 0.68, + "grad_norm": 1.9107782987664588, + "learning_rate": 2.4039253246211673e-06, + "loss": 0.5542, + "step": 6576 + }, + { + "epoch": 0.68, + "grad_norm": 1.912398300444987, + "learning_rate": 2.402486750126994e-06, + "loss": 0.6348, + "step": 6577 + }, + { + "epoch": 0.68, + "grad_norm": 1.7869081393535953, + "learning_rate": 2.401048470075751e-06, + "loss": 0.5759, + "step": 6578 + }, + { + "epoch": 0.68, + "grad_norm": 1.8508351567029224, + "learning_rate": 2.399610484630479e-06, + "loss": 0.6337, + "step": 6579 + }, + { + "epoch": 0.68, + "grad_norm": 1.9020644166463498, + "learning_rate": 2.3981727939541806e-06, + "loss": 0.5074, + "step": 6580 + }, + { + "epoch": 0.68, + "grad_norm": 1.8625920857309566, + "learning_rate": 2.396735398209825e-06, + "loss": 0.5701, + "step": 6581 + }, + { + "epoch": 0.68, + "grad_norm": 1.8707396576414643, + "learning_rate": 2.3952982975603494e-06, + "loss": 0.6144, + "step": 6582 + }, + { + "epoch": 0.68, + "grad_norm": 1.7674720029464268, + "learning_rate": 2.3938614921686592e-06, + "loss": 0.4661, + "step": 6583 + }, + { + "epoch": 0.68, + "grad_norm": 2.1866573593617296, + "learning_rate": 2.392424982197622e-06, + "loss": 0.7057, + "step": 6584 + }, + { + "epoch": 0.68, + "grad_norm": 1.5827145809561256, + "learning_rate": 2.3909887678100774e-06, + "loss": 0.494, + "step": 6585 + }, + { + "epoch": 0.68, + "grad_norm": 1.9495572070111211, + "learning_rate": 2.3895528491688246e-06, + "loss": 0.6217, + "step": 6586 + }, + { + "epoch": 0.68, + "grad_norm": 1.840105029036223, + "learning_rate": 2.388117226436638e-06, + "loss": 0.6843, + "step": 6587 + }, + { + "epoch": 0.68, + "grad_norm": 2.1018687611008997, + "learning_rate": 2.3866818997762507e-06, + "loss": 0.6491, + "step": 6588 + }, + { + "epoch": 0.68, + "grad_norm": 1.9982661992960102, + "learning_rate": 2.3852468693503635e-06, + "loss": 0.7238, + "step": 6589 + }, + { + "epoch": 0.69, + "grad_norm": 1.8010975053442853, + "learning_rate": 2.3838121353216494e-06, + "loss": 0.6081, + "step": 6590 + }, + { + "epoch": 0.69, + "grad_norm": 2.0237199118531266, + "learning_rate": 2.3823776978527412e-06, + "loss": 0.6033, + "step": 6591 + }, + { + "epoch": 0.69, + "grad_norm": 1.8954053993498832, + "learning_rate": 2.380943557106239e-06, + "loss": 0.6438, + "step": 6592 + }, + { + "epoch": 0.69, + "grad_norm": 1.8578242402001044, + "learning_rate": 2.379509713244715e-06, + "loss": 0.6441, + "step": 6593 + }, + { + "epoch": 0.69, + "grad_norm": 1.913753592633508, + "learning_rate": 2.3780761664306988e-06, + "loss": 0.5426, + "step": 6594 + }, + { + "epoch": 0.69, + "grad_norm": 1.8191071740448894, + "learning_rate": 2.3766429168266958e-06, + "loss": 0.5654, + "step": 6595 + }, + { + "epoch": 0.69, + "grad_norm": 1.775553219293777, + "learning_rate": 2.375209964595171e-06, + "loss": 0.601, + "step": 6596 + }, + { + "epoch": 0.69, + "grad_norm": 2.0852233066631856, + "learning_rate": 2.3737773098985556e-06, + "loss": 0.5894, + "step": 6597 + }, + { + "epoch": 0.69, + "grad_norm": 1.811993129732441, + "learning_rate": 2.3723449528992527e-06, + "loss": 0.5774, + "step": 6598 + }, + { + "epoch": 0.69, + "grad_norm": 2.3368875265028968, + "learning_rate": 2.3709128937596248e-06, + "loss": 0.6608, + "step": 6599 + }, + { + "epoch": 0.69, + "grad_norm": 2.145791013358563, + "learning_rate": 2.3694811326420074e-06, + "loss": 0.6564, + "step": 6600 + }, + { + "epoch": 0.69, + "grad_norm": 2.0766083446244052, + "learning_rate": 2.3680496697086956e-06, + "loss": 0.6666, + "step": 6601 + }, + { + "epoch": 0.69, + "grad_norm": 1.9026813857247935, + "learning_rate": 2.366618505121957e-06, + "loss": 0.6696, + "step": 6602 + }, + { + "epoch": 0.69, + "grad_norm": 1.7019084860970937, + "learning_rate": 2.365187639044021e-06, + "loss": 0.574, + "step": 6603 + }, + { + "epoch": 0.69, + "grad_norm": 2.0689299845262132, + "learning_rate": 2.3637570716370835e-06, + "loss": 0.6265, + "step": 6604 + }, + { + "epoch": 0.69, + "grad_norm": 2.0029581171015454, + "learning_rate": 2.362326803063306e-06, + "loss": 0.5978, + "step": 6605 + }, + { + "epoch": 0.69, + "grad_norm": 1.7689417076989542, + "learning_rate": 2.360896833484822e-06, + "loss": 0.5353, + "step": 6606 + }, + { + "epoch": 0.69, + "grad_norm": 2.0905140765067185, + "learning_rate": 2.3594671630637223e-06, + "loss": 0.6662, + "step": 6607 + }, + { + "epoch": 0.69, + "grad_norm": 1.768719947566444, + "learning_rate": 2.3580377919620716e-06, + "loss": 0.6265, + "step": 6608 + }, + { + "epoch": 0.69, + "grad_norm": 2.1019145030463755, + "learning_rate": 2.3566087203418946e-06, + "loss": 0.6763, + "step": 6609 + }, + { + "epoch": 0.69, + "grad_norm": 1.8952776750300382, + "learning_rate": 2.3551799483651894e-06, + "loss": 0.5889, + "step": 6610 + }, + { + "epoch": 0.69, + "grad_norm": 2.1165276188915803, + "learning_rate": 2.3537514761939083e-06, + "loss": 0.7131, + "step": 6611 + }, + { + "epoch": 0.69, + "grad_norm": 1.8468004323684801, + "learning_rate": 2.3523233039899827e-06, + "loss": 0.6277, + "step": 6612 + }, + { + "epoch": 0.69, + "grad_norm": 1.9635660498133092, + "learning_rate": 2.3508954319153e-06, + "loss": 0.577, + "step": 6613 + }, + { + "epoch": 0.69, + "grad_norm": 2.173612272307502, + "learning_rate": 2.3494678601317204e-06, + "loss": 0.5449, + "step": 6614 + }, + { + "epoch": 0.69, + "grad_norm": 2.09870439358293, + "learning_rate": 2.3480405888010654e-06, + "loss": 0.5512, + "step": 6615 + }, + { + "epoch": 0.69, + "grad_norm": 1.9973381551511349, + "learning_rate": 2.3466136180851274e-06, + "loss": 0.5502, + "step": 6616 + }, + { + "epoch": 0.69, + "grad_norm": 2.0266125822500776, + "learning_rate": 2.345186948145659e-06, + "loss": 0.5873, + "step": 6617 + }, + { + "epoch": 0.69, + "grad_norm": 1.7407141658103176, + "learning_rate": 2.343760579144382e-06, + "loss": 0.6076, + "step": 6618 + }, + { + "epoch": 0.69, + "grad_norm": 2.1528326826833104, + "learning_rate": 2.342334511242982e-06, + "loss": 0.6227, + "step": 6619 + }, + { + "epoch": 0.69, + "grad_norm": 1.877769142307935, + "learning_rate": 2.3409087446031144e-06, + "loss": 0.5574, + "step": 6620 + }, + { + "epoch": 0.69, + "grad_norm": 1.970392303151823, + "learning_rate": 2.3394832793863955e-06, + "loss": 0.5651, + "step": 6621 + }, + { + "epoch": 0.69, + "grad_norm": 1.896484797097487, + "learning_rate": 2.338058115754413e-06, + "loss": 0.6181, + "step": 6622 + }, + { + "epoch": 0.69, + "grad_norm": 1.8879614504411355, + "learning_rate": 2.336633253868714e-06, + "loss": 0.5099, + "step": 6623 + }, + { + "epoch": 0.69, + "grad_norm": 1.9349750897671085, + "learning_rate": 2.335208693890819e-06, + "loss": 0.6151, + "step": 6624 + }, + { + "epoch": 0.69, + "grad_norm": 1.9611926446015844, + "learning_rate": 2.333784435982206e-06, + "loss": 0.5171, + "step": 6625 + }, + { + "epoch": 0.69, + "grad_norm": 2.0197304863197316, + "learning_rate": 2.3323604803043225e-06, + "loss": 0.6571, + "step": 6626 + }, + { + "epoch": 0.69, + "grad_norm": 1.9238997930011825, + "learning_rate": 2.3309368270185863e-06, + "loss": 0.6127, + "step": 6627 + }, + { + "epoch": 0.69, + "grad_norm": 1.9924563648365212, + "learning_rate": 2.3295134762863713e-06, + "loss": 0.6, + "step": 6628 + }, + { + "epoch": 0.69, + "grad_norm": 2.1233139368564826, + "learning_rate": 2.3280904282690268e-06, + "loss": 0.6467, + "step": 6629 + }, + { + "epoch": 0.69, + "grad_norm": 1.6964172462449443, + "learning_rate": 2.3266676831278625e-06, + "loss": 0.5363, + "step": 6630 + }, + { + "epoch": 0.69, + "grad_norm": 2.111431211506792, + "learning_rate": 2.325245241024151e-06, + "loss": 0.5895, + "step": 6631 + }, + { + "epoch": 0.69, + "grad_norm": 1.8468775750315043, + "learning_rate": 2.3238231021191392e-06, + "loss": 0.5522, + "step": 6632 + }, + { + "epoch": 0.69, + "grad_norm": 1.865844535939304, + "learning_rate": 2.3224012665740327e-06, + "loss": 0.5739, + "step": 6633 + }, + { + "epoch": 0.69, + "grad_norm": 2.0846033087576568, + "learning_rate": 2.3209797345500025e-06, + "loss": 0.7064, + "step": 6634 + }, + { + "epoch": 0.69, + "grad_norm": 1.9161200839699233, + "learning_rate": 2.3195585062081904e-06, + "loss": 0.6521, + "step": 6635 + }, + { + "epoch": 0.69, + "grad_norm": 2.0009721784284826, + "learning_rate": 2.3181375817096986e-06, + "loss": 0.6137, + "step": 6636 + }, + { + "epoch": 0.69, + "grad_norm": 1.9519332289640217, + "learning_rate": 2.3167169612155997e-06, + "loss": 0.5762, + "step": 6637 + }, + { + "epoch": 0.69, + "grad_norm": 1.837292165340319, + "learning_rate": 2.315296644886926e-06, + "loss": 0.6546, + "step": 6638 + }, + { + "epoch": 0.69, + "grad_norm": 1.9042025982700623, + "learning_rate": 2.313876632884683e-06, + "loss": 0.6275, + "step": 6639 + }, + { + "epoch": 0.69, + "grad_norm": 1.8454885332773772, + "learning_rate": 2.3124569253698305e-06, + "loss": 0.6006, + "step": 6640 + }, + { + "epoch": 0.69, + "grad_norm": 1.7636093008937368, + "learning_rate": 2.3110375225033056e-06, + "loss": 0.5944, + "step": 6641 + }, + { + "epoch": 0.69, + "grad_norm": 1.7390161950750196, + "learning_rate": 2.3096184244460025e-06, + "loss": 0.6379, + "step": 6642 + }, + { + "epoch": 0.69, + "grad_norm": 2.069866503330104, + "learning_rate": 2.3081996313587873e-06, + "loss": 0.6273, + "step": 6643 + }, + { + "epoch": 0.69, + "grad_norm": 1.752133432566502, + "learning_rate": 2.306781143402485e-06, + "loss": 0.5812, + "step": 6644 + }, + { + "epoch": 0.69, + "grad_norm": 1.8767393290787764, + "learning_rate": 2.305362960737893e-06, + "loss": 0.563, + "step": 6645 + }, + { + "epoch": 0.69, + "grad_norm": 2.088228977799822, + "learning_rate": 2.3039450835257663e-06, + "loss": 0.6123, + "step": 6646 + }, + { + "epoch": 0.69, + "grad_norm": 1.8447026144205156, + "learning_rate": 2.3025275119268352e-06, + "loss": 0.5872, + "step": 6647 + }, + { + "epoch": 0.69, + "grad_norm": 1.9480786651098552, + "learning_rate": 2.3011102461017816e-06, + "loss": 0.6293, + "step": 6648 + }, + { + "epoch": 0.69, + "grad_norm": 1.9232095483162872, + "learning_rate": 2.299693286211267e-06, + "loss": 0.7181, + "step": 6649 + }, + { + "epoch": 0.69, + "grad_norm": 1.9315203503161686, + "learning_rate": 2.298276632415908e-06, + "loss": 0.6642, + "step": 6650 + }, + { + "epoch": 0.69, + "grad_norm": 2.075640982086266, + "learning_rate": 2.296860284876293e-06, + "loss": 0.6924, + "step": 6651 + }, + { + "epoch": 0.69, + "grad_norm": 1.9253684895073375, + "learning_rate": 2.2954442437529705e-06, + "loss": 0.5573, + "step": 6652 + }, + { + "epoch": 0.69, + "grad_norm": 1.9033756169578906, + "learning_rate": 2.294028509206461e-06, + "loss": 0.5732, + "step": 6653 + }, + { + "epoch": 0.69, + "grad_norm": 2.097629789333268, + "learning_rate": 2.292613081397243e-06, + "loss": 0.7167, + "step": 6654 + }, + { + "epoch": 0.69, + "grad_norm": 2.047506195550759, + "learning_rate": 2.2911979604857636e-06, + "loss": 0.5737, + "step": 6655 + }, + { + "epoch": 0.69, + "grad_norm": 2.1135148834024093, + "learning_rate": 2.289783146632434e-06, + "loss": 0.6373, + "step": 6656 + }, + { + "epoch": 0.69, + "grad_norm": 2.153401373307362, + "learning_rate": 2.2883686399976335e-06, + "loss": 0.6147, + "step": 6657 + }, + { + "epoch": 0.69, + "grad_norm": 1.8547858747646147, + "learning_rate": 2.2869544407417016e-06, + "loss": 0.6559, + "step": 6658 + }, + { + "epoch": 0.69, + "grad_norm": 1.871073109375712, + "learning_rate": 2.2855405490249498e-06, + "loss": 0.6087, + "step": 6659 + }, + { + "epoch": 0.69, + "grad_norm": 1.9986690467749073, + "learning_rate": 2.2841269650076468e-06, + "loss": 0.6595, + "step": 6660 + }, + { + "epoch": 0.69, + "grad_norm": 1.9087678195779618, + "learning_rate": 2.282713688850034e-06, + "loss": 0.5527, + "step": 6661 + }, + { + "epoch": 0.69, + "grad_norm": 2.014305188228896, + "learning_rate": 2.281300720712313e-06, + "loss": 0.7236, + "step": 6662 + }, + { + "epoch": 0.69, + "grad_norm": 1.9398579036796748, + "learning_rate": 2.2798880607546486e-06, + "loss": 0.5963, + "step": 6663 + }, + { + "epoch": 0.69, + "grad_norm": 2.3211520585069536, + "learning_rate": 2.2784757091371797e-06, + "loss": 0.5343, + "step": 6664 + }, + { + "epoch": 0.69, + "grad_norm": 2.0825157206727343, + "learning_rate": 2.2770636660199983e-06, + "loss": 0.6533, + "step": 6665 + }, + { + "epoch": 0.69, + "grad_norm": 1.8372912903103717, + "learning_rate": 2.275651931563173e-06, + "loss": 0.7069, + "step": 6666 + }, + { + "epoch": 0.69, + "grad_norm": 1.8541732518907224, + "learning_rate": 2.274240505926728e-06, + "loss": 0.6389, + "step": 6667 + }, + { + "epoch": 0.69, + "grad_norm": 2.049805210637795, + "learning_rate": 2.2728293892706595e-06, + "loss": 0.6054, + "step": 6668 + }, + { + "epoch": 0.69, + "grad_norm": 2.13594935059443, + "learning_rate": 2.271418581754924e-06, + "loss": 0.6606, + "step": 6669 + }, + { + "epoch": 0.69, + "grad_norm": 2.0250909170956506, + "learning_rate": 2.2700080835394444e-06, + "loss": 0.6581, + "step": 6670 + }, + { + "epoch": 0.69, + "grad_norm": 2.056761095650911, + "learning_rate": 2.2685978947841077e-06, + "loss": 0.6361, + "step": 6671 + }, + { + "epoch": 0.69, + "grad_norm": 2.265738878544974, + "learning_rate": 2.2671880156487695e-06, + "loss": 0.7253, + "step": 6672 + }, + { + "epoch": 0.69, + "grad_norm": 1.8661634959294517, + "learning_rate": 2.265778446293245e-06, + "loss": 0.6385, + "step": 6673 + }, + { + "epoch": 0.69, + "grad_norm": 1.8345999506199009, + "learning_rate": 2.264369186877319e-06, + "loss": 0.6667, + "step": 6674 + }, + { + "epoch": 0.69, + "grad_norm": 2.1188435197091833, + "learning_rate": 2.2629602375607373e-06, + "loss": 0.6107, + "step": 6675 + }, + { + "epoch": 0.69, + "grad_norm": 1.766286234357666, + "learning_rate": 2.2615515985032164e-06, + "loss": 0.5546, + "step": 6676 + }, + { + "epoch": 0.69, + "grad_norm": 1.9151920493197223, + "learning_rate": 2.260143269864427e-06, + "loss": 0.6048, + "step": 6677 + }, + { + "epoch": 0.69, + "grad_norm": 1.7001073092306225, + "learning_rate": 2.258735251804017e-06, + "loss": 0.4986, + "step": 6678 + }, + { + "epoch": 0.69, + "grad_norm": 1.995324748961039, + "learning_rate": 2.2573275444815886e-06, + "loss": 0.6454, + "step": 6679 + }, + { + "epoch": 0.69, + "grad_norm": 2.121005092218295, + "learning_rate": 2.255920148056717e-06, + "loss": 0.7024, + "step": 6680 + }, + { + "epoch": 0.69, + "grad_norm": 2.1720784642317454, + "learning_rate": 2.2545130626889363e-06, + "loss": 0.5723, + "step": 6681 + }, + { + "epoch": 0.69, + "grad_norm": 2.0099728903305523, + "learning_rate": 2.25310628853775e-06, + "loss": 0.6592, + "step": 6682 + }, + { + "epoch": 0.69, + "grad_norm": 2.1435430270674973, + "learning_rate": 2.251699825762621e-06, + "loss": 0.5571, + "step": 6683 + }, + { + "epoch": 0.69, + "grad_norm": 1.9626458440348495, + "learning_rate": 2.2502936745229852e-06, + "loss": 0.6454, + "step": 6684 + }, + { + "epoch": 0.69, + "grad_norm": 1.7483370034305972, + "learning_rate": 2.2488878349782306e-06, + "loss": 0.5895, + "step": 6685 + }, + { + "epoch": 0.7, + "grad_norm": 1.9414199777315755, + "learning_rate": 2.2474823072877226e-06, + "loss": 0.6565, + "step": 6686 + }, + { + "epoch": 0.7, + "grad_norm": 1.9913410893273478, + "learning_rate": 2.2460770916107823e-06, + "loss": 0.6351, + "step": 6687 + }, + { + "epoch": 0.7, + "grad_norm": 2.04831872577914, + "learning_rate": 2.244672188106702e-06, + "loss": 0.5214, + "step": 6688 + }, + { + "epoch": 0.7, + "grad_norm": 2.3021743565866615, + "learning_rate": 2.243267596934732e-06, + "loss": 0.6176, + "step": 6689 + }, + { + "epoch": 0.7, + "grad_norm": 1.9373025833593858, + "learning_rate": 2.241863318254095e-06, + "loss": 0.6219, + "step": 6690 + }, + { + "epoch": 0.7, + "grad_norm": 1.8598534415891308, + "learning_rate": 2.2404593522239715e-06, + "loss": 0.6604, + "step": 6691 + }, + { + "epoch": 0.7, + "grad_norm": 1.6833312249023875, + "learning_rate": 2.239055699003509e-06, + "loss": 0.5226, + "step": 6692 + }, + { + "epoch": 0.7, + "grad_norm": 1.963731738827018, + "learning_rate": 2.2376523587518184e-06, + "loss": 0.6036, + "step": 6693 + }, + { + "epoch": 0.7, + "grad_norm": 2.017374029375341, + "learning_rate": 2.23624933162798e-06, + "loss": 0.7601, + "step": 6694 + }, + { + "epoch": 0.7, + "grad_norm": 1.8393505053809505, + "learning_rate": 2.23484661779103e-06, + "loss": 0.602, + "step": 6695 + }, + { + "epoch": 0.7, + "grad_norm": 2.012100195733019, + "learning_rate": 2.2334442173999794e-06, + "loss": 0.7229, + "step": 6696 + }, + { + "epoch": 0.7, + "grad_norm": 1.829512485563068, + "learning_rate": 2.232042130613793e-06, + "loss": 0.6148, + "step": 6697 + }, + { + "epoch": 0.7, + "grad_norm": 1.662329440268267, + "learning_rate": 2.2306403575914103e-06, + "loss": 0.6035, + "step": 6698 + }, + { + "epoch": 0.7, + "grad_norm": 1.926230971491137, + "learning_rate": 2.229238898491728e-06, + "loss": 0.5508, + "step": 6699 + }, + { + "epoch": 0.7, + "grad_norm": 1.9284111002937832, + "learning_rate": 2.2278377534736067e-06, + "loss": 0.652, + "step": 6700 + }, + { + "epoch": 0.7, + "grad_norm": 2.023252463481603, + "learning_rate": 2.2264369226958794e-06, + "loss": 0.5936, + "step": 6701 + }, + { + "epoch": 0.7, + "grad_norm": 2.0858121565124996, + "learning_rate": 2.225036406317334e-06, + "loss": 0.687, + "step": 6702 + }, + { + "epoch": 0.7, + "grad_norm": 2.0405317157720906, + "learning_rate": 2.2236362044967304e-06, + "loss": 0.6622, + "step": 6703 + }, + { + "epoch": 0.7, + "grad_norm": 1.8435761932262933, + "learning_rate": 2.2222363173927853e-06, + "loss": 0.6013, + "step": 6704 + }, + { + "epoch": 0.7, + "grad_norm": 2.0622278757264785, + "learning_rate": 2.2208367451641886e-06, + "loss": 0.5633, + "step": 6705 + }, + { + "epoch": 0.7, + "grad_norm": 1.8044115147358368, + "learning_rate": 2.219437487969588e-06, + "loss": 0.5886, + "step": 6706 + }, + { + "epoch": 0.7, + "grad_norm": 1.869590527714706, + "learning_rate": 2.2180385459675964e-06, + "loss": 0.598, + "step": 6707 + }, + { + "epoch": 0.7, + "grad_norm": 1.991125138912976, + "learning_rate": 2.2166399193167905e-06, + "loss": 0.6672, + "step": 6708 + }, + { + "epoch": 0.7, + "grad_norm": 2.078202290454276, + "learning_rate": 2.2152416081757154e-06, + "loss": 0.5946, + "step": 6709 + }, + { + "epoch": 0.7, + "grad_norm": 1.9841789691260552, + "learning_rate": 2.213843612702876e-06, + "loss": 0.624, + "step": 6710 + }, + { + "epoch": 0.7, + "grad_norm": 1.9292348217406463, + "learning_rate": 2.212445933056745e-06, + "loss": 0.5677, + "step": 6711 + }, + { + "epoch": 0.7, + "grad_norm": 1.7298685570499033, + "learning_rate": 2.211048569395754e-06, + "loss": 0.6022, + "step": 6712 + }, + { + "epoch": 0.7, + "grad_norm": 1.9413989348010925, + "learning_rate": 2.2096515218783084e-06, + "loss": 0.5198, + "step": 6713 + }, + { + "epoch": 0.7, + "grad_norm": 2.106895321722421, + "learning_rate": 2.208254790662763e-06, + "loss": 0.6963, + "step": 6714 + }, + { + "epoch": 0.7, + "grad_norm": 1.899806937788191, + "learning_rate": 2.2068583759074513e-06, + "loss": 0.6799, + "step": 6715 + }, + { + "epoch": 0.7, + "grad_norm": 1.919074168896033, + "learning_rate": 2.2054622777706612e-06, + "loss": 0.6711, + "step": 6716 + }, + { + "epoch": 0.7, + "grad_norm": 1.8067940485711798, + "learning_rate": 2.204066496410653e-06, + "loss": 0.5445, + "step": 6717 + }, + { + "epoch": 0.7, + "grad_norm": 1.9616475474941888, + "learning_rate": 2.2026710319856407e-06, + "loss": 0.6405, + "step": 6718 + }, + { + "epoch": 0.7, + "grad_norm": 2.1720562157049415, + "learning_rate": 2.2012758846538135e-06, + "loss": 0.5855, + "step": 6719 + }, + { + "epoch": 0.7, + "grad_norm": 1.9905847516723316, + "learning_rate": 2.199881054573315e-06, + "loss": 0.6609, + "step": 6720 + }, + { + "epoch": 0.7, + "grad_norm": 1.8413328911147004, + "learning_rate": 2.1984865419022633e-06, + "loss": 0.515, + "step": 6721 + }, + { + "epoch": 0.7, + "grad_norm": 1.7123430797565387, + "learning_rate": 2.197092346798726e-06, + "loss": 0.5539, + "step": 6722 + }, + { + "epoch": 0.7, + "grad_norm": 1.809983657480813, + "learning_rate": 2.1956984694207495e-06, + "loss": 0.5613, + "step": 6723 + }, + { + "epoch": 0.7, + "grad_norm": 1.9263268915106502, + "learning_rate": 2.1943049099263333e-06, + "loss": 0.6399, + "step": 6724 + }, + { + "epoch": 0.7, + "grad_norm": 2.005700997387467, + "learning_rate": 2.1929116684734493e-06, + "loss": 0.746, + "step": 6725 + }, + { + "epoch": 0.7, + "grad_norm": 2.057704807111089, + "learning_rate": 2.1915187452200255e-06, + "loss": 0.5946, + "step": 6726 + }, + { + "epoch": 0.7, + "grad_norm": 1.9196933178433027, + "learning_rate": 2.190126140323962e-06, + "loss": 0.601, + "step": 6727 + }, + { + "epoch": 0.7, + "grad_norm": 2.10808635939514, + "learning_rate": 2.188733853943116e-06, + "loss": 0.683, + "step": 6728 + }, + { + "epoch": 0.7, + "grad_norm": 1.9521402765230733, + "learning_rate": 2.1873418862353095e-06, + "loss": 0.5989, + "step": 6729 + }, + { + "epoch": 0.7, + "grad_norm": 1.9980956231048448, + "learning_rate": 2.1859502373583336e-06, + "loss": 0.6752, + "step": 6730 + }, + { + "epoch": 0.7, + "grad_norm": 1.9240445744428603, + "learning_rate": 2.184558907469938e-06, + "loss": 0.739, + "step": 6731 + }, + { + "epoch": 0.7, + "grad_norm": 1.7796759312492878, + "learning_rate": 2.1831678967278356e-06, + "loss": 0.6363, + "step": 6732 + }, + { + "epoch": 0.7, + "grad_norm": 1.8523858957919184, + "learning_rate": 2.18177720528971e-06, + "loss": 0.6039, + "step": 6733 + }, + { + "epoch": 0.7, + "grad_norm": 1.895909602685592, + "learning_rate": 2.1803868333131996e-06, + "loss": 0.5693, + "step": 6734 + }, + { + "epoch": 0.7, + "grad_norm": 2.0462928765800226, + "learning_rate": 2.1789967809559144e-06, + "loss": 0.6011, + "step": 6735 + }, + { + "epoch": 0.7, + "grad_norm": 1.8139579333458893, + "learning_rate": 2.177607048375423e-06, + "loss": 0.5731, + "step": 6736 + }, + { + "epoch": 0.7, + "grad_norm": 1.9562113590511148, + "learning_rate": 2.1762176357292582e-06, + "loss": 0.6779, + "step": 6737 + }, + { + "epoch": 0.7, + "grad_norm": 2.015963062808654, + "learning_rate": 2.174828543174921e-06, + "loss": 0.7395, + "step": 6738 + }, + { + "epoch": 0.7, + "grad_norm": 1.7438058430933063, + "learning_rate": 2.17343977086987e-06, + "loss": 0.5682, + "step": 6739 + }, + { + "epoch": 0.7, + "grad_norm": 2.3548843240144435, + "learning_rate": 2.172051318971533e-06, + "loss": 0.7333, + "step": 6740 + }, + { + "epoch": 0.7, + "grad_norm": 1.8381592551491213, + "learning_rate": 2.170663187637297e-06, + "loss": 0.6598, + "step": 6741 + }, + { + "epoch": 0.7, + "grad_norm": 2.153703994101113, + "learning_rate": 2.169275377024516e-06, + "loss": 0.713, + "step": 6742 + }, + { + "epoch": 0.7, + "grad_norm": 1.9617566205072865, + "learning_rate": 2.1678878872905063e-06, + "loss": 0.6191, + "step": 6743 + }, + { + "epoch": 0.7, + "grad_norm": 1.9815659603480604, + "learning_rate": 2.1665007185925468e-06, + "loss": 0.58, + "step": 6744 + }, + { + "epoch": 0.7, + "grad_norm": 1.8177032013635215, + "learning_rate": 2.16511387108788e-06, + "loss": 0.4916, + "step": 6745 + }, + { + "epoch": 0.7, + "grad_norm": 1.8158442719708523, + "learning_rate": 2.1637273449337156e-06, + "loss": 0.6302, + "step": 6746 + }, + { + "epoch": 0.7, + "grad_norm": 2.027352451014181, + "learning_rate": 2.1623411402872206e-06, + "loss": 0.5283, + "step": 6747 + }, + { + "epoch": 0.7, + "grad_norm": 2.4065306653749294, + "learning_rate": 2.160955257305534e-06, + "loss": 0.7473, + "step": 6748 + }, + { + "epoch": 0.7, + "grad_norm": 2.218280632800332, + "learning_rate": 2.159569696145749e-06, + "loss": 0.6691, + "step": 6749 + }, + { + "epoch": 0.7, + "grad_norm": 1.8586197979012782, + "learning_rate": 2.158184456964932e-06, + "loss": 0.5736, + "step": 6750 + }, + { + "epoch": 0.7, + "grad_norm": 1.7866938651306266, + "learning_rate": 2.1567995399201018e-06, + "loss": 0.6215, + "step": 6751 + }, + { + "epoch": 0.7, + "grad_norm": 2.135629174836658, + "learning_rate": 2.155414945168251e-06, + "loss": 0.6011, + "step": 6752 + }, + { + "epoch": 0.7, + "grad_norm": 1.9452854659735015, + "learning_rate": 2.1540306728663274e-06, + "loss": 0.5866, + "step": 6753 + }, + { + "epoch": 0.7, + "grad_norm": 2.086410546915578, + "learning_rate": 2.152646723171251e-06, + "loss": 0.5784, + "step": 6754 + }, + { + "epoch": 0.7, + "grad_norm": 1.8368795960451758, + "learning_rate": 2.1512630962398954e-06, + "loss": 0.6354, + "step": 6755 + }, + { + "epoch": 0.7, + "grad_norm": 2.070655836107693, + "learning_rate": 2.1498797922291075e-06, + "loss": 0.5932, + "step": 6756 + }, + { + "epoch": 0.7, + "grad_norm": 1.7569569690003866, + "learning_rate": 2.1484968112956884e-06, + "loss": 0.5498, + "step": 6757 + }, + { + "epoch": 0.7, + "grad_norm": 1.821829426027513, + "learning_rate": 2.1471141535964126e-06, + "loss": 0.6307, + "step": 6758 + }, + { + "epoch": 0.7, + "grad_norm": 1.898338128453966, + "learning_rate": 2.1457318192880043e-06, + "loss": 0.6396, + "step": 6759 + }, + { + "epoch": 0.7, + "grad_norm": 1.8115570550325248, + "learning_rate": 2.144349808527165e-06, + "loss": 0.6435, + "step": 6760 + }, + { + "epoch": 0.7, + "grad_norm": 1.8336892208081155, + "learning_rate": 2.14296812147055e-06, + "loss": 0.5665, + "step": 6761 + }, + { + "epoch": 0.7, + "grad_norm": 1.8332267785810863, + "learning_rate": 2.1415867582747847e-06, + "loss": 0.5371, + "step": 6762 + }, + { + "epoch": 0.7, + "grad_norm": 1.9140615577561217, + "learning_rate": 2.1402057190964503e-06, + "loss": 0.5874, + "step": 6763 + }, + { + "epoch": 0.7, + "grad_norm": 2.0702064176398993, + "learning_rate": 2.1388250040921007e-06, + "loss": 0.6677, + "step": 6764 + }, + { + "epoch": 0.7, + "grad_norm": 1.6913066775000711, + "learning_rate": 2.137444613418244e-06, + "loss": 0.5909, + "step": 6765 + }, + { + "epoch": 0.7, + "grad_norm": 1.8374709309177701, + "learning_rate": 2.1360645472313556e-06, + "loss": 0.5114, + "step": 6766 + }, + { + "epoch": 0.7, + "grad_norm": 1.9940985284039772, + "learning_rate": 2.134684805687876e-06, + "loss": 0.7109, + "step": 6767 + }, + { + "epoch": 0.7, + "grad_norm": 1.8285598217429264, + "learning_rate": 2.1333053889442033e-06, + "loss": 0.6866, + "step": 6768 + }, + { + "epoch": 0.7, + "grad_norm": 1.8855850725628227, + "learning_rate": 2.131926297156707e-06, + "loss": 0.5933, + "step": 6769 + }, + { + "epoch": 0.7, + "grad_norm": 2.004497545118367, + "learning_rate": 2.130547530481712e-06, + "loss": 0.6405, + "step": 6770 + }, + { + "epoch": 0.7, + "grad_norm": 1.7597570513215508, + "learning_rate": 2.1291690890755078e-06, + "loss": 0.6394, + "step": 6771 + }, + { + "epoch": 0.7, + "grad_norm": 2.0693726659785225, + "learning_rate": 2.1277909730943526e-06, + "loss": 0.6361, + "step": 6772 + }, + { + "epoch": 0.7, + "grad_norm": 1.8601120817243084, + "learning_rate": 2.126413182694461e-06, + "loss": 0.5867, + "step": 6773 + }, + { + "epoch": 0.7, + "grad_norm": 2.0572739362735324, + "learning_rate": 2.125035718032013e-06, + "loss": 0.7085, + "step": 6774 + }, + { + "epoch": 0.7, + "grad_norm": 1.9037558097151903, + "learning_rate": 2.123658579263155e-06, + "loss": 0.6568, + "step": 6775 + }, + { + "epoch": 0.7, + "grad_norm": 2.0402613319469007, + "learning_rate": 2.1222817665439893e-06, + "loss": 0.7298, + "step": 6776 + }, + { + "epoch": 0.7, + "grad_norm": 1.9559818195133252, + "learning_rate": 2.1209052800305897e-06, + "loss": 0.6301, + "step": 6777 + }, + { + "epoch": 0.7, + "grad_norm": 1.8799592134451022, + "learning_rate": 2.119529119878985e-06, + "loss": 0.5894, + "step": 6778 + }, + { + "epoch": 0.7, + "grad_norm": 1.8268326940086235, + "learning_rate": 2.1181532862451746e-06, + "loss": 0.6109, + "step": 6779 + }, + { + "epoch": 0.7, + "grad_norm": 2.115331740345243, + "learning_rate": 2.1167777792851153e-06, + "loss": 0.5923, + "step": 6780 + }, + { + "epoch": 0.7, + "grad_norm": 2.0187530099260917, + "learning_rate": 2.1154025991547283e-06, + "loss": 0.5897, + "step": 6781 + }, + { + "epoch": 0.7, + "grad_norm": 1.9106316038087288, + "learning_rate": 2.114027746009897e-06, + "loss": 0.6144, + "step": 6782 + }, + { + "epoch": 0.71, + "grad_norm": 1.8569111712932218, + "learning_rate": 2.112653220006472e-06, + "loss": 0.6748, + "step": 6783 + }, + { + "epoch": 0.71, + "grad_norm": 1.8045272023207288, + "learning_rate": 2.1112790213002592e-06, + "loss": 0.6509, + "step": 6784 + }, + { + "epoch": 0.71, + "grad_norm": 2.1234892959481653, + "learning_rate": 2.1099051500470368e-06, + "loss": 0.78, + "step": 6785 + }, + { + "epoch": 0.71, + "grad_norm": 1.77970791042375, + "learning_rate": 2.1085316064025375e-06, + "loss": 0.5062, + "step": 6786 + }, + { + "epoch": 0.71, + "grad_norm": 1.9188195927515572, + "learning_rate": 2.1071583905224643e-06, + "loss": 0.5566, + "step": 6787 + }, + { + "epoch": 0.71, + "grad_norm": 1.9184818718964722, + "learning_rate": 2.105785502562472e-06, + "loss": 0.6935, + "step": 6788 + }, + { + "epoch": 0.71, + "grad_norm": 2.1765053123488634, + "learning_rate": 2.1044129426781925e-06, + "loss": 0.6602, + "step": 6789 + }, + { + "epoch": 0.71, + "grad_norm": 2.0392376211455963, + "learning_rate": 2.1030407110252077e-06, + "loss": 0.6014, + "step": 6790 + }, + { + "epoch": 0.71, + "grad_norm": 1.779595008058516, + "learning_rate": 2.1016688077590726e-06, + "loss": 0.6121, + "step": 6791 + }, + { + "epoch": 0.71, + "grad_norm": 1.8969276425103183, + "learning_rate": 2.100297233035296e-06, + "loss": 0.6356, + "step": 6792 + }, + { + "epoch": 0.71, + "grad_norm": 2.090929508806987, + "learning_rate": 2.0989259870093575e-06, + "loss": 0.7559, + "step": 6793 + }, + { + "epoch": 0.71, + "grad_norm": 2.0298868348206267, + "learning_rate": 2.0975550698366924e-06, + "loss": 0.669, + "step": 6794 + }, + { + "epoch": 0.71, + "grad_norm": 1.9494565795464749, + "learning_rate": 2.096184481672707e-06, + "loss": 0.5928, + "step": 6795 + }, + { + "epoch": 0.71, + "grad_norm": 1.9858014912834072, + "learning_rate": 2.0948142226727584e-06, + "loss": 0.67, + "step": 6796 + }, + { + "epoch": 0.71, + "grad_norm": 1.9407007847129225, + "learning_rate": 2.0934442929921783e-06, + "loss": 0.539, + "step": 6797 + }, + { + "epoch": 0.71, + "grad_norm": 2.0414032198405, + "learning_rate": 2.0920746927862523e-06, + "loss": 0.6952, + "step": 6798 + }, + { + "epoch": 0.71, + "grad_norm": 1.959079680325511, + "learning_rate": 2.0907054222102367e-06, + "loss": 0.5815, + "step": 6799 + }, + { + "epoch": 0.71, + "grad_norm": 1.8445652931836316, + "learning_rate": 2.0893364814193424e-06, + "loss": 0.5937, + "step": 6800 + }, + { + "epoch": 0.71, + "grad_norm": 1.8116444330336892, + "learning_rate": 2.0879678705687495e-06, + "loss": 0.667, + "step": 6801 + }, + { + "epoch": 0.71, + "grad_norm": 1.8516217311768648, + "learning_rate": 2.0865995898135965e-06, + "loss": 0.5826, + "step": 6802 + }, + { + "epoch": 0.71, + "grad_norm": 1.9105961253495354, + "learning_rate": 2.0852316393089837e-06, + "loss": 0.5449, + "step": 6803 + }, + { + "epoch": 0.71, + "grad_norm": 2.1050591561753578, + "learning_rate": 2.083864019209981e-06, + "loss": 0.6291, + "step": 6804 + }, + { + "epoch": 0.71, + "grad_norm": 1.8619748597791044, + "learning_rate": 2.08249672967161e-06, + "loss": 0.5646, + "step": 6805 + }, + { + "epoch": 0.71, + "grad_norm": 1.8444285783299936, + "learning_rate": 2.081129770848867e-06, + "loss": 0.6131, + "step": 6806 + }, + { + "epoch": 0.71, + "grad_norm": 2.271770773829363, + "learning_rate": 2.079763142896699e-06, + "loss": 0.5903, + "step": 6807 + }, + { + "epoch": 0.71, + "grad_norm": 2.052492209017752, + "learning_rate": 2.0783968459700253e-06, + "loss": 0.607, + "step": 6808 + }, + { + "epoch": 0.71, + "grad_norm": 1.8228278521126375, + "learning_rate": 2.077030880223722e-06, + "loss": 0.6362, + "step": 6809 + }, + { + "epoch": 0.71, + "grad_norm": 2.1175619753666206, + "learning_rate": 2.0756652458126285e-06, + "loss": 0.635, + "step": 6810 + }, + { + "epoch": 0.71, + "grad_norm": 1.85016309421988, + "learning_rate": 2.074299942891546e-06, + "loss": 0.6413, + "step": 6811 + }, + { + "epoch": 0.71, + "grad_norm": 1.8552690267386498, + "learning_rate": 2.0729349716152424e-06, + "loss": 0.5696, + "step": 6812 + }, + { + "epoch": 0.71, + "grad_norm": 2.1779947285160848, + "learning_rate": 2.071570332138442e-06, + "loss": 0.6923, + "step": 6813 + }, + { + "epoch": 0.71, + "grad_norm": 1.8964953958247945, + "learning_rate": 2.0702060246158378e-06, + "loss": 0.6115, + "step": 6814 + }, + { + "epoch": 0.71, + "grad_norm": 1.799391893677241, + "learning_rate": 2.068842049202078e-06, + "loss": 0.6503, + "step": 6815 + }, + { + "epoch": 0.71, + "grad_norm": 1.8439040043393595, + "learning_rate": 2.0674784060517803e-06, + "loss": 0.6019, + "step": 6816 + }, + { + "epoch": 0.71, + "grad_norm": 2.0442863866035004, + "learning_rate": 2.066115095319521e-06, + "loss": 0.5634, + "step": 6817 + }, + { + "epoch": 0.71, + "grad_norm": 1.9730979415829402, + "learning_rate": 2.0647521171598376e-06, + "loss": 0.6527, + "step": 6818 + }, + { + "epoch": 0.71, + "grad_norm": 1.965494335587785, + "learning_rate": 2.0633894717272308e-06, + "loss": 0.5889, + "step": 6819 + }, + { + "epoch": 0.71, + "grad_norm": 2.0617167262437013, + "learning_rate": 2.0620271591761666e-06, + "loss": 0.6645, + "step": 6820 + }, + { + "epoch": 0.71, + "grad_norm": 2.0622153316834453, + "learning_rate": 2.060665179661068e-06, + "loss": 0.6241, + "step": 6821 + }, + { + "epoch": 0.71, + "grad_norm": 1.9657200879940147, + "learning_rate": 2.0593035333363275e-06, + "loss": 0.6282, + "step": 6822 + }, + { + "epoch": 0.71, + "grad_norm": 2.1417828950209485, + "learning_rate": 2.0579422203562905e-06, + "loss": 0.6605, + "step": 6823 + }, + { + "epoch": 0.71, + "grad_norm": 1.8466583378272763, + "learning_rate": 2.056581240875276e-06, + "loss": 0.5548, + "step": 6824 + }, + { + "epoch": 0.71, + "grad_norm": 1.9783061847720764, + "learning_rate": 2.055220595047551e-06, + "loss": 0.5555, + "step": 6825 + }, + { + "epoch": 0.71, + "grad_norm": 1.7190514762365583, + "learning_rate": 2.053860283027358e-06, + "loss": 0.5731, + "step": 6826 + }, + { + "epoch": 0.71, + "grad_norm": 2.2693999624623724, + "learning_rate": 2.0525003049688923e-06, + "loss": 0.521, + "step": 6827 + }, + { + "epoch": 0.71, + "grad_norm": 2.028005261832898, + "learning_rate": 2.0511406610263196e-06, + "loss": 0.6534, + "step": 6828 + }, + { + "epoch": 0.71, + "grad_norm": 1.8211798337378464, + "learning_rate": 2.0497813513537583e-06, + "loss": 0.5156, + "step": 6829 + }, + { + "epoch": 0.71, + "grad_norm": 2.0316985269130874, + "learning_rate": 2.048422376105299e-06, + "loss": 0.5355, + "step": 6830 + }, + { + "epoch": 0.71, + "grad_norm": 2.035569698591049, + "learning_rate": 2.047063735434985e-06, + "loss": 0.5921, + "step": 6831 + }, + { + "epoch": 0.71, + "grad_norm": 1.8627838823785647, + "learning_rate": 2.045705429496831e-06, + "loss": 0.5364, + "step": 6832 + }, + { + "epoch": 0.71, + "grad_norm": 1.9083218950041783, + "learning_rate": 2.044347458444802e-06, + "loss": 0.6016, + "step": 6833 + }, + { + "epoch": 0.71, + "grad_norm": 1.925848000350107, + "learning_rate": 2.042989822432837e-06, + "loss": 0.624, + "step": 6834 + }, + { + "epoch": 0.71, + "grad_norm": 2.098946597015218, + "learning_rate": 2.041632521614828e-06, + "loss": 0.5922, + "step": 6835 + }, + { + "epoch": 0.71, + "grad_norm": 2.100563969003895, + "learning_rate": 2.040275556144637e-06, + "loss": 0.6121, + "step": 6836 + }, + { + "epoch": 0.71, + "grad_norm": 1.9737206038407717, + "learning_rate": 2.03891892617608e-06, + "loss": 0.5919, + "step": 6837 + }, + { + "epoch": 0.71, + "grad_norm": 1.9297318075312375, + "learning_rate": 2.0375626318629418e-06, + "loss": 0.6429, + "step": 6838 + }, + { + "epoch": 0.71, + "grad_norm": 2.4570564234245285, + "learning_rate": 2.036206673358964e-06, + "loss": 0.7009, + "step": 6839 + }, + { + "epoch": 0.71, + "grad_norm": 1.7813614587881377, + "learning_rate": 2.034851050817852e-06, + "loss": 0.5578, + "step": 6840 + }, + { + "epoch": 0.71, + "grad_norm": 2.100717069970721, + "learning_rate": 2.0334957643932757e-06, + "loss": 0.5658, + "step": 6841 + }, + { + "epoch": 0.71, + "grad_norm": 2.02686330607163, + "learning_rate": 2.032140814238861e-06, + "loss": 0.5931, + "step": 6842 + }, + { + "epoch": 0.71, + "grad_norm": 1.6161797864253376, + "learning_rate": 2.030786200508203e-06, + "loss": 0.4844, + "step": 6843 + }, + { + "epoch": 0.71, + "grad_norm": 1.9412498996165422, + "learning_rate": 2.0294319233548516e-06, + "loss": 0.5354, + "step": 6844 + }, + { + "epoch": 0.71, + "grad_norm": 1.986315746322376, + "learning_rate": 2.028077982932325e-06, + "loss": 0.6693, + "step": 6845 + }, + { + "epoch": 0.71, + "grad_norm": 1.86339065340936, + "learning_rate": 2.026724379394098e-06, + "loss": 0.5476, + "step": 6846 + }, + { + "epoch": 0.71, + "grad_norm": 1.7315228539679266, + "learning_rate": 2.0253711128936104e-06, + "loss": 0.5505, + "step": 6847 + }, + { + "epoch": 0.71, + "grad_norm": 2.0758485452326267, + "learning_rate": 2.0240181835842605e-06, + "loss": 0.629, + "step": 6848 + }, + { + "epoch": 0.71, + "grad_norm": 1.98434925798822, + "learning_rate": 2.0226655916194127e-06, + "loss": 0.555, + "step": 6849 + }, + { + "epoch": 0.71, + "grad_norm": 1.963022411365478, + "learning_rate": 2.0213133371523893e-06, + "loss": 0.6524, + "step": 6850 + }, + { + "epoch": 0.71, + "grad_norm": 2.0305066724656267, + "learning_rate": 2.0199614203364787e-06, + "loss": 0.6561, + "step": 6851 + }, + { + "epoch": 0.71, + "grad_norm": 1.9822291874774338, + "learning_rate": 2.018609841324925e-06, + "loss": 0.584, + "step": 6852 + }, + { + "epoch": 0.71, + "grad_norm": 2.2247395503229326, + "learning_rate": 2.0172586002709403e-06, + "loss": 0.6224, + "step": 6853 + }, + { + "epoch": 0.71, + "grad_norm": 2.364632538967232, + "learning_rate": 2.0159076973276954e-06, + "loss": 0.6781, + "step": 6854 + }, + { + "epoch": 0.71, + "grad_norm": 2.2066032797604134, + "learning_rate": 2.014557132648321e-06, + "loss": 0.7244, + "step": 6855 + }, + { + "epoch": 0.71, + "grad_norm": 1.854399879858053, + "learning_rate": 2.0132069063859107e-06, + "loss": 0.5404, + "step": 6856 + }, + { + "epoch": 0.71, + "grad_norm": 2.198675457264703, + "learning_rate": 2.0118570186935234e-06, + "loss": 0.679, + "step": 6857 + }, + { + "epoch": 0.71, + "grad_norm": 1.9050712289570522, + "learning_rate": 2.010507469724173e-06, + "loss": 0.702, + "step": 6858 + }, + { + "epoch": 0.71, + "grad_norm": 2.0735079350659302, + "learning_rate": 2.0091582596308423e-06, + "loss": 0.6322, + "step": 6859 + }, + { + "epoch": 0.71, + "grad_norm": 2.010600304322372, + "learning_rate": 2.007809388566468e-06, + "loss": 0.7113, + "step": 6860 + }, + { + "epoch": 0.71, + "grad_norm": 1.7636218721247439, + "learning_rate": 2.0064608566839584e-06, + "loss": 0.5361, + "step": 6861 + }, + { + "epoch": 0.71, + "grad_norm": 1.9501623701778918, + "learning_rate": 2.0051126641361697e-06, + "loss": 0.6234, + "step": 6862 + }, + { + "epoch": 0.71, + "grad_norm": 1.9960117280272525, + "learning_rate": 2.0037648110759324e-06, + "loss": 0.6327, + "step": 6863 + }, + { + "epoch": 0.71, + "grad_norm": 1.817735464885767, + "learning_rate": 2.0024172976560296e-06, + "loss": 0.5358, + "step": 6864 + }, + { + "epoch": 0.71, + "grad_norm": 2.1683050481880564, + "learning_rate": 2.001070124029214e-06, + "loss": 0.5889, + "step": 6865 + }, + { + "epoch": 0.71, + "grad_norm": 1.8559272054114377, + "learning_rate": 1.9997232903481916e-06, + "loss": 0.5864, + "step": 6866 + }, + { + "epoch": 0.71, + "grad_norm": 2.101506830003224, + "learning_rate": 1.9983767967656364e-06, + "loss": 0.6091, + "step": 6867 + }, + { + "epoch": 0.71, + "grad_norm": 1.9674756726064457, + "learning_rate": 1.9970306434341806e-06, + "loss": 0.6021, + "step": 6868 + }, + { + "epoch": 0.71, + "grad_norm": 1.9449109915690936, + "learning_rate": 1.9956848305064156e-06, + "loss": 0.5826, + "step": 6869 + }, + { + "epoch": 0.71, + "grad_norm": 2.312848931186964, + "learning_rate": 1.994339358134901e-06, + "loss": 0.5566, + "step": 6870 + }, + { + "epoch": 0.71, + "grad_norm": 1.8944276535814153, + "learning_rate": 1.992994226472152e-06, + "loss": 0.6597, + "step": 6871 + }, + { + "epoch": 0.71, + "grad_norm": 2.045736585954296, + "learning_rate": 1.9916494356706447e-06, + "loss": 0.6818, + "step": 6872 + }, + { + "epoch": 0.71, + "grad_norm": 1.8958380409998368, + "learning_rate": 1.9903049858828226e-06, + "loss": 0.5696, + "step": 6873 + }, + { + "epoch": 0.71, + "grad_norm": 2.0705398801996773, + "learning_rate": 1.9889608772610837e-06, + "loss": 0.5976, + "step": 6874 + }, + { + "epoch": 0.71, + "grad_norm": 2.0054209308593265, + "learning_rate": 1.987617109957793e-06, + "loss": 0.5429, + "step": 6875 + }, + { + "epoch": 0.71, + "grad_norm": 2.041343124527013, + "learning_rate": 1.9862736841252734e-06, + "loss": 0.6029, + "step": 6876 + }, + { + "epoch": 0.71, + "grad_norm": 1.918282480016842, + "learning_rate": 1.984930599915807e-06, + "loss": 0.6197, + "step": 6877 + }, + { + "epoch": 0.71, + "grad_norm": 1.75857226275821, + "learning_rate": 1.983587857481645e-06, + "loss": 0.5959, + "step": 6878 + }, + { + "epoch": 0.72, + "grad_norm": 1.918945998094483, + "learning_rate": 1.9822454569749895e-06, + "loss": 0.5751, + "step": 6879 + }, + { + "epoch": 0.72, + "grad_norm": 2.102556873591886, + "learning_rate": 1.980903398548015e-06, + "loss": 0.6102, + "step": 6880 + }, + { + "epoch": 0.72, + "grad_norm": 1.894996073709637, + "learning_rate": 1.9795616823528457e-06, + "loss": 0.6569, + "step": 6881 + }, + { + "epoch": 0.72, + "grad_norm": 1.9034762382422525, + "learning_rate": 1.978220308541578e-06, + "loss": 0.6017, + "step": 6882 + }, + { + "epoch": 0.72, + "grad_norm": 1.8933476962584155, + "learning_rate": 1.9768792772662616e-06, + "loss": 0.5561, + "step": 6883 + }, + { + "epoch": 0.72, + "grad_norm": 1.804196672010393, + "learning_rate": 1.9755385886789107e-06, + "loss": 0.6527, + "step": 6884 + }, + { + "epoch": 0.72, + "grad_norm": 1.8793796521053991, + "learning_rate": 1.9741982429314977e-06, + "loss": 0.6567, + "step": 6885 + }, + { + "epoch": 0.72, + "grad_norm": 1.8753444491107256, + "learning_rate": 1.972858240175962e-06, + "loss": 0.6202, + "step": 6886 + }, + { + "epoch": 0.72, + "grad_norm": 2.0290386500817785, + "learning_rate": 1.9715185805641974e-06, + "loss": 0.5861, + "step": 6887 + }, + { + "epoch": 0.72, + "grad_norm": 1.847889462311458, + "learning_rate": 1.970179264248065e-06, + "loss": 0.6332, + "step": 6888 + }, + { + "epoch": 0.72, + "grad_norm": 2.080563361469307, + "learning_rate": 1.9688402913793804e-06, + "loss": 0.6881, + "step": 6889 + }, + { + "epoch": 0.72, + "grad_norm": 1.9954194703008998, + "learning_rate": 1.967501662109928e-06, + "loss": 0.6653, + "step": 6890 + }, + { + "epoch": 0.72, + "grad_norm": 2.077546547685542, + "learning_rate": 1.9661633765914467e-06, + "loss": 0.6304, + "step": 6891 + }, + { + "epoch": 0.72, + "grad_norm": 2.2298547879318162, + "learning_rate": 1.964825434975639e-06, + "loss": 0.6967, + "step": 6892 + }, + { + "epoch": 0.72, + "grad_norm": 2.201827109812438, + "learning_rate": 1.9634878374141662e-06, + "loss": 0.5941, + "step": 6893 + }, + { + "epoch": 0.72, + "grad_norm": 2.2384917852547126, + "learning_rate": 1.962150584058657e-06, + "loss": 0.666, + "step": 6894 + }, + { + "epoch": 0.72, + "grad_norm": 2.113834279753955, + "learning_rate": 1.9608136750606917e-06, + "loss": 0.6328, + "step": 6895 + }, + { + "epoch": 0.72, + "grad_norm": 2.117062026332053, + "learning_rate": 1.959477110571821e-06, + "loss": 0.6106, + "step": 6896 + }, + { + "epoch": 0.72, + "grad_norm": 1.8239442513943565, + "learning_rate": 1.958140890743549e-06, + "loss": 0.5488, + "step": 6897 + }, + { + "epoch": 0.72, + "grad_norm": 1.8460499061498403, + "learning_rate": 1.956805015727348e-06, + "loss": 0.4865, + "step": 6898 + }, + { + "epoch": 0.72, + "grad_norm": 2.1680120538340493, + "learning_rate": 1.955469485674641e-06, + "loss": 0.6363, + "step": 6899 + }, + { + "epoch": 0.72, + "grad_norm": 2.0433806599917523, + "learning_rate": 1.9541343007368225e-06, + "loss": 0.5699, + "step": 6900 + }, + { + "epoch": 0.72, + "grad_norm": 1.9010351880597207, + "learning_rate": 1.95279946106524e-06, + "loss": 0.6144, + "step": 6901 + }, + { + "epoch": 0.72, + "grad_norm": 2.015347226433184, + "learning_rate": 1.951464966811209e-06, + "loss": 0.5687, + "step": 6902 + }, + { + "epoch": 0.72, + "grad_norm": 1.8436595068007384, + "learning_rate": 1.9501308181259986e-06, + "loss": 0.5836, + "step": 6903 + }, + { + "epoch": 0.72, + "grad_norm": 1.8664745085447245, + "learning_rate": 1.948797015160845e-06, + "loss": 0.6421, + "step": 6904 + }, + { + "epoch": 0.72, + "grad_norm": 2.125979204251544, + "learning_rate": 1.947463558066941e-06, + "loss": 0.6685, + "step": 6905 + }, + { + "epoch": 0.72, + "grad_norm": 1.9993794382514192, + "learning_rate": 1.94613044699544e-06, + "loss": 0.628, + "step": 6906 + }, + { + "epoch": 0.72, + "grad_norm": 1.7669592256552917, + "learning_rate": 1.944797682097461e-06, + "loss": 0.5624, + "step": 6907 + }, + { + "epoch": 0.72, + "grad_norm": 1.8143771752464506, + "learning_rate": 1.9434652635240775e-06, + "loss": 0.6319, + "step": 6908 + }, + { + "epoch": 0.72, + "grad_norm": 1.6875843744992083, + "learning_rate": 1.9421331914263293e-06, + "loss": 0.5459, + "step": 6909 + }, + { + "epoch": 0.72, + "grad_norm": 2.0810872582365767, + "learning_rate": 1.9408014659552133e-06, + "loss": 0.6141, + "step": 6910 + }, + { + "epoch": 0.72, + "grad_norm": 1.8317397564624678, + "learning_rate": 1.9394700872616856e-06, + "loss": 0.569, + "step": 6911 + }, + { + "epoch": 0.72, + "grad_norm": 1.807931318726164, + "learning_rate": 1.9381390554966705e-06, + "loss": 0.6178, + "step": 6912 + }, + { + "epoch": 0.72, + "grad_norm": 2.014838295762959, + "learning_rate": 1.9368083708110454e-06, + "loss": 0.5852, + "step": 6913 + }, + { + "epoch": 0.72, + "grad_norm": 1.7712989338761949, + "learning_rate": 1.935478033355649e-06, + "loss": 0.6652, + "step": 6914 + }, + { + "epoch": 0.72, + "grad_norm": 1.899489708641119, + "learning_rate": 1.9341480432812867e-06, + "loss": 0.5458, + "step": 6915 + }, + { + "epoch": 0.72, + "grad_norm": 1.719903119059106, + "learning_rate": 1.9328184007387163e-06, + "loss": 0.5132, + "step": 6916 + }, + { + "epoch": 0.72, + "grad_norm": 2.2829620839907903, + "learning_rate": 1.9314891058786644e-06, + "loss": 0.4828, + "step": 6917 + }, + { + "epoch": 0.72, + "grad_norm": 1.8957640141264724, + "learning_rate": 1.930160158851811e-06, + "loss": 0.5468, + "step": 6918 + }, + { + "epoch": 0.72, + "grad_norm": 1.975808639599023, + "learning_rate": 1.9288315598088024e-06, + "loss": 0.6222, + "step": 6919 + }, + { + "epoch": 0.72, + "grad_norm": 1.9159184447005044, + "learning_rate": 1.9275033089002413e-06, + "loss": 0.5843, + "step": 6920 + }, + { + "epoch": 0.72, + "grad_norm": 1.865664172333486, + "learning_rate": 1.9261754062766937e-06, + "loss": 0.667, + "step": 6921 + }, + { + "epoch": 0.72, + "grad_norm": 1.8030021224323394, + "learning_rate": 1.9248478520886815e-06, + "loss": 0.5762, + "step": 6922 + }, + { + "epoch": 0.72, + "grad_norm": 1.9875737286149218, + "learning_rate": 1.923520646486695e-06, + "loss": 0.5619, + "step": 6923 + }, + { + "epoch": 0.72, + "grad_norm": 1.8218277493472246, + "learning_rate": 1.9221937896211773e-06, + "loss": 0.602, + "step": 6924 + }, + { + "epoch": 0.72, + "grad_norm": 1.9715617137675558, + "learning_rate": 1.920867281642538e-06, + "loss": 0.6224, + "step": 6925 + }, + { + "epoch": 0.72, + "grad_norm": 2.131266043950465, + "learning_rate": 1.9195411227011403e-06, + "loss": 0.5832, + "step": 6926 + }, + { + "epoch": 0.72, + "grad_norm": 2.0327060331736404, + "learning_rate": 1.9182153129473167e-06, + "loss": 0.7407, + "step": 6927 + }, + { + "epoch": 0.72, + "grad_norm": 1.8922822746296175, + "learning_rate": 1.916889852531353e-06, + "loss": 0.5791, + "step": 6928 + }, + { + "epoch": 0.72, + "grad_norm": 1.9156447848338625, + "learning_rate": 1.9155647416034972e-06, + "loss": 0.5832, + "step": 6929 + }, + { + "epoch": 0.72, + "grad_norm": 1.8836997633797954, + "learning_rate": 1.914239980313958e-06, + "loss": 0.6571, + "step": 6930 + }, + { + "epoch": 0.72, + "grad_norm": 2.041638407251988, + "learning_rate": 1.912915568812906e-06, + "loss": 0.5327, + "step": 6931 + }, + { + "epoch": 0.72, + "grad_norm": 1.958615762904026, + "learning_rate": 1.9115915072504683e-06, + "loss": 0.6959, + "step": 6932 + }, + { + "epoch": 0.72, + "grad_norm": 1.9335752293452386, + "learning_rate": 1.9102677957767384e-06, + "loss": 0.6213, + "step": 6933 + }, + { + "epoch": 0.72, + "grad_norm": 1.8751908142036637, + "learning_rate": 1.9089444345417636e-06, + "loss": 0.6393, + "step": 6934 + }, + { + "epoch": 0.72, + "grad_norm": 1.8739242366864037, + "learning_rate": 1.9076214236955585e-06, + "loss": 0.6625, + "step": 6935 + }, + { + "epoch": 0.72, + "grad_norm": 1.862653258537508, + "learning_rate": 1.9062987633880876e-06, + "loss": 0.6272, + "step": 6936 + }, + { + "epoch": 0.72, + "grad_norm": 1.9757417242906068, + "learning_rate": 1.9049764537692872e-06, + "loss": 0.5792, + "step": 6937 + }, + { + "epoch": 0.72, + "grad_norm": 2.0341787737940793, + "learning_rate": 1.903654494989045e-06, + "loss": 0.66, + "step": 6938 + }, + { + "epoch": 0.72, + "grad_norm": 1.8427766740494667, + "learning_rate": 1.9023328871972163e-06, + "loss": 0.5716, + "step": 6939 + }, + { + "epoch": 0.72, + "grad_norm": 2.024770792040896, + "learning_rate": 1.9010116305436094e-06, + "loss": 0.6085, + "step": 6940 + }, + { + "epoch": 0.72, + "grad_norm": 1.8948892940354372, + "learning_rate": 1.8996907251779988e-06, + "loss": 0.5694, + "step": 6941 + }, + { + "epoch": 0.72, + "grad_norm": 1.889140157202737, + "learning_rate": 1.8983701712501163e-06, + "loss": 0.6102, + "step": 6942 + }, + { + "epoch": 0.72, + "grad_norm": 2.0194435530553716, + "learning_rate": 1.8970499689096516e-06, + "loss": 0.6385, + "step": 6943 + }, + { + "epoch": 0.72, + "grad_norm": 1.9671159856485523, + "learning_rate": 1.895730118306261e-06, + "loss": 0.6185, + "step": 6944 + }, + { + "epoch": 0.72, + "grad_norm": 1.906154410239305, + "learning_rate": 1.8944106195895535e-06, + "loss": 0.5676, + "step": 6945 + }, + { + "epoch": 0.72, + "grad_norm": 2.1301700751404984, + "learning_rate": 1.8930914729091055e-06, + "loss": 0.6528, + "step": 6946 + }, + { + "epoch": 0.72, + "grad_norm": 1.8831602503537106, + "learning_rate": 1.8917726784144458e-06, + "loss": 0.5776, + "step": 6947 + }, + { + "epoch": 0.72, + "grad_norm": 2.062453945989891, + "learning_rate": 1.890454236255071e-06, + "loss": 0.61, + "step": 6948 + }, + { + "epoch": 0.72, + "grad_norm": 1.9660287314255194, + "learning_rate": 1.8891361465804326e-06, + "loss": 0.6367, + "step": 6949 + }, + { + "epoch": 0.72, + "grad_norm": 1.9541472768077464, + "learning_rate": 1.8878184095399428e-06, + "loss": 0.5979, + "step": 6950 + }, + { + "epoch": 0.72, + "grad_norm": 1.7630023901018252, + "learning_rate": 1.886501025282974e-06, + "loss": 0.5508, + "step": 6951 + }, + { + "epoch": 0.72, + "grad_norm": 1.693350200238252, + "learning_rate": 1.8851839939588617e-06, + "loss": 0.5917, + "step": 6952 + }, + { + "epoch": 0.72, + "grad_norm": 1.9405268803538889, + "learning_rate": 1.8838673157168956e-06, + "loss": 0.6026, + "step": 6953 + }, + { + "epoch": 0.72, + "grad_norm": 1.8552942459894206, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.6218, + "step": 6954 + }, + { + "epoch": 0.72, + "grad_norm": 1.8592020844670467, + "learning_rate": 1.8812350190763822e-06, + "loss": 0.6586, + "step": 6955 + }, + { + "epoch": 0.72, + "grad_norm": 2.0291508697413643, + "learning_rate": 1.8799194009762201e-06, + "loss": 0.6224, + "step": 6956 + }, + { + "epoch": 0.72, + "grad_norm": 2.0727455489975797, + "learning_rate": 1.8786041365549784e-06, + "loss": 0.7172, + "step": 6957 + }, + { + "epoch": 0.72, + "grad_norm": 1.8228323748167872, + "learning_rate": 1.8772892259617487e-06, + "loss": 0.5831, + "step": 6958 + }, + { + "epoch": 0.72, + "grad_norm": 1.8902269228729196, + "learning_rate": 1.875974669345582e-06, + "loss": 0.6655, + "step": 6959 + }, + { + "epoch": 0.72, + "grad_norm": 1.962322524254648, + "learning_rate": 1.8746604668554952e-06, + "loss": 0.5862, + "step": 6960 + }, + { + "epoch": 0.72, + "grad_norm": 2.2883723322771603, + "learning_rate": 1.8733466186404565e-06, + "loss": 0.6552, + "step": 6961 + }, + { + "epoch": 0.72, + "grad_norm": 1.8072852247050788, + "learning_rate": 1.8720331248494012e-06, + "loss": 0.5782, + "step": 6962 + }, + { + "epoch": 0.72, + "grad_norm": 2.123278381013698, + "learning_rate": 1.8707199856312186e-06, + "loss": 0.7116, + "step": 6963 + }, + { + "epoch": 0.72, + "grad_norm": 2.0807833558357838, + "learning_rate": 1.8694072011347636e-06, + "loss": 0.7126, + "step": 6964 + }, + { + "epoch": 0.72, + "grad_norm": 2.1994360140008165, + "learning_rate": 1.8680947715088465e-06, + "loss": 0.6547, + "step": 6965 + }, + { + "epoch": 0.72, + "grad_norm": 2.3923087716494797, + "learning_rate": 1.8667826969022379e-06, + "loss": 0.6168, + "step": 6966 + }, + { + "epoch": 0.72, + "grad_norm": 2.2628861411700947, + "learning_rate": 1.8654709774636676e-06, + "loss": 0.6133, + "step": 6967 + }, + { + "epoch": 0.72, + "grad_norm": 1.9026703854680282, + "learning_rate": 1.8641596133418305e-06, + "loss": 0.5893, + "step": 6968 + }, + { + "epoch": 0.72, + "grad_norm": 2.024038972343844, + "learning_rate": 1.8628486046853728e-06, + "loss": 0.6062, + "step": 6969 + }, + { + "epoch": 0.72, + "grad_norm": 1.8290810303514669, + "learning_rate": 1.8615379516429084e-06, + "loss": 0.6898, + "step": 6970 + }, + { + "epoch": 0.72, + "grad_norm": 1.9372867721866058, + "learning_rate": 1.8602276543630044e-06, + "loss": 0.6593, + "step": 6971 + }, + { + "epoch": 0.72, + "grad_norm": 2.0472330529227487, + "learning_rate": 1.858917712994195e-06, + "loss": 0.5912, + "step": 6972 + }, + { + "epoch": 0.72, + "grad_norm": 2.0346582715337576, + "learning_rate": 1.8576081276849633e-06, + "loss": 0.6109, + "step": 6973 + }, + { + "epoch": 0.72, + "grad_norm": 1.881561791570671, + "learning_rate": 1.8562988985837632e-06, + "loss": 0.6301, + "step": 6974 + }, + { + "epoch": 0.73, + "grad_norm": 1.9364107609270342, + "learning_rate": 1.8549900258389992e-06, + "loss": 0.5445, + "step": 6975 + }, + { + "epoch": 0.73, + "grad_norm": 1.764202402624153, + "learning_rate": 1.853681509599044e-06, + "loss": 0.5492, + "step": 6976 + }, + { + "epoch": 0.73, + "grad_norm": 1.9477673427273048, + "learning_rate": 1.852373350012221e-06, + "loss": 0.5653, + "step": 6977 + }, + { + "epoch": 0.73, + "grad_norm": 2.0400789263657946, + "learning_rate": 1.8510655472268212e-06, + "loss": 0.5676, + "step": 6978 + }, + { + "epoch": 0.73, + "grad_norm": 1.9127120678077745, + "learning_rate": 1.8497581013910904e-06, + "loss": 0.6438, + "step": 6979 + }, + { + "epoch": 0.73, + "grad_norm": 1.9248975570888085, + "learning_rate": 1.848451012653233e-06, + "loss": 0.6097, + "step": 6980 + }, + { + "epoch": 0.73, + "grad_norm": 1.9422713468844135, + "learning_rate": 1.8471442811614177e-06, + "loss": 0.6654, + "step": 6981 + }, + { + "epoch": 0.73, + "grad_norm": 2.160167412360216, + "learning_rate": 1.8458379070637678e-06, + "loss": 0.6017, + "step": 6982 + }, + { + "epoch": 0.73, + "grad_norm": 2.0222848743451896, + "learning_rate": 1.8445318905083703e-06, + "loss": 0.7048, + "step": 6983 + }, + { + "epoch": 0.73, + "grad_norm": 2.057612384950042, + "learning_rate": 1.843226231643267e-06, + "loss": 0.6671, + "step": 6984 + }, + { + "epoch": 0.73, + "grad_norm": 1.7047056832519873, + "learning_rate": 1.8419209306164653e-06, + "loss": 0.6048, + "step": 6985 + }, + { + "epoch": 0.73, + "grad_norm": 2.060269652286786, + "learning_rate": 1.8406159875759266e-06, + "loss": 0.6438, + "step": 6986 + }, + { + "epoch": 0.73, + "grad_norm": 2.159682548383432, + "learning_rate": 1.8393114026695736e-06, + "loss": 0.5876, + "step": 6987 + }, + { + "epoch": 0.73, + "grad_norm": 1.8583493961058244, + "learning_rate": 1.8380071760452862e-06, + "loss": 0.7386, + "step": 6988 + }, + { + "epoch": 0.73, + "grad_norm": 1.7793109875706183, + "learning_rate": 1.83670330785091e-06, + "loss": 0.5908, + "step": 6989 + }, + { + "epoch": 0.73, + "grad_norm": 2.1854173295401274, + "learning_rate": 1.8353997982342425e-06, + "loss": 0.6927, + "step": 6990 + }, + { + "epoch": 0.73, + "grad_norm": 1.6687385136463946, + "learning_rate": 1.8340966473430477e-06, + "loss": 0.5707, + "step": 6991 + }, + { + "epoch": 0.73, + "grad_norm": 1.817059248514043, + "learning_rate": 1.8327938553250407e-06, + "loss": 0.5635, + "step": 6992 + }, + { + "epoch": 0.73, + "grad_norm": 1.9641887104756441, + "learning_rate": 1.8314914223279046e-06, + "loss": 0.7102, + "step": 6993 + }, + { + "epoch": 0.73, + "grad_norm": 1.7640602296208145, + "learning_rate": 1.8301893484992755e-06, + "loss": 0.6463, + "step": 6994 + }, + { + "epoch": 0.73, + "grad_norm": 2.2745339932755324, + "learning_rate": 1.8288876339867511e-06, + "loss": 0.7781, + "step": 6995 + }, + { + "epoch": 0.73, + "grad_norm": 1.8716453246074705, + "learning_rate": 1.8275862789378862e-06, + "loss": 0.6602, + "step": 6996 + }, + { + "epoch": 0.73, + "grad_norm": 2.353529730295554, + "learning_rate": 1.826285283500201e-06, + "loss": 0.7568, + "step": 6997 + }, + { + "epoch": 0.73, + "grad_norm": 1.9426701427221278, + "learning_rate": 1.8249846478211663e-06, + "loss": 0.7043, + "step": 6998 + }, + { + "epoch": 0.73, + "grad_norm": 1.9052478319387323, + "learning_rate": 1.8236843720482206e-06, + "loss": 0.631, + "step": 6999 + }, + { + "epoch": 0.73, + "grad_norm": 1.884005001155357, + "learning_rate": 1.822384456328754e-06, + "loss": 0.5732, + "step": 7000 + }, + { + "epoch": 0.73, + "grad_norm": 1.906581597726097, + "learning_rate": 1.8210849008101244e-06, + "loss": 0.6591, + "step": 7001 + }, + { + "epoch": 0.73, + "grad_norm": 2.032360075463133, + "learning_rate": 1.8197857056396372e-06, + "loss": 0.5087, + "step": 7002 + }, + { + "epoch": 0.73, + "grad_norm": 2.072367018499558, + "learning_rate": 1.8184868709645686e-06, + "loss": 0.6859, + "step": 7003 + }, + { + "epoch": 0.73, + "grad_norm": 1.9137751075131972, + "learning_rate": 1.8171883969321458e-06, + "loss": 0.6251, + "step": 7004 + }, + { + "epoch": 0.73, + "grad_norm": 2.016277252510613, + "learning_rate": 1.815890283689561e-06, + "loss": 0.7028, + "step": 7005 + }, + { + "epoch": 0.73, + "grad_norm": 1.8514033044622462, + "learning_rate": 1.8145925313839597e-06, + "loss": 0.6176, + "step": 7006 + }, + { + "epoch": 0.73, + "grad_norm": 1.707417328439556, + "learning_rate": 1.8132951401624527e-06, + "loss": 0.5991, + "step": 7007 + }, + { + "epoch": 0.73, + "grad_norm": 2.05162512516868, + "learning_rate": 1.811998110172104e-06, + "loss": 0.6503, + "step": 7008 + }, + { + "epoch": 0.73, + "grad_norm": 1.8997173006855768, + "learning_rate": 1.8107014415599416e-06, + "loss": 0.6086, + "step": 7009 + }, + { + "epoch": 0.73, + "grad_norm": 1.9842498026525295, + "learning_rate": 1.8094051344729497e-06, + "loss": 0.6816, + "step": 7010 + }, + { + "epoch": 0.73, + "grad_norm": 1.8316376168242585, + "learning_rate": 1.808109189058071e-06, + "loss": 0.5557, + "step": 7011 + }, + { + "epoch": 0.73, + "grad_norm": 1.841851848534049, + "learning_rate": 1.8068136054622076e-06, + "loss": 0.6038, + "step": 7012 + }, + { + "epoch": 0.73, + "grad_norm": 1.7337353429307356, + "learning_rate": 1.8055183838322243e-06, + "loss": 0.6381, + "step": 7013 + }, + { + "epoch": 0.73, + "grad_norm": 2.066314680142265, + "learning_rate": 1.804223524314938e-06, + "loss": 0.6052, + "step": 7014 + }, + { + "epoch": 0.73, + "grad_norm": 1.9221531255760698, + "learning_rate": 1.802929027057133e-06, + "loss": 0.6784, + "step": 7015 + }, + { + "epoch": 0.73, + "grad_norm": 1.8537567633360141, + "learning_rate": 1.8016348922055448e-06, + "loss": 0.6299, + "step": 7016 + }, + { + "epoch": 0.73, + "grad_norm": 1.9730320203082514, + "learning_rate": 1.8003411199068704e-06, + "loss": 0.5932, + "step": 7017 + }, + { + "epoch": 0.73, + "grad_norm": 1.9237631065008112, + "learning_rate": 1.7990477103077691e-06, + "loss": 0.6454, + "step": 7018 + }, + { + "epoch": 0.73, + "grad_norm": 1.9063776911922903, + "learning_rate": 1.7977546635548527e-06, + "loss": 0.5453, + "step": 7019 + }, + { + "epoch": 0.73, + "grad_norm": 1.7272938089567267, + "learning_rate": 1.7964619797946991e-06, + "loss": 0.5609, + "step": 7020 + }, + { + "epoch": 0.73, + "grad_norm": 1.9139613548645424, + "learning_rate": 1.7951696591738382e-06, + "loss": 0.6083, + "step": 7021 + }, + { + "epoch": 0.73, + "grad_norm": 2.0076928391817583, + "learning_rate": 1.7938777018387648e-06, + "loss": 0.6729, + "step": 7022 + }, + { + "epoch": 0.73, + "grad_norm": 1.8414781413829706, + "learning_rate": 1.7925861079359268e-06, + "loss": 0.6657, + "step": 7023 + }, + { + "epoch": 0.73, + "grad_norm": 1.9926494137956807, + "learning_rate": 1.7912948776117385e-06, + "loss": 0.5832, + "step": 7024 + }, + { + "epoch": 0.73, + "grad_norm": 2.047037722371666, + "learning_rate": 1.7900040110125611e-06, + "loss": 0.5773, + "step": 7025 + }, + { + "epoch": 0.73, + "grad_norm": 1.9110124757511402, + "learning_rate": 1.7887135082847274e-06, + "loss": 0.6048, + "step": 7026 + }, + { + "epoch": 0.73, + "grad_norm": 1.9091170712990175, + "learning_rate": 1.7874233695745191e-06, + "loss": 0.5744, + "step": 7027 + }, + { + "epoch": 0.73, + "grad_norm": 2.1501681795544605, + "learning_rate": 1.786133595028185e-06, + "loss": 0.6717, + "step": 7028 + }, + { + "epoch": 0.73, + "grad_norm": 1.8727818568701433, + "learning_rate": 1.7848441847919246e-06, + "loss": 0.6262, + "step": 7029 + }, + { + "epoch": 0.73, + "grad_norm": 2.0302444195043146, + "learning_rate": 1.7835551390119033e-06, + "loss": 0.6903, + "step": 7030 + }, + { + "epoch": 0.73, + "grad_norm": 1.8811800640863703, + "learning_rate": 1.7822664578342403e-06, + "loss": 0.5732, + "step": 7031 + }, + { + "epoch": 0.73, + "grad_norm": 2.0044708198114662, + "learning_rate": 1.7809781414050147e-06, + "loss": 0.6878, + "step": 7032 + }, + { + "epoch": 0.73, + "grad_norm": 2.0627253959610394, + "learning_rate": 1.7796901898702628e-06, + "loss": 0.6258, + "step": 7033 + }, + { + "epoch": 0.73, + "grad_norm": 1.9855655422507894, + "learning_rate": 1.7784026033759844e-06, + "loss": 0.7077, + "step": 7034 + }, + { + "epoch": 0.73, + "grad_norm": 2.1111295924164546, + "learning_rate": 1.777115382068132e-06, + "loss": 0.6692, + "step": 7035 + }, + { + "epoch": 0.73, + "grad_norm": 2.0552672782335533, + "learning_rate": 1.7758285260926228e-06, + "loss": 0.6351, + "step": 7036 + }, + { + "epoch": 0.73, + "grad_norm": 1.790621899903027, + "learning_rate": 1.7745420355953253e-06, + "loss": 0.5409, + "step": 7037 + }, + { + "epoch": 0.73, + "grad_norm": 2.0282472564011536, + "learning_rate": 1.7732559107220765e-06, + "loss": 0.6969, + "step": 7038 + }, + { + "epoch": 0.73, + "grad_norm": 1.8675459459068524, + "learning_rate": 1.7719701516186578e-06, + "loss": 0.5922, + "step": 7039 + }, + { + "epoch": 0.73, + "grad_norm": 2.2556574076998035, + "learning_rate": 1.770684758430824e-06, + "loss": 0.6478, + "step": 7040 + }, + { + "epoch": 0.73, + "grad_norm": 2.4928599008992904, + "learning_rate": 1.769399731304277e-06, + "loss": 0.5609, + "step": 7041 + }, + { + "epoch": 0.73, + "grad_norm": 1.8371152290767596, + "learning_rate": 1.7681150703846867e-06, + "loss": 0.6117, + "step": 7042 + }, + { + "epoch": 0.73, + "grad_norm": 2.036972378473435, + "learning_rate": 1.7668307758176717e-06, + "loss": 0.6935, + "step": 7043 + }, + { + "epoch": 0.73, + "grad_norm": 1.9198084244760627, + "learning_rate": 1.7655468477488191e-06, + "loss": 0.5391, + "step": 7044 + }, + { + "epoch": 0.73, + "grad_norm": 1.7153890540288939, + "learning_rate": 1.7642632863236653e-06, + "loss": 0.5468, + "step": 7045 + }, + { + "epoch": 0.73, + "grad_norm": 1.9903008100611193, + "learning_rate": 1.7629800916877126e-06, + "loss": 0.697, + "step": 7046 + }, + { + "epoch": 0.73, + "grad_norm": 1.9312849352503563, + "learning_rate": 1.7616972639864166e-06, + "loss": 0.5932, + "step": 7047 + }, + { + "epoch": 0.73, + "grad_norm": 1.9990339340200853, + "learning_rate": 1.7604148033651925e-06, + "loss": 0.6599, + "step": 7048 + }, + { + "epoch": 0.73, + "grad_norm": 2.1694031802547125, + "learning_rate": 1.7591327099694167e-06, + "loss": 0.6993, + "step": 7049 + }, + { + "epoch": 0.73, + "grad_norm": 1.8424469644992956, + "learning_rate": 1.7578509839444202e-06, + "loss": 0.5265, + "step": 7050 + }, + { + "epoch": 0.73, + "grad_norm": 1.890458213661419, + "learning_rate": 1.756569625435493e-06, + "loss": 0.6508, + "step": 7051 + }, + { + "epoch": 0.73, + "grad_norm": 2.0184026859189967, + "learning_rate": 1.7552886345878879e-06, + "loss": 0.6978, + "step": 7052 + }, + { + "epoch": 0.73, + "grad_norm": 2.0127389742757353, + "learning_rate": 1.7540080115468095e-06, + "loss": 0.6505, + "step": 7053 + }, + { + "epoch": 0.73, + "grad_norm": 2.3335684879728342, + "learning_rate": 1.752727756457423e-06, + "loss": 0.6635, + "step": 7054 + }, + { + "epoch": 0.73, + "grad_norm": 1.9125226367301975, + "learning_rate": 1.7514478694648563e-06, + "loss": 0.6221, + "step": 7055 + }, + { + "epoch": 0.73, + "grad_norm": 2.2096866866360534, + "learning_rate": 1.7501683507141876e-06, + "loss": 0.6165, + "step": 7056 + }, + { + "epoch": 0.73, + "grad_norm": 1.8901019668590993, + "learning_rate": 1.7488892003504615e-06, + "loss": 0.6364, + "step": 7057 + }, + { + "epoch": 0.73, + "grad_norm": 1.835005177582281, + "learning_rate": 1.7476104185186737e-06, + "loss": 0.5718, + "step": 7058 + }, + { + "epoch": 0.73, + "grad_norm": 1.7368860915767115, + "learning_rate": 1.7463320053637844e-06, + "loss": 0.6054, + "step": 7059 + }, + { + "epoch": 0.73, + "grad_norm": 2.0065600854979793, + "learning_rate": 1.745053961030706e-06, + "loss": 0.7173, + "step": 7060 + }, + { + "epoch": 0.73, + "grad_norm": 1.9553760769747448, + "learning_rate": 1.743776285664317e-06, + "loss": 0.6248, + "step": 7061 + }, + { + "epoch": 0.73, + "grad_norm": 1.9170304004440861, + "learning_rate": 1.7424989794094426e-06, + "loss": 0.5648, + "step": 7062 + }, + { + "epoch": 0.73, + "grad_norm": 1.8843076141856252, + "learning_rate": 1.7412220424108778e-06, + "loss": 0.5752, + "step": 7063 + }, + { + "epoch": 0.73, + "grad_norm": 1.892460870380063, + "learning_rate": 1.739945474813367e-06, + "loss": 0.6732, + "step": 7064 + }, + { + "epoch": 0.73, + "grad_norm": 2.064212792184756, + "learning_rate": 1.7386692767616204e-06, + "loss": 0.6855, + "step": 7065 + }, + { + "epoch": 0.73, + "grad_norm": 2.0042471676355587, + "learning_rate": 1.737393448400298e-06, + "loss": 0.6518, + "step": 7066 + }, + { + "epoch": 0.73, + "grad_norm": 2.1605114964526453, + "learning_rate": 1.7361179898740265e-06, + "loss": 0.5507, + "step": 7067 + }, + { + "epoch": 0.73, + "grad_norm": 1.9867984391295228, + "learning_rate": 1.7348429013273844e-06, + "loss": 0.583, + "step": 7068 + }, + { + "epoch": 0.73, + "grad_norm": 1.8676294756248204, + "learning_rate": 1.73356818290491e-06, + "loss": 0.5919, + "step": 7069 + }, + { + "epoch": 0.73, + "grad_norm": 2.0320162519150218, + "learning_rate": 1.7322938347510986e-06, + "loss": 0.6553, + "step": 7070 + }, + { + "epoch": 0.74, + "grad_norm": 1.996766764699223, + "learning_rate": 1.731019857010408e-06, + "loss": 0.6392, + "step": 7071 + }, + { + "epoch": 0.74, + "grad_norm": 2.0880116579155645, + "learning_rate": 1.7297462498272476e-06, + "loss": 0.6324, + "step": 7072 + }, + { + "epoch": 0.74, + "grad_norm": 1.9530026108454182, + "learning_rate": 1.728473013345991e-06, + "loss": 0.6077, + "step": 7073 + }, + { + "epoch": 0.74, + "grad_norm": 1.853123715597261, + "learning_rate": 1.727200147710964e-06, + "loss": 0.6759, + "step": 7074 + }, + { + "epoch": 0.74, + "grad_norm": 1.6900306342788634, + "learning_rate": 1.7259276530664577e-06, + "loss": 0.6573, + "step": 7075 + }, + { + "epoch": 0.74, + "grad_norm": 1.808885919868968, + "learning_rate": 1.7246555295567102e-06, + "loss": 0.6117, + "step": 7076 + }, + { + "epoch": 0.74, + "grad_norm": 1.962491279043586, + "learning_rate": 1.7233837773259288e-06, + "loss": 0.563, + "step": 7077 + }, + { + "epoch": 0.74, + "grad_norm": 2.0288719441359766, + "learning_rate": 1.7221123965182712e-06, + "loss": 0.5779, + "step": 7078 + }, + { + "epoch": 0.74, + "grad_norm": 2.122216006034978, + "learning_rate": 1.720841387277858e-06, + "loss": 0.6406, + "step": 7079 + }, + { + "epoch": 0.74, + "grad_norm": 1.8225722448977553, + "learning_rate": 1.7195707497487624e-06, + "loss": 0.6658, + "step": 7080 + }, + { + "epoch": 0.74, + "grad_norm": 1.8260903677576232, + "learning_rate": 1.7183004840750223e-06, + "loss": 0.5738, + "step": 7081 + }, + { + "epoch": 0.74, + "grad_norm": 1.9433119305101705, + "learning_rate": 1.7170305904006252e-06, + "loss": 0.6746, + "step": 7082 + }, + { + "epoch": 0.74, + "grad_norm": 2.277323368339559, + "learning_rate": 1.7157610688695248e-06, + "loss": 0.6327, + "step": 7083 + }, + { + "epoch": 0.74, + "grad_norm": 1.9858333985133685, + "learning_rate": 1.714491919625627e-06, + "loss": 0.6856, + "step": 7084 + }, + { + "epoch": 0.74, + "grad_norm": 1.875314227630569, + "learning_rate": 1.7132231428127949e-06, + "loss": 0.5694, + "step": 7085 + }, + { + "epoch": 0.74, + "grad_norm": 1.8591613869115624, + "learning_rate": 1.7119547385748552e-06, + "loss": 0.6008, + "step": 7086 + }, + { + "epoch": 0.74, + "grad_norm": 2.1645944722824653, + "learning_rate": 1.710686707055586e-06, + "loss": 0.5587, + "step": 7087 + }, + { + "epoch": 0.74, + "grad_norm": 1.934409516211822, + "learning_rate": 1.7094190483987282e-06, + "loss": 0.556, + "step": 7088 + }, + { + "epoch": 0.74, + "grad_norm": 1.5143520523481275, + "learning_rate": 1.708151762747977e-06, + "loss": 0.5124, + "step": 7089 + }, + { + "epoch": 0.74, + "grad_norm": 1.8513917629013414, + "learning_rate": 1.7068848502469866e-06, + "loss": 0.5633, + "step": 7090 + }, + { + "epoch": 0.74, + "grad_norm": 2.0804716730668145, + "learning_rate": 1.7056183110393666e-06, + "loss": 0.7043, + "step": 7091 + }, + { + "epoch": 0.74, + "grad_norm": 1.8489263596056118, + "learning_rate": 1.7043521452686902e-06, + "loss": 0.6114, + "step": 7092 + }, + { + "epoch": 0.74, + "grad_norm": 1.9388587591896767, + "learning_rate": 1.7030863530784814e-06, + "loss": 0.5371, + "step": 7093 + }, + { + "epoch": 0.74, + "grad_norm": 1.8122345033573635, + "learning_rate": 1.7018209346122272e-06, + "loss": 0.5691, + "step": 7094 + }, + { + "epoch": 0.74, + "grad_norm": 2.0118884418469616, + "learning_rate": 1.7005558900133678e-06, + "loss": 0.6261, + "step": 7095 + }, + { + "epoch": 0.74, + "grad_norm": 1.734039688883811, + "learning_rate": 1.6992912194253065e-06, + "loss": 0.5549, + "step": 7096 + }, + { + "epoch": 0.74, + "grad_norm": 2.1291135005730237, + "learning_rate": 1.6980269229913965e-06, + "loss": 0.7039, + "step": 7097 + }, + { + "epoch": 0.74, + "grad_norm": 2.040583129036302, + "learning_rate": 1.696763000854959e-06, + "loss": 0.6216, + "step": 7098 + }, + { + "epoch": 0.74, + "grad_norm": 1.7362654789557477, + "learning_rate": 1.69549945315926e-06, + "loss": 0.5089, + "step": 7099 + }, + { + "epoch": 0.74, + "grad_norm": 1.8933279122929687, + "learning_rate": 1.6942362800475343e-06, + "loss": 0.6009, + "step": 7100 + }, + { + "epoch": 0.74, + "grad_norm": 2.0124255601190577, + "learning_rate": 1.6929734816629674e-06, + "loss": 0.672, + "step": 7101 + }, + { + "epoch": 0.74, + "grad_norm": 2.0199051597392446, + "learning_rate": 1.6917110581487067e-06, + "loss": 0.5459, + "step": 7102 + }, + { + "epoch": 0.74, + "grad_norm": 2.024380319703663, + "learning_rate": 1.690449009647853e-06, + "loss": 0.6074, + "step": 7103 + }, + { + "epoch": 0.74, + "grad_norm": 1.7856784382758137, + "learning_rate": 1.6891873363034693e-06, + "loss": 0.4908, + "step": 7104 + }, + { + "epoch": 0.74, + "grad_norm": 1.8101573491267204, + "learning_rate": 1.6879260382585727e-06, + "loss": 0.5492, + "step": 7105 + }, + { + "epoch": 0.74, + "grad_norm": 1.9753378786530036, + "learning_rate": 1.686665115656137e-06, + "loss": 0.6705, + "step": 7106 + }, + { + "epoch": 0.74, + "grad_norm": 1.9769168029519377, + "learning_rate": 1.6854045686390947e-06, + "loss": 0.6015, + "step": 7107 + }, + { + "epoch": 0.74, + "grad_norm": 2.0534968822940582, + "learning_rate": 1.6841443973503384e-06, + "loss": 0.5757, + "step": 7108 + }, + { + "epoch": 0.74, + "grad_norm": 2.1367136070784816, + "learning_rate": 1.6828846019327128e-06, + "loss": 0.6714, + "step": 7109 + }, + { + "epoch": 0.74, + "grad_norm": 1.9896795722232372, + "learning_rate": 1.6816251825290265e-06, + "loss": 0.6377, + "step": 7110 + }, + { + "epoch": 0.74, + "grad_norm": 2.051473929576098, + "learning_rate": 1.680366139282038e-06, + "loss": 0.6382, + "step": 7111 + }, + { + "epoch": 0.74, + "grad_norm": 1.845927910849788, + "learning_rate": 1.6791074723344719e-06, + "loss": 0.6508, + "step": 7112 + }, + { + "epoch": 0.74, + "grad_norm": 2.1883562893787234, + "learning_rate": 1.6778491818289995e-06, + "loss": 0.5516, + "step": 7113 + }, + { + "epoch": 0.74, + "grad_norm": 2.334556242298653, + "learning_rate": 1.6765912679082592e-06, + "loss": 0.6544, + "step": 7114 + }, + { + "epoch": 0.74, + "grad_norm": 1.7710546453451088, + "learning_rate": 1.67533373071484e-06, + "loss": 0.6163, + "step": 7115 + }, + { + "epoch": 0.74, + "grad_norm": 2.0008488447691177, + "learning_rate": 1.6740765703912942e-06, + "loss": 0.6591, + "step": 7116 + }, + { + "epoch": 0.74, + "grad_norm": 1.9710345983015822, + "learning_rate": 1.6728197870801244e-06, + "loss": 0.6042, + "step": 7117 + }, + { + "epoch": 0.74, + "grad_norm": 2.0038441788739054, + "learning_rate": 1.6715633809237974e-06, + "loss": 0.5822, + "step": 7118 + }, + { + "epoch": 0.74, + "grad_norm": 1.987280070196863, + "learning_rate": 1.6703073520647316e-06, + "loss": 0.5685, + "step": 7119 + }, + { + "epoch": 0.74, + "grad_norm": 2.2229776012908724, + "learning_rate": 1.6690517006453071e-06, + "loss": 0.6201, + "step": 7120 + }, + { + "epoch": 0.74, + "grad_norm": 2.0164190353791467, + "learning_rate": 1.6677964268078584e-06, + "loss": 0.6305, + "step": 7121 + }, + { + "epoch": 0.74, + "grad_norm": 1.9884428866003194, + "learning_rate": 1.6665415306946764e-06, + "loss": 0.6417, + "step": 7122 + }, + { + "epoch": 0.74, + "grad_norm": 1.7354582036347406, + "learning_rate": 1.665287012448013e-06, + "loss": 0.5151, + "step": 7123 + }, + { + "epoch": 0.74, + "grad_norm": 2.2562445463263816, + "learning_rate": 1.6640328722100723e-06, + "loss": 0.6702, + "step": 7124 + }, + { + "epoch": 0.74, + "grad_norm": 1.8771207954912414, + "learning_rate": 1.6627791101230222e-06, + "loss": 0.5857, + "step": 7125 + }, + { + "epoch": 0.74, + "grad_norm": 2.507413450280522, + "learning_rate": 1.6615257263289809e-06, + "loss": 0.5257, + "step": 7126 + }, + { + "epoch": 0.74, + "grad_norm": 2.057813853105924, + "learning_rate": 1.6602727209700276e-06, + "loss": 0.6122, + "step": 7127 + }, + { + "epoch": 0.74, + "grad_norm": 2.309053131838771, + "learning_rate": 1.659020094188195e-06, + "loss": 0.5922, + "step": 7128 + }, + { + "epoch": 0.74, + "grad_norm": 2.210830259594895, + "learning_rate": 1.6577678461254797e-06, + "loss": 0.6566, + "step": 7129 + }, + { + "epoch": 0.74, + "grad_norm": 1.8767799929288522, + "learning_rate": 1.6565159769238276e-06, + "loss": 0.6357, + "step": 7130 + }, + { + "epoch": 0.74, + "grad_norm": 1.9208667473650485, + "learning_rate": 1.6552644867251483e-06, + "loss": 0.5577, + "step": 7131 + }, + { + "epoch": 0.74, + "grad_norm": 1.9185506346962085, + "learning_rate": 1.6540133756713017e-06, + "loss": 0.6611, + "step": 7132 + }, + { + "epoch": 0.74, + "grad_norm": 1.8123867213865923, + "learning_rate": 1.6527626439041128e-06, + "loss": 0.5769, + "step": 7133 + }, + { + "epoch": 0.74, + "grad_norm": 1.6904443041818638, + "learning_rate": 1.6515122915653564e-06, + "loss": 0.5196, + "step": 7134 + }, + { + "epoch": 0.74, + "grad_norm": 1.8405788256050637, + "learning_rate": 1.6502623187967675e-06, + "loss": 0.5961, + "step": 7135 + }, + { + "epoch": 0.74, + "grad_norm": 2.276571617181377, + "learning_rate": 1.6490127257400363e-06, + "loss": 0.5643, + "step": 7136 + }, + { + "epoch": 0.74, + "grad_norm": 2.08819425335901, + "learning_rate": 1.6477635125368136e-06, + "loss": 0.6155, + "step": 7137 + }, + { + "epoch": 0.74, + "grad_norm": 1.9634742797877143, + "learning_rate": 1.6465146793287028e-06, + "loss": 0.7454, + "step": 7138 + }, + { + "epoch": 0.74, + "grad_norm": 1.9754650148020452, + "learning_rate": 1.645266226257269e-06, + "loss": 0.5814, + "step": 7139 + }, + { + "epoch": 0.74, + "grad_norm": 1.9078401551432211, + "learning_rate": 1.6440181534640277e-06, + "loss": 0.6458, + "step": 7140 + }, + { + "epoch": 0.74, + "grad_norm": 1.9830325654887706, + "learning_rate": 1.6427704610904594e-06, + "loss": 0.6258, + "step": 7141 + }, + { + "epoch": 0.74, + "grad_norm": 1.9722224448626224, + "learning_rate": 1.6415231492779942e-06, + "loss": 0.6054, + "step": 7142 + }, + { + "epoch": 0.74, + "grad_norm": 2.1482120325582414, + "learning_rate": 1.640276218168023e-06, + "loss": 0.5688, + "step": 7143 + }, + { + "epoch": 0.74, + "grad_norm": 1.8700239696691592, + "learning_rate": 1.6390296679018909e-06, + "loss": 0.6565, + "step": 7144 + }, + { + "epoch": 0.74, + "grad_norm": 2.0072980679497032, + "learning_rate": 1.637783498620904e-06, + "loss": 0.7163, + "step": 7145 + }, + { + "epoch": 0.74, + "grad_norm": 1.744011926537701, + "learning_rate": 1.6365377104663206e-06, + "loss": 0.5255, + "step": 7146 + }, + { + "epoch": 0.74, + "grad_norm": 2.1432552564208316, + "learning_rate": 1.63529230357936e-06, + "loss": 0.7444, + "step": 7147 + }, + { + "epoch": 0.74, + "grad_norm": 2.0570680988223917, + "learning_rate": 1.6340472781011935e-06, + "loss": 0.6855, + "step": 7148 + }, + { + "epoch": 0.74, + "grad_norm": 2.1021390615422724, + "learning_rate": 1.6328026341729547e-06, + "loss": 0.5668, + "step": 7149 + }, + { + "epoch": 0.74, + "grad_norm": 2.1758359200731814, + "learning_rate": 1.6315583719357298e-06, + "loss": 0.6835, + "step": 7150 + }, + { + "epoch": 0.74, + "grad_norm": 1.9660860516559455, + "learning_rate": 1.630314491530563e-06, + "loss": 0.5572, + "step": 7151 + }, + { + "epoch": 0.74, + "grad_norm": 2.0277568894165072, + "learning_rate": 1.6290709930984533e-06, + "loss": 0.703, + "step": 7152 + }, + { + "epoch": 0.74, + "grad_norm": 1.7083389944187983, + "learning_rate": 1.6278278767803617e-06, + "loss": 0.5905, + "step": 7153 + }, + { + "epoch": 0.74, + "grad_norm": 1.7879894519009019, + "learning_rate": 1.6265851427171996e-06, + "loss": 0.5972, + "step": 7154 + }, + { + "epoch": 0.74, + "grad_norm": 1.8278090663267998, + "learning_rate": 1.625342791049841e-06, + "loss": 0.6401, + "step": 7155 + }, + { + "epoch": 0.74, + "grad_norm": 1.857344031323788, + "learning_rate": 1.6241008219191107e-06, + "loss": 0.5997, + "step": 7156 + }, + { + "epoch": 0.74, + "grad_norm": 1.9243269148131965, + "learning_rate": 1.622859235465795e-06, + "loss": 0.6302, + "step": 7157 + }, + { + "epoch": 0.74, + "grad_norm": 1.9731235103533473, + "learning_rate": 1.6216180318306352e-06, + "loss": 0.6789, + "step": 7158 + }, + { + "epoch": 0.74, + "grad_norm": 2.1769733748993194, + "learning_rate": 1.6203772111543247e-06, + "loss": 0.5889, + "step": 7159 + }, + { + "epoch": 0.74, + "grad_norm": 1.8509294311347555, + "learning_rate": 1.6191367735775231e-06, + "loss": 0.5998, + "step": 7160 + }, + { + "epoch": 0.74, + "grad_norm": 1.8165697626069977, + "learning_rate": 1.6178967192408367e-06, + "loss": 0.5656, + "step": 7161 + }, + { + "epoch": 0.74, + "grad_norm": 1.9585567515408426, + "learning_rate": 1.616657048284836e-06, + "loss": 0.67, + "step": 7162 + }, + { + "epoch": 0.74, + "grad_norm": 1.8907536973431627, + "learning_rate": 1.6154177608500415e-06, + "loss": 0.6474, + "step": 7163 + }, + { + "epoch": 0.74, + "grad_norm": 1.8221553661698418, + "learning_rate": 1.6141788570769385e-06, + "loss": 0.6146, + "step": 7164 + }, + { + "epoch": 0.74, + "grad_norm": 1.8889485316058878, + "learning_rate": 1.6129403371059576e-06, + "loss": 0.6079, + "step": 7165 + }, + { + "epoch": 0.74, + "grad_norm": 1.8222971298134154, + "learning_rate": 1.611702201077497e-06, + "loss": 0.6486, + "step": 7166 + }, + { + "epoch": 0.75, + "grad_norm": 2.0447254833968795, + "learning_rate": 1.610464449131902e-06, + "loss": 0.6075, + "step": 7167 + }, + { + "epoch": 0.75, + "grad_norm": 1.7665235599646913, + "learning_rate": 1.609227081409484e-06, + "loss": 0.6228, + "step": 7168 + }, + { + "epoch": 0.75, + "grad_norm": 1.964368754990478, + "learning_rate": 1.607990098050501e-06, + "loss": 0.6175, + "step": 7169 + }, + { + "epoch": 0.75, + "grad_norm": 2.029133317798616, + "learning_rate": 1.6067534991951754e-06, + "loss": 0.6734, + "step": 7170 + }, + { + "epoch": 0.75, + "grad_norm": 1.8409734232164388, + "learning_rate": 1.6055172849836826e-06, + "loss": 0.6309, + "step": 7171 + }, + { + "epoch": 0.75, + "grad_norm": 1.7783753720824762, + "learning_rate": 1.6042814555561525e-06, + "loss": 0.5715, + "step": 7172 + }, + { + "epoch": 0.75, + "grad_norm": 1.698699527789748, + "learning_rate": 1.603046011052673e-06, + "loss": 0.6219, + "step": 7173 + }, + { + "epoch": 0.75, + "grad_norm": 1.950831771168677, + "learning_rate": 1.6018109516132917e-06, + "loss": 0.6084, + "step": 7174 + }, + { + "epoch": 0.75, + "grad_norm": 1.7085437834335682, + "learning_rate": 1.600576277378007e-06, + "loss": 0.5615, + "step": 7175 + }, + { + "epoch": 0.75, + "grad_norm": 1.803822529222228, + "learning_rate": 1.5993419884867783e-06, + "loss": 0.5288, + "step": 7176 + }, + { + "epoch": 0.75, + "grad_norm": 2.0799801699470155, + "learning_rate": 1.5981080850795171e-06, + "loss": 0.6573, + "step": 7177 + }, + { + "epoch": 0.75, + "grad_norm": 2.074780152890361, + "learning_rate": 1.5968745672960961e-06, + "loss": 0.6823, + "step": 7178 + }, + { + "epoch": 0.75, + "grad_norm": 1.9974810974208348, + "learning_rate": 1.59564143527634e-06, + "loss": 0.6208, + "step": 7179 + }, + { + "epoch": 0.75, + "grad_norm": 2.024925483595224, + "learning_rate": 1.5944086891600314e-06, + "loss": 0.6646, + "step": 7180 + }, + { + "epoch": 0.75, + "grad_norm": 2.182335939230614, + "learning_rate": 1.5931763290869073e-06, + "loss": 0.6291, + "step": 7181 + }, + { + "epoch": 0.75, + "grad_norm": 2.1038938622619363, + "learning_rate": 1.5919443551966662e-06, + "loss": 0.5521, + "step": 7182 + }, + { + "epoch": 0.75, + "grad_norm": 2.2416642109686515, + "learning_rate": 1.5907127676289564e-06, + "loss": 0.6313, + "step": 7183 + }, + { + "epoch": 0.75, + "grad_norm": 1.8541008455898098, + "learning_rate": 1.589481566523388e-06, + "loss": 0.4875, + "step": 7184 + }, + { + "epoch": 0.75, + "grad_norm": 1.9562324429343436, + "learning_rate": 1.5882507520195218e-06, + "loss": 0.5673, + "step": 7185 + }, + { + "epoch": 0.75, + "grad_norm": 2.064914677261776, + "learning_rate": 1.587020324256881e-06, + "loss": 0.6345, + "step": 7186 + }, + { + "epoch": 0.75, + "grad_norm": 1.9817317237939902, + "learning_rate": 1.5857902833749395e-06, + "loss": 0.5916, + "step": 7187 + }, + { + "epoch": 0.75, + "grad_norm": 1.801570357563867, + "learning_rate": 1.5845606295131284e-06, + "loss": 0.5671, + "step": 7188 + }, + { + "epoch": 0.75, + "grad_norm": 1.817221361338044, + "learning_rate": 1.5833313628108388e-06, + "loss": 0.5532, + "step": 7189 + }, + { + "epoch": 0.75, + "grad_norm": 1.854113231046822, + "learning_rate": 1.5821024834074134e-06, + "loss": 0.6285, + "step": 7190 + }, + { + "epoch": 0.75, + "grad_norm": 2.165618397080074, + "learning_rate": 1.5808739914421512e-06, + "loss": 0.5933, + "step": 7191 + }, + { + "epoch": 0.75, + "grad_norm": 1.9316848594411447, + "learning_rate": 1.5796458870543124e-06, + "loss": 0.6431, + "step": 7192 + }, + { + "epoch": 0.75, + "grad_norm": 2.0443009173966904, + "learning_rate": 1.5784181703831059e-06, + "loss": 0.6558, + "step": 7193 + }, + { + "epoch": 0.75, + "grad_norm": 2.0513270749303976, + "learning_rate": 1.577190841567704e-06, + "loss": 0.6386, + "step": 7194 + }, + { + "epoch": 0.75, + "grad_norm": 1.9096009741492599, + "learning_rate": 1.575963900747229e-06, + "loss": 0.6275, + "step": 7195 + }, + { + "epoch": 0.75, + "grad_norm": 2.0390290313937824, + "learning_rate": 1.5747373480607607e-06, + "loss": 0.6873, + "step": 7196 + }, + { + "epoch": 0.75, + "grad_norm": 1.9505996653243933, + "learning_rate": 1.5735111836473393e-06, + "loss": 0.6114, + "step": 7197 + }, + { + "epoch": 0.75, + "grad_norm": 2.010251955965999, + "learning_rate": 1.5722854076459538e-06, + "loss": 0.6547, + "step": 7198 + }, + { + "epoch": 0.75, + "grad_norm": 1.9576954525895818, + "learning_rate": 1.5710600201955567e-06, + "loss": 0.6175, + "step": 7199 + }, + { + "epoch": 0.75, + "grad_norm": 1.971259045738693, + "learning_rate": 1.5698350214350483e-06, + "loss": 0.6752, + "step": 7200 + }, + { + "epoch": 0.75, + "grad_norm": 1.9285497231593764, + "learning_rate": 1.5686104115032952e-06, + "loss": 0.5635, + "step": 7201 + }, + { + "epoch": 0.75, + "grad_norm": 2.070817821852628, + "learning_rate": 1.567386190539107e-06, + "loss": 0.6503, + "step": 7202 + }, + { + "epoch": 0.75, + "grad_norm": 1.8660756222471904, + "learning_rate": 1.5661623586812607e-06, + "loss": 0.548, + "step": 7203 + }, + { + "epoch": 0.75, + "grad_norm": 1.941836274898934, + "learning_rate": 1.5649389160684813e-06, + "loss": 0.526, + "step": 7204 + }, + { + "epoch": 0.75, + "grad_norm": 2.032513766314926, + "learning_rate": 1.5637158628394572e-06, + "loss": 0.5919, + "step": 7205 + }, + { + "epoch": 0.75, + "grad_norm": 2.0981992425710727, + "learning_rate": 1.5624931991328246e-06, + "loss": 0.6707, + "step": 7206 + }, + { + "epoch": 0.75, + "grad_norm": 2.0306085746889875, + "learning_rate": 1.5612709250871822e-06, + "loss": 0.6382, + "step": 7207 + }, + { + "epoch": 0.75, + "grad_norm": 1.9767774040239565, + "learning_rate": 1.5600490408410807e-06, + "loss": 0.541, + "step": 7208 + }, + { + "epoch": 0.75, + "grad_norm": 1.89306377323277, + "learning_rate": 1.5588275465330277e-06, + "loss": 0.6256, + "step": 7209 + }, + { + "epoch": 0.75, + "grad_norm": 1.961898068094439, + "learning_rate": 1.5576064423014846e-06, + "loss": 0.6037, + "step": 7210 + }, + { + "epoch": 0.75, + "grad_norm": 1.8167320785570311, + "learning_rate": 1.5563857282848738e-06, + "loss": 0.6319, + "step": 7211 + }, + { + "epoch": 0.75, + "grad_norm": 2.0044052618558608, + "learning_rate": 1.555165404621567e-06, + "loss": 0.6699, + "step": 7212 + }, + { + "epoch": 0.75, + "grad_norm": 1.8192002546413146, + "learning_rate": 1.5539454714498985e-06, + "loss": 0.5856, + "step": 7213 + }, + { + "epoch": 0.75, + "grad_norm": 1.828513720311586, + "learning_rate": 1.5527259289081508e-06, + "loss": 0.6237, + "step": 7214 + }, + { + "epoch": 0.75, + "grad_norm": 1.962328010297825, + "learning_rate": 1.5515067771345694e-06, + "loss": 0.7629, + "step": 7215 + }, + { + "epoch": 0.75, + "grad_norm": 1.8235786907604312, + "learning_rate": 1.5502880162673506e-06, + "loss": 0.6629, + "step": 7216 + }, + { + "epoch": 0.75, + "grad_norm": 2.1373109789266316, + "learning_rate": 1.5490696464446475e-06, + "loss": 0.6834, + "step": 7217 + }, + { + "epoch": 0.75, + "grad_norm": 1.798807028243252, + "learning_rate": 1.5478516678045686e-06, + "loss": 0.6461, + "step": 7218 + }, + { + "epoch": 0.75, + "grad_norm": 2.073920849769466, + "learning_rate": 1.546634080485181e-06, + "loss": 0.584, + "step": 7219 + }, + { + "epoch": 0.75, + "grad_norm": 2.143982758312892, + "learning_rate": 1.545416884624502e-06, + "loss": 0.6263, + "step": 7220 + }, + { + "epoch": 0.75, + "grad_norm": 1.868656609798202, + "learning_rate": 1.5442000803605117e-06, + "loss": 0.6062, + "step": 7221 + }, + { + "epoch": 0.75, + "grad_norm": 2.102607752104285, + "learning_rate": 1.5429836678311382e-06, + "loss": 0.6158, + "step": 7222 + }, + { + "epoch": 0.75, + "grad_norm": 2.015804353429865, + "learning_rate": 1.5417676471742716e-06, + "loss": 0.6549, + "step": 7223 + }, + { + "epoch": 0.75, + "grad_norm": 1.6390174876555401, + "learning_rate": 1.5405520185277533e-06, + "loss": 0.553, + "step": 7224 + }, + { + "epoch": 0.75, + "grad_norm": 2.0043174338965635, + "learning_rate": 1.5393367820293809e-06, + "loss": 0.5936, + "step": 7225 + }, + { + "epoch": 0.75, + "grad_norm": 2.1450822606543487, + "learning_rate": 1.5381219378169103e-06, + "loss": 0.6166, + "step": 7226 + }, + { + "epoch": 0.75, + "grad_norm": 1.9942427306633814, + "learning_rate": 1.5369074860280509e-06, + "loss": 0.5527, + "step": 7227 + }, + { + "epoch": 0.75, + "grad_norm": 1.9786504142555035, + "learning_rate": 1.5356934268004648e-06, + "loss": 0.5666, + "step": 7228 + }, + { + "epoch": 0.75, + "grad_norm": 1.6346339842485804, + "learning_rate": 1.534479760271776e-06, + "loss": 0.4102, + "step": 7229 + }, + { + "epoch": 0.75, + "grad_norm": 2.1155433193814477, + "learning_rate": 1.5332664865795594e-06, + "loss": 0.6233, + "step": 7230 + }, + { + "epoch": 0.75, + "grad_norm": 2.0461764504626054, + "learning_rate": 1.532053605861345e-06, + "loss": 0.659, + "step": 7231 + }, + { + "epoch": 0.75, + "grad_norm": 1.9075305136903682, + "learning_rate": 1.5308411182546224e-06, + "loss": 0.6123, + "step": 7232 + }, + { + "epoch": 0.75, + "grad_norm": 1.898062658797633, + "learning_rate": 1.5296290238968303e-06, + "loss": 0.6636, + "step": 7233 + }, + { + "epoch": 0.75, + "grad_norm": 1.954671511226436, + "learning_rate": 1.5284173229253712e-06, + "loss": 0.5265, + "step": 7234 + }, + { + "epoch": 0.75, + "grad_norm": 2.2142740269246506, + "learning_rate": 1.527206015477594e-06, + "loss": 0.5625, + "step": 7235 + }, + { + "epoch": 0.75, + "grad_norm": 2.1394005619281624, + "learning_rate": 1.5259951016908108e-06, + "loss": 0.7228, + "step": 7236 + }, + { + "epoch": 0.75, + "grad_norm": 2.1596350480306192, + "learning_rate": 1.5247845817022827e-06, + "loss": 0.6213, + "step": 7237 + }, + { + "epoch": 0.75, + "grad_norm": 1.9822202370478568, + "learning_rate": 1.5235744556492337e-06, + "loss": 0.577, + "step": 7238 + }, + { + "epoch": 0.75, + "grad_norm": 1.849473911231631, + "learning_rate": 1.5223647236688317e-06, + "loss": 0.5713, + "step": 7239 + }, + { + "epoch": 0.75, + "grad_norm": 2.1778919669885215, + "learning_rate": 1.5211553858982115e-06, + "loss": 0.636, + "step": 7240 + }, + { + "epoch": 0.75, + "grad_norm": 1.9120175964258541, + "learning_rate": 1.5199464424744553e-06, + "loss": 0.6798, + "step": 7241 + }, + { + "epoch": 0.75, + "grad_norm": 2.2539495939251353, + "learning_rate": 1.5187378935346075e-06, + "loss": 0.4582, + "step": 7242 + }, + { + "epoch": 0.75, + "grad_norm": 2.0770941489672294, + "learning_rate": 1.5175297392156602e-06, + "loss": 0.6639, + "step": 7243 + }, + { + "epoch": 0.75, + "grad_norm": 1.6863575683493603, + "learning_rate": 1.516321979654568e-06, + "loss": 0.6796, + "step": 7244 + }, + { + "epoch": 0.75, + "grad_norm": 2.05137600474315, + "learning_rate": 1.5151146149882356e-06, + "loss": 0.6001, + "step": 7245 + }, + { + "epoch": 0.75, + "grad_norm": 2.04734677979564, + "learning_rate": 1.513907645353525e-06, + "loss": 0.7466, + "step": 7246 + }, + { + "epoch": 0.75, + "grad_norm": 1.7640317190712616, + "learning_rate": 1.5127010708872513e-06, + "loss": 0.5521, + "step": 7247 + }, + { + "epoch": 0.75, + "grad_norm": 1.9082985743121863, + "learning_rate": 1.5114948917261896e-06, + "loss": 0.6531, + "step": 7248 + }, + { + "epoch": 0.75, + "grad_norm": 2.068888380150388, + "learning_rate": 1.510289108007064e-06, + "loss": 0.573, + "step": 7249 + }, + { + "epoch": 0.75, + "grad_norm": 2.1610981787210704, + "learning_rate": 1.5090837198665602e-06, + "loss": 0.7273, + "step": 7250 + }, + { + "epoch": 0.75, + "grad_norm": 1.8145500032819155, + "learning_rate": 1.507878727441313e-06, + "loss": 0.5476, + "step": 7251 + }, + { + "epoch": 0.75, + "grad_norm": 2.1836728973091852, + "learning_rate": 1.5066741308679183e-06, + "loss": 0.7212, + "step": 7252 + }, + { + "epoch": 0.75, + "grad_norm": 2.026098366272406, + "learning_rate": 1.5054699302829217e-06, + "loss": 0.6131, + "step": 7253 + }, + { + "epoch": 0.75, + "grad_norm": 1.958268075225633, + "learning_rate": 1.5042661258228268e-06, + "loss": 0.7548, + "step": 7254 + }, + { + "epoch": 0.75, + "grad_norm": 2.2799034428142844, + "learning_rate": 1.5030627176240903e-06, + "loss": 0.6088, + "step": 7255 + }, + { + "epoch": 0.75, + "grad_norm": 1.8507112671260586, + "learning_rate": 1.5018597058231276e-06, + "loss": 0.6433, + "step": 7256 + }, + { + "epoch": 0.75, + "grad_norm": 1.9787360681863888, + "learning_rate": 1.500657090556305e-06, + "loss": 0.6084, + "step": 7257 + }, + { + "epoch": 0.75, + "grad_norm": 2.0335913118093716, + "learning_rate": 1.4994548719599478e-06, + "loss": 0.6161, + "step": 7258 + }, + { + "epoch": 0.75, + "grad_norm": 1.913473741873713, + "learning_rate": 1.4982530501703325e-06, + "loss": 0.599, + "step": 7259 + }, + { + "epoch": 0.75, + "grad_norm": 1.993400563984133, + "learning_rate": 1.4970516253236938e-06, + "loss": 0.6776, + "step": 7260 + }, + { + "epoch": 0.75, + "grad_norm": 1.8738338404676722, + "learning_rate": 1.4958505975562205e-06, + "loss": 0.6608, + "step": 7261 + }, + { + "epoch": 0.75, + "grad_norm": 1.9624680385206787, + "learning_rate": 1.4946499670040526e-06, + "loss": 0.646, + "step": 7262 + }, + { + "epoch": 0.75, + "grad_norm": 2.1031705360635313, + "learning_rate": 1.4934497338032926e-06, + "loss": 0.6785, + "step": 7263 + }, + { + "epoch": 0.76, + "grad_norm": 1.7375430132673397, + "learning_rate": 1.4922498980899907e-06, + "loss": 0.5263, + "step": 7264 + }, + { + "epoch": 0.76, + "grad_norm": 1.942208339687299, + "learning_rate": 1.4910504600001574e-06, + "loss": 0.6087, + "step": 7265 + }, + { + "epoch": 0.76, + "grad_norm": 1.7733528282248332, + "learning_rate": 1.489851419669755e-06, + "loss": 0.6096, + "step": 7266 + }, + { + "epoch": 0.76, + "grad_norm": 1.9702015428371882, + "learning_rate": 1.4886527772347015e-06, + "loss": 0.6524, + "step": 7267 + }, + { + "epoch": 0.76, + "grad_norm": 1.821655039369508, + "learning_rate": 1.4874545328308681e-06, + "loss": 0.6979, + "step": 7268 + }, + { + "epoch": 0.76, + "grad_norm": 1.6085812027586697, + "learning_rate": 1.486256686594086e-06, + "loss": 0.4883, + "step": 7269 + }, + { + "epoch": 0.76, + "grad_norm": 1.8103497969052265, + "learning_rate": 1.4850592386601342e-06, + "loss": 0.6498, + "step": 7270 + }, + { + "epoch": 0.76, + "grad_norm": 1.8821065569520434, + "learning_rate": 1.4838621891647537e-06, + "loss": 0.5811, + "step": 7271 + }, + { + "epoch": 0.76, + "grad_norm": 2.026422332113513, + "learning_rate": 1.482665538243634e-06, + "loss": 0.5675, + "step": 7272 + }, + { + "epoch": 0.76, + "grad_norm": 2.021526459853876, + "learning_rate": 1.4814692860324254e-06, + "loss": 0.6165, + "step": 7273 + }, + { + "epoch": 0.76, + "grad_norm": 2.2131193609875255, + "learning_rate": 1.4802734326667261e-06, + "loss": 0.6052, + "step": 7274 + }, + { + "epoch": 0.76, + "grad_norm": 1.9863781617996716, + "learning_rate": 1.4790779782820991e-06, + "loss": 0.7431, + "step": 7275 + }, + { + "epoch": 0.76, + "grad_norm": 1.899030267652968, + "learning_rate": 1.4778829230140479e-06, + "loss": 0.5902, + "step": 7276 + }, + { + "epoch": 0.76, + "grad_norm": 2.104359170256935, + "learning_rate": 1.4766882669980443e-06, + "loss": 0.5782, + "step": 7277 + }, + { + "epoch": 0.76, + "grad_norm": 2.3395206730302807, + "learning_rate": 1.4754940103695065e-06, + "loss": 0.6701, + "step": 7278 + }, + { + "epoch": 0.76, + "grad_norm": 1.9128557347137265, + "learning_rate": 1.4743001532638135e-06, + "loss": 0.5963, + "step": 7279 + }, + { + "epoch": 0.76, + "grad_norm": 1.839960847147049, + "learning_rate": 1.473106695816292e-06, + "loss": 0.5946, + "step": 7280 + }, + { + "epoch": 0.76, + "grad_norm": 1.8804694955721557, + "learning_rate": 1.4719136381622307e-06, + "loss": 0.5972, + "step": 7281 + }, + { + "epoch": 0.76, + "grad_norm": 2.1064096197759916, + "learning_rate": 1.4707209804368683e-06, + "loss": 0.6805, + "step": 7282 + }, + { + "epoch": 0.76, + "grad_norm": 2.1508059506711388, + "learning_rate": 1.4695287227753984e-06, + "loss": 0.6387, + "step": 7283 + }, + { + "epoch": 0.76, + "grad_norm": 1.934117540641856, + "learning_rate": 1.4683368653129698e-06, + "loss": 0.6755, + "step": 7284 + }, + { + "epoch": 0.76, + "grad_norm": 1.874609864466516, + "learning_rate": 1.4671454081846886e-06, + "loss": 0.7757, + "step": 7285 + }, + { + "epoch": 0.76, + "grad_norm": 1.8727817163935145, + "learning_rate": 1.4659543515256103e-06, + "loss": 0.7327, + "step": 7286 + }, + { + "epoch": 0.76, + "grad_norm": 1.9625655507945419, + "learning_rate": 1.4647636954707517e-06, + "loss": 0.5665, + "step": 7287 + }, + { + "epoch": 0.76, + "grad_norm": 1.9176017722412064, + "learning_rate": 1.4635734401550761e-06, + "loss": 0.5784, + "step": 7288 + }, + { + "epoch": 0.76, + "grad_norm": 1.9382591162652874, + "learning_rate": 1.4623835857135099e-06, + "loss": 0.6005, + "step": 7289 + }, + { + "epoch": 0.76, + "grad_norm": 2.0286029348295123, + "learning_rate": 1.4611941322809282e-06, + "loss": 0.6977, + "step": 7290 + }, + { + "epoch": 0.76, + "grad_norm": 1.9278560441634012, + "learning_rate": 1.4600050799921622e-06, + "loss": 0.6472, + "step": 7291 + }, + { + "epoch": 0.76, + "grad_norm": 1.9786842736043846, + "learning_rate": 1.4588164289819956e-06, + "loss": 0.7257, + "step": 7292 + }, + { + "epoch": 0.76, + "grad_norm": 1.919764223108238, + "learning_rate": 1.4576281793851726e-06, + "loss": 0.5598, + "step": 7293 + }, + { + "epoch": 0.76, + "grad_norm": 2.0586232862552425, + "learning_rate": 1.456440331336385e-06, + "loss": 0.5034, + "step": 7294 + }, + { + "epoch": 0.76, + "grad_norm": 1.8835266264519421, + "learning_rate": 1.4552528849702852e-06, + "loss": 0.6227, + "step": 7295 + }, + { + "epoch": 0.76, + "grad_norm": 1.9365442859333502, + "learning_rate": 1.454065840421473e-06, + "loss": 0.6033, + "step": 7296 + }, + { + "epoch": 0.76, + "grad_norm": 1.8303763066257501, + "learning_rate": 1.452879197824511e-06, + "loss": 0.6193, + "step": 7297 + }, + { + "epoch": 0.76, + "grad_norm": 1.714017227116502, + "learning_rate": 1.45169295731391e-06, + "loss": 0.5045, + "step": 7298 + }, + { + "epoch": 0.76, + "grad_norm": 1.9553749197730281, + "learning_rate": 1.450507119024135e-06, + "loss": 0.5116, + "step": 7299 + }, + { + "epoch": 0.76, + "grad_norm": 2.0489024004785206, + "learning_rate": 1.4493216830896112e-06, + "loss": 0.5511, + "step": 7300 + }, + { + "epoch": 0.76, + "grad_norm": 1.9177337191952164, + "learning_rate": 1.4481366496447113e-06, + "loss": 0.6671, + "step": 7301 + }, + { + "epoch": 0.76, + "grad_norm": 2.242369095193321, + "learning_rate": 1.4469520188237684e-06, + "loss": 0.6147, + "step": 7302 + }, + { + "epoch": 0.76, + "grad_norm": 1.9271683681568583, + "learning_rate": 1.4457677907610646e-06, + "loss": 0.6747, + "step": 7303 + }, + { + "epoch": 0.76, + "grad_norm": 1.9556100302069954, + "learning_rate": 1.4445839655908432e-06, + "loss": 0.6199, + "step": 7304 + }, + { + "epoch": 0.76, + "grad_norm": 2.0446941771338847, + "learning_rate": 1.4434005434472914e-06, + "loss": 0.7253, + "step": 7305 + }, + { + "epoch": 0.76, + "grad_norm": 2.180261152086753, + "learning_rate": 1.4422175244645613e-06, + "loss": 0.7472, + "step": 7306 + }, + { + "epoch": 0.76, + "grad_norm": 1.9436496640175285, + "learning_rate": 1.4410349087767521e-06, + "loss": 0.681, + "step": 7307 + }, + { + "epoch": 0.76, + "grad_norm": 1.957008494417684, + "learning_rate": 1.4398526965179233e-06, + "loss": 0.6194, + "step": 7308 + }, + { + "epoch": 0.76, + "grad_norm": 1.9913658996313202, + "learning_rate": 1.438670887822081e-06, + "loss": 0.5928, + "step": 7309 + }, + { + "epoch": 0.76, + "grad_norm": 1.9922918850877875, + "learning_rate": 1.437489482823195e-06, + "loss": 0.7113, + "step": 7310 + }, + { + "epoch": 0.76, + "grad_norm": 1.9105299964949578, + "learning_rate": 1.4363084816551798e-06, + "loss": 0.5878, + "step": 7311 + }, + { + "epoch": 0.76, + "grad_norm": 1.8437267826289236, + "learning_rate": 1.4351278844519134e-06, + "loss": 0.6025, + "step": 7312 + }, + { + "epoch": 0.76, + "grad_norm": 2.2123053516034568, + "learning_rate": 1.4339476913472177e-06, + "loss": 0.6034, + "step": 7313 + }, + { + "epoch": 0.76, + "grad_norm": 1.7734591969685631, + "learning_rate": 1.4327679024748785e-06, + "loss": 0.521, + "step": 7314 + }, + { + "epoch": 0.76, + "grad_norm": 2.0931515989249583, + "learning_rate": 1.4315885179686285e-06, + "loss": 0.6181, + "step": 7315 + }, + { + "epoch": 0.76, + "grad_norm": 1.8904524239638931, + "learning_rate": 1.4304095379621607e-06, + "loss": 0.5739, + "step": 7316 + }, + { + "epoch": 0.76, + "grad_norm": 1.8580069179145808, + "learning_rate": 1.4292309625891166e-06, + "loss": 0.4972, + "step": 7317 + }, + { + "epoch": 0.76, + "grad_norm": 1.8960742519739238, + "learning_rate": 1.4280527919830966e-06, + "loss": 0.5732, + "step": 7318 + }, + { + "epoch": 0.76, + "grad_norm": 1.8541559313701252, + "learning_rate": 1.4268750262776526e-06, + "loss": 0.6536, + "step": 7319 + }, + { + "epoch": 0.76, + "grad_norm": 2.079477738015932, + "learning_rate": 1.42569766560629e-06, + "loss": 0.6436, + "step": 7320 + }, + { + "epoch": 0.76, + "grad_norm": 1.8833701025509348, + "learning_rate": 1.4245207101024684e-06, + "loss": 0.6474, + "step": 7321 + }, + { + "epoch": 0.76, + "grad_norm": 2.0036752845734265, + "learning_rate": 1.4233441598996055e-06, + "loss": 0.6448, + "step": 7322 + }, + { + "epoch": 0.76, + "grad_norm": 1.9424022901674687, + "learning_rate": 1.4221680151310667e-06, + "loss": 0.5813, + "step": 7323 + }, + { + "epoch": 0.76, + "grad_norm": 1.776323088501909, + "learning_rate": 1.420992275930178e-06, + "loss": 0.6075, + "step": 7324 + }, + { + "epoch": 0.76, + "grad_norm": 1.8971379539053441, + "learning_rate": 1.4198169424302133e-06, + "loss": 0.5884, + "step": 7325 + }, + { + "epoch": 0.76, + "grad_norm": 1.9758928262230633, + "learning_rate": 1.4186420147644053e-06, + "loss": 0.6538, + "step": 7326 + }, + { + "epoch": 0.76, + "grad_norm": 2.0378828824189505, + "learning_rate": 1.4174674930659389e-06, + "loss": 0.686, + "step": 7327 + }, + { + "epoch": 0.76, + "grad_norm": 2.100855386114118, + "learning_rate": 1.4162933774679494e-06, + "loss": 0.5805, + "step": 7328 + }, + { + "epoch": 0.76, + "grad_norm": 1.8241299180054797, + "learning_rate": 1.4151196681035339e-06, + "loss": 0.5597, + "step": 7329 + }, + { + "epoch": 0.76, + "grad_norm": 2.3646475695480302, + "learning_rate": 1.4139463651057377e-06, + "loss": 0.696, + "step": 7330 + }, + { + "epoch": 0.76, + "grad_norm": 1.7871636366438264, + "learning_rate": 1.4127734686075589e-06, + "loss": 0.5538, + "step": 7331 + }, + { + "epoch": 0.76, + "grad_norm": 1.7624748734883664, + "learning_rate": 1.4116009787419555e-06, + "loss": 0.5067, + "step": 7332 + }, + { + "epoch": 0.76, + "grad_norm": 1.8830917235961375, + "learning_rate": 1.4104288956418326e-06, + "loss": 0.6711, + "step": 7333 + }, + { + "epoch": 0.76, + "grad_norm": 2.1333917259154473, + "learning_rate": 1.4092572194400556e-06, + "loss": 0.6778, + "step": 7334 + }, + { + "epoch": 0.76, + "grad_norm": 1.8922106726635324, + "learning_rate": 1.4080859502694399e-06, + "loss": 0.5897, + "step": 7335 + }, + { + "epoch": 0.76, + "grad_norm": 1.7979650669786016, + "learning_rate": 1.406915088262753e-06, + "loss": 0.6342, + "step": 7336 + }, + { + "epoch": 0.76, + "grad_norm": 1.8262225902179414, + "learning_rate": 1.4057446335527224e-06, + "loss": 0.5744, + "step": 7337 + }, + { + "epoch": 0.76, + "grad_norm": 1.939920334697821, + "learning_rate": 1.4045745862720227e-06, + "loss": 0.6456, + "step": 7338 + }, + { + "epoch": 0.76, + "grad_norm": 2.046921032309175, + "learning_rate": 1.4034049465532884e-06, + "loss": 0.6499, + "step": 7339 + }, + { + "epoch": 0.76, + "grad_norm": 2.0279532671601435, + "learning_rate": 1.4022357145291022e-06, + "loss": 0.6686, + "step": 7340 + }, + { + "epoch": 0.76, + "grad_norm": 1.876032959240246, + "learning_rate": 1.4010668903320068e-06, + "loss": 0.65, + "step": 7341 + }, + { + "epoch": 0.76, + "grad_norm": 2.010683002947994, + "learning_rate": 1.3998984740944898e-06, + "loss": 0.6212, + "step": 7342 + }, + { + "epoch": 0.76, + "grad_norm": 1.9942519888855668, + "learning_rate": 1.3987304659490019e-06, + "loss": 0.6769, + "step": 7343 + }, + { + "epoch": 0.76, + "grad_norm": 2.3241892030749804, + "learning_rate": 1.397562866027941e-06, + "loss": 0.7233, + "step": 7344 + }, + { + "epoch": 0.76, + "grad_norm": 1.8842048203883948, + "learning_rate": 1.3963956744636642e-06, + "loss": 0.483, + "step": 7345 + }, + { + "epoch": 0.76, + "grad_norm": 1.8132127750394096, + "learning_rate": 1.3952288913884754e-06, + "loss": 0.6027, + "step": 7346 + }, + { + "epoch": 0.76, + "grad_norm": 1.9408602426373183, + "learning_rate": 1.3940625169346406e-06, + "loss": 0.6706, + "step": 7347 + }, + { + "epoch": 0.76, + "grad_norm": 1.9509705460858806, + "learning_rate": 1.3928965512343705e-06, + "loss": 0.512, + "step": 7348 + }, + { + "epoch": 0.76, + "grad_norm": 1.846187361487381, + "learning_rate": 1.3917309944198392e-06, + "loss": 0.5747, + "step": 7349 + }, + { + "epoch": 0.76, + "grad_norm": 1.8360446999359556, + "learning_rate": 1.390565846623163e-06, + "loss": 0.5452, + "step": 7350 + }, + { + "epoch": 0.76, + "grad_norm": 2.207526838462662, + "learning_rate": 1.389401107976423e-06, + "loss": 0.6348, + "step": 7351 + }, + { + "epoch": 0.76, + "grad_norm": 1.6972499850348746, + "learning_rate": 1.3882367786116458e-06, + "loss": 0.6734, + "step": 7352 + }, + { + "epoch": 0.76, + "grad_norm": 1.9465037911461067, + "learning_rate": 1.3870728586608172e-06, + "loss": 0.638, + "step": 7353 + }, + { + "epoch": 0.76, + "grad_norm": 1.99105946713035, + "learning_rate": 1.3859093482558717e-06, + "loss": 0.4869, + "step": 7354 + }, + { + "epoch": 0.76, + "grad_norm": 1.9083214942937838, + "learning_rate": 1.3847462475287027e-06, + "loss": 0.6243, + "step": 7355 + }, + { + "epoch": 0.76, + "grad_norm": 1.8052126687935333, + "learning_rate": 1.3835835566111527e-06, + "loss": 0.5666, + "step": 7356 + }, + { + "epoch": 0.76, + "grad_norm": 1.8247580687949765, + "learning_rate": 1.3824212756350196e-06, + "loss": 0.5969, + "step": 7357 + }, + { + "epoch": 0.76, + "grad_norm": 1.931041342905744, + "learning_rate": 1.3812594047320526e-06, + "loss": 0.6333, + "step": 7358 + }, + { + "epoch": 0.76, + "grad_norm": 2.003570523038699, + "learning_rate": 1.3800979440339602e-06, + "loss": 0.6259, + "step": 7359 + }, + { + "epoch": 0.77, + "grad_norm": 1.9910752759176764, + "learning_rate": 1.3789368936723967e-06, + "loss": 0.596, + "step": 7360 + }, + { + "epoch": 0.77, + "grad_norm": 2.204108423906913, + "learning_rate": 1.3777762537789774e-06, + "loss": 0.6022, + "step": 7361 + }, + { + "epoch": 0.77, + "grad_norm": 1.7905234257212006, + "learning_rate": 1.3766160244852645e-06, + "loss": 0.666, + "step": 7362 + }, + { + "epoch": 0.77, + "grad_norm": 1.785127840395526, + "learning_rate": 1.375456205922779e-06, + "loss": 0.6004, + "step": 7363 + }, + { + "epoch": 0.77, + "grad_norm": 1.8382116223905691, + "learning_rate": 1.3742967982229915e-06, + "loss": 0.6217, + "step": 7364 + }, + { + "epoch": 0.77, + "grad_norm": 1.8116921007155165, + "learning_rate": 1.373137801517327e-06, + "loss": 0.5808, + "step": 7365 + }, + { + "epoch": 0.77, + "grad_norm": 2.0143148074817656, + "learning_rate": 1.371979215937166e-06, + "loss": 0.5981, + "step": 7366 + }, + { + "epoch": 0.77, + "grad_norm": 1.945327577763545, + "learning_rate": 1.3708210416138395e-06, + "loss": 0.7188, + "step": 7367 + }, + { + "epoch": 0.77, + "grad_norm": 1.9918025752529571, + "learning_rate": 1.3696632786786328e-06, + "loss": 0.6553, + "step": 7368 + }, + { + "epoch": 0.77, + "grad_norm": 1.8128557697830252, + "learning_rate": 1.368505927262787e-06, + "loss": 0.54, + "step": 7369 + }, + { + "epoch": 0.77, + "grad_norm": 1.8567139003564144, + "learning_rate": 1.3673489874974916e-06, + "loss": 0.6109, + "step": 7370 + }, + { + "epoch": 0.77, + "grad_norm": 1.8570547914301103, + "learning_rate": 1.3661924595138953e-06, + "loss": 0.5828, + "step": 7371 + }, + { + "epoch": 0.77, + "grad_norm": 2.0112052275541963, + "learning_rate": 1.3650363434430957e-06, + "loss": 0.643, + "step": 7372 + }, + { + "epoch": 0.77, + "grad_norm": 1.9738896384606492, + "learning_rate": 1.363880639416144e-06, + "loss": 0.7166, + "step": 7373 + }, + { + "epoch": 0.77, + "grad_norm": 1.9657301144532935, + "learning_rate": 1.3627253475640484e-06, + "loss": 0.5739, + "step": 7374 + }, + { + "epoch": 0.77, + "grad_norm": 1.8189747674386199, + "learning_rate": 1.3615704680177649e-06, + "loss": 0.5863, + "step": 7375 + }, + { + "epoch": 0.77, + "grad_norm": 1.9980086094799228, + "learning_rate": 1.3604160009082084e-06, + "loss": 0.6279, + "step": 7376 + }, + { + "epoch": 0.77, + "grad_norm": 1.8661162265592808, + "learning_rate": 1.359261946366242e-06, + "loss": 0.6268, + "step": 7377 + }, + { + "epoch": 0.77, + "grad_norm": 2.0760584257748347, + "learning_rate": 1.3581083045226884e-06, + "loss": 0.5509, + "step": 7378 + }, + { + "epoch": 0.77, + "grad_norm": 1.890930457422948, + "learning_rate": 1.3569550755083139e-06, + "loss": 0.4827, + "step": 7379 + }, + { + "epoch": 0.77, + "grad_norm": 1.8250307809861648, + "learning_rate": 1.3558022594538473e-06, + "loss": 0.6133, + "step": 7380 + }, + { + "epoch": 0.77, + "grad_norm": 1.9776684236653719, + "learning_rate": 1.3546498564899647e-06, + "loss": 0.65, + "step": 7381 + }, + { + "epoch": 0.77, + "grad_norm": 1.8375601370048764, + "learning_rate": 1.3534978667472998e-06, + "loss": 0.6446, + "step": 7382 + }, + { + "epoch": 0.77, + "grad_norm": 1.882348930178244, + "learning_rate": 1.3523462903564344e-06, + "loss": 0.6684, + "step": 7383 + }, + { + "epoch": 0.77, + "grad_norm": 2.009995487209896, + "learning_rate": 1.3511951274479096e-06, + "loss": 0.7314, + "step": 7384 + }, + { + "epoch": 0.77, + "grad_norm": 1.9752306252999285, + "learning_rate": 1.3500443781522131e-06, + "loss": 0.5498, + "step": 7385 + }, + { + "epoch": 0.77, + "grad_norm": 1.989631043234858, + "learning_rate": 1.3488940425997937e-06, + "loss": 0.6369, + "step": 7386 + }, + { + "epoch": 0.77, + "grad_norm": 2.0437233289117995, + "learning_rate": 1.3477441209210418e-06, + "loss": 0.6656, + "step": 7387 + }, + { + "epoch": 0.77, + "grad_norm": 1.8972633241525336, + "learning_rate": 1.3465946132463125e-06, + "loss": 0.6628, + "step": 7388 + }, + { + "epoch": 0.77, + "grad_norm": 1.8438369722578372, + "learning_rate": 1.3454455197059064e-06, + "loss": 0.5667, + "step": 7389 + }, + { + "epoch": 0.77, + "grad_norm": 2.0959707834954493, + "learning_rate": 1.3442968404300822e-06, + "loss": 0.5153, + "step": 7390 + }, + { + "epoch": 0.77, + "grad_norm": 1.6795735736077286, + "learning_rate": 1.3431485755490464e-06, + "loss": 0.5253, + "step": 7391 + }, + { + "epoch": 0.77, + "grad_norm": 1.8489557099642173, + "learning_rate": 1.342000725192964e-06, + "loss": 0.6495, + "step": 7392 + }, + { + "epoch": 0.77, + "grad_norm": 2.0025930868134916, + "learning_rate": 1.3408532894919502e-06, + "loss": 0.6117, + "step": 7393 + }, + { + "epoch": 0.77, + "grad_norm": 2.9342212627586495, + "learning_rate": 1.3397062685760715e-06, + "loss": 0.6195, + "step": 7394 + }, + { + "epoch": 0.77, + "grad_norm": 2.3274325479107087, + "learning_rate": 1.3385596625753494e-06, + "loss": 0.774, + "step": 7395 + }, + { + "epoch": 0.77, + "grad_norm": 2.007477669025387, + "learning_rate": 1.3374134716197602e-06, + "loss": 0.5757, + "step": 7396 + }, + { + "epoch": 0.77, + "grad_norm": 2.1464769668013823, + "learning_rate": 1.336267695839229e-06, + "loss": 0.6546, + "step": 7397 + }, + { + "epoch": 0.77, + "grad_norm": 2.063072317843152, + "learning_rate": 1.3351223353636378e-06, + "loss": 0.6652, + "step": 7398 + }, + { + "epoch": 0.77, + "grad_norm": 2.23065638452988, + "learning_rate": 1.3339773903228182e-06, + "loss": 0.6505, + "step": 7399 + }, + { + "epoch": 0.77, + "grad_norm": 2.0593425716320235, + "learning_rate": 1.3328328608465586e-06, + "loss": 0.6715, + "step": 7400 + }, + { + "epoch": 0.77, + "grad_norm": 1.8612256997436372, + "learning_rate": 1.3316887470645956e-06, + "loss": 0.5658, + "step": 7401 + }, + { + "epoch": 0.77, + "grad_norm": 2.092289554514257, + "learning_rate": 1.3305450491066207e-06, + "loss": 0.5539, + "step": 7402 + }, + { + "epoch": 0.77, + "grad_norm": 2.1098319168701223, + "learning_rate": 1.3294017671022812e-06, + "loss": 0.6631, + "step": 7403 + }, + { + "epoch": 0.77, + "grad_norm": 2.055282905776299, + "learning_rate": 1.328258901181172e-06, + "loss": 0.638, + "step": 7404 + }, + { + "epoch": 0.77, + "grad_norm": 1.8517537301996196, + "learning_rate": 1.3271164514728458e-06, + "loss": 0.617, + "step": 7405 + }, + { + "epoch": 0.77, + "grad_norm": 1.9401490645931452, + "learning_rate": 1.3259744181068041e-06, + "loss": 0.6754, + "step": 7406 + }, + { + "epoch": 0.77, + "grad_norm": 2.0376313058684565, + "learning_rate": 1.3248328012125022e-06, + "loss": 0.5623, + "step": 7407 + }, + { + "epoch": 0.77, + "grad_norm": 2.0528085589228446, + "learning_rate": 1.3236916009193517e-06, + "loss": 0.6334, + "step": 7408 + }, + { + "epoch": 0.77, + "grad_norm": 1.9909521335765008, + "learning_rate": 1.3225508173567125e-06, + "loss": 0.6425, + "step": 7409 + }, + { + "epoch": 0.77, + "grad_norm": 2.075921732696546, + "learning_rate": 1.3214104506538971e-06, + "loss": 0.5893, + "step": 7410 + }, + { + "epoch": 0.77, + "grad_norm": 1.906763513338803, + "learning_rate": 1.320270500940176e-06, + "loss": 0.6562, + "step": 7411 + }, + { + "epoch": 0.77, + "grad_norm": 1.9042795537620933, + "learning_rate": 1.3191309683447662e-06, + "loss": 0.5949, + "step": 7412 + }, + { + "epoch": 0.77, + "grad_norm": 1.876715715793952, + "learning_rate": 1.3179918529968422e-06, + "loss": 0.6021, + "step": 7413 + }, + { + "epoch": 0.77, + "grad_norm": 1.8171198061068894, + "learning_rate": 1.3168531550255275e-06, + "loss": 0.5601, + "step": 7414 + }, + { + "epoch": 0.77, + "grad_norm": 1.8263304397349878, + "learning_rate": 1.3157148745599035e-06, + "loss": 0.6062, + "step": 7415 + }, + { + "epoch": 0.77, + "grad_norm": 2.0517363085919533, + "learning_rate": 1.3145770117289957e-06, + "loss": 0.6128, + "step": 7416 + }, + { + "epoch": 0.77, + "grad_norm": 1.9454726843452275, + "learning_rate": 1.313439566661791e-06, + "loss": 0.5095, + "step": 7417 + }, + { + "epoch": 0.77, + "grad_norm": 1.9864994234352504, + "learning_rate": 1.3123025394872224e-06, + "loss": 0.567, + "step": 7418 + }, + { + "epoch": 0.77, + "grad_norm": 2.0924159820519774, + "learning_rate": 1.3111659303341824e-06, + "loss": 0.627, + "step": 7419 + }, + { + "epoch": 0.77, + "grad_norm": 2.0441005429459826, + "learning_rate": 1.3100297393315077e-06, + "loss": 0.6127, + "step": 7420 + }, + { + "epoch": 0.77, + "grad_norm": 2.011198133677825, + "learning_rate": 1.3088939666079958e-06, + "loss": 0.6564, + "step": 7421 + }, + { + "epoch": 0.77, + "grad_norm": 1.877266216814998, + "learning_rate": 1.3077586122923896e-06, + "loss": 0.565, + "step": 7422 + }, + { + "epoch": 0.77, + "grad_norm": 2.1366794824633017, + "learning_rate": 1.3066236765133933e-06, + "loss": 0.5936, + "step": 7423 + }, + { + "epoch": 0.77, + "grad_norm": 1.9038044812800596, + "learning_rate": 1.3054891593996515e-06, + "loss": 0.6186, + "step": 7424 + }, + { + "epoch": 0.77, + "grad_norm": 2.1095772614580666, + "learning_rate": 1.3043550610797728e-06, + "loss": 0.6388, + "step": 7425 + }, + { + "epoch": 0.77, + "grad_norm": 1.9489744092099393, + "learning_rate": 1.3032213816823113e-06, + "loss": 0.5858, + "step": 7426 + }, + { + "epoch": 0.77, + "grad_norm": 2.0758525689119067, + "learning_rate": 1.3020881213357783e-06, + "loss": 0.6035, + "step": 7427 + }, + { + "epoch": 0.77, + "grad_norm": 2.0709398095894356, + "learning_rate": 1.3009552801686331e-06, + "loss": 0.554, + "step": 7428 + }, + { + "epoch": 0.77, + "grad_norm": 1.866922642817398, + "learning_rate": 1.299822858309292e-06, + "loss": 0.5272, + "step": 7429 + }, + { + "epoch": 0.77, + "grad_norm": 1.8806440265200155, + "learning_rate": 1.29869085588612e-06, + "loss": 0.5618, + "step": 7430 + }, + { + "epoch": 0.77, + "grad_norm": 1.998869396446172, + "learning_rate": 1.2975592730274367e-06, + "loss": 0.6345, + "step": 7431 + }, + { + "epoch": 0.77, + "grad_norm": 1.8384656204172125, + "learning_rate": 1.296428109861511e-06, + "loss": 0.6149, + "step": 7432 + }, + { + "epoch": 0.77, + "grad_norm": 2.233235086672102, + "learning_rate": 1.2952973665165703e-06, + "loss": 0.6092, + "step": 7433 + }, + { + "epoch": 0.77, + "grad_norm": 2.6047023375364677, + "learning_rate": 1.2941670431207882e-06, + "loss": 0.6329, + "step": 7434 + }, + { + "epoch": 0.77, + "grad_norm": 2.0698950270152126, + "learning_rate": 1.293037139802295e-06, + "loss": 0.655, + "step": 7435 + }, + { + "epoch": 0.77, + "grad_norm": 2.403007806832603, + "learning_rate": 1.2919076566891703e-06, + "loss": 0.6415, + "step": 7436 + }, + { + "epoch": 0.77, + "grad_norm": 2.1414097281187785, + "learning_rate": 1.290778593909449e-06, + "loss": 0.6067, + "step": 7437 + }, + { + "epoch": 0.77, + "grad_norm": 2.2802961243526507, + "learning_rate": 1.2896499515911165e-06, + "loss": 0.7176, + "step": 7438 + }, + { + "epoch": 0.77, + "grad_norm": 1.8863817546743002, + "learning_rate": 1.2885217298621084e-06, + "loss": 0.6193, + "step": 7439 + }, + { + "epoch": 0.77, + "grad_norm": 2.044165537334881, + "learning_rate": 1.2873939288503185e-06, + "loss": 0.682, + "step": 7440 + }, + { + "epoch": 0.77, + "grad_norm": 2.115113504513422, + "learning_rate": 1.2862665486835861e-06, + "loss": 0.6313, + "step": 7441 + }, + { + "epoch": 0.77, + "grad_norm": 1.8510730137768605, + "learning_rate": 1.2851395894897101e-06, + "loss": 0.676, + "step": 7442 + }, + { + "epoch": 0.77, + "grad_norm": 2.0138120838329927, + "learning_rate": 1.2840130513964338e-06, + "loss": 0.5752, + "step": 7443 + }, + { + "epoch": 0.77, + "grad_norm": 1.8887083592061393, + "learning_rate": 1.2828869345314599e-06, + "loss": 0.5755, + "step": 7444 + }, + { + "epoch": 0.77, + "grad_norm": 2.0255619098514455, + "learning_rate": 1.2817612390224388e-06, + "loss": 0.6166, + "step": 7445 + }, + { + "epoch": 0.77, + "grad_norm": 1.9683394171262643, + "learning_rate": 1.2806359649969746e-06, + "loss": 0.5901, + "step": 7446 + }, + { + "epoch": 0.77, + "grad_norm": 2.1220195600084066, + "learning_rate": 1.2795111125826221e-06, + "loss": 0.6434, + "step": 7447 + }, + { + "epoch": 0.77, + "grad_norm": 1.8294003393097111, + "learning_rate": 1.2783866819068923e-06, + "loss": 0.6808, + "step": 7448 + }, + { + "epoch": 0.77, + "grad_norm": 1.8189125265698487, + "learning_rate": 1.2772626730972437e-06, + "loss": 0.4841, + "step": 7449 + }, + { + "epoch": 0.77, + "grad_norm": 1.6978368469146112, + "learning_rate": 1.2761390862810907e-06, + "loss": 0.6485, + "step": 7450 + }, + { + "epoch": 0.77, + "grad_norm": 2.2209030633430737, + "learning_rate": 1.2750159215857965e-06, + "loss": 0.6692, + "step": 7451 + }, + { + "epoch": 0.77, + "grad_norm": 1.929125071978997, + "learning_rate": 1.2738931791386827e-06, + "loss": 0.6442, + "step": 7452 + }, + { + "epoch": 0.77, + "grad_norm": 1.799506153570008, + "learning_rate": 1.2727708590670113e-06, + "loss": 0.5547, + "step": 7453 + }, + { + "epoch": 0.77, + "grad_norm": 2.13151128803415, + "learning_rate": 1.2716489614980093e-06, + "loss": 0.6322, + "step": 7454 + }, + { + "epoch": 0.77, + "grad_norm": 2.134629474837925, + "learning_rate": 1.2705274865588475e-06, + "loss": 0.5887, + "step": 7455 + }, + { + "epoch": 0.78, + "grad_norm": 1.858068512251421, + "learning_rate": 1.2694064343766532e-06, + "loss": 0.5537, + "step": 7456 + }, + { + "epoch": 0.78, + "grad_norm": 1.8715567367821404, + "learning_rate": 1.2682858050785018e-06, + "loss": 0.6063, + "step": 7457 + }, + { + "epoch": 0.78, + "grad_norm": 2.198960334446777, + "learning_rate": 1.2671655987914261e-06, + "loss": 0.5857, + "step": 7458 + }, + { + "epoch": 0.78, + "grad_norm": 2.1149071876653958, + "learning_rate": 1.266045815642405e-06, + "loss": 0.5134, + "step": 7459 + }, + { + "epoch": 0.78, + "grad_norm": 2.3036410022633413, + "learning_rate": 1.2649264557583758e-06, + "loss": 0.7023, + "step": 7460 + }, + { + "epoch": 0.78, + "grad_norm": 1.725475153183577, + "learning_rate": 1.2638075192662196e-06, + "loss": 0.5848, + "step": 7461 + }, + { + "epoch": 0.78, + "grad_norm": 1.7445337006329626, + "learning_rate": 1.2626890062927781e-06, + "loss": 0.5705, + "step": 7462 + }, + { + "epoch": 0.78, + "grad_norm": 1.9445544003943256, + "learning_rate": 1.2615709169648382e-06, + "loss": 0.5653, + "step": 7463 + }, + { + "epoch": 0.78, + "grad_norm": 1.86634440943093, + "learning_rate": 1.2604532514091444e-06, + "loss": 0.6699, + "step": 7464 + }, + { + "epoch": 0.78, + "grad_norm": 1.884500167863542, + "learning_rate": 1.2593360097523883e-06, + "loss": 0.5513, + "step": 7465 + }, + { + "epoch": 0.78, + "grad_norm": 2.0358534462218953, + "learning_rate": 1.2582191921212172e-06, + "loss": 0.6638, + "step": 7466 + }, + { + "epoch": 0.78, + "grad_norm": 1.9210922659682166, + "learning_rate": 1.257102798642229e-06, + "loss": 0.5599, + "step": 7467 + }, + { + "epoch": 0.78, + "grad_norm": 2.0339311660440456, + "learning_rate": 1.2559868294419702e-06, + "loss": 0.6709, + "step": 7468 + }, + { + "epoch": 0.78, + "grad_norm": 1.9172549578337978, + "learning_rate": 1.2548712846469469e-06, + "loss": 0.5646, + "step": 7469 + }, + { + "epoch": 0.78, + "grad_norm": 1.9444512462886203, + "learning_rate": 1.2537561643836087e-06, + "loss": 0.6041, + "step": 7470 + }, + { + "epoch": 0.78, + "grad_norm": 2.0064010781768653, + "learning_rate": 1.2526414687783616e-06, + "loss": 0.6542, + "step": 7471 + }, + { + "epoch": 0.78, + "grad_norm": 1.9136862206054561, + "learning_rate": 1.2515271979575645e-06, + "loss": 0.6535, + "step": 7472 + }, + { + "epoch": 0.78, + "grad_norm": 1.791221271933324, + "learning_rate": 1.2504133520475237e-06, + "loss": 0.615, + "step": 7473 + }, + { + "epoch": 0.78, + "grad_norm": 2.0331463728088317, + "learning_rate": 1.249299931174503e-06, + "loss": 0.7014, + "step": 7474 + }, + { + "epoch": 0.78, + "grad_norm": 2.092796442100189, + "learning_rate": 1.248186935464713e-06, + "loss": 0.6129, + "step": 7475 + }, + { + "epoch": 0.78, + "grad_norm": 2.0088765568010576, + "learning_rate": 1.2470743650443167e-06, + "loss": 0.6894, + "step": 7476 + }, + { + "epoch": 0.78, + "grad_norm": 2.1889027363813027, + "learning_rate": 1.2459622200394344e-06, + "loss": 0.6256, + "step": 7477 + }, + { + "epoch": 0.78, + "grad_norm": 1.9459517215941233, + "learning_rate": 1.2448505005761297e-06, + "loss": 0.6502, + "step": 7478 + }, + { + "epoch": 0.78, + "grad_norm": 1.958059668778648, + "learning_rate": 1.243739206780426e-06, + "loss": 0.5418, + "step": 7479 + }, + { + "epoch": 0.78, + "grad_norm": 2.0181150473837106, + "learning_rate": 1.2426283387782916e-06, + "loss": 0.686, + "step": 7480 + }, + { + "epoch": 0.78, + "grad_norm": 1.8056526635316388, + "learning_rate": 1.2415178966956531e-06, + "loss": 0.6975, + "step": 7481 + }, + { + "epoch": 0.78, + "grad_norm": 1.9223777564267408, + "learning_rate": 1.2404078806583835e-06, + "loss": 0.578, + "step": 7482 + }, + { + "epoch": 0.78, + "grad_norm": 1.87336105244118, + "learning_rate": 1.2392982907923096e-06, + "loss": 0.5269, + "step": 7483 + }, + { + "epoch": 0.78, + "grad_norm": 1.8268304700340083, + "learning_rate": 1.2381891272232083e-06, + "loss": 0.6357, + "step": 7484 + }, + { + "epoch": 0.78, + "grad_norm": 1.8174515571326288, + "learning_rate": 1.237080390076812e-06, + "loss": 0.5843, + "step": 7485 + }, + { + "epoch": 0.78, + "grad_norm": 1.9327780722397199, + "learning_rate": 1.2359720794788006e-06, + "loss": 0.6239, + "step": 7486 + }, + { + "epoch": 0.78, + "grad_norm": 2.0991839672424932, + "learning_rate": 1.2348641955548096e-06, + "loss": 0.6605, + "step": 7487 + }, + { + "epoch": 0.78, + "grad_norm": 1.8536105464415018, + "learning_rate": 1.2337567384304206e-06, + "loss": 0.6537, + "step": 7488 + }, + { + "epoch": 0.78, + "grad_norm": 2.024081612397799, + "learning_rate": 1.2326497082311756e-06, + "loss": 0.6295, + "step": 7489 + }, + { + "epoch": 0.78, + "grad_norm": 1.9662894712769203, + "learning_rate": 1.231543105082556e-06, + "loss": 0.7392, + "step": 7490 + }, + { + "epoch": 0.78, + "grad_norm": 2.0201598746186793, + "learning_rate": 1.230436929110007e-06, + "loss": 0.6617, + "step": 7491 + }, + { + "epoch": 0.78, + "grad_norm": 1.935461672788772, + "learning_rate": 1.2293311804389162e-06, + "loss": 0.6007, + "step": 7492 + }, + { + "epoch": 0.78, + "grad_norm": 1.7867631971998374, + "learning_rate": 1.2282258591946294e-06, + "loss": 0.5533, + "step": 7493 + }, + { + "epoch": 0.78, + "grad_norm": 1.9480879837331495, + "learning_rate": 1.2271209655024386e-06, + "loss": 0.5748, + "step": 7494 + }, + { + "epoch": 0.78, + "grad_norm": 1.8358355416673764, + "learning_rate": 1.2260164994875922e-06, + "loss": 0.5568, + "step": 7495 + }, + { + "epoch": 0.78, + "grad_norm": 1.916660508469549, + "learning_rate": 1.224912461275287e-06, + "loss": 0.5668, + "step": 7496 + }, + { + "epoch": 0.78, + "grad_norm": 2.1041973386188726, + "learning_rate": 1.2238088509906715e-06, + "loss": 0.6088, + "step": 7497 + }, + { + "epoch": 0.78, + "grad_norm": 1.8336319921741646, + "learning_rate": 1.2227056687588445e-06, + "loss": 0.6178, + "step": 7498 + }, + { + "epoch": 0.78, + "grad_norm": 1.9408959767507967, + "learning_rate": 1.221602914704862e-06, + "loss": 0.6562, + "step": 7499 + }, + { + "epoch": 0.78, + "grad_norm": 1.982054454174401, + "learning_rate": 1.2205005889537231e-06, + "loss": 0.6758, + "step": 7500 + }, + { + "epoch": 0.78, + "grad_norm": 2.242903224347222, + "learning_rate": 1.2193986916303862e-06, + "loss": 0.6497, + "step": 7501 + }, + { + "epoch": 0.78, + "grad_norm": 1.696679047260719, + "learning_rate": 1.2182972228597555e-06, + "loss": 0.6423, + "step": 7502 + }, + { + "epoch": 0.78, + "grad_norm": 2.4159106630224647, + "learning_rate": 1.2171961827666907e-06, + "loss": 0.6576, + "step": 7503 + }, + { + "epoch": 0.78, + "grad_norm": 2.15668495278231, + "learning_rate": 1.2160955714759997e-06, + "loss": 0.5918, + "step": 7504 + }, + { + "epoch": 0.78, + "grad_norm": 2.082528155431938, + "learning_rate": 1.2149953891124423e-06, + "loss": 0.6091, + "step": 7505 + }, + { + "epoch": 0.78, + "grad_norm": 1.9378865879606009, + "learning_rate": 1.2138956358007325e-06, + "loss": 0.6194, + "step": 7506 + }, + { + "epoch": 0.78, + "grad_norm": 1.9807419537639468, + "learning_rate": 1.2127963116655323e-06, + "loss": 0.563, + "step": 7507 + }, + { + "epoch": 0.78, + "grad_norm": 1.9001140039929365, + "learning_rate": 1.2116974168314549e-06, + "loss": 0.5075, + "step": 7508 + }, + { + "epoch": 0.78, + "grad_norm": 1.9044446353892586, + "learning_rate": 1.2105989514230699e-06, + "loss": 0.5632, + "step": 7509 + }, + { + "epoch": 0.78, + "grad_norm": 1.9036438238927869, + "learning_rate": 1.2095009155648908e-06, + "loss": 0.6259, + "step": 7510 + }, + { + "epoch": 0.78, + "grad_norm": 2.0289114282767042, + "learning_rate": 1.2084033093813897e-06, + "loss": 0.6061, + "step": 7511 + }, + { + "epoch": 0.78, + "grad_norm": 1.9769797796713287, + "learning_rate": 1.2073061329969843e-06, + "loss": 0.5597, + "step": 7512 + }, + { + "epoch": 0.78, + "grad_norm": 1.829432862973179, + "learning_rate": 1.2062093865360458e-06, + "loss": 0.5307, + "step": 7513 + }, + { + "epoch": 0.78, + "grad_norm": 1.9966239074516121, + "learning_rate": 1.205113070122898e-06, + "loss": 0.6177, + "step": 7514 + }, + { + "epoch": 0.78, + "grad_norm": 2.0151026946951824, + "learning_rate": 1.2040171838818128e-06, + "loss": 0.5806, + "step": 7515 + }, + { + "epoch": 0.78, + "grad_norm": 1.9245838703900684, + "learning_rate": 1.202921727937017e-06, + "loss": 0.5436, + "step": 7516 + }, + { + "epoch": 0.78, + "grad_norm": 2.3269530223457475, + "learning_rate": 1.201826702412685e-06, + "loss": 0.6014, + "step": 7517 + }, + { + "epoch": 0.78, + "grad_norm": 2.0196480903460583, + "learning_rate": 1.2007321074329464e-06, + "loss": 0.6542, + "step": 7518 + }, + { + "epoch": 0.78, + "grad_norm": 2.0366225641189617, + "learning_rate": 1.1996379431218792e-06, + "loss": 0.5832, + "step": 7519 + }, + { + "epoch": 0.78, + "grad_norm": 1.8691559040156394, + "learning_rate": 1.1985442096035116e-06, + "loss": 0.5792, + "step": 7520 + }, + { + "epoch": 0.78, + "grad_norm": 1.97785016962828, + "learning_rate": 1.1974509070018242e-06, + "loss": 0.6102, + "step": 7521 + }, + { + "epoch": 0.78, + "grad_norm": 1.909391688954166, + "learning_rate": 1.1963580354407523e-06, + "loss": 0.5965, + "step": 7522 + }, + { + "epoch": 0.78, + "grad_norm": 1.8397548132442958, + "learning_rate": 1.195265595044175e-06, + "loss": 0.5612, + "step": 7523 + }, + { + "epoch": 0.78, + "grad_norm": 1.8519815774481667, + "learning_rate": 1.1941735859359305e-06, + "loss": 0.6289, + "step": 7524 + }, + { + "epoch": 0.78, + "grad_norm": 1.9325455805100002, + "learning_rate": 1.193082008239801e-06, + "loss": 0.7232, + "step": 7525 + }, + { + "epoch": 0.78, + "grad_norm": 1.7899558628801764, + "learning_rate": 1.1919908620795274e-06, + "loss": 0.6425, + "step": 7526 + }, + { + "epoch": 0.78, + "grad_norm": 1.7674583673545878, + "learning_rate": 1.1909001475787917e-06, + "loss": 0.6329, + "step": 7527 + }, + { + "epoch": 0.78, + "grad_norm": 1.9014708835524068, + "learning_rate": 1.189809864861237e-06, + "loss": 0.5598, + "step": 7528 + }, + { + "epoch": 0.78, + "grad_norm": 1.9850467848833522, + "learning_rate": 1.1887200140504496e-06, + "loss": 0.6087, + "step": 7529 + }, + { + "epoch": 0.78, + "grad_norm": 1.8719486639364562, + "learning_rate": 1.187630595269974e-06, + "loss": 0.6111, + "step": 7530 + }, + { + "epoch": 0.78, + "grad_norm": 1.8776002051132321, + "learning_rate": 1.186541608643299e-06, + "loss": 0.6164, + "step": 7531 + }, + { + "epoch": 0.78, + "grad_norm": 2.10496286194707, + "learning_rate": 1.1854530542938697e-06, + "loss": 0.6653, + "step": 7532 + }, + { + "epoch": 0.78, + "grad_norm": 1.792696936223161, + "learning_rate": 1.184364932345079e-06, + "loss": 0.6264, + "step": 7533 + }, + { + "epoch": 0.78, + "grad_norm": 2.062502687433275, + "learning_rate": 1.1832772429202716e-06, + "loss": 0.6956, + "step": 7534 + }, + { + "epoch": 0.78, + "grad_norm": 2.051811267436275, + "learning_rate": 1.1821899861427415e-06, + "loss": 0.6218, + "step": 7535 + }, + { + "epoch": 0.78, + "grad_norm": 1.8590354079413396, + "learning_rate": 1.1811031621357388e-06, + "loss": 0.5887, + "step": 7536 + }, + { + "epoch": 0.78, + "grad_norm": 1.8694047326150192, + "learning_rate": 1.1800167710224585e-06, + "loss": 0.5381, + "step": 7537 + }, + { + "epoch": 0.78, + "grad_norm": 2.0369985151819887, + "learning_rate": 1.1789308129260518e-06, + "loss": 0.6614, + "step": 7538 + }, + { + "epoch": 0.78, + "grad_norm": 2.024847817291823, + "learning_rate": 1.1778452879696156e-06, + "loss": 0.5751, + "step": 7539 + }, + { + "epoch": 0.78, + "grad_norm": 2.0143477964838166, + "learning_rate": 1.1767601962762025e-06, + "loss": 0.668, + "step": 7540 + }, + { + "epoch": 0.78, + "grad_norm": 2.1439637308422053, + "learning_rate": 1.1756755379688133e-06, + "loss": 0.6613, + "step": 7541 + }, + { + "epoch": 0.78, + "grad_norm": 2.128249539956141, + "learning_rate": 1.1745913131703983e-06, + "loss": 0.594, + "step": 7542 + }, + { + "epoch": 0.78, + "grad_norm": 2.042668732426268, + "learning_rate": 1.1735075220038634e-06, + "loss": 0.6774, + "step": 7543 + }, + { + "epoch": 0.78, + "grad_norm": 1.7655693783203028, + "learning_rate": 1.1724241645920597e-06, + "loss": 0.5097, + "step": 7544 + }, + { + "epoch": 0.78, + "grad_norm": 1.9980257594538535, + "learning_rate": 1.1713412410577947e-06, + "loss": 0.5953, + "step": 7545 + }, + { + "epoch": 0.78, + "grad_norm": 1.9020372671647598, + "learning_rate": 1.1702587515238228e-06, + "loss": 0.6078, + "step": 7546 + }, + { + "epoch": 0.78, + "grad_norm": 1.84747773445059, + "learning_rate": 1.1691766961128486e-06, + "loss": 0.6325, + "step": 7547 + }, + { + "epoch": 0.78, + "grad_norm": 2.1840723467663334, + "learning_rate": 1.1680950749475328e-06, + "loss": 0.4747, + "step": 7548 + }, + { + "epoch": 0.78, + "grad_norm": 2.01935125924819, + "learning_rate": 1.1670138881504811e-06, + "loss": 0.6862, + "step": 7549 + }, + { + "epoch": 0.78, + "grad_norm": 1.790776100057919, + "learning_rate": 1.165933135844251e-06, + "loss": 0.6148, + "step": 7550 + }, + { + "epoch": 0.78, + "grad_norm": 1.9127701595519246, + "learning_rate": 1.1648528181513546e-06, + "loss": 0.6695, + "step": 7551 + }, + { + "epoch": 0.79, + "grad_norm": 1.8752750795920698, + "learning_rate": 1.1637729351942496e-06, + "loss": 0.6208, + "step": 7552 + }, + { + "epoch": 0.79, + "grad_norm": 2.0838979472639543, + "learning_rate": 1.16269348709535e-06, + "loss": 0.6042, + "step": 7553 + }, + { + "epoch": 0.79, + "grad_norm": 1.870248031496323, + "learning_rate": 1.1616144739770134e-06, + "loss": 0.5422, + "step": 7554 + }, + { + "epoch": 0.79, + "grad_norm": 1.985975392199706, + "learning_rate": 1.1605358959615559e-06, + "loss": 0.6439, + "step": 7555 + }, + { + "epoch": 0.79, + "grad_norm": 2.142807668782956, + "learning_rate": 1.1594577531712392e-06, + "loss": 0.638, + "step": 7556 + }, + { + "epoch": 0.79, + "grad_norm": 1.8963683455946494, + "learning_rate": 1.1583800457282763e-06, + "loss": 0.641, + "step": 7557 + }, + { + "epoch": 0.79, + "grad_norm": 1.8549487285628472, + "learning_rate": 1.1573027737548304e-06, + "loss": 0.5158, + "step": 7558 + }, + { + "epoch": 0.79, + "grad_norm": 1.7469422801211996, + "learning_rate": 1.156225937373019e-06, + "loss": 0.5104, + "step": 7559 + }, + { + "epoch": 0.79, + "grad_norm": 2.0926111621878314, + "learning_rate": 1.1551495367049047e-06, + "loss": 0.7087, + "step": 7560 + }, + { + "epoch": 0.79, + "grad_norm": 2.126301992165716, + "learning_rate": 1.154073571872507e-06, + "loss": 0.6058, + "step": 7561 + }, + { + "epoch": 0.79, + "grad_norm": 2.012168076568665, + "learning_rate": 1.1529980429977899e-06, + "loss": 0.6645, + "step": 7562 + }, + { + "epoch": 0.79, + "grad_norm": 1.7738548550605582, + "learning_rate": 1.151922950202674e-06, + "loss": 0.6511, + "step": 7563 + }, + { + "epoch": 0.79, + "grad_norm": 1.8809818116839208, + "learning_rate": 1.1508482936090226e-06, + "loss": 0.6599, + "step": 7564 + }, + { + "epoch": 0.79, + "grad_norm": 1.9829487911729462, + "learning_rate": 1.149774073338658e-06, + "loss": 0.6738, + "step": 7565 + }, + { + "epoch": 0.79, + "grad_norm": 2.1298374139158023, + "learning_rate": 1.1487002895133458e-06, + "loss": 0.6807, + "step": 7566 + }, + { + "epoch": 0.79, + "grad_norm": 1.7396917186181862, + "learning_rate": 1.1476269422548097e-06, + "loss": 0.5649, + "step": 7567 + }, + { + "epoch": 0.79, + "grad_norm": 2.163951307025049, + "learning_rate": 1.1465540316847158e-06, + "loss": 0.6927, + "step": 7568 + }, + { + "epoch": 0.79, + "grad_norm": 1.7608713500076538, + "learning_rate": 1.1454815579246874e-06, + "loss": 0.5432, + "step": 7569 + }, + { + "epoch": 0.79, + "grad_norm": 1.701479430283739, + "learning_rate": 1.1444095210962946e-06, + "loss": 0.5278, + "step": 7570 + }, + { + "epoch": 0.79, + "grad_norm": 1.9439676814040012, + "learning_rate": 1.1433379213210589e-06, + "loss": 0.6163, + "step": 7571 + }, + { + "epoch": 0.79, + "grad_norm": 2.041557258404316, + "learning_rate": 1.14226675872045e-06, + "loss": 0.5878, + "step": 7572 + }, + { + "epoch": 0.79, + "grad_norm": 1.8275477847578068, + "learning_rate": 1.1411960334158945e-06, + "loss": 0.5765, + "step": 7573 + }, + { + "epoch": 0.79, + "grad_norm": 1.9882523933445255, + "learning_rate": 1.1401257455287612e-06, + "loss": 0.6118, + "step": 7574 + }, + { + "epoch": 0.79, + "grad_norm": 2.1243807472913923, + "learning_rate": 1.1390558951803765e-06, + "loss": 0.5873, + "step": 7575 + }, + { + "epoch": 0.79, + "grad_norm": 2.036793715513729, + "learning_rate": 1.1379864824920116e-06, + "loss": 0.6901, + "step": 7576 + }, + { + "epoch": 0.79, + "grad_norm": 2.0003545707561745, + "learning_rate": 1.1369175075848931e-06, + "loss": 0.6905, + "step": 7577 + }, + { + "epoch": 0.79, + "grad_norm": 1.8163317896843112, + "learning_rate": 1.135848970580194e-06, + "loss": 0.5174, + "step": 7578 + }, + { + "epoch": 0.79, + "grad_norm": 1.9890937817343028, + "learning_rate": 1.1347808715990377e-06, + "loss": 0.6099, + "step": 7579 + }, + { + "epoch": 0.79, + "grad_norm": 2.0858479558113454, + "learning_rate": 1.1337132107625015e-06, + "loss": 0.5826, + "step": 7580 + }, + { + "epoch": 0.79, + "grad_norm": 1.9155881518121252, + "learning_rate": 1.1326459881916091e-06, + "loss": 0.6876, + "step": 7581 + }, + { + "epoch": 0.79, + "grad_norm": 1.9989956883343196, + "learning_rate": 1.1315792040073381e-06, + "loss": 0.6681, + "step": 7582 + }, + { + "epoch": 0.79, + "grad_norm": 1.9550742344176189, + "learning_rate": 1.1305128583306125e-06, + "loss": 0.6161, + "step": 7583 + }, + { + "epoch": 0.79, + "grad_norm": 2.0103579349657963, + "learning_rate": 1.1294469512823109e-06, + "loss": 0.6299, + "step": 7584 + }, + { + "epoch": 0.79, + "grad_norm": 1.9054575542751568, + "learning_rate": 1.128381482983259e-06, + "loss": 0.6079, + "step": 7585 + }, + { + "epoch": 0.79, + "grad_norm": 2.1391755475909, + "learning_rate": 1.1273164535542336e-06, + "loss": 0.6313, + "step": 7586 + }, + { + "epoch": 0.79, + "grad_norm": 1.981591108859556, + "learning_rate": 1.1262518631159602e-06, + "loss": 0.5913, + "step": 7587 + }, + { + "epoch": 0.79, + "grad_norm": 2.1379326081176777, + "learning_rate": 1.125187711789119e-06, + "loss": 0.6499, + "step": 7588 + }, + { + "epoch": 0.79, + "grad_norm": 1.982491035473915, + "learning_rate": 1.1241239996943348e-06, + "loss": 0.6315, + "step": 7589 + }, + { + "epoch": 0.79, + "grad_norm": 2.2170410264953224, + "learning_rate": 1.1230607269521886e-06, + "loss": 0.6647, + "step": 7590 + }, + { + "epoch": 0.79, + "grad_norm": 2.1017656060939376, + "learning_rate": 1.1219978936832054e-06, + "loss": 0.6512, + "step": 7591 + }, + { + "epoch": 0.79, + "grad_norm": 1.9103803015382639, + "learning_rate": 1.1209355000078664e-06, + "loss": 0.5763, + "step": 7592 + }, + { + "epoch": 0.79, + "grad_norm": 1.8881668957666067, + "learning_rate": 1.1198735460465987e-06, + "loss": 0.5324, + "step": 7593 + }, + { + "epoch": 0.79, + "grad_norm": 2.1092691410374727, + "learning_rate": 1.1188120319197798e-06, + "loss": 0.7296, + "step": 7594 + }, + { + "epoch": 0.79, + "grad_norm": 1.7952823369616315, + "learning_rate": 1.117750957747738e-06, + "loss": 0.5827, + "step": 7595 + }, + { + "epoch": 0.79, + "grad_norm": 1.9802795486580884, + "learning_rate": 1.1166903236507549e-06, + "loss": 0.6069, + "step": 7596 + }, + { + "epoch": 0.79, + "grad_norm": 1.9392038562759872, + "learning_rate": 1.1156301297490563e-06, + "loss": 0.6014, + "step": 7597 + }, + { + "epoch": 0.79, + "grad_norm": 2.0103638691186676, + "learning_rate": 1.1145703761628234e-06, + "loss": 0.6243, + "step": 7598 + }, + { + "epoch": 0.79, + "grad_norm": 1.854549954527706, + "learning_rate": 1.1135110630121837e-06, + "loss": 0.5502, + "step": 7599 + }, + { + "epoch": 0.79, + "grad_norm": 2.021354607363837, + "learning_rate": 1.1124521904172202e-06, + "loss": 0.5726, + "step": 7600 + }, + { + "epoch": 0.79, + "grad_norm": 1.9661146192099226, + "learning_rate": 1.1113937584979561e-06, + "loss": 0.6839, + "step": 7601 + }, + { + "epoch": 0.79, + "grad_norm": 1.7591677360991955, + "learning_rate": 1.1103357673743752e-06, + "loss": 0.6131, + "step": 7602 + }, + { + "epoch": 0.79, + "grad_norm": 1.9398835866330275, + "learning_rate": 1.109278217166404e-06, + "loss": 0.6187, + "step": 7603 + }, + { + "epoch": 0.79, + "grad_norm": 1.9393774468410334, + "learning_rate": 1.1082211079939248e-06, + "loss": 0.5964, + "step": 7604 + }, + { + "epoch": 0.79, + "grad_norm": 2.067156283501053, + "learning_rate": 1.107164439976764e-06, + "loss": 0.7105, + "step": 7605 + }, + { + "epoch": 0.79, + "grad_norm": 1.9167409651719103, + "learning_rate": 1.106108213234704e-06, + "loss": 0.656, + "step": 7606 + }, + { + "epoch": 0.79, + "grad_norm": 2.1560390878979914, + "learning_rate": 1.105052427887472e-06, + "loss": 0.6422, + "step": 7607 + }, + { + "epoch": 0.79, + "grad_norm": 2.118078967090903, + "learning_rate": 1.1039970840547464e-06, + "loss": 0.6541, + "step": 7608 + }, + { + "epoch": 0.79, + "grad_norm": 1.8508371466187719, + "learning_rate": 1.1029421818561592e-06, + "loss": 0.4905, + "step": 7609 + }, + { + "epoch": 0.79, + "grad_norm": 1.9440624024422692, + "learning_rate": 1.1018877214112883e-06, + "loss": 0.6826, + "step": 7610 + }, + { + "epoch": 0.79, + "grad_norm": 1.849950642377605, + "learning_rate": 1.1008337028396616e-06, + "loss": 0.5475, + "step": 7611 + }, + { + "epoch": 0.79, + "grad_norm": 1.9160204717719853, + "learning_rate": 1.0997801262607599e-06, + "loss": 0.6415, + "step": 7612 + }, + { + "epoch": 0.79, + "grad_norm": 1.9539927844434095, + "learning_rate": 1.0987269917940107e-06, + "loss": 0.6261, + "step": 7613 + }, + { + "epoch": 0.79, + "grad_norm": 2.1191670091655235, + "learning_rate": 1.0976742995587941e-06, + "loss": 0.6103, + "step": 7614 + }, + { + "epoch": 0.79, + "grad_norm": 2.2618502739095514, + "learning_rate": 1.096622049674439e-06, + "loss": 0.5837, + "step": 7615 + }, + { + "epoch": 0.79, + "grad_norm": 1.8741698346992601, + "learning_rate": 1.095570242260221e-06, + "loss": 0.6731, + "step": 7616 + }, + { + "epoch": 0.79, + "grad_norm": 1.9071579986938094, + "learning_rate": 1.094518877435372e-06, + "loss": 0.5906, + "step": 7617 + }, + { + "epoch": 0.79, + "grad_norm": 1.8615446372298599, + "learning_rate": 1.093467955319068e-06, + "loss": 0.5217, + "step": 7618 + }, + { + "epoch": 0.79, + "grad_norm": 1.9575245781215822, + "learning_rate": 1.0924174760304385e-06, + "loss": 0.5396, + "step": 7619 + }, + { + "epoch": 0.79, + "grad_norm": 1.8968632493253208, + "learning_rate": 1.0913674396885598e-06, + "loss": 0.5562, + "step": 7620 + }, + { + "epoch": 0.79, + "grad_norm": 1.9487478206224738, + "learning_rate": 1.090317846412461e-06, + "loss": 0.6184, + "step": 7621 + }, + { + "epoch": 0.79, + "grad_norm": 2.0424190189706133, + "learning_rate": 1.0892686963211191e-06, + "loss": 0.6212, + "step": 7622 + }, + { + "epoch": 0.79, + "grad_norm": 2.174634093253347, + "learning_rate": 1.0882199895334605e-06, + "loss": 0.641, + "step": 7623 + }, + { + "epoch": 0.79, + "grad_norm": 1.8967766592892383, + "learning_rate": 1.0871717261683619e-06, + "loss": 0.5959, + "step": 7624 + }, + { + "epoch": 0.79, + "grad_norm": 2.0868235703345177, + "learning_rate": 1.0861239063446511e-06, + "loss": 0.6228, + "step": 7625 + }, + { + "epoch": 0.79, + "grad_norm": 1.8227671520819473, + "learning_rate": 1.0850765301811028e-06, + "loss": 0.5928, + "step": 7626 + }, + { + "epoch": 0.79, + "grad_norm": 1.9844970398015163, + "learning_rate": 1.0840295977964454e-06, + "loss": 0.7122, + "step": 7627 + }, + { + "epoch": 0.79, + "grad_norm": 1.8711686783888133, + "learning_rate": 1.0829831093093524e-06, + "loss": 0.5506, + "step": 7628 + }, + { + "epoch": 0.79, + "grad_norm": 2.045619614830849, + "learning_rate": 1.0819370648384525e-06, + "loss": 0.7085, + "step": 7629 + }, + { + "epoch": 0.79, + "grad_norm": 1.7682681907229623, + "learning_rate": 1.080891464502316e-06, + "loss": 0.6398, + "step": 7630 + }, + { + "epoch": 0.79, + "grad_norm": 1.7360472338932105, + "learning_rate": 1.0798463084194715e-06, + "loss": 0.5867, + "step": 7631 + }, + { + "epoch": 0.79, + "grad_norm": 2.167967567642517, + "learning_rate": 1.0788015967083904e-06, + "loss": 0.693, + "step": 7632 + }, + { + "epoch": 0.79, + "grad_norm": 1.8159747786768616, + "learning_rate": 1.0777573294875005e-06, + "loss": 0.5595, + "step": 7633 + }, + { + "epoch": 0.79, + "grad_norm": 1.8337688164202244, + "learning_rate": 1.076713506875171e-06, + "loss": 0.6175, + "step": 7634 + }, + { + "epoch": 0.79, + "grad_norm": 1.7006940068561807, + "learning_rate": 1.0756701289897298e-06, + "loss": 0.5631, + "step": 7635 + }, + { + "epoch": 0.79, + "grad_norm": 1.8017376813538655, + "learning_rate": 1.0746271959494453e-06, + "loss": 0.5882, + "step": 7636 + }, + { + "epoch": 0.79, + "grad_norm": 1.789860603798531, + "learning_rate": 1.0735847078725452e-06, + "loss": 0.6337, + "step": 7637 + }, + { + "epoch": 0.79, + "grad_norm": 1.973652062162871, + "learning_rate": 1.0725426648771952e-06, + "loss": 0.6533, + "step": 7638 + }, + { + "epoch": 0.79, + "grad_norm": 1.978809763923317, + "learning_rate": 1.0715010670815212e-06, + "loss": 0.5568, + "step": 7639 + }, + { + "epoch": 0.79, + "grad_norm": 1.7844382460604202, + "learning_rate": 1.070459914603592e-06, + "loss": 0.5602, + "step": 7640 + }, + { + "epoch": 0.79, + "grad_norm": 1.8403111784857593, + "learning_rate": 1.0694192075614302e-06, + "loss": 0.6194, + "step": 7641 + }, + { + "epoch": 0.79, + "grad_norm": 1.7334240782019665, + "learning_rate": 1.0683789460730037e-06, + "loss": 0.5426, + "step": 7642 + }, + { + "epoch": 0.79, + "grad_norm": 1.685316731739376, + "learning_rate": 1.0673391302562342e-06, + "loss": 0.4805, + "step": 7643 + }, + { + "epoch": 0.79, + "grad_norm": 1.869559308757824, + "learning_rate": 1.0662997602289899e-06, + "loss": 0.7268, + "step": 7644 + }, + { + "epoch": 0.79, + "grad_norm": 2.16910751822174, + "learning_rate": 1.0652608361090877e-06, + "loss": 0.6145, + "step": 7645 + }, + { + "epoch": 0.79, + "grad_norm": 1.923836692448067, + "learning_rate": 1.0642223580142985e-06, + "loss": 0.5563, + "step": 7646 + }, + { + "epoch": 0.79, + "grad_norm": 1.9053312003550178, + "learning_rate": 1.0631843260623382e-06, + "loss": 0.5602, + "step": 7647 + }, + { + "epoch": 0.8, + "grad_norm": 1.9624951568475208, + "learning_rate": 1.0621467403708718e-06, + "loss": 0.5808, + "step": 7648 + }, + { + "epoch": 0.8, + "grad_norm": 1.887605733437194, + "learning_rate": 1.0611096010575196e-06, + "loss": 0.632, + "step": 7649 + }, + { + "epoch": 0.8, + "grad_norm": 1.9203647513784456, + "learning_rate": 1.0600729082398425e-06, + "loss": 0.6598, + "step": 7650 + }, + { + "epoch": 0.8, + "grad_norm": 1.8767373807747643, + "learning_rate": 1.0590366620353604e-06, + "loss": 0.598, + "step": 7651 + }, + { + "epoch": 0.8, + "grad_norm": 1.8056123438665657, + "learning_rate": 1.058000862561535e-06, + "loss": 0.5871, + "step": 7652 + }, + { + "epoch": 0.8, + "grad_norm": 2.125561482859971, + "learning_rate": 1.0569655099357795e-06, + "loss": 0.6351, + "step": 7653 + }, + { + "epoch": 0.8, + "grad_norm": 2.0398054017931213, + "learning_rate": 1.0559306042754591e-06, + "loss": 0.6699, + "step": 7654 + }, + { + "epoch": 0.8, + "grad_norm": 2.0576078182108186, + "learning_rate": 1.0548961456978835e-06, + "loss": 0.6127, + "step": 7655 + }, + { + "epoch": 0.8, + "grad_norm": 1.9026327912259209, + "learning_rate": 1.0538621343203176e-06, + "loss": 0.5675, + "step": 7656 + }, + { + "epoch": 0.8, + "grad_norm": 2.0639922430174535, + "learning_rate": 1.05282857025997e-06, + "loss": 0.6958, + "step": 7657 + }, + { + "epoch": 0.8, + "grad_norm": 2.112511747733857, + "learning_rate": 1.051795453634003e-06, + "loss": 0.6881, + "step": 7658 + }, + { + "epoch": 0.8, + "grad_norm": 2.0806728442724602, + "learning_rate": 1.0507627845595259e-06, + "loss": 0.6517, + "step": 7659 + }, + { + "epoch": 0.8, + "grad_norm": 1.8655768403232256, + "learning_rate": 1.049730563153597e-06, + "loss": 0.5419, + "step": 7660 + }, + { + "epoch": 0.8, + "grad_norm": 2.107389670056977, + "learning_rate": 1.0486987895332229e-06, + "loss": 0.6122, + "step": 7661 + }, + { + "epoch": 0.8, + "grad_norm": 2.001585005845773, + "learning_rate": 1.0476674638153638e-06, + "loss": 0.6874, + "step": 7662 + }, + { + "epoch": 0.8, + "grad_norm": 1.8022025145340081, + "learning_rate": 1.0466365861169242e-06, + "loss": 0.5743, + "step": 7663 + }, + { + "epoch": 0.8, + "grad_norm": 2.25061615492626, + "learning_rate": 1.045606156554762e-06, + "loss": 0.6585, + "step": 7664 + }, + { + "epoch": 0.8, + "grad_norm": 1.8060804364597933, + "learning_rate": 1.0445761752456806e-06, + "loss": 0.5762, + "step": 7665 + }, + { + "epoch": 0.8, + "grad_norm": 1.9562870642355572, + "learning_rate": 1.0435466423064373e-06, + "loss": 0.5267, + "step": 7666 + }, + { + "epoch": 0.8, + "grad_norm": 1.8288657824591275, + "learning_rate": 1.04251755785373e-06, + "loss": 0.5505, + "step": 7667 + }, + { + "epoch": 0.8, + "grad_norm": 1.9565933117918024, + "learning_rate": 1.0414889220042163e-06, + "loss": 0.6889, + "step": 7668 + }, + { + "epoch": 0.8, + "grad_norm": 2.0077080060650956, + "learning_rate": 1.0404607348744943e-06, + "loss": 0.6004, + "step": 7669 + }, + { + "epoch": 0.8, + "grad_norm": 1.8100825225512873, + "learning_rate": 1.0394329965811178e-06, + "loss": 0.5825, + "step": 7670 + }, + { + "epoch": 0.8, + "grad_norm": 1.9809824856030736, + "learning_rate": 1.038405707240585e-06, + "loss": 0.5185, + "step": 7671 + }, + { + "epoch": 0.8, + "grad_norm": 1.9217501283416114, + "learning_rate": 1.0373788669693464e-06, + "loss": 0.5785, + "step": 7672 + }, + { + "epoch": 0.8, + "grad_norm": 2.056980482387888, + "learning_rate": 1.0363524758837984e-06, + "loss": 0.7201, + "step": 7673 + }, + { + "epoch": 0.8, + "grad_norm": 2.0266442614164544, + "learning_rate": 1.0353265341002916e-06, + "loss": 0.5002, + "step": 7674 + }, + { + "epoch": 0.8, + "grad_norm": 2.0009831878190654, + "learning_rate": 1.034301041735118e-06, + "loss": 0.5331, + "step": 7675 + }, + { + "epoch": 0.8, + "grad_norm": 2.095824252013779, + "learning_rate": 1.0332759989045254e-06, + "loss": 0.6838, + "step": 7676 + }, + { + "epoch": 0.8, + "grad_norm": 1.7209335276516577, + "learning_rate": 1.0322514057247075e-06, + "loss": 0.6151, + "step": 7677 + }, + { + "epoch": 0.8, + "grad_norm": 1.9503500468360193, + "learning_rate": 1.031227262311809e-06, + "loss": 0.6328, + "step": 7678 + }, + { + "epoch": 0.8, + "grad_norm": 1.9867515395161024, + "learning_rate": 1.0302035687819202e-06, + "loss": 0.6623, + "step": 7679 + }, + { + "epoch": 0.8, + "grad_norm": 1.9160205255434155, + "learning_rate": 1.0291803252510857e-06, + "loss": 0.6412, + "step": 7680 + }, + { + "epoch": 0.8, + "grad_norm": 1.9044848440465423, + "learning_rate": 1.0281575318352937e-06, + "loss": 0.5959, + "step": 7681 + }, + { + "epoch": 0.8, + "grad_norm": 1.9017102445371714, + "learning_rate": 1.0271351886504832e-06, + "loss": 0.6255, + "step": 7682 + }, + { + "epoch": 0.8, + "grad_norm": 1.9617797412488593, + "learning_rate": 1.0261132958125452e-06, + "loss": 0.5663, + "step": 7683 + }, + { + "epoch": 0.8, + "grad_norm": 1.8326899544229418, + "learning_rate": 1.025091853437314e-06, + "loss": 0.5486, + "step": 7684 + }, + { + "epoch": 0.8, + "grad_norm": 1.9429118963302403, + "learning_rate": 1.0240708616405788e-06, + "loss": 0.6459, + "step": 7685 + }, + { + "epoch": 0.8, + "grad_norm": 1.9982778451386498, + "learning_rate": 1.0230503205380732e-06, + "loss": 0.6218, + "step": 7686 + }, + { + "epoch": 0.8, + "grad_norm": 2.0367479976721032, + "learning_rate": 1.0220302302454804e-06, + "loss": 0.6262, + "step": 7687 + }, + { + "epoch": 0.8, + "grad_norm": 2.1004913104077994, + "learning_rate": 1.0210105908784362e-06, + "loss": 0.6363, + "step": 7688 + }, + { + "epoch": 0.8, + "grad_norm": 2.196885005810578, + "learning_rate": 1.019991402552521e-06, + "loss": 0.6211, + "step": 7689 + }, + { + "epoch": 0.8, + "grad_norm": 2.0510349626748896, + "learning_rate": 1.0189726653832637e-06, + "loss": 0.5913, + "step": 7690 + }, + { + "epoch": 0.8, + "grad_norm": 2.062389403716988, + "learning_rate": 1.017954379486148e-06, + "loss": 0.6006, + "step": 7691 + }, + { + "epoch": 0.8, + "grad_norm": 2.000958354903393, + "learning_rate": 1.0169365449765982e-06, + "loss": 0.6191, + "step": 7692 + }, + { + "epoch": 0.8, + "grad_norm": 2.1648733827702418, + "learning_rate": 1.0159191619699955e-06, + "loss": 0.6194, + "step": 7693 + }, + { + "epoch": 0.8, + "grad_norm": 1.7312236576166993, + "learning_rate": 1.014902230581663e-06, + "loss": 0.5829, + "step": 7694 + }, + { + "epoch": 0.8, + "grad_norm": 1.9281553571908774, + "learning_rate": 1.0138857509268784e-06, + "loss": 0.6326, + "step": 7695 + }, + { + "epoch": 0.8, + "grad_norm": 1.6412211399878196, + "learning_rate": 1.012869723120864e-06, + "loss": 0.5582, + "step": 7696 + }, + { + "epoch": 0.8, + "grad_norm": 2.005045612911496, + "learning_rate": 1.0118541472787918e-06, + "loss": 0.5659, + "step": 7697 + }, + { + "epoch": 0.8, + "grad_norm": 1.9229217799978204, + "learning_rate": 1.0108390235157828e-06, + "loss": 0.5267, + "step": 7698 + }, + { + "epoch": 0.8, + "grad_norm": 2.000719179387221, + "learning_rate": 1.0098243519469091e-06, + "loss": 0.5253, + "step": 7699 + }, + { + "epoch": 0.8, + "grad_norm": 2.0842973123188866, + "learning_rate": 1.0088101326871873e-06, + "loss": 0.6451, + "step": 7700 + }, + { + "epoch": 0.8, + "grad_norm": 1.6323359685648406, + "learning_rate": 1.0077963658515872e-06, + "loss": 0.531, + "step": 7701 + }, + { + "epoch": 0.8, + "grad_norm": 2.0540576166204314, + "learning_rate": 1.0067830515550224e-06, + "loss": 0.6194, + "step": 7702 + }, + { + "epoch": 0.8, + "grad_norm": 1.8564346985220948, + "learning_rate": 1.0057701899123622e-06, + "loss": 0.7364, + "step": 7703 + }, + { + "epoch": 0.8, + "grad_norm": 1.7651410118055617, + "learning_rate": 1.0047577810384146e-06, + "loss": 0.6406, + "step": 7704 + }, + { + "epoch": 0.8, + "grad_norm": 1.7861292875990769, + "learning_rate": 1.003745825047946e-06, + "loss": 0.5525, + "step": 7705 + }, + { + "epoch": 0.8, + "grad_norm": 1.7860974870586726, + "learning_rate": 1.002734322055664e-06, + "loss": 0.604, + "step": 7706 + }, + { + "epoch": 0.8, + "grad_norm": 1.9245464161068735, + "learning_rate": 1.0017232721762322e-06, + "loss": 0.6887, + "step": 7707 + }, + { + "epoch": 0.8, + "grad_norm": 2.245870208702652, + "learning_rate": 1.0007126755242557e-06, + "loss": 0.5527, + "step": 7708 + }, + { + "epoch": 0.8, + "grad_norm": 1.9829272032721494, + "learning_rate": 9.997025322142934e-07, + "loss": 0.5582, + "step": 7709 + }, + { + "epoch": 0.8, + "grad_norm": 1.9923017209495424, + "learning_rate": 9.986928423608493e-07, + "loss": 0.647, + "step": 7710 + }, + { + "epoch": 0.8, + "grad_norm": 2.027249908466499, + "learning_rate": 9.976836060783806e-07, + "loss": 0.6092, + "step": 7711 + }, + { + "epoch": 0.8, + "grad_norm": 1.8182521544541843, + "learning_rate": 9.966748234812845e-07, + "loss": 0.5714, + "step": 7712 + }, + { + "epoch": 0.8, + "grad_norm": 2.1289844169817194, + "learning_rate": 9.956664946839173e-07, + "loss": 0.5971, + "step": 7713 + }, + { + "epoch": 0.8, + "grad_norm": 2.2228604368044356, + "learning_rate": 9.946586198005754e-07, + "loss": 0.6114, + "step": 7714 + }, + { + "epoch": 0.8, + "grad_norm": 1.736074800532946, + "learning_rate": 9.9365119894551e-07, + "loss": 0.6053, + "step": 7715 + }, + { + "epoch": 0.8, + "grad_norm": 2.0883144500735913, + "learning_rate": 9.92644232232915e-07, + "loss": 0.6102, + "step": 7716 + }, + { + "epoch": 0.8, + "grad_norm": 2.0457525870447477, + "learning_rate": 9.91637719776939e-07, + "loss": 0.7014, + "step": 7717 + }, + { + "epoch": 0.8, + "grad_norm": 1.9544938159774565, + "learning_rate": 9.906316616916745e-07, + "loss": 0.5763, + "step": 7718 + }, + { + "epoch": 0.8, + "grad_norm": 1.7888980454640862, + "learning_rate": 9.89626058091162e-07, + "loss": 0.5796, + "step": 7719 + }, + { + "epoch": 0.8, + "grad_norm": 2.138621337538482, + "learning_rate": 9.886209090893955e-07, + "loss": 0.6714, + "step": 7720 + }, + { + "epoch": 0.8, + "grad_norm": 1.8656774932599596, + "learning_rate": 9.876162148003121e-07, + "loss": 0.5886, + "step": 7721 + }, + { + "epoch": 0.8, + "grad_norm": 2.0270285492211597, + "learning_rate": 9.866119753378018e-07, + "loss": 0.6527, + "step": 7722 + }, + { + "epoch": 0.8, + "grad_norm": 2.0333059598856047, + "learning_rate": 9.856081908156984e-07, + "loss": 0.6542, + "step": 7723 + }, + { + "epoch": 0.8, + "grad_norm": 1.764098539500526, + "learning_rate": 9.846048613477894e-07, + "loss": 0.5076, + "step": 7724 + }, + { + "epoch": 0.8, + "grad_norm": 1.8576540431009838, + "learning_rate": 9.836019870478058e-07, + "loss": 0.5937, + "step": 7725 + }, + { + "epoch": 0.8, + "grad_norm": 1.8921832430445682, + "learning_rate": 9.825995680294298e-07, + "loss": 0.5207, + "step": 7726 + }, + { + "epoch": 0.8, + "grad_norm": 1.8915147722807397, + "learning_rate": 9.815976044062902e-07, + "loss": 0.5271, + "step": 7727 + }, + { + "epoch": 0.8, + "grad_norm": 1.8331960578185176, + "learning_rate": 9.80596096291967e-07, + "loss": 0.6478, + "step": 7728 + }, + { + "epoch": 0.8, + "grad_norm": 1.7613187657217821, + "learning_rate": 9.795950437999852e-07, + "loss": 0.5245, + "step": 7729 + }, + { + "epoch": 0.8, + "grad_norm": 2.0307503138525873, + "learning_rate": 9.785944470438218e-07, + "loss": 0.6444, + "step": 7730 + }, + { + "epoch": 0.8, + "grad_norm": 1.7686848346745405, + "learning_rate": 9.775943061368982e-07, + "loss": 0.5941, + "step": 7731 + }, + { + "epoch": 0.8, + "grad_norm": 2.345642126924032, + "learning_rate": 9.765946211925882e-07, + "loss": 0.6443, + "step": 7732 + }, + { + "epoch": 0.8, + "grad_norm": 1.9272931213689597, + "learning_rate": 9.755953923242102e-07, + "loss": 0.5868, + "step": 7733 + }, + { + "epoch": 0.8, + "grad_norm": 1.9544735196473395, + "learning_rate": 9.74596619645033e-07, + "loss": 0.6951, + "step": 7734 + }, + { + "epoch": 0.8, + "grad_norm": 1.9460700117324157, + "learning_rate": 9.735983032682716e-07, + "loss": 0.6072, + "step": 7735 + }, + { + "epoch": 0.8, + "grad_norm": 2.1090367377616435, + "learning_rate": 9.726004433070935e-07, + "loss": 0.7047, + "step": 7736 + }, + { + "epoch": 0.8, + "grad_norm": 2.1982409577233133, + "learning_rate": 9.716030398746096e-07, + "loss": 0.6184, + "step": 7737 + }, + { + "epoch": 0.8, + "grad_norm": 1.9379260663865387, + "learning_rate": 9.706060930838834e-07, + "loss": 0.6641, + "step": 7738 + }, + { + "epoch": 0.8, + "grad_norm": 2.3158402348464384, + "learning_rate": 9.69609603047922e-07, + "loss": 0.6658, + "step": 7739 + }, + { + "epoch": 0.8, + "grad_norm": 2.202051506082379, + "learning_rate": 9.686135698796866e-07, + "loss": 0.6234, + "step": 7740 + }, + { + "epoch": 0.8, + "grad_norm": 1.946984139913474, + "learning_rate": 9.676179936920793e-07, + "loss": 0.7025, + "step": 7741 + }, + { + "epoch": 0.8, + "grad_norm": 1.9651438174470215, + "learning_rate": 9.666228745979571e-07, + "loss": 0.6672, + "step": 7742 + }, + { + "epoch": 0.8, + "grad_norm": 1.821834253966292, + "learning_rate": 9.656282127101208e-07, + "loss": 0.5473, + "step": 7743 + }, + { + "epoch": 0.8, + "grad_norm": 1.8646490281696781, + "learning_rate": 9.646340081413225e-07, + "loss": 0.6667, + "step": 7744 + }, + { + "epoch": 0.81, + "grad_norm": 1.9663132430420378, + "learning_rate": 9.636402610042589e-07, + "loss": 0.6145, + "step": 7745 + }, + { + "epoch": 0.81, + "grad_norm": 1.8591207036266417, + "learning_rate": 9.6264697141158e-07, + "loss": 0.5758, + "step": 7746 + }, + { + "epoch": 0.81, + "grad_norm": 1.8749424175544835, + "learning_rate": 9.61654139475877e-07, + "loss": 0.5613, + "step": 7747 + }, + { + "epoch": 0.81, + "grad_norm": 1.8187123900362623, + "learning_rate": 9.606617653096967e-07, + "loss": 0.5405, + "step": 7748 + }, + { + "epoch": 0.81, + "grad_norm": 2.0773843186976726, + "learning_rate": 9.59669849025529e-07, + "loss": 0.6071, + "step": 7749 + }, + { + "epoch": 0.81, + "grad_norm": 1.9075752017328036, + "learning_rate": 9.586783907358126e-07, + "loss": 0.5405, + "step": 7750 + }, + { + "epoch": 0.81, + "grad_norm": 1.9368497704701095, + "learning_rate": 9.57687390552935e-07, + "loss": 0.6222, + "step": 7751 + }, + { + "epoch": 0.81, + "grad_norm": 1.894016291586957, + "learning_rate": 9.566968485892324e-07, + "loss": 0.6088, + "step": 7752 + }, + { + "epoch": 0.81, + "grad_norm": 1.8925582435192947, + "learning_rate": 9.557067649569873e-07, + "loss": 0.5613, + "step": 7753 + }, + { + "epoch": 0.81, + "grad_norm": 1.9135127751222611, + "learning_rate": 9.54717139768433e-07, + "loss": 0.6596, + "step": 7754 + }, + { + "epoch": 0.81, + "grad_norm": 2.2074368490586944, + "learning_rate": 9.537279731357485e-07, + "loss": 0.6295, + "step": 7755 + }, + { + "epoch": 0.81, + "grad_norm": 1.9786253685527062, + "learning_rate": 9.527392651710598e-07, + "loss": 0.6196, + "step": 7756 + }, + { + "epoch": 0.81, + "grad_norm": 1.8644737769919695, + "learning_rate": 9.517510159864452e-07, + "loss": 0.5505, + "step": 7757 + }, + { + "epoch": 0.81, + "grad_norm": 2.037734457999557, + "learning_rate": 9.507632256939264e-07, + "loss": 0.5701, + "step": 7758 + }, + { + "epoch": 0.81, + "grad_norm": 1.8913067735074787, + "learning_rate": 9.497758944054769e-07, + "loss": 0.5228, + "step": 7759 + }, + { + "epoch": 0.81, + "grad_norm": 1.961114075237607, + "learning_rate": 9.487890222330137e-07, + "loss": 0.5865, + "step": 7760 + }, + { + "epoch": 0.81, + "grad_norm": 2.1418281069781733, + "learning_rate": 9.478026092884074e-07, + "loss": 0.7004, + "step": 7761 + }, + { + "epoch": 0.81, + "grad_norm": 2.031618259141534, + "learning_rate": 9.468166556834724e-07, + "loss": 0.6048, + "step": 7762 + }, + { + "epoch": 0.81, + "grad_norm": 1.983872334557737, + "learning_rate": 9.458311615299714e-07, + "loss": 0.6104, + "step": 7763 + }, + { + "epoch": 0.81, + "grad_norm": 1.966621625986287, + "learning_rate": 9.448461269396148e-07, + "loss": 0.6129, + "step": 7764 + }, + { + "epoch": 0.81, + "grad_norm": 1.8456010848415982, + "learning_rate": 9.438615520240651e-07, + "loss": 0.6677, + "step": 7765 + }, + { + "epoch": 0.81, + "grad_norm": 1.9795646084529916, + "learning_rate": 9.428774368949262e-07, + "loss": 0.488, + "step": 7766 + }, + { + "epoch": 0.81, + "grad_norm": 1.778709075372669, + "learning_rate": 9.418937816637558e-07, + "loss": 0.5943, + "step": 7767 + }, + { + "epoch": 0.81, + "grad_norm": 2.0339698933276407, + "learning_rate": 9.409105864420548e-07, + "loss": 0.6256, + "step": 7768 + }, + { + "epoch": 0.81, + "grad_norm": 2.1716627541691045, + "learning_rate": 9.399278513412757e-07, + "loss": 0.6345, + "step": 7769 + }, + { + "epoch": 0.81, + "grad_norm": 2.099308376029622, + "learning_rate": 9.389455764728167e-07, + "loss": 0.5985, + "step": 7770 + }, + { + "epoch": 0.81, + "grad_norm": 1.952829993448853, + "learning_rate": 9.379637619480236e-07, + "loss": 0.6046, + "step": 7771 + }, + { + "epoch": 0.81, + "grad_norm": 1.9217150291138596, + "learning_rate": 9.369824078781897e-07, + "loss": 0.5854, + "step": 7772 + }, + { + "epoch": 0.81, + "grad_norm": 2.0430771905247034, + "learning_rate": 9.360015143745599e-07, + "loss": 0.6265, + "step": 7773 + }, + { + "epoch": 0.81, + "grad_norm": 1.9586997014673917, + "learning_rate": 9.350210815483207e-07, + "loss": 0.7064, + "step": 7774 + }, + { + "epoch": 0.81, + "grad_norm": 2.0477443669373177, + "learning_rate": 9.340411095106128e-07, + "loss": 0.6626, + "step": 7775 + }, + { + "epoch": 0.81, + "grad_norm": 1.8187859005943139, + "learning_rate": 9.330615983725194e-07, + "loss": 0.6062, + "step": 7776 + }, + { + "epoch": 0.81, + "grad_norm": 1.8676725602236373, + "learning_rate": 9.320825482450769e-07, + "loss": 0.6194, + "step": 7777 + }, + { + "epoch": 0.81, + "grad_norm": 2.009081149555021, + "learning_rate": 9.311039592392612e-07, + "loss": 0.6296, + "step": 7778 + }, + { + "epoch": 0.81, + "grad_norm": 1.9506352943100582, + "learning_rate": 9.30125831466005e-07, + "loss": 0.6273, + "step": 7779 + }, + { + "epoch": 0.81, + "grad_norm": 1.8634198547031477, + "learning_rate": 9.291481650361822e-07, + "loss": 0.58, + "step": 7780 + }, + { + "epoch": 0.81, + "grad_norm": 2.2028182575661557, + "learning_rate": 9.281709600606193e-07, + "loss": 0.6037, + "step": 7781 + }, + { + "epoch": 0.81, + "grad_norm": 1.9124910064744183, + "learning_rate": 9.271942166500853e-07, + "loss": 0.5988, + "step": 7782 + }, + { + "epoch": 0.81, + "grad_norm": 2.0032674668960504, + "learning_rate": 9.262179349153022e-07, + "loss": 0.6078, + "step": 7783 + }, + { + "epoch": 0.81, + "grad_norm": 2.0402604629089893, + "learning_rate": 9.252421149669349e-07, + "loss": 0.5139, + "step": 7784 + }, + { + "epoch": 0.81, + "grad_norm": 1.9476800993967762, + "learning_rate": 9.242667569156006e-07, + "loss": 0.578, + "step": 7785 + }, + { + "epoch": 0.81, + "grad_norm": 2.127765015120673, + "learning_rate": 9.232918608718599e-07, + "loss": 0.6612, + "step": 7786 + }, + { + "epoch": 0.81, + "grad_norm": 1.963807687238336, + "learning_rate": 9.223174269462237e-07, + "loss": 0.5882, + "step": 7787 + }, + { + "epoch": 0.81, + "grad_norm": 1.9031394484343827, + "learning_rate": 9.213434552491479e-07, + "loss": 0.5933, + "step": 7788 + }, + { + "epoch": 0.81, + "grad_norm": 1.8294547712072886, + "learning_rate": 9.203699458910397e-07, + "loss": 0.6555, + "step": 7789 + }, + { + "epoch": 0.81, + "grad_norm": 1.8309553228658446, + "learning_rate": 9.193968989822504e-07, + "loss": 0.5514, + "step": 7790 + }, + { + "epoch": 0.81, + "grad_norm": 1.9206616651299844, + "learning_rate": 9.184243146330829e-07, + "loss": 0.6373, + "step": 7791 + }, + { + "epoch": 0.81, + "grad_norm": 2.1123393182042958, + "learning_rate": 9.174521929537827e-07, + "loss": 0.6415, + "step": 7792 + }, + { + "epoch": 0.81, + "grad_norm": 2.1317560935213113, + "learning_rate": 9.164805340545457e-07, + "loss": 0.6525, + "step": 7793 + }, + { + "epoch": 0.81, + "grad_norm": 2.0224002996549735, + "learning_rate": 9.15509338045516e-07, + "loss": 0.6894, + "step": 7794 + }, + { + "epoch": 0.81, + "grad_norm": 2.042260851573768, + "learning_rate": 9.145386050367827e-07, + "loss": 0.5762, + "step": 7795 + }, + { + "epoch": 0.81, + "grad_norm": 1.9891557355289593, + "learning_rate": 9.135683351383862e-07, + "loss": 0.6294, + "step": 7796 + }, + { + "epoch": 0.81, + "grad_norm": 2.022029699436418, + "learning_rate": 9.125985284603095e-07, + "loss": 0.5819, + "step": 7797 + }, + { + "epoch": 0.81, + "grad_norm": 2.1109280193657596, + "learning_rate": 9.116291851124887e-07, + "loss": 0.6766, + "step": 7798 + }, + { + "epoch": 0.81, + "grad_norm": 1.9502117247555344, + "learning_rate": 9.106603052048019e-07, + "loss": 0.5748, + "step": 7799 + }, + { + "epoch": 0.81, + "grad_norm": 1.7462106726187883, + "learning_rate": 9.096918888470785e-07, + "loss": 0.4553, + "step": 7800 + }, + { + "epoch": 0.81, + "grad_norm": 2.0378937226681795, + "learning_rate": 9.087239361490919e-07, + "loss": 0.677, + "step": 7801 + }, + { + "epoch": 0.81, + "grad_norm": 1.8627644962745071, + "learning_rate": 9.07756447220568e-07, + "loss": 0.5913, + "step": 7802 + }, + { + "epoch": 0.81, + "grad_norm": 1.9507831523660102, + "learning_rate": 9.067894221711748e-07, + "loss": 0.5372, + "step": 7803 + }, + { + "epoch": 0.81, + "grad_norm": 1.9434714525311196, + "learning_rate": 9.058228611105319e-07, + "loss": 0.5585, + "step": 7804 + }, + { + "epoch": 0.81, + "grad_norm": 1.784512235108847, + "learning_rate": 9.048567641482031e-07, + "loss": 0.6526, + "step": 7805 + }, + { + "epoch": 0.81, + "grad_norm": 1.7646771163534876, + "learning_rate": 9.038911313937021e-07, + "loss": 0.5878, + "step": 7806 + }, + { + "epoch": 0.81, + "grad_norm": 1.9503722707113982, + "learning_rate": 9.02925962956489e-07, + "loss": 0.6089, + "step": 7807 + }, + { + "epoch": 0.81, + "grad_norm": 1.9933524346721043, + "learning_rate": 9.019612589459703e-07, + "loss": 0.7098, + "step": 7808 + }, + { + "epoch": 0.81, + "grad_norm": 1.684179267546423, + "learning_rate": 9.009970194714995e-07, + "loss": 0.6127, + "step": 7809 + }, + { + "epoch": 0.81, + "grad_norm": 1.947028893634746, + "learning_rate": 9.00033244642382e-07, + "loss": 0.571, + "step": 7810 + }, + { + "epoch": 0.81, + "grad_norm": 1.8394267571336589, + "learning_rate": 8.990699345678633e-07, + "loss": 0.5227, + "step": 7811 + }, + { + "epoch": 0.81, + "grad_norm": 2.1484606234393557, + "learning_rate": 8.981070893571436e-07, + "loss": 0.6421, + "step": 7812 + }, + { + "epoch": 0.81, + "grad_norm": 2.310340099058539, + "learning_rate": 8.971447091193641e-07, + "loss": 0.6349, + "step": 7813 + }, + { + "epoch": 0.81, + "grad_norm": 2.2442612121075705, + "learning_rate": 8.961827939636198e-07, + "loss": 0.6153, + "step": 7814 + }, + { + "epoch": 0.81, + "grad_norm": 2.0223880288341167, + "learning_rate": 8.952213439989443e-07, + "loss": 0.5675, + "step": 7815 + }, + { + "epoch": 0.81, + "grad_norm": 1.7330228615175547, + "learning_rate": 8.942603593343269e-07, + "loss": 0.5218, + "step": 7816 + }, + { + "epoch": 0.81, + "grad_norm": 1.9915324011404267, + "learning_rate": 8.932998400786985e-07, + "loss": 0.6024, + "step": 7817 + }, + { + "epoch": 0.81, + "grad_norm": 2.098901888535553, + "learning_rate": 8.923397863409422e-07, + "loss": 0.6626, + "step": 7818 + }, + { + "epoch": 0.81, + "grad_norm": 2.0703368108294784, + "learning_rate": 8.913801982298825e-07, + "loss": 0.6209, + "step": 7819 + }, + { + "epoch": 0.81, + "grad_norm": 2.0340207412306706, + "learning_rate": 8.90421075854297e-07, + "loss": 0.744, + "step": 7820 + }, + { + "epoch": 0.81, + "grad_norm": 2.0633123671861373, + "learning_rate": 8.894624193229051e-07, + "loss": 0.6967, + "step": 7821 + }, + { + "epoch": 0.81, + "grad_norm": 2.2559417118692306, + "learning_rate": 8.885042287443785e-07, + "loss": 0.5755, + "step": 7822 + }, + { + "epoch": 0.81, + "grad_norm": 2.0898486974483483, + "learning_rate": 8.875465042273323e-07, + "loss": 0.5747, + "step": 7823 + }, + { + "epoch": 0.81, + "grad_norm": 1.8473521886926423, + "learning_rate": 8.865892458803288e-07, + "loss": 0.5424, + "step": 7824 + }, + { + "epoch": 0.81, + "grad_norm": 2.00638075054233, + "learning_rate": 8.856324538118815e-07, + "loss": 0.6036, + "step": 7825 + }, + { + "epoch": 0.81, + "grad_norm": 2.1426291684833663, + "learning_rate": 8.846761281304461e-07, + "loss": 0.5502, + "step": 7826 + }, + { + "epoch": 0.81, + "grad_norm": 1.9203586483594455, + "learning_rate": 8.837202689444274e-07, + "loss": 0.5921, + "step": 7827 + }, + { + "epoch": 0.81, + "grad_norm": 1.9063737713261275, + "learning_rate": 8.827648763621793e-07, + "loss": 0.5702, + "step": 7828 + }, + { + "epoch": 0.81, + "grad_norm": 1.7596477523436753, + "learning_rate": 8.818099504919997e-07, + "loss": 0.5492, + "step": 7829 + }, + { + "epoch": 0.81, + "grad_norm": 1.878655835611806, + "learning_rate": 8.808554914421341e-07, + "loss": 0.5626, + "step": 7830 + }, + { + "epoch": 0.81, + "grad_norm": 1.7387383408855732, + "learning_rate": 8.799014993207783e-07, + "loss": 0.5791, + "step": 7831 + }, + { + "epoch": 0.81, + "grad_norm": 1.825422980516178, + "learning_rate": 8.789479742360696e-07, + "loss": 0.6566, + "step": 7832 + }, + { + "epoch": 0.81, + "grad_norm": 1.836576312128298, + "learning_rate": 8.779949162960988e-07, + "loss": 0.4287, + "step": 7833 + }, + { + "epoch": 0.81, + "grad_norm": 1.9235214625697135, + "learning_rate": 8.770423256088978e-07, + "loss": 0.648, + "step": 7834 + }, + { + "epoch": 0.81, + "grad_norm": 1.8400134406808073, + "learning_rate": 8.760902022824502e-07, + "loss": 0.5477, + "step": 7835 + }, + { + "epoch": 0.81, + "grad_norm": 2.095020666362253, + "learning_rate": 8.751385464246836e-07, + "loss": 0.5633, + "step": 7836 + }, + { + "epoch": 0.81, + "grad_norm": 2.2320814488356078, + "learning_rate": 8.74187358143474e-07, + "loss": 0.6631, + "step": 7837 + }, + { + "epoch": 0.81, + "grad_norm": 2.326172772622547, + "learning_rate": 8.732366375466422e-07, + "loss": 0.6657, + "step": 7838 + }, + { + "epoch": 0.81, + "grad_norm": 2.1461369278054625, + "learning_rate": 8.722863847419605e-07, + "loss": 0.563, + "step": 7839 + }, + { + "epoch": 0.81, + "grad_norm": 2.014512726119495, + "learning_rate": 8.713365998371431e-07, + "loss": 0.674, + "step": 7840 + }, + { + "epoch": 0.82, + "grad_norm": 1.673982067794888, + "learning_rate": 8.703872829398563e-07, + "loss": 0.4947, + "step": 7841 + }, + { + "epoch": 0.82, + "grad_norm": 1.7725031865905028, + "learning_rate": 8.694384341577072e-07, + "loss": 0.5893, + "step": 7842 + }, + { + "epoch": 0.82, + "grad_norm": 1.84444502749441, + "learning_rate": 8.684900535982566e-07, + "loss": 0.6629, + "step": 7843 + }, + { + "epoch": 0.82, + "grad_norm": 1.7431264743321058, + "learning_rate": 8.675421413690072e-07, + "loss": 0.6112, + "step": 7844 + }, + { + "epoch": 0.82, + "grad_norm": 1.88630416944583, + "learning_rate": 8.665946975774103e-07, + "loss": 0.686, + "step": 7845 + }, + { + "epoch": 0.82, + "grad_norm": 1.8938332117324725, + "learning_rate": 8.656477223308623e-07, + "loss": 0.596, + "step": 7846 + }, + { + "epoch": 0.82, + "grad_norm": 1.856990540011655, + "learning_rate": 8.647012157367118e-07, + "loss": 0.6462, + "step": 7847 + }, + { + "epoch": 0.82, + "grad_norm": 1.9573712099806473, + "learning_rate": 8.63755177902248e-07, + "loss": 0.5727, + "step": 7848 + }, + { + "epoch": 0.82, + "grad_norm": 1.7953197325036485, + "learning_rate": 8.62809608934711e-07, + "loss": 0.5912, + "step": 7849 + }, + { + "epoch": 0.82, + "grad_norm": 1.9467543418309012, + "learning_rate": 8.618645089412852e-07, + "loss": 0.643, + "step": 7850 + }, + { + "epoch": 0.82, + "grad_norm": 1.9747829769385832, + "learning_rate": 8.609198780291067e-07, + "loss": 0.6153, + "step": 7851 + }, + { + "epoch": 0.82, + "grad_norm": 1.8468696120697057, + "learning_rate": 8.599757163052491e-07, + "loss": 0.5527, + "step": 7852 + }, + { + "epoch": 0.82, + "grad_norm": 2.0237754704208495, + "learning_rate": 8.590320238767425e-07, + "loss": 0.6137, + "step": 7853 + }, + { + "epoch": 0.82, + "grad_norm": 1.9018135574384365, + "learning_rate": 8.580888008505578e-07, + "loss": 0.6224, + "step": 7854 + }, + { + "epoch": 0.82, + "grad_norm": 1.9945743056461949, + "learning_rate": 8.571460473336168e-07, + "loss": 0.6345, + "step": 7855 + }, + { + "epoch": 0.82, + "grad_norm": 1.8275906820507346, + "learning_rate": 8.562037634327836e-07, + "loss": 0.6913, + "step": 7856 + }, + { + "epoch": 0.82, + "grad_norm": 1.7781481219644808, + "learning_rate": 8.552619492548736e-07, + "loss": 0.6142, + "step": 7857 + }, + { + "epoch": 0.82, + "grad_norm": 2.101917955513664, + "learning_rate": 8.543206049066461e-07, + "loss": 0.5878, + "step": 7858 + }, + { + "epoch": 0.82, + "grad_norm": 1.7860451310801435, + "learning_rate": 8.533797304948066e-07, + "loss": 0.6194, + "step": 7859 + }, + { + "epoch": 0.82, + "grad_norm": 2.053297227977161, + "learning_rate": 8.524393261260106e-07, + "loss": 0.5577, + "step": 7860 + }, + { + "epoch": 0.82, + "grad_norm": 1.8999012436315257, + "learning_rate": 8.51499391906856e-07, + "loss": 0.5324, + "step": 7861 + }, + { + "epoch": 0.82, + "grad_norm": 1.9403626400781433, + "learning_rate": 8.50559927943892e-07, + "loss": 0.6044, + "step": 7862 + }, + { + "epoch": 0.82, + "grad_norm": 1.9202525596061424, + "learning_rate": 8.496209343436101e-07, + "loss": 0.6422, + "step": 7863 + }, + { + "epoch": 0.82, + "grad_norm": 1.9258908691490637, + "learning_rate": 8.486824112124531e-07, + "loss": 0.5877, + "step": 7864 + }, + { + "epoch": 0.82, + "grad_norm": 2.139813896668843, + "learning_rate": 8.477443586568068e-07, + "loss": 0.6723, + "step": 7865 + }, + { + "epoch": 0.82, + "grad_norm": 1.8864183561566554, + "learning_rate": 8.46806776783004e-07, + "loss": 0.5684, + "step": 7866 + }, + { + "epoch": 0.82, + "grad_norm": 2.086021703795221, + "learning_rate": 8.458696656973242e-07, + "loss": 0.6466, + "step": 7867 + }, + { + "epoch": 0.82, + "grad_norm": 1.7670328138359852, + "learning_rate": 8.449330255059974e-07, + "loss": 0.5655, + "step": 7868 + }, + { + "epoch": 0.82, + "grad_norm": 2.109669089522648, + "learning_rate": 8.439968563151935e-07, + "loss": 0.5716, + "step": 7869 + }, + { + "epoch": 0.82, + "grad_norm": 1.9731160176083082, + "learning_rate": 8.430611582310355e-07, + "loss": 0.6779, + "step": 7870 + }, + { + "epoch": 0.82, + "grad_norm": 1.6833598941120023, + "learning_rate": 8.421259313595881e-07, + "loss": 0.5651, + "step": 7871 + }, + { + "epoch": 0.82, + "grad_norm": 1.9782640426514457, + "learning_rate": 8.411911758068664e-07, + "loss": 0.5027, + "step": 7872 + }, + { + "epoch": 0.82, + "grad_norm": 1.7638609114287134, + "learning_rate": 8.402568916788295e-07, + "loss": 0.6116, + "step": 7873 + }, + { + "epoch": 0.82, + "grad_norm": 2.0287539947205198, + "learning_rate": 8.393230790813834e-07, + "loss": 0.6667, + "step": 7874 + }, + { + "epoch": 0.82, + "grad_norm": 1.8293808842129964, + "learning_rate": 8.383897381203804e-07, + "loss": 0.6778, + "step": 7875 + }, + { + "epoch": 0.82, + "grad_norm": 1.889493686769746, + "learning_rate": 8.374568689016222e-07, + "loss": 0.5777, + "step": 7876 + }, + { + "epoch": 0.82, + "grad_norm": 2.0618821665052263, + "learning_rate": 8.365244715308524e-07, + "loss": 0.6482, + "step": 7877 + }, + { + "epoch": 0.82, + "grad_norm": 1.979835255191372, + "learning_rate": 8.355925461137659e-07, + "loss": 0.5515, + "step": 7878 + }, + { + "epoch": 0.82, + "grad_norm": 1.8234234080540386, + "learning_rate": 8.346610927559995e-07, + "loss": 0.5685, + "step": 7879 + }, + { + "epoch": 0.82, + "grad_norm": 2.114466124057432, + "learning_rate": 8.337301115631408e-07, + "loss": 0.6513, + "step": 7880 + }, + { + "epoch": 0.82, + "grad_norm": 2.0450819662924973, + "learning_rate": 8.327996026407215e-07, + "loss": 0.582, + "step": 7881 + }, + { + "epoch": 0.82, + "grad_norm": 1.9534975150914224, + "learning_rate": 8.318695660942188e-07, + "loss": 0.6081, + "step": 7882 + }, + { + "epoch": 0.82, + "grad_norm": 1.766548803762076, + "learning_rate": 8.309400020290576e-07, + "loss": 0.5923, + "step": 7883 + }, + { + "epoch": 0.82, + "grad_norm": 1.8376364935303842, + "learning_rate": 8.30010910550611e-07, + "loss": 0.5752, + "step": 7884 + }, + { + "epoch": 0.82, + "grad_norm": 2.1740334973065294, + "learning_rate": 8.29082291764195e-07, + "loss": 0.6675, + "step": 7885 + }, + { + "epoch": 0.82, + "grad_norm": 1.955516410752621, + "learning_rate": 8.281541457750752e-07, + "loss": 0.6615, + "step": 7886 + }, + { + "epoch": 0.82, + "grad_norm": 2.0513285670740427, + "learning_rate": 8.272264726884611e-07, + "loss": 0.6483, + "step": 7887 + }, + { + "epoch": 0.82, + "grad_norm": 1.9204631948112747, + "learning_rate": 8.262992726095126e-07, + "loss": 0.63, + "step": 7888 + }, + { + "epoch": 0.82, + "grad_norm": 1.920799123113018, + "learning_rate": 8.253725456433281e-07, + "loss": 0.574, + "step": 7889 + }, + { + "epoch": 0.82, + "grad_norm": 2.0544728279229307, + "learning_rate": 8.244462918949613e-07, + "loss": 0.6066, + "step": 7890 + }, + { + "epoch": 0.82, + "grad_norm": 1.8312075580282576, + "learning_rate": 8.235205114694067e-07, + "loss": 0.6475, + "step": 7891 + }, + { + "epoch": 0.82, + "grad_norm": 2.138681797382035, + "learning_rate": 8.225952044716079e-07, + "loss": 0.6259, + "step": 7892 + }, + { + "epoch": 0.82, + "grad_norm": 1.895533332438548, + "learning_rate": 8.216703710064516e-07, + "loss": 0.5747, + "step": 7893 + }, + { + "epoch": 0.82, + "grad_norm": 1.904821942247295, + "learning_rate": 8.207460111787763e-07, + "loss": 0.589, + "step": 7894 + }, + { + "epoch": 0.82, + "grad_norm": 1.9668496729404117, + "learning_rate": 8.198221250933613e-07, + "loss": 0.6433, + "step": 7895 + }, + { + "epoch": 0.82, + "grad_norm": 1.9284587389594263, + "learning_rate": 8.188987128549336e-07, + "loss": 0.6273, + "step": 7896 + }, + { + "epoch": 0.82, + "grad_norm": 2.1416050872893946, + "learning_rate": 8.179757745681693e-07, + "loss": 0.6704, + "step": 7897 + }, + { + "epoch": 0.82, + "grad_norm": 1.7866996337890955, + "learning_rate": 8.170533103376865e-07, + "loss": 0.6691, + "step": 7898 + }, + { + "epoch": 0.82, + "grad_norm": 1.8456024182305906, + "learning_rate": 8.161313202680543e-07, + "loss": 0.6148, + "step": 7899 + }, + { + "epoch": 0.82, + "grad_norm": 1.7492541132835926, + "learning_rate": 8.15209804463783e-07, + "loss": 0.5606, + "step": 7900 + }, + { + "epoch": 0.82, + "grad_norm": 1.9516406633526484, + "learning_rate": 8.142887630293339e-07, + "loss": 0.5595, + "step": 7901 + }, + { + "epoch": 0.82, + "grad_norm": 1.7358148718068005, + "learning_rate": 8.133681960691098e-07, + "loss": 0.4818, + "step": 7902 + }, + { + "epoch": 0.82, + "grad_norm": 1.963868509775501, + "learning_rate": 8.124481036874665e-07, + "loss": 0.5443, + "step": 7903 + }, + { + "epoch": 0.82, + "grad_norm": 2.0190989920317866, + "learning_rate": 8.115284859886963e-07, + "loss": 0.5788, + "step": 7904 + }, + { + "epoch": 0.82, + "grad_norm": 1.9193826384008805, + "learning_rate": 8.106093430770473e-07, + "loss": 0.7062, + "step": 7905 + }, + { + "epoch": 0.82, + "grad_norm": 2.048467386932486, + "learning_rate": 8.096906750567063e-07, + "loss": 0.659, + "step": 7906 + }, + { + "epoch": 0.82, + "grad_norm": 2.1066721308629646, + "learning_rate": 8.087724820318127e-07, + "loss": 0.6521, + "step": 7907 + }, + { + "epoch": 0.82, + "grad_norm": 2.113774039359718, + "learning_rate": 8.07854764106446e-07, + "loss": 0.6454, + "step": 7908 + }, + { + "epoch": 0.82, + "grad_norm": 1.9652147888264364, + "learning_rate": 8.069375213846381e-07, + "loss": 0.5461, + "step": 7909 + }, + { + "epoch": 0.82, + "grad_norm": 1.8955929743963291, + "learning_rate": 8.060207539703613e-07, + "loss": 0.5916, + "step": 7910 + }, + { + "epoch": 0.82, + "grad_norm": 2.202212850241475, + "learning_rate": 8.051044619675368e-07, + "loss": 0.7072, + "step": 7911 + }, + { + "epoch": 0.82, + "grad_norm": 1.9825746509636721, + "learning_rate": 8.041886454800307e-07, + "loss": 0.6028, + "step": 7912 + }, + { + "epoch": 0.82, + "grad_norm": 1.743284921250136, + "learning_rate": 8.032733046116581e-07, + "loss": 0.5952, + "step": 7913 + }, + { + "epoch": 0.82, + "grad_norm": 1.9131860315901044, + "learning_rate": 8.023584394661754e-07, + "loss": 0.5394, + "step": 7914 + }, + { + "epoch": 0.82, + "grad_norm": 2.08391122810736, + "learning_rate": 8.014440501472909e-07, + "loss": 0.6065, + "step": 7915 + }, + { + "epoch": 0.82, + "grad_norm": 1.9814471916087633, + "learning_rate": 8.005301367586532e-07, + "loss": 0.5821, + "step": 7916 + }, + { + "epoch": 0.82, + "grad_norm": 2.3000295637313712, + "learning_rate": 7.996166994038618e-07, + "loss": 0.558, + "step": 7917 + }, + { + "epoch": 0.82, + "grad_norm": 2.0762593164954706, + "learning_rate": 7.987037381864587e-07, + "loss": 0.7169, + "step": 7918 + }, + { + "epoch": 0.82, + "grad_norm": 1.7514422787011197, + "learning_rate": 7.977912532099336e-07, + "loss": 0.558, + "step": 7919 + }, + { + "epoch": 0.82, + "grad_norm": 1.76970855988798, + "learning_rate": 7.968792445777207e-07, + "loss": 0.5681, + "step": 7920 + }, + { + "epoch": 0.82, + "grad_norm": 1.9054210564705762, + "learning_rate": 7.95967712393203e-07, + "loss": 0.6098, + "step": 7921 + }, + { + "epoch": 0.82, + "grad_norm": 1.9968109035396673, + "learning_rate": 7.950566567597067e-07, + "loss": 0.5548, + "step": 7922 + }, + { + "epoch": 0.82, + "grad_norm": 2.0679678927457252, + "learning_rate": 7.941460777805071e-07, + "loss": 0.6346, + "step": 7923 + }, + { + "epoch": 0.82, + "grad_norm": 1.771436468079132, + "learning_rate": 7.932359755588204e-07, + "loss": 0.6228, + "step": 7924 + }, + { + "epoch": 0.82, + "grad_norm": 2.028353877337731, + "learning_rate": 7.923263501978151e-07, + "loss": 0.6094, + "step": 7925 + }, + { + "epoch": 0.82, + "grad_norm": 1.9541780264877802, + "learning_rate": 7.914172018006006e-07, + "loss": 0.6535, + "step": 7926 + }, + { + "epoch": 0.82, + "grad_norm": 2.033311941728741, + "learning_rate": 7.905085304702348e-07, + "loss": 0.6615, + "step": 7927 + }, + { + "epoch": 0.82, + "grad_norm": 1.9664209507744381, + "learning_rate": 7.896003363097194e-07, + "loss": 0.613, + "step": 7928 + }, + { + "epoch": 0.82, + "grad_norm": 2.047467628372388, + "learning_rate": 7.886926194220051e-07, + "loss": 0.6435, + "step": 7929 + }, + { + "epoch": 0.82, + "grad_norm": 2.2322887628362986, + "learning_rate": 7.87785379909985e-07, + "loss": 0.4776, + "step": 7930 + }, + { + "epoch": 0.82, + "grad_norm": 1.9161552547216953, + "learning_rate": 7.86878617876502e-07, + "loss": 0.5047, + "step": 7931 + }, + { + "epoch": 0.82, + "grad_norm": 1.9666675466994157, + "learning_rate": 7.859723334243414e-07, + "loss": 0.6392, + "step": 7932 + }, + { + "epoch": 0.82, + "grad_norm": 2.0673189448510128, + "learning_rate": 7.850665266562352e-07, + "loss": 0.697, + "step": 7933 + }, + { + "epoch": 0.82, + "grad_norm": 2.02537722417873, + "learning_rate": 7.841611976748637e-07, + "loss": 0.6193, + "step": 7934 + }, + { + "epoch": 0.82, + "grad_norm": 1.8026876495600388, + "learning_rate": 7.832563465828486e-07, + "loss": 0.6503, + "step": 7935 + }, + { + "epoch": 0.82, + "grad_norm": 1.9214738333018813, + "learning_rate": 7.823519734827623e-07, + "loss": 0.5923, + "step": 7936 + }, + { + "epoch": 0.83, + "grad_norm": 2.0886931902273393, + "learning_rate": 7.814480784771184e-07, + "loss": 0.6328, + "step": 7937 + }, + { + "epoch": 0.83, + "grad_norm": 1.7978925952821656, + "learning_rate": 7.805446616683815e-07, + "loss": 0.6205, + "step": 7938 + }, + { + "epoch": 0.83, + "grad_norm": 2.2547069940717863, + "learning_rate": 7.796417231589553e-07, + "loss": 0.5602, + "step": 7939 + }, + { + "epoch": 0.83, + "grad_norm": 2.0421838146360134, + "learning_rate": 7.78739263051198e-07, + "loss": 0.6162, + "step": 7940 + }, + { + "epoch": 0.83, + "grad_norm": 1.8776854409493586, + "learning_rate": 7.778372814474028e-07, + "loss": 0.5241, + "step": 7941 + }, + { + "epoch": 0.83, + "grad_norm": 1.9343979647373022, + "learning_rate": 7.769357784498189e-07, + "loss": 0.6138, + "step": 7942 + }, + { + "epoch": 0.83, + "grad_norm": 1.9126629837911133, + "learning_rate": 7.760347541606339e-07, + "loss": 0.5349, + "step": 7943 + }, + { + "epoch": 0.83, + "grad_norm": 2.013112677164787, + "learning_rate": 7.751342086819864e-07, + "loss": 0.5842, + "step": 7944 + }, + { + "epoch": 0.83, + "grad_norm": 2.1023021606726475, + "learning_rate": 7.742341421159561e-07, + "loss": 0.5927, + "step": 7945 + }, + { + "epoch": 0.83, + "grad_norm": 1.9172493579642078, + "learning_rate": 7.733345545645726e-07, + "loss": 0.4841, + "step": 7946 + }, + { + "epoch": 0.83, + "grad_norm": 1.8852024848426656, + "learning_rate": 7.724354461298089e-07, + "loss": 0.6012, + "step": 7947 + }, + { + "epoch": 0.83, + "grad_norm": 1.8875527210328427, + "learning_rate": 7.71536816913584e-07, + "loss": 0.5591, + "step": 7948 + }, + { + "epoch": 0.83, + "grad_norm": 2.1986943184940437, + "learning_rate": 7.706386670177606e-07, + "loss": 0.6477, + "step": 7949 + }, + { + "epoch": 0.83, + "grad_norm": 1.9792660395007868, + "learning_rate": 7.697409965441527e-07, + "loss": 0.5966, + "step": 7950 + }, + { + "epoch": 0.83, + "grad_norm": 2.0633054153372186, + "learning_rate": 7.68843805594513e-07, + "loss": 0.5333, + "step": 7951 + }, + { + "epoch": 0.83, + "grad_norm": 2.0106706139040496, + "learning_rate": 7.679470942705459e-07, + "loss": 0.6114, + "step": 7952 + }, + { + "epoch": 0.83, + "grad_norm": 1.9411577381023806, + "learning_rate": 7.670508626738959e-07, + "loss": 0.6204, + "step": 7953 + }, + { + "epoch": 0.83, + "grad_norm": 1.8717425506845027, + "learning_rate": 7.661551109061593e-07, + "loss": 0.6533, + "step": 7954 + }, + { + "epoch": 0.83, + "grad_norm": 1.957638620055607, + "learning_rate": 7.652598390688731e-07, + "loss": 0.61, + "step": 7955 + }, + { + "epoch": 0.83, + "grad_norm": 1.7361028180084892, + "learning_rate": 7.643650472635211e-07, + "loss": 0.5245, + "step": 7956 + }, + { + "epoch": 0.83, + "grad_norm": 2.062695795851483, + "learning_rate": 7.634707355915321e-07, + "loss": 0.5406, + "step": 7957 + }, + { + "epoch": 0.83, + "grad_norm": 1.7556940200609303, + "learning_rate": 7.625769041542841e-07, + "loss": 0.6456, + "step": 7958 + }, + { + "epoch": 0.83, + "grad_norm": 2.1244482890856484, + "learning_rate": 7.616835530530947e-07, + "loss": 0.5838, + "step": 7959 + }, + { + "epoch": 0.83, + "grad_norm": 2.007384668816092, + "learning_rate": 7.607906823892341e-07, + "loss": 0.5308, + "step": 7960 + }, + { + "epoch": 0.83, + "grad_norm": 1.8335296792574647, + "learning_rate": 7.598982922639109e-07, + "loss": 0.5689, + "step": 7961 + }, + { + "epoch": 0.83, + "grad_norm": 1.89865804594614, + "learning_rate": 7.590063827782851e-07, + "loss": 0.6264, + "step": 7962 + }, + { + "epoch": 0.83, + "grad_norm": 2.101400701382922, + "learning_rate": 7.581149540334587e-07, + "loss": 0.6344, + "step": 7963 + }, + { + "epoch": 0.83, + "grad_norm": 1.8929227815753287, + "learning_rate": 7.572240061304786e-07, + "loss": 0.6135, + "step": 7964 + }, + { + "epoch": 0.83, + "grad_norm": 1.9995425057991527, + "learning_rate": 7.563335391703424e-07, + "loss": 0.6244, + "step": 7965 + }, + { + "epoch": 0.83, + "grad_norm": 1.8371114068334875, + "learning_rate": 7.554435532539872e-07, + "loss": 0.6308, + "step": 7966 + }, + { + "epoch": 0.83, + "grad_norm": 1.85970602929272, + "learning_rate": 7.545540484822972e-07, + "loss": 0.5694, + "step": 7967 + }, + { + "epoch": 0.83, + "grad_norm": 2.0786161787069113, + "learning_rate": 7.536650249561056e-07, + "loss": 0.7314, + "step": 7968 + }, + { + "epoch": 0.83, + "grad_norm": 1.7588934796892286, + "learning_rate": 7.527764827761863e-07, + "loss": 0.5897, + "step": 7969 + }, + { + "epoch": 0.83, + "grad_norm": 2.2854940957417815, + "learning_rate": 7.518884220432599e-07, + "loss": 0.6597, + "step": 7970 + }, + { + "epoch": 0.83, + "grad_norm": 1.9764113169266813, + "learning_rate": 7.510008428579956e-07, + "loss": 0.5809, + "step": 7971 + }, + { + "epoch": 0.83, + "grad_norm": 2.026543602570681, + "learning_rate": 7.501137453210027e-07, + "loss": 0.7416, + "step": 7972 + }, + { + "epoch": 0.83, + "grad_norm": 2.177075001218227, + "learning_rate": 7.492271295328419e-07, + "loss": 0.57, + "step": 7973 + }, + { + "epoch": 0.83, + "grad_norm": 1.8721218522062755, + "learning_rate": 7.483409955940136e-07, + "loss": 0.5692, + "step": 7974 + }, + { + "epoch": 0.83, + "grad_norm": 1.8713817493079272, + "learning_rate": 7.474553436049675e-07, + "loss": 0.5742, + "step": 7975 + }, + { + "epoch": 0.83, + "grad_norm": 1.8123154657334932, + "learning_rate": 7.465701736660963e-07, + "loss": 0.5795, + "step": 7976 + }, + { + "epoch": 0.83, + "grad_norm": 1.8267308046975017, + "learning_rate": 7.456854858777418e-07, + "loss": 0.5542, + "step": 7977 + }, + { + "epoch": 0.83, + "grad_norm": 2.124509762504682, + "learning_rate": 7.448012803401843e-07, + "loss": 0.5719, + "step": 7978 + }, + { + "epoch": 0.83, + "grad_norm": 1.7888627753037367, + "learning_rate": 7.43917557153656e-07, + "loss": 0.6277, + "step": 7979 + }, + { + "epoch": 0.83, + "grad_norm": 1.8413228879270656, + "learning_rate": 7.430343164183312e-07, + "loss": 0.5997, + "step": 7980 + }, + { + "epoch": 0.83, + "grad_norm": 1.8432625198692136, + "learning_rate": 7.421515582343308e-07, + "loss": 0.6055, + "step": 7981 + }, + { + "epoch": 0.83, + "grad_norm": 2.0281622706959856, + "learning_rate": 7.412692827017193e-07, + "loss": 0.5958, + "step": 7982 + }, + { + "epoch": 0.83, + "grad_norm": 1.9798861010712308, + "learning_rate": 7.4038748992051e-07, + "loss": 0.5568, + "step": 7983 + }, + { + "epoch": 0.83, + "grad_norm": 1.9779665626638276, + "learning_rate": 7.395061799906578e-07, + "loss": 0.6735, + "step": 7984 + }, + { + "epoch": 0.83, + "grad_norm": 2.105242899193206, + "learning_rate": 7.386253530120635e-07, + "loss": 0.6218, + "step": 7985 + }, + { + "epoch": 0.83, + "grad_norm": 1.792392367170328, + "learning_rate": 7.377450090845733e-07, + "loss": 0.6366, + "step": 7986 + }, + { + "epoch": 0.83, + "grad_norm": 1.9121163417362366, + "learning_rate": 7.368651483079819e-07, + "loss": 0.5815, + "step": 7987 + }, + { + "epoch": 0.83, + "grad_norm": 1.848675634361415, + "learning_rate": 7.35985770782024e-07, + "loss": 0.6224, + "step": 7988 + }, + { + "epoch": 0.83, + "grad_norm": 1.9301143939260674, + "learning_rate": 7.35106876606384e-07, + "loss": 0.6844, + "step": 7989 + }, + { + "epoch": 0.83, + "grad_norm": 1.9834997145749633, + "learning_rate": 7.342284658806875e-07, + "loss": 0.7157, + "step": 7990 + }, + { + "epoch": 0.83, + "grad_norm": 1.8168292987948385, + "learning_rate": 7.333505387045108e-07, + "loss": 0.5979, + "step": 7991 + }, + { + "epoch": 0.83, + "grad_norm": 1.8488952297008012, + "learning_rate": 7.324730951773673e-07, + "loss": 0.478, + "step": 7992 + }, + { + "epoch": 0.83, + "grad_norm": 1.8987998955323417, + "learning_rate": 7.315961353987234e-07, + "loss": 0.6196, + "step": 7993 + }, + { + "epoch": 0.83, + "grad_norm": 1.9061861574645693, + "learning_rate": 7.307196594679855e-07, + "loss": 0.5492, + "step": 7994 + }, + { + "epoch": 0.83, + "grad_norm": 2.026536733296861, + "learning_rate": 7.298436674845099e-07, + "loss": 0.6364, + "step": 7995 + }, + { + "epoch": 0.83, + "grad_norm": 2.4322053014985077, + "learning_rate": 7.289681595475922e-07, + "loss": 0.7198, + "step": 7996 + }, + { + "epoch": 0.83, + "grad_norm": 1.851711818464298, + "learning_rate": 7.280931357564791e-07, + "loss": 0.6029, + "step": 7997 + }, + { + "epoch": 0.83, + "grad_norm": 2.003585262968676, + "learning_rate": 7.272185962103567e-07, + "loss": 0.5255, + "step": 7998 + }, + { + "epoch": 0.83, + "grad_norm": 1.808289594625113, + "learning_rate": 7.263445410083614e-07, + "loss": 0.6749, + "step": 7999 + }, + { + "epoch": 0.83, + "grad_norm": 1.8692318647300077, + "learning_rate": 7.254709702495721e-07, + "loss": 0.5514, + "step": 8000 + }, + { + "epoch": 0.83, + "grad_norm": 1.996444861038339, + "learning_rate": 7.245978840330103e-07, + "loss": 0.575, + "step": 8001 + }, + { + "epoch": 0.83, + "grad_norm": 2.0228741290262593, + "learning_rate": 7.23725282457649e-07, + "loss": 0.6426, + "step": 8002 + }, + { + "epoch": 0.83, + "grad_norm": 2.0017135428709207, + "learning_rate": 7.228531656223997e-07, + "loss": 0.6279, + "step": 8003 + }, + { + "epoch": 0.83, + "grad_norm": 1.894666164788839, + "learning_rate": 7.219815336261243e-07, + "loss": 0.5261, + "step": 8004 + }, + { + "epoch": 0.83, + "grad_norm": 2.1004694991805155, + "learning_rate": 7.211103865676255e-07, + "loss": 0.5761, + "step": 8005 + }, + { + "epoch": 0.83, + "grad_norm": 1.9736439337576523, + "learning_rate": 7.202397245456539e-07, + "loss": 0.6219, + "step": 8006 + }, + { + "epoch": 0.83, + "grad_norm": 1.9906929570826677, + "learning_rate": 7.193695476589019e-07, + "loss": 0.5685, + "step": 8007 + }, + { + "epoch": 0.83, + "grad_norm": 1.7979796734065363, + "learning_rate": 7.184998560060114e-07, + "loss": 0.5869, + "step": 8008 + }, + { + "epoch": 0.83, + "grad_norm": 1.9042889426569207, + "learning_rate": 7.176306496855651e-07, + "loss": 0.6146, + "step": 8009 + }, + { + "epoch": 0.83, + "grad_norm": 1.9254016912472625, + "learning_rate": 7.167619287960942e-07, + "loss": 0.581, + "step": 8010 + }, + { + "epoch": 0.83, + "grad_norm": 1.9512998168719646, + "learning_rate": 7.158936934360711e-07, + "loss": 0.6168, + "step": 8011 + }, + { + "epoch": 0.83, + "grad_norm": 2.046608135870485, + "learning_rate": 7.150259437039175e-07, + "loss": 0.7063, + "step": 8012 + }, + { + "epoch": 0.83, + "grad_norm": 1.878222927702504, + "learning_rate": 7.14158679697996e-07, + "loss": 0.6154, + "step": 8013 + }, + { + "epoch": 0.83, + "grad_norm": 1.881994367788152, + "learning_rate": 7.13291901516619e-07, + "loss": 0.5284, + "step": 8014 + }, + { + "epoch": 0.83, + "grad_norm": 2.123034281457884, + "learning_rate": 7.124256092580357e-07, + "loss": 0.6879, + "step": 8015 + }, + { + "epoch": 0.83, + "grad_norm": 1.9677989441827224, + "learning_rate": 7.11559803020449e-07, + "loss": 0.6354, + "step": 8016 + }, + { + "epoch": 0.83, + "grad_norm": 2.0902633305259357, + "learning_rate": 7.106944829020013e-07, + "loss": 0.5253, + "step": 8017 + }, + { + "epoch": 0.83, + "grad_norm": 2.038512209808121, + "learning_rate": 7.098296490007828e-07, + "loss": 0.6774, + "step": 8018 + }, + { + "epoch": 0.83, + "grad_norm": 1.7981807892646227, + "learning_rate": 7.089653014148263e-07, + "loss": 0.6563, + "step": 8019 + }, + { + "epoch": 0.83, + "grad_norm": 2.004711187157496, + "learning_rate": 7.081014402421115e-07, + "loss": 0.6193, + "step": 8020 + }, + { + "epoch": 0.83, + "grad_norm": 1.984447214093464, + "learning_rate": 7.072380655805617e-07, + "loss": 0.6576, + "step": 8021 + }, + { + "epoch": 0.83, + "grad_norm": 1.9593774644521333, + "learning_rate": 7.063751775280448e-07, + "loss": 0.5683, + "step": 8022 + }, + { + "epoch": 0.83, + "grad_norm": 2.012476005916973, + "learning_rate": 7.055127761823732e-07, + "loss": 0.6759, + "step": 8023 + }, + { + "epoch": 0.83, + "grad_norm": 2.445817481217792, + "learning_rate": 7.046508616413078e-07, + "loss": 0.6529, + "step": 8024 + }, + { + "epoch": 0.83, + "grad_norm": 2.1997155940021216, + "learning_rate": 7.037894340025487e-07, + "loss": 0.6053, + "step": 8025 + }, + { + "epoch": 0.83, + "grad_norm": 1.5564007641049542, + "learning_rate": 7.029284933637454e-07, + "loss": 0.5903, + "step": 8026 + }, + { + "epoch": 0.83, + "grad_norm": 2.1252616955848116, + "learning_rate": 7.020680398224893e-07, + "loss": 0.6022, + "step": 8027 + }, + { + "epoch": 0.83, + "grad_norm": 1.877781753370922, + "learning_rate": 7.012080734763205e-07, + "loss": 0.6165, + "step": 8028 + }, + { + "epoch": 0.83, + "grad_norm": 1.908376242028954, + "learning_rate": 7.003485944227162e-07, + "loss": 0.516, + "step": 8029 + }, + { + "epoch": 0.83, + "grad_norm": 1.8883975626092215, + "learning_rate": 6.994896027591074e-07, + "loss": 0.5823, + "step": 8030 + }, + { + "epoch": 0.83, + "grad_norm": 1.893765363384913, + "learning_rate": 6.986310985828626e-07, + "loss": 0.6621, + "step": 8031 + }, + { + "epoch": 0.83, + "grad_norm": 1.939654375998763, + "learning_rate": 6.977730819913015e-07, + "loss": 0.609, + "step": 8032 + }, + { + "epoch": 0.84, + "grad_norm": 1.9859890115855503, + "learning_rate": 6.969155530816824e-07, + "loss": 0.6512, + "step": 8033 + }, + { + "epoch": 0.84, + "grad_norm": 2.036896841652315, + "learning_rate": 6.960585119512125e-07, + "loss": 0.5567, + "step": 8034 + }, + { + "epoch": 0.84, + "grad_norm": 1.919130726074716, + "learning_rate": 6.952019586970416e-07, + "loss": 0.612, + "step": 8035 + }, + { + "epoch": 0.84, + "grad_norm": 1.928540621573469, + "learning_rate": 6.943458934162656e-07, + "loss": 0.6911, + "step": 8036 + }, + { + "epoch": 0.84, + "grad_norm": 1.9840568093800386, + "learning_rate": 6.934903162059242e-07, + "loss": 0.6367, + "step": 8037 + }, + { + "epoch": 0.84, + "grad_norm": 2.005740659612198, + "learning_rate": 6.92635227163001e-07, + "loss": 0.6262, + "step": 8038 + }, + { + "epoch": 0.84, + "grad_norm": 2.1474820104829084, + "learning_rate": 6.917806263844268e-07, + "loss": 0.6138, + "step": 8039 + }, + { + "epoch": 0.84, + "grad_norm": 2.1811307190465885, + "learning_rate": 6.909265139670735e-07, + "loss": 0.5507, + "step": 8040 + }, + { + "epoch": 0.84, + "grad_norm": 1.9714652581473788, + "learning_rate": 6.900728900077619e-07, + "loss": 0.6353, + "step": 8041 + }, + { + "epoch": 0.84, + "grad_norm": 2.335366259929589, + "learning_rate": 6.89219754603253e-07, + "loss": 0.7156, + "step": 8042 + }, + { + "epoch": 0.84, + "grad_norm": 2.0117714750139672, + "learning_rate": 6.883671078502574e-07, + "loss": 0.5447, + "step": 8043 + }, + { + "epoch": 0.84, + "grad_norm": 2.117046336034858, + "learning_rate": 6.875149498454237e-07, + "loss": 0.7257, + "step": 8044 + }, + { + "epoch": 0.84, + "grad_norm": 2.1060454141945257, + "learning_rate": 6.866632806853518e-07, + "loss": 0.7056, + "step": 8045 + }, + { + "epoch": 0.84, + "grad_norm": 1.8680810716410823, + "learning_rate": 6.858121004665813e-07, + "loss": 0.5665, + "step": 8046 + }, + { + "epoch": 0.84, + "grad_norm": 1.969897563001284, + "learning_rate": 6.849614092856005e-07, + "loss": 0.6128, + "step": 8047 + }, + { + "epoch": 0.84, + "grad_norm": 2.038296389027259, + "learning_rate": 6.841112072388373e-07, + "loss": 0.6069, + "step": 8048 + }, + { + "epoch": 0.84, + "grad_norm": 2.12047751623645, + "learning_rate": 6.832614944226695e-07, + "loss": 0.6651, + "step": 8049 + }, + { + "epoch": 0.84, + "grad_norm": 1.78106945111783, + "learning_rate": 6.824122709334152e-07, + "loss": 0.6632, + "step": 8050 + }, + { + "epoch": 0.84, + "grad_norm": 1.7715123778299606, + "learning_rate": 6.815635368673418e-07, + "loss": 0.5322, + "step": 8051 + }, + { + "epoch": 0.84, + "grad_norm": 1.9202887091570335, + "learning_rate": 6.807152923206528e-07, + "loss": 0.6464, + "step": 8052 + }, + { + "epoch": 0.84, + "grad_norm": 1.9938870648476452, + "learning_rate": 6.798675373895064e-07, + "loss": 0.6621, + "step": 8053 + }, + { + "epoch": 0.84, + "grad_norm": 2.0986027995623178, + "learning_rate": 6.790202721699968e-07, + "loss": 0.6684, + "step": 8054 + }, + { + "epoch": 0.84, + "grad_norm": 2.0705757135608573, + "learning_rate": 6.781734967581699e-07, + "loss": 0.5709, + "step": 8055 + }, + { + "epoch": 0.84, + "grad_norm": 2.001524224677035, + "learning_rate": 6.77327211250009e-07, + "loss": 0.5792, + "step": 8056 + }, + { + "epoch": 0.84, + "grad_norm": 1.7953776708987224, + "learning_rate": 6.764814157414484e-07, + "loss": 0.5446, + "step": 8057 + }, + { + "epoch": 0.84, + "grad_norm": 1.9216694263397198, + "learning_rate": 6.756361103283626e-07, + "loss": 0.5869, + "step": 8058 + }, + { + "epoch": 0.84, + "grad_norm": 2.23119073549652, + "learning_rate": 6.747912951065722e-07, + "loss": 0.72, + "step": 8059 + }, + { + "epoch": 0.84, + "grad_norm": 1.887099107100426, + "learning_rate": 6.739469701718398e-07, + "loss": 0.5626, + "step": 8060 + }, + { + "epoch": 0.84, + "grad_norm": 1.7724551449703356, + "learning_rate": 6.731031356198769e-07, + "loss": 0.5565, + "step": 8061 + }, + { + "epoch": 0.84, + "grad_norm": 1.8592960000825143, + "learning_rate": 6.722597915463352e-07, + "loss": 0.6031, + "step": 8062 + }, + { + "epoch": 0.84, + "grad_norm": 2.000586160391033, + "learning_rate": 6.714169380468144e-07, + "loss": 0.5987, + "step": 8063 + }, + { + "epoch": 0.84, + "grad_norm": 1.8188177079291683, + "learning_rate": 6.705745752168552e-07, + "loss": 0.5919, + "step": 8064 + }, + { + "epoch": 0.84, + "grad_norm": 1.8460808208474586, + "learning_rate": 6.697327031519452e-07, + "loss": 0.5777, + "step": 8065 + }, + { + "epoch": 0.84, + "grad_norm": 2.0220325407867525, + "learning_rate": 6.688913219475158e-07, + "loss": 0.6628, + "step": 8066 + }, + { + "epoch": 0.84, + "grad_norm": 1.9653708715581029, + "learning_rate": 6.680504316989405e-07, + "loss": 0.6537, + "step": 8067 + }, + { + "epoch": 0.84, + "grad_norm": 2.1103633643710333, + "learning_rate": 6.672100325015396e-07, + "loss": 0.7001, + "step": 8068 + }, + { + "epoch": 0.84, + "grad_norm": 1.930693121411337, + "learning_rate": 6.663701244505788e-07, + "loss": 0.6294, + "step": 8069 + }, + { + "epoch": 0.84, + "grad_norm": 2.0081188554424685, + "learning_rate": 6.655307076412637e-07, + "loss": 0.6435, + "step": 8070 + }, + { + "epoch": 0.84, + "grad_norm": 1.9797486027365578, + "learning_rate": 6.646917821687504e-07, + "loss": 0.6492, + "step": 8071 + }, + { + "epoch": 0.84, + "grad_norm": 1.8818227613869194, + "learning_rate": 6.638533481281323e-07, + "loss": 0.5762, + "step": 8072 + }, + { + "epoch": 0.84, + "grad_norm": 2.05125972802814, + "learning_rate": 6.630154056144533e-07, + "loss": 0.673, + "step": 8073 + }, + { + "epoch": 0.84, + "grad_norm": 1.7769660589729557, + "learning_rate": 6.621779547226986e-07, + "loss": 0.6233, + "step": 8074 + }, + { + "epoch": 0.84, + "grad_norm": 2.305973486593778, + "learning_rate": 6.613409955477962e-07, + "loss": 0.6659, + "step": 8075 + }, + { + "epoch": 0.84, + "grad_norm": 1.8901669506607903, + "learning_rate": 6.605045281846222e-07, + "loss": 0.6144, + "step": 8076 + }, + { + "epoch": 0.84, + "grad_norm": 2.0978112467258616, + "learning_rate": 6.596685527279939e-07, + "loss": 0.6885, + "step": 8077 + }, + { + "epoch": 0.84, + "grad_norm": 2.004337979444083, + "learning_rate": 6.588330692726747e-07, + "loss": 0.6176, + "step": 8078 + }, + { + "epoch": 0.84, + "grad_norm": 2.1383294050312145, + "learning_rate": 6.579980779133705e-07, + "loss": 0.674, + "step": 8079 + }, + { + "epoch": 0.84, + "grad_norm": 1.8627858607830665, + "learning_rate": 6.571635787447339e-07, + "loss": 0.5736, + "step": 8080 + }, + { + "epoch": 0.84, + "grad_norm": 2.059976230713306, + "learning_rate": 6.563295718613577e-07, + "loss": 0.7069, + "step": 8081 + }, + { + "epoch": 0.84, + "grad_norm": 2.0325541362148702, + "learning_rate": 6.554960573577834e-07, + "loss": 0.586, + "step": 8082 + }, + { + "epoch": 0.84, + "grad_norm": 1.9740764701873206, + "learning_rate": 6.546630353284927e-07, + "loss": 0.5326, + "step": 8083 + }, + { + "epoch": 0.84, + "grad_norm": 1.9563646646591624, + "learning_rate": 6.538305058679156e-07, + "loss": 0.6275, + "step": 8084 + }, + { + "epoch": 0.84, + "grad_norm": 2.0898251948049698, + "learning_rate": 6.529984690704222e-07, + "loss": 0.6423, + "step": 8085 + }, + { + "epoch": 0.84, + "grad_norm": 2.2789120695548575, + "learning_rate": 6.521669250303303e-07, + "loss": 0.6563, + "step": 8086 + }, + { + "epoch": 0.84, + "grad_norm": 1.9959617661791869, + "learning_rate": 6.51335873841899e-07, + "loss": 0.5937, + "step": 8087 + }, + { + "epoch": 0.84, + "grad_norm": 2.0252537703122244, + "learning_rate": 6.505053155993335e-07, + "loss": 0.6138, + "step": 8088 + }, + { + "epoch": 0.84, + "grad_norm": 1.9790505498454918, + "learning_rate": 6.496752503967801e-07, + "loss": 0.635, + "step": 8089 + }, + { + "epoch": 0.84, + "grad_norm": 2.223228305084311, + "learning_rate": 6.488456783283343e-07, + "loss": 0.6743, + "step": 8090 + }, + { + "epoch": 0.84, + "grad_norm": 2.2335882128652336, + "learning_rate": 6.480165994880311e-07, + "loss": 0.6312, + "step": 8091 + }, + { + "epoch": 0.84, + "grad_norm": 2.153211792356647, + "learning_rate": 6.471880139698523e-07, + "loss": 0.6157, + "step": 8092 + }, + { + "epoch": 0.84, + "grad_norm": 2.1742719609091137, + "learning_rate": 6.463599218677214e-07, + "loss": 0.6432, + "step": 8093 + }, + { + "epoch": 0.84, + "grad_norm": 2.0043189581847036, + "learning_rate": 6.455323232755095e-07, + "loss": 0.6517, + "step": 8094 + }, + { + "epoch": 0.84, + "grad_norm": 2.135564020927073, + "learning_rate": 6.447052182870284e-07, + "loss": 0.642, + "step": 8095 + }, + { + "epoch": 0.84, + "grad_norm": 1.9084088820683192, + "learning_rate": 6.438786069960345e-07, + "loss": 0.6292, + "step": 8096 + }, + { + "epoch": 0.84, + "grad_norm": 1.9232430703411438, + "learning_rate": 6.430524894962292e-07, + "loss": 0.6953, + "step": 8097 + }, + { + "epoch": 0.84, + "grad_norm": 1.9925604766847647, + "learning_rate": 6.422268658812591e-07, + "loss": 0.511, + "step": 8098 + }, + { + "epoch": 0.84, + "grad_norm": 2.0022550095449083, + "learning_rate": 6.414017362447106e-07, + "loss": 0.7423, + "step": 8099 + }, + { + "epoch": 0.84, + "grad_norm": 1.761768263429008, + "learning_rate": 6.405771006801198e-07, + "loss": 0.5593, + "step": 8100 + }, + { + "epoch": 0.84, + "grad_norm": 1.8505005255126203, + "learning_rate": 6.397529592809615e-07, + "loss": 0.6013, + "step": 8101 + }, + { + "epoch": 0.84, + "grad_norm": 2.0629170850220375, + "learning_rate": 6.389293121406592e-07, + "loss": 0.6523, + "step": 8102 + }, + { + "epoch": 0.84, + "grad_norm": 1.8557315352311599, + "learning_rate": 6.381061593525762e-07, + "loss": 0.6192, + "step": 8103 + }, + { + "epoch": 0.84, + "grad_norm": 1.8596773536895244, + "learning_rate": 6.372835010100215e-07, + "loss": 0.5742, + "step": 8104 + }, + { + "epoch": 0.84, + "grad_norm": 1.8124890471220407, + "learning_rate": 6.364613372062489e-07, + "loss": 0.5756, + "step": 8105 + }, + { + "epoch": 0.84, + "grad_norm": 1.9876410029659157, + "learning_rate": 6.356396680344556e-07, + "loss": 0.6143, + "step": 8106 + }, + { + "epoch": 0.84, + "grad_norm": 1.8949001243139065, + "learning_rate": 6.34818493587781e-07, + "loss": 0.6156, + "step": 8107 + }, + { + "epoch": 0.84, + "grad_norm": 2.449001211705069, + "learning_rate": 6.339978139593117e-07, + "loss": 0.6886, + "step": 8108 + }, + { + "epoch": 0.84, + "grad_norm": 1.9631147045408033, + "learning_rate": 6.331776292420744e-07, + "loss": 0.6128, + "step": 8109 + }, + { + "epoch": 0.84, + "grad_norm": 1.9368516611472004, + "learning_rate": 6.323579395290435e-07, + "loss": 0.5734, + "step": 8110 + }, + { + "epoch": 0.84, + "grad_norm": 1.8401589565954515, + "learning_rate": 6.315387449131355e-07, + "loss": 0.6432, + "step": 8111 + }, + { + "epoch": 0.84, + "grad_norm": 1.8297334426235043, + "learning_rate": 6.307200454872093e-07, + "loss": 0.5988, + "step": 8112 + }, + { + "epoch": 0.84, + "grad_norm": 1.703991110615374, + "learning_rate": 6.299018413440705e-07, + "loss": 0.5624, + "step": 8113 + }, + { + "epoch": 0.84, + "grad_norm": 2.036173943589799, + "learning_rate": 6.290841325764662e-07, + "loss": 0.6786, + "step": 8114 + }, + { + "epoch": 0.84, + "grad_norm": 1.9534205152629416, + "learning_rate": 6.282669192770896e-07, + "loss": 0.6305, + "step": 8115 + }, + { + "epoch": 0.84, + "grad_norm": 2.203379643878614, + "learning_rate": 6.274502015385747e-07, + "loss": 0.61, + "step": 8116 + }, + { + "epoch": 0.84, + "grad_norm": 2.0103680975969453, + "learning_rate": 6.266339794535043e-07, + "loss": 0.6402, + "step": 8117 + }, + { + "epoch": 0.84, + "grad_norm": 1.916711741876715, + "learning_rate": 6.258182531143975e-07, + "loss": 0.56, + "step": 8118 + }, + { + "epoch": 0.84, + "grad_norm": 1.941936713481425, + "learning_rate": 6.250030226137249e-07, + "loss": 0.6702, + "step": 8119 + }, + { + "epoch": 0.84, + "grad_norm": 2.0225054472897175, + "learning_rate": 6.241882880438949e-07, + "loss": 0.6005, + "step": 8120 + }, + { + "epoch": 0.84, + "grad_norm": 1.8856948884191638, + "learning_rate": 6.233740494972651e-07, + "loss": 0.5536, + "step": 8121 + }, + { + "epoch": 0.84, + "grad_norm": 1.9042258310577747, + "learning_rate": 6.225603070661318e-07, + "loss": 0.5368, + "step": 8122 + }, + { + "epoch": 0.84, + "grad_norm": 2.004419780870736, + "learning_rate": 6.217470608427395e-07, + "loss": 0.5644, + "step": 8123 + }, + { + "epoch": 0.84, + "grad_norm": 1.9291607571745164, + "learning_rate": 6.209343109192728e-07, + "loss": 0.5955, + "step": 8124 + }, + { + "epoch": 0.84, + "grad_norm": 2.125008453554657, + "learning_rate": 6.201220573878613e-07, + "loss": 0.694, + "step": 8125 + }, + { + "epoch": 0.84, + "grad_norm": 2.0979216517156836, + "learning_rate": 6.193103003405787e-07, + "loss": 0.7127, + "step": 8126 + }, + { + "epoch": 0.84, + "grad_norm": 2.134679203517581, + "learning_rate": 6.184990398694435e-07, + "loss": 0.6655, + "step": 8127 + }, + { + "epoch": 0.84, + "grad_norm": 1.8835391406410493, + "learning_rate": 6.176882760664149e-07, + "loss": 0.545, + "step": 8128 + }, + { + "epoch": 0.85, + "grad_norm": 1.9960800018697822, + "learning_rate": 6.168780090233994e-07, + "loss": 0.6728, + "step": 8129 + }, + { + "epoch": 0.85, + "grad_norm": 2.419923774205455, + "learning_rate": 6.160682388322436e-07, + "loss": 0.5715, + "step": 8130 + }, + { + "epoch": 0.85, + "grad_norm": 2.0042116275814434, + "learning_rate": 6.152589655847413e-07, + "loss": 0.5711, + "step": 8131 + }, + { + "epoch": 0.85, + "grad_norm": 1.9220772781429358, + "learning_rate": 6.14450189372628e-07, + "loss": 0.5321, + "step": 8132 + }, + { + "epoch": 0.85, + "grad_norm": 2.015869273943935, + "learning_rate": 6.136419102875818e-07, + "loss": 0.6413, + "step": 8133 + }, + { + "epoch": 0.85, + "grad_norm": 1.7662095296346332, + "learning_rate": 6.128341284212258e-07, + "loss": 0.5262, + "step": 8134 + }, + { + "epoch": 0.85, + "grad_norm": 2.303831477540973, + "learning_rate": 6.120268438651283e-07, + "loss": 0.7143, + "step": 8135 + }, + { + "epoch": 0.85, + "grad_norm": 2.3042755907621832, + "learning_rate": 6.112200567107978e-07, + "loss": 0.7399, + "step": 8136 + }, + { + "epoch": 0.85, + "grad_norm": 2.071953461037566, + "learning_rate": 6.104137670496901e-07, + "loss": 0.5517, + "step": 8137 + }, + { + "epoch": 0.85, + "grad_norm": 1.9353300309979724, + "learning_rate": 6.096079749732009e-07, + "loss": 0.6453, + "step": 8138 + }, + { + "epoch": 0.85, + "grad_norm": 1.986671989877575, + "learning_rate": 6.088026805726727e-07, + "loss": 0.6424, + "step": 8139 + }, + { + "epoch": 0.85, + "grad_norm": 1.9911146195693263, + "learning_rate": 6.079978839393896e-07, + "loss": 0.5474, + "step": 8140 + }, + { + "epoch": 0.85, + "grad_norm": 2.0317851662272974, + "learning_rate": 6.071935851645794e-07, + "loss": 0.5149, + "step": 8141 + }, + { + "epoch": 0.85, + "grad_norm": 1.9665291558339688, + "learning_rate": 6.063897843394151e-07, + "loss": 0.6124, + "step": 8142 + }, + { + "epoch": 0.85, + "grad_norm": 1.7078468937680698, + "learning_rate": 6.055864815550106e-07, + "loss": 0.5798, + "step": 8143 + }, + { + "epoch": 0.85, + "grad_norm": 1.5752220742547929, + "learning_rate": 6.047836769024268e-07, + "loss": 0.5167, + "step": 8144 + }, + { + "epoch": 0.85, + "grad_norm": 1.9074047548237727, + "learning_rate": 6.03981370472665e-07, + "loss": 0.6603, + "step": 8145 + }, + { + "epoch": 0.85, + "grad_norm": 2.1307330401924562, + "learning_rate": 6.031795623566705e-07, + "loss": 0.6601, + "step": 8146 + }, + { + "epoch": 0.85, + "grad_norm": 1.987581419382653, + "learning_rate": 6.023782526453347e-07, + "loss": 0.5877, + "step": 8147 + }, + { + "epoch": 0.85, + "grad_norm": 1.9220353371587902, + "learning_rate": 6.015774414294894e-07, + "loss": 0.5671, + "step": 8148 + }, + { + "epoch": 0.85, + "grad_norm": 1.8531354376553826, + "learning_rate": 6.007771287999104e-07, + "loss": 0.6311, + "step": 8149 + }, + { + "epoch": 0.85, + "grad_norm": 2.0038979840409037, + "learning_rate": 5.999773148473193e-07, + "loss": 0.5818, + "step": 8150 + }, + { + "epoch": 0.85, + "grad_norm": 1.7687166304424737, + "learning_rate": 5.991779996623781e-07, + "loss": 0.6035, + "step": 8151 + }, + { + "epoch": 0.85, + "grad_norm": 2.126672984023645, + "learning_rate": 5.983791833356955e-07, + "loss": 0.5741, + "step": 8152 + }, + { + "epoch": 0.85, + "grad_norm": 1.7741173112022408, + "learning_rate": 5.975808659578197e-07, + "loss": 0.5839, + "step": 8153 + }, + { + "epoch": 0.85, + "grad_norm": 2.0262142221276727, + "learning_rate": 5.967830476192476e-07, + "loss": 0.5856, + "step": 8154 + }, + { + "epoch": 0.85, + "grad_norm": 1.8676247618096367, + "learning_rate": 5.959857284104132e-07, + "loss": 0.6758, + "step": 8155 + }, + { + "epoch": 0.85, + "grad_norm": 2.059672622044571, + "learning_rate": 5.951889084216989e-07, + "loss": 0.4721, + "step": 8156 + }, + { + "epoch": 0.85, + "grad_norm": 2.1535823631847846, + "learning_rate": 5.943925877434276e-07, + "loss": 0.6405, + "step": 8157 + }, + { + "epoch": 0.85, + "grad_norm": 2.1292794880898303, + "learning_rate": 5.935967664658682e-07, + "loss": 0.5985, + "step": 8158 + }, + { + "epoch": 0.85, + "grad_norm": 1.960817142560252, + "learning_rate": 5.928014446792308e-07, + "loss": 0.5587, + "step": 8159 + }, + { + "epoch": 0.85, + "grad_norm": 1.8036953737602626, + "learning_rate": 5.920066224736703e-07, + "loss": 0.5876, + "step": 8160 + }, + { + "epoch": 0.85, + "grad_norm": 2.017494671286078, + "learning_rate": 5.912122999392838e-07, + "loss": 0.6468, + "step": 8161 + }, + { + "epoch": 0.85, + "grad_norm": 1.9745326076384333, + "learning_rate": 5.904184771661126e-07, + "loss": 0.6042, + "step": 8162 + }, + { + "epoch": 0.85, + "grad_norm": 2.115718656081801, + "learning_rate": 5.896251542441395e-07, + "loss": 0.6424, + "step": 8163 + }, + { + "epoch": 0.85, + "grad_norm": 2.152383828453354, + "learning_rate": 5.888323312632948e-07, + "loss": 0.5845, + "step": 8164 + }, + { + "epoch": 0.85, + "grad_norm": 1.8618214723348339, + "learning_rate": 5.880400083134469e-07, + "loss": 0.5711, + "step": 8165 + }, + { + "epoch": 0.85, + "grad_norm": 1.9004306913700046, + "learning_rate": 5.872481854844126e-07, + "loss": 0.6468, + "step": 8166 + }, + { + "epoch": 0.85, + "grad_norm": 2.079916826005283, + "learning_rate": 5.864568628659473e-07, + "loss": 0.5972, + "step": 8167 + }, + { + "epoch": 0.85, + "grad_norm": 1.9150425887906342, + "learning_rate": 5.856660405477538e-07, + "loss": 0.5951, + "step": 8168 + }, + { + "epoch": 0.85, + "grad_norm": 1.9768982180845163, + "learning_rate": 5.848757186194753e-07, + "loss": 0.661, + "step": 8169 + }, + { + "epoch": 0.85, + "grad_norm": 2.0685273523691246, + "learning_rate": 5.840858971707003e-07, + "loss": 0.6187, + "step": 8170 + }, + { + "epoch": 0.85, + "grad_norm": 2.20740535387764, + "learning_rate": 5.83296576290957e-07, + "loss": 0.61, + "step": 8171 + }, + { + "epoch": 0.85, + "grad_norm": 2.077062544429413, + "learning_rate": 5.825077560697224e-07, + "loss": 0.6323, + "step": 8172 + }, + { + "epoch": 0.85, + "grad_norm": 2.2229980382125083, + "learning_rate": 5.817194365964113e-07, + "loss": 0.6538, + "step": 8173 + }, + { + "epoch": 0.85, + "grad_norm": 2.0776525637885204, + "learning_rate": 5.809316179603863e-07, + "loss": 0.6899, + "step": 8174 + }, + { + "epoch": 0.85, + "grad_norm": 2.060688398838161, + "learning_rate": 5.801443002509493e-07, + "loss": 0.6166, + "step": 8175 + }, + { + "epoch": 0.85, + "grad_norm": 1.9882609174919457, + "learning_rate": 5.793574835573495e-07, + "loss": 0.6165, + "step": 8176 + }, + { + "epoch": 0.85, + "grad_norm": 2.1539749506952197, + "learning_rate": 5.785711679687756e-07, + "loss": 0.6126, + "step": 8177 + }, + { + "epoch": 0.85, + "grad_norm": 2.009397380360969, + "learning_rate": 5.777853535743605e-07, + "loss": 0.539, + "step": 8178 + }, + { + "epoch": 0.85, + "grad_norm": 1.701887467794953, + "learning_rate": 5.770000404631815e-07, + "loss": 0.5223, + "step": 8179 + }, + { + "epoch": 0.85, + "grad_norm": 1.8337089423095534, + "learning_rate": 5.762152287242578e-07, + "loss": 0.523, + "step": 8180 + }, + { + "epoch": 0.85, + "grad_norm": 2.190954607822149, + "learning_rate": 5.754309184465534e-07, + "loss": 0.5291, + "step": 8181 + }, + { + "epoch": 0.85, + "grad_norm": 1.9339239124719618, + "learning_rate": 5.746471097189727e-07, + "loss": 0.6148, + "step": 8182 + }, + { + "epoch": 0.85, + "grad_norm": 1.8286911849784362, + "learning_rate": 5.738638026303672e-07, + "loss": 0.6589, + "step": 8183 + }, + { + "epoch": 0.85, + "grad_norm": 1.763185449223713, + "learning_rate": 5.730809972695272e-07, + "loss": 0.6251, + "step": 8184 + }, + { + "epoch": 0.85, + "grad_norm": 2.144097951689894, + "learning_rate": 5.72298693725189e-07, + "loss": 0.6271, + "step": 8185 + }, + { + "epoch": 0.85, + "grad_norm": 2.1576327396917168, + "learning_rate": 5.715168920860298e-07, + "loss": 0.5955, + "step": 8186 + }, + { + "epoch": 0.85, + "grad_norm": 2.2777450538296677, + "learning_rate": 5.707355924406738e-07, + "loss": 0.7751, + "step": 8187 + }, + { + "epoch": 0.85, + "grad_norm": 2.2047094872367006, + "learning_rate": 5.699547948776829e-07, + "loss": 0.6282, + "step": 8188 + }, + { + "epoch": 0.85, + "grad_norm": 1.9091741539508933, + "learning_rate": 5.691744994855675e-07, + "loss": 0.6249, + "step": 8189 + }, + { + "epoch": 0.85, + "grad_norm": 2.021439937726755, + "learning_rate": 5.683947063527762e-07, + "loss": 0.565, + "step": 8190 + }, + { + "epoch": 0.85, + "grad_norm": 1.729258043483863, + "learning_rate": 5.676154155677066e-07, + "loss": 0.5358, + "step": 8191 + }, + { + "epoch": 0.85, + "grad_norm": 1.929628451697072, + "learning_rate": 5.668366272186915e-07, + "loss": 0.5293, + "step": 8192 + }, + { + "epoch": 0.85, + "grad_norm": 1.8951688227059889, + "learning_rate": 5.660583413940135e-07, + "loss": 0.6399, + "step": 8193 + }, + { + "epoch": 0.85, + "grad_norm": 2.0138368029617912, + "learning_rate": 5.652805581818943e-07, + "loss": 0.5991, + "step": 8194 + }, + { + "epoch": 0.85, + "grad_norm": 1.9355511215261672, + "learning_rate": 5.645032776705023e-07, + "loss": 0.5545, + "step": 8195 + }, + { + "epoch": 0.85, + "grad_norm": 2.068165286932841, + "learning_rate": 5.637264999479436e-07, + "loss": 0.6252, + "step": 8196 + }, + { + "epoch": 0.85, + "grad_norm": 1.8040737962816091, + "learning_rate": 5.629502251022734e-07, + "loss": 0.6947, + "step": 8197 + }, + { + "epoch": 0.85, + "grad_norm": 2.1167954493638685, + "learning_rate": 5.621744532214856e-07, + "loss": 0.6703, + "step": 8198 + }, + { + "epoch": 0.85, + "grad_norm": 1.8845230887915438, + "learning_rate": 5.613991843935179e-07, + "loss": 0.6548, + "step": 8199 + }, + { + "epoch": 0.85, + "grad_norm": 1.8029172725787435, + "learning_rate": 5.606244187062509e-07, + "loss": 0.6001, + "step": 8200 + }, + { + "epoch": 0.85, + "grad_norm": 1.9588614842084962, + "learning_rate": 5.598501562475111e-07, + "loss": 0.5883, + "step": 8201 + }, + { + "epoch": 0.85, + "grad_norm": 1.7162881782665964, + "learning_rate": 5.590763971050628e-07, + "loss": 0.5193, + "step": 8202 + }, + { + "epoch": 0.85, + "grad_norm": 2.048313840022909, + "learning_rate": 5.583031413666185e-07, + "loss": 0.6895, + "step": 8203 + }, + { + "epoch": 0.85, + "grad_norm": 1.9457017560438867, + "learning_rate": 5.575303891198286e-07, + "loss": 0.5658, + "step": 8204 + }, + { + "epoch": 0.85, + "grad_norm": 1.880442705012502, + "learning_rate": 5.567581404522914e-07, + "loss": 0.5756, + "step": 8205 + }, + { + "epoch": 0.85, + "grad_norm": 1.8335074588378506, + "learning_rate": 5.559863954515448e-07, + "loss": 0.554, + "step": 8206 + }, + { + "epoch": 0.85, + "grad_norm": 1.957864056813018, + "learning_rate": 5.552151542050699e-07, + "loss": 0.7151, + "step": 8207 + }, + { + "epoch": 0.85, + "grad_norm": 2.0410162015627153, + "learning_rate": 5.54444416800291e-07, + "loss": 0.6489, + "step": 8208 + }, + { + "epoch": 0.85, + "grad_norm": 1.966686920765713, + "learning_rate": 5.536741833245773e-07, + "loss": 0.5739, + "step": 8209 + }, + { + "epoch": 0.85, + "grad_norm": 2.097614872666767, + "learning_rate": 5.529044538652373e-07, + "loss": 0.6079, + "step": 8210 + }, + { + "epoch": 0.85, + "grad_norm": 1.8126670428732163, + "learning_rate": 5.521352285095261e-07, + "loss": 0.5025, + "step": 8211 + }, + { + "epoch": 0.85, + "grad_norm": 1.9132465488593897, + "learning_rate": 5.513665073446372e-07, + "loss": 0.6297, + "step": 8212 + }, + { + "epoch": 0.85, + "grad_norm": 1.9350220871821595, + "learning_rate": 5.505982904577123e-07, + "loss": 0.615, + "step": 8213 + }, + { + "epoch": 0.85, + "grad_norm": 2.1347049899077284, + "learning_rate": 5.49830577935832e-07, + "loss": 0.5208, + "step": 8214 + }, + { + "epoch": 0.85, + "grad_norm": 1.8637329068323893, + "learning_rate": 5.490633698660197e-07, + "loss": 0.5808, + "step": 8215 + }, + { + "epoch": 0.85, + "grad_norm": 1.948838872573589, + "learning_rate": 5.482966663352451e-07, + "loss": 0.621, + "step": 8216 + }, + { + "epoch": 0.85, + "grad_norm": 2.0493579656488325, + "learning_rate": 5.47530467430416e-07, + "loss": 0.622, + "step": 8217 + }, + { + "epoch": 0.85, + "grad_norm": 1.918223916486134, + "learning_rate": 5.467647732383879e-07, + "loss": 0.608, + "step": 8218 + }, + { + "epoch": 0.85, + "grad_norm": 1.7803045728773015, + "learning_rate": 5.459995838459542e-07, + "loss": 0.5959, + "step": 8219 + }, + { + "epoch": 0.85, + "grad_norm": 1.9266062790106375, + "learning_rate": 5.452348993398566e-07, + "loss": 0.5599, + "step": 8220 + }, + { + "epoch": 0.85, + "grad_norm": 1.7707431361868795, + "learning_rate": 5.444707198067722e-07, + "loss": 0.6657, + "step": 8221 + }, + { + "epoch": 0.85, + "grad_norm": 1.8932789462090363, + "learning_rate": 5.437070453333288e-07, + "loss": 0.6384, + "step": 8222 + }, + { + "epoch": 0.85, + "grad_norm": 2.236832426733744, + "learning_rate": 5.429438760060906e-07, + "loss": 0.5938, + "step": 8223 + }, + { + "epoch": 0.85, + "grad_norm": 1.9219044021119076, + "learning_rate": 5.421812119115699e-07, + "loss": 0.6044, + "step": 8224 + }, + { + "epoch": 0.85, + "grad_norm": 2.2466433945686757, + "learning_rate": 5.414190531362162e-07, + "loss": 0.6077, + "step": 8225 + }, + { + "epoch": 0.86, + "grad_norm": 1.8069595964244884, + "learning_rate": 5.406573997664267e-07, + "loss": 0.5446, + "step": 8226 + }, + { + "epoch": 0.86, + "grad_norm": 1.8885534423348074, + "learning_rate": 5.398962518885375e-07, + "loss": 0.5408, + "step": 8227 + }, + { + "epoch": 0.86, + "grad_norm": 2.093745092026466, + "learning_rate": 5.391356095888323e-07, + "loss": 0.6371, + "step": 8228 + }, + { + "epoch": 0.86, + "grad_norm": 1.9475019077484441, + "learning_rate": 5.38375472953529e-07, + "loss": 0.6218, + "step": 8229 + }, + { + "epoch": 0.86, + "grad_norm": 2.050764295266713, + "learning_rate": 5.376158420687977e-07, + "loss": 0.7155, + "step": 8230 + }, + { + "epoch": 0.86, + "grad_norm": 2.039172342224332, + "learning_rate": 5.368567170207445e-07, + "loss": 0.6407, + "step": 8231 + }, + { + "epoch": 0.86, + "grad_norm": 1.9356989051958875, + "learning_rate": 5.360980978954223e-07, + "loss": 0.5751, + "step": 8232 + }, + { + "epoch": 0.86, + "grad_norm": 1.908719869422782, + "learning_rate": 5.353399847788233e-07, + "loss": 0.6832, + "step": 8233 + }, + { + "epoch": 0.86, + "grad_norm": 1.9156768167886171, + "learning_rate": 5.345823777568859e-07, + "loss": 0.6937, + "step": 8234 + }, + { + "epoch": 0.86, + "grad_norm": 2.094528279582242, + "learning_rate": 5.338252769154878e-07, + "loss": 0.5975, + "step": 8235 + }, + { + "epoch": 0.86, + "grad_norm": 1.9826801963733196, + "learning_rate": 5.330686823404507e-07, + "loss": 0.5109, + "step": 8236 + }, + { + "epoch": 0.86, + "grad_norm": 2.1642191778237656, + "learning_rate": 5.323125941175383e-07, + "loss": 0.6296, + "step": 8237 + }, + { + "epoch": 0.86, + "grad_norm": 2.1050313683713204, + "learning_rate": 5.315570123324593e-07, + "loss": 0.5841, + "step": 8238 + }, + { + "epoch": 0.86, + "grad_norm": 1.86255900999276, + "learning_rate": 5.308019370708612e-07, + "loss": 0.5258, + "step": 8239 + }, + { + "epoch": 0.86, + "grad_norm": 2.0814563238442214, + "learning_rate": 5.300473684183382e-07, + "loss": 0.6346, + "step": 8240 + }, + { + "epoch": 0.86, + "grad_norm": 2.004366161593308, + "learning_rate": 5.292933064604228e-07, + "loss": 0.6448, + "step": 8241 + }, + { + "epoch": 0.86, + "grad_norm": 2.0814251969657196, + "learning_rate": 5.28539751282594e-07, + "loss": 0.7272, + "step": 8242 + }, + { + "epoch": 0.86, + "grad_norm": 1.8919312569321771, + "learning_rate": 5.277867029702716e-07, + "loss": 0.5276, + "step": 8243 + }, + { + "epoch": 0.86, + "grad_norm": 1.9044934817666754, + "learning_rate": 5.270341616088153e-07, + "loss": 0.5631, + "step": 8244 + }, + { + "epoch": 0.86, + "grad_norm": 1.8961844755275268, + "learning_rate": 5.262821272835334e-07, + "loss": 0.5554, + "step": 8245 + }, + { + "epoch": 0.86, + "grad_norm": 2.1041404998731177, + "learning_rate": 5.255306000796717e-07, + "loss": 0.5704, + "step": 8246 + }, + { + "epoch": 0.86, + "grad_norm": 2.039862049268198, + "learning_rate": 5.24779580082419e-07, + "loss": 0.5524, + "step": 8247 + }, + { + "epoch": 0.86, + "grad_norm": 1.8011189836643542, + "learning_rate": 5.240290673769099e-07, + "loss": 0.5643, + "step": 8248 + }, + { + "epoch": 0.86, + "grad_norm": 1.7674719230033473, + "learning_rate": 5.23279062048217e-07, + "loss": 0.5861, + "step": 8249 + }, + { + "epoch": 0.86, + "grad_norm": 1.6875423096614102, + "learning_rate": 5.225295641813599e-07, + "loss": 0.5266, + "step": 8250 + }, + { + "epoch": 0.86, + "grad_norm": 2.1684829276074384, + "learning_rate": 5.217805738612975e-07, + "loss": 0.563, + "step": 8251 + }, + { + "epoch": 0.86, + "grad_norm": 2.008210226253471, + "learning_rate": 5.210320911729311e-07, + "loss": 0.5562, + "step": 8252 + }, + { + "epoch": 0.86, + "grad_norm": 2.146514487983114, + "learning_rate": 5.202841162011074e-07, + "loss": 0.6285, + "step": 8253 + }, + { + "epoch": 0.86, + "grad_norm": 2.070368443828562, + "learning_rate": 5.195366490306114e-07, + "loss": 0.5777, + "step": 8254 + }, + { + "epoch": 0.86, + "grad_norm": 1.9585353530387317, + "learning_rate": 5.187896897461752e-07, + "loss": 0.5651, + "step": 8255 + }, + { + "epoch": 0.86, + "grad_norm": 2.179156757136343, + "learning_rate": 5.180432384324691e-07, + "loss": 0.6963, + "step": 8256 + }, + { + "epoch": 0.86, + "grad_norm": 1.9217899295161627, + "learning_rate": 5.172972951741096e-07, + "loss": 0.5194, + "step": 8257 + }, + { + "epoch": 0.86, + "grad_norm": 1.7949492707964125, + "learning_rate": 5.165518600556507e-07, + "loss": 0.4705, + "step": 8258 + }, + { + "epoch": 0.86, + "grad_norm": 2.089497265943696, + "learning_rate": 5.158069331615939e-07, + "loss": 0.6488, + "step": 8259 + }, + { + "epoch": 0.86, + "grad_norm": 1.925905784209994, + "learning_rate": 5.150625145763794e-07, + "loss": 0.523, + "step": 8260 + }, + { + "epoch": 0.86, + "grad_norm": 1.8012988786439579, + "learning_rate": 5.143186043843934e-07, + "loss": 0.5625, + "step": 8261 + }, + { + "epoch": 0.86, + "grad_norm": 2.073140541720193, + "learning_rate": 5.135752026699597e-07, + "loss": 0.776, + "step": 8262 + }, + { + "epoch": 0.86, + "grad_norm": 2.0938632113238547, + "learning_rate": 5.128323095173498e-07, + "loss": 0.6787, + "step": 8263 + }, + { + "epoch": 0.86, + "grad_norm": 1.8505470192539382, + "learning_rate": 5.12089925010773e-07, + "loss": 0.6027, + "step": 8264 + }, + { + "epoch": 0.86, + "grad_norm": 1.9385142627925132, + "learning_rate": 5.113480492343847e-07, + "loss": 0.5588, + "step": 8265 + }, + { + "epoch": 0.86, + "grad_norm": 2.375733866717919, + "learning_rate": 5.106066822722782e-07, + "loss": 0.66, + "step": 8266 + }, + { + "epoch": 0.86, + "grad_norm": 1.9370422187030516, + "learning_rate": 5.098658242084937e-07, + "loss": 0.6152, + "step": 8267 + }, + { + "epoch": 0.86, + "grad_norm": 1.9627422041290383, + "learning_rate": 5.091254751270097e-07, + "loss": 0.6244, + "step": 8268 + }, + { + "epoch": 0.86, + "grad_norm": 1.9673460760767303, + "learning_rate": 5.083856351117511e-07, + "loss": 0.6486, + "step": 8269 + }, + { + "epoch": 0.86, + "grad_norm": 1.9677413688817145, + "learning_rate": 5.076463042465812e-07, + "loss": 0.6466, + "step": 8270 + }, + { + "epoch": 0.86, + "grad_norm": 1.9076042255087213, + "learning_rate": 5.069074826153097e-07, + "loss": 0.7091, + "step": 8271 + }, + { + "epoch": 0.86, + "grad_norm": 2.1522781967814955, + "learning_rate": 5.061691703016841e-07, + "loss": 0.5958, + "step": 8272 + }, + { + "epoch": 0.86, + "grad_norm": 1.9512598833988453, + "learning_rate": 5.054313673893979e-07, + "loss": 0.6494, + "step": 8273 + }, + { + "epoch": 0.86, + "grad_norm": 1.8889340369572525, + "learning_rate": 5.046940739620826e-07, + "loss": 0.5436, + "step": 8274 + }, + { + "epoch": 0.86, + "grad_norm": 2.1089818787586467, + "learning_rate": 5.039572901033179e-07, + "loss": 0.6802, + "step": 8275 + }, + { + "epoch": 0.86, + "grad_norm": 1.9862877189958716, + "learning_rate": 5.0322101589662e-07, + "loss": 0.6129, + "step": 8276 + }, + { + "epoch": 0.86, + "grad_norm": 1.8141324404301526, + "learning_rate": 5.024852514254513e-07, + "loss": 0.5729, + "step": 8277 + }, + { + "epoch": 0.86, + "grad_norm": 2.0791012063858805, + "learning_rate": 5.017499967732137e-07, + "loss": 0.6168, + "step": 8278 + }, + { + "epoch": 0.86, + "grad_norm": 2.2094091772410938, + "learning_rate": 5.010152520232536e-07, + "loss": 0.6785, + "step": 8279 + }, + { + "epoch": 0.86, + "grad_norm": 2.0704609733536175, + "learning_rate": 5.002810172588584e-07, + "loss": 0.6584, + "step": 8280 + }, + { + "epoch": 0.86, + "grad_norm": 1.9853255752421886, + "learning_rate": 4.995472925632567e-07, + "loss": 0.6358, + "step": 8281 + }, + { + "epoch": 0.86, + "grad_norm": 1.9531326047376758, + "learning_rate": 4.988140780196221e-07, + "loss": 0.652, + "step": 8282 + }, + { + "epoch": 0.86, + "grad_norm": 2.057213242161284, + "learning_rate": 4.980813737110662e-07, + "loss": 0.601, + "step": 8283 + }, + { + "epoch": 0.86, + "grad_norm": 2.105331445169244, + "learning_rate": 4.973491797206481e-07, + "loss": 0.5704, + "step": 8284 + }, + { + "epoch": 0.86, + "grad_norm": 1.9364053789589217, + "learning_rate": 4.966174961313646e-07, + "loss": 0.6009, + "step": 8285 + }, + { + "epoch": 0.86, + "grad_norm": 2.0882062233429544, + "learning_rate": 4.958863230261551e-07, + "loss": 0.5458, + "step": 8286 + }, + { + "epoch": 0.86, + "grad_norm": 1.9799264378167545, + "learning_rate": 4.951556604879049e-07, + "loss": 0.582, + "step": 8287 + }, + { + "epoch": 0.86, + "grad_norm": 2.021495582372774, + "learning_rate": 4.94425508599437e-07, + "loss": 0.5471, + "step": 8288 + }, + { + "epoch": 0.86, + "grad_norm": 1.8753911264845755, + "learning_rate": 4.936958674435178e-07, + "loss": 0.6074, + "step": 8289 + }, + { + "epoch": 0.86, + "grad_norm": 2.066023846671249, + "learning_rate": 4.929667371028579e-07, + "loss": 0.6597, + "step": 8290 + }, + { + "epoch": 0.86, + "grad_norm": 2.150677404952387, + "learning_rate": 4.922381176601066e-07, + "loss": 0.5113, + "step": 8291 + }, + { + "epoch": 0.86, + "grad_norm": 2.054898643812894, + "learning_rate": 4.915100091978591e-07, + "loss": 0.6124, + "step": 8292 + }, + { + "epoch": 0.86, + "grad_norm": 1.6742532360960376, + "learning_rate": 4.907824117986487e-07, + "loss": 0.5931, + "step": 8293 + }, + { + "epoch": 0.86, + "grad_norm": 1.8695315925969587, + "learning_rate": 4.900553255449553e-07, + "loss": 0.5876, + "step": 8294 + }, + { + "epoch": 0.86, + "grad_norm": 1.8359415649511754, + "learning_rate": 4.893287505191946e-07, + "loss": 0.5155, + "step": 8295 + }, + { + "epoch": 0.86, + "grad_norm": 2.06769544820391, + "learning_rate": 4.886026868037313e-07, + "loss": 0.5753, + "step": 8296 + }, + { + "epoch": 0.86, + "grad_norm": 2.0138686917169335, + "learning_rate": 4.878771344808664e-07, + "loss": 0.5813, + "step": 8297 + }, + { + "epoch": 0.86, + "grad_norm": 1.9528925103015966, + "learning_rate": 4.871520936328478e-07, + "loss": 0.5845, + "step": 8298 + }, + { + "epoch": 0.86, + "grad_norm": 1.8439522634772305, + "learning_rate": 4.864275643418603e-07, + "loss": 0.502, + "step": 8299 + }, + { + "epoch": 0.86, + "grad_norm": 2.2204098320288224, + "learning_rate": 4.857035466900361e-07, + "loss": 0.6992, + "step": 8300 + }, + { + "epoch": 0.86, + "grad_norm": 2.3494412708615866, + "learning_rate": 4.849800407594446e-07, + "loss": 0.5811, + "step": 8301 + }, + { + "epoch": 0.86, + "grad_norm": 2.3615193836986235, + "learning_rate": 4.842570466321023e-07, + "loss": 0.6805, + "step": 8302 + }, + { + "epoch": 0.86, + "grad_norm": 1.9959876511710395, + "learning_rate": 4.835345643899609e-07, + "loss": 0.6178, + "step": 8303 + }, + { + "epoch": 0.86, + "grad_norm": 2.1046472978265878, + "learning_rate": 4.828125941149197e-07, + "loss": 0.524, + "step": 8304 + }, + { + "epoch": 0.86, + "grad_norm": 1.8352720574795676, + "learning_rate": 4.820911358888181e-07, + "loss": 0.6001, + "step": 8305 + }, + { + "epoch": 0.86, + "grad_norm": 1.7599800381591997, + "learning_rate": 4.813701897934375e-07, + "loss": 0.5603, + "step": 8306 + }, + { + "epoch": 0.86, + "grad_norm": 2.268456220684496, + "learning_rate": 4.806497559105011e-07, + "loss": 0.5543, + "step": 8307 + }, + { + "epoch": 0.86, + "grad_norm": 1.8005414284190293, + "learning_rate": 4.799298343216746e-07, + "loss": 0.6011, + "step": 8308 + }, + { + "epoch": 0.86, + "grad_norm": 1.925175771444624, + "learning_rate": 4.792104251085655e-07, + "loss": 0.5503, + "step": 8309 + }, + { + "epoch": 0.86, + "grad_norm": 2.3450122517710894, + "learning_rate": 4.784915283527219e-07, + "loss": 0.6776, + "step": 8310 + }, + { + "epoch": 0.86, + "grad_norm": 2.1628264585537806, + "learning_rate": 4.777731441356342e-07, + "loss": 0.5846, + "step": 8311 + }, + { + "epoch": 0.86, + "grad_norm": 1.6450519414376317, + "learning_rate": 4.770552725387378e-07, + "loss": 0.5043, + "step": 8312 + }, + { + "epoch": 0.86, + "grad_norm": 1.7482690149775968, + "learning_rate": 4.763379136434054e-07, + "loss": 0.6179, + "step": 8313 + }, + { + "epoch": 0.86, + "grad_norm": 1.9801836667225938, + "learning_rate": 4.7562106753095527e-07, + "loss": 0.5958, + "step": 8314 + }, + { + "epoch": 0.86, + "grad_norm": 1.9733287042306242, + "learning_rate": 4.7490473428264406e-07, + "loss": 0.7162, + "step": 8315 + }, + { + "epoch": 0.86, + "grad_norm": 1.8725031647315058, + "learning_rate": 4.741889139796746e-07, + "loss": 0.5833, + "step": 8316 + }, + { + "epoch": 0.86, + "grad_norm": 1.799784467033496, + "learning_rate": 4.7347360670318756e-07, + "loss": 0.6006, + "step": 8317 + }, + { + "epoch": 0.86, + "grad_norm": 1.9366110237287402, + "learning_rate": 4.727588125342669e-07, + "loss": 0.5255, + "step": 8318 + }, + { + "epoch": 0.86, + "grad_norm": 2.234824200192645, + "learning_rate": 4.7204453155394013e-07, + "loss": 0.6816, + "step": 8319 + }, + { + "epoch": 0.86, + "grad_norm": 2.5122506337921116, + "learning_rate": 4.7133076384317354e-07, + "loss": 0.6069, + "step": 8320 + }, + { + "epoch": 0.86, + "grad_norm": 2.119122687155672, + "learning_rate": 4.70617509482878e-07, + "loss": 0.6328, + "step": 8321 + }, + { + "epoch": 0.87, + "grad_norm": 2.004192201331832, + "learning_rate": 4.699047685539038e-07, + "loss": 0.5006, + "step": 8322 + }, + { + "epoch": 0.87, + "grad_norm": 1.916590258992686, + "learning_rate": 4.6919254113704515e-07, + "loss": 0.5811, + "step": 8323 + }, + { + "epoch": 0.87, + "grad_norm": 1.6538686628596007, + "learning_rate": 4.68480827313037e-07, + "loss": 0.5711, + "step": 8324 + }, + { + "epoch": 0.87, + "grad_norm": 1.6500225555034003, + "learning_rate": 4.6776962716255593e-07, + "loss": 0.5336, + "step": 8325 + }, + { + "epoch": 0.87, + "grad_norm": 1.9802573141561102, + "learning_rate": 4.670589407662196e-07, + "loss": 0.5819, + "step": 8326 + }, + { + "epoch": 0.87, + "grad_norm": 2.0088327902438525, + "learning_rate": 4.663487682045903e-07, + "loss": 0.5182, + "step": 8327 + }, + { + "epoch": 0.87, + "grad_norm": 2.017583462703393, + "learning_rate": 4.656391095581675e-07, + "loss": 0.6915, + "step": 8328 + }, + { + "epoch": 0.87, + "grad_norm": 2.1208095680699808, + "learning_rate": 4.6492996490739796e-07, + "loss": 0.6893, + "step": 8329 + }, + { + "epoch": 0.87, + "grad_norm": 1.9534148806244596, + "learning_rate": 4.6422133433266513e-07, + "loss": 0.5895, + "step": 8330 + }, + { + "epoch": 0.87, + "grad_norm": 1.8000895668108274, + "learning_rate": 4.6351321791429924e-07, + "loss": 0.5725, + "step": 8331 + }, + { + "epoch": 0.87, + "grad_norm": 1.9340943599774802, + "learning_rate": 4.62805615732565e-07, + "loss": 0.6447, + "step": 8332 + }, + { + "epoch": 0.87, + "grad_norm": 1.8956460910814394, + "learning_rate": 4.6209852786767593e-07, + "loss": 0.5395, + "step": 8333 + }, + { + "epoch": 0.87, + "grad_norm": 1.709032151810743, + "learning_rate": 4.613919543997836e-07, + "loss": 0.5488, + "step": 8334 + }, + { + "epoch": 0.87, + "grad_norm": 2.113498349240345, + "learning_rate": 4.606858954089827e-07, + "loss": 0.6202, + "step": 8335 + }, + { + "epoch": 0.87, + "grad_norm": 2.034226927445555, + "learning_rate": 4.599803509753081e-07, + "loss": 0.6481, + "step": 8336 + }, + { + "epoch": 0.87, + "grad_norm": 1.853159457928419, + "learning_rate": 4.592753211787393e-07, + "loss": 0.6445, + "step": 8337 + }, + { + "epoch": 0.87, + "grad_norm": 2.1242559862625416, + "learning_rate": 4.585708060991928e-07, + "loss": 0.6081, + "step": 8338 + }, + { + "epoch": 0.87, + "grad_norm": 2.2173927759277388, + "learning_rate": 4.578668058165325e-07, + "loss": 0.6526, + "step": 8339 + }, + { + "epoch": 0.87, + "grad_norm": 1.9183380234550136, + "learning_rate": 4.571633204105574e-07, + "loss": 0.5716, + "step": 8340 + }, + { + "epoch": 0.87, + "grad_norm": 1.852559147624212, + "learning_rate": 4.564603499610143e-07, + "loss": 0.6098, + "step": 8341 + }, + { + "epoch": 0.87, + "grad_norm": 1.9970457306785048, + "learning_rate": 4.5575789454758656e-07, + "loss": 0.6656, + "step": 8342 + }, + { + "epoch": 0.87, + "grad_norm": 2.102913775343231, + "learning_rate": 4.5505595424990446e-07, + "loss": 0.6814, + "step": 8343 + }, + { + "epoch": 0.87, + "grad_norm": 2.1924316214071427, + "learning_rate": 4.5435452914753377e-07, + "loss": 0.5918, + "step": 8344 + }, + { + "epoch": 0.87, + "grad_norm": 2.16432717318907, + "learning_rate": 4.5365361931998696e-07, + "loss": 0.6633, + "step": 8345 + }, + { + "epoch": 0.87, + "grad_norm": 2.0535418697441377, + "learning_rate": 4.5295322484671667e-07, + "loss": 0.5843, + "step": 8346 + }, + { + "epoch": 0.87, + "grad_norm": 1.9299352635441742, + "learning_rate": 4.522533458071149e-07, + "loss": 0.6031, + "step": 8347 + }, + { + "epoch": 0.87, + "grad_norm": 1.8036773453246178, + "learning_rate": 4.5155398228051707e-07, + "loss": 0.5917, + "step": 8348 + }, + { + "epoch": 0.87, + "grad_norm": 1.9397877871030635, + "learning_rate": 4.508551343462014e-07, + "loss": 0.6233, + "step": 8349 + }, + { + "epoch": 0.87, + "grad_norm": 2.252675458486445, + "learning_rate": 4.501568020833846e-07, + "loss": 0.5963, + "step": 8350 + }, + { + "epoch": 0.87, + "grad_norm": 2.04584672503017, + "learning_rate": 4.4945898557122893e-07, + "loss": 0.592, + "step": 8351 + }, + { + "epoch": 0.87, + "grad_norm": 2.1792705337695466, + "learning_rate": 4.4876168488883267e-07, + "loss": 0.6744, + "step": 8352 + }, + { + "epoch": 0.87, + "grad_norm": 1.852796108199507, + "learning_rate": 4.4806490011524205e-07, + "loss": 0.5102, + "step": 8353 + }, + { + "epoch": 0.87, + "grad_norm": 1.5734956707152514, + "learning_rate": 4.473686313294401e-07, + "loss": 0.5494, + "step": 8354 + }, + { + "epoch": 0.87, + "grad_norm": 1.7716365972878156, + "learning_rate": 4.466728786103519e-07, + "loss": 0.5574, + "step": 8355 + }, + { + "epoch": 0.87, + "grad_norm": 1.9458084993387372, + "learning_rate": 4.4597764203684725e-07, + "loss": 0.5328, + "step": 8356 + }, + { + "epoch": 0.87, + "grad_norm": 2.197405580309809, + "learning_rate": 4.4528292168773303e-07, + "loss": 0.6607, + "step": 8357 + }, + { + "epoch": 0.87, + "grad_norm": 2.0426609447432154, + "learning_rate": 4.445887176417613e-07, + "loss": 0.6092, + "step": 8358 + }, + { + "epoch": 0.87, + "grad_norm": 1.9637826343530485, + "learning_rate": 4.4389502997762236e-07, + "loss": 0.5815, + "step": 8359 + }, + { + "epoch": 0.87, + "grad_norm": 1.9580038792141665, + "learning_rate": 4.432018587739517e-07, + "loss": 0.6124, + "step": 8360 + }, + { + "epoch": 0.87, + "grad_norm": 1.8027490658262888, + "learning_rate": 4.425092041093237e-07, + "loss": 0.5564, + "step": 8361 + }, + { + "epoch": 0.87, + "grad_norm": 2.890109404627932, + "learning_rate": 4.418170660622539e-07, + "loss": 0.6641, + "step": 8362 + }, + { + "epoch": 0.87, + "grad_norm": 1.9496169997143404, + "learning_rate": 4.4112544471119954e-07, + "loss": 0.6565, + "step": 8363 + }, + { + "epoch": 0.87, + "grad_norm": 2.1410985482384, + "learning_rate": 4.404343401345612e-07, + "loss": 0.7114, + "step": 8364 + }, + { + "epoch": 0.87, + "grad_norm": 1.7444177282452187, + "learning_rate": 4.39743752410679e-07, + "loss": 0.6074, + "step": 8365 + }, + { + "epoch": 0.87, + "grad_norm": 1.9722413049276115, + "learning_rate": 4.390536816178353e-07, + "loss": 0.6795, + "step": 8366 + }, + { + "epoch": 0.87, + "grad_norm": 1.8931068079905056, + "learning_rate": 4.3836412783425265e-07, + "loss": 0.6789, + "step": 8367 + }, + { + "epoch": 0.87, + "grad_norm": 1.956276084763643, + "learning_rate": 4.3767509113809836e-07, + "loss": 0.6538, + "step": 8368 + }, + { + "epoch": 0.87, + "grad_norm": 1.7819181415754366, + "learning_rate": 4.3698657160747504e-07, + "loss": 0.6105, + "step": 8369 + }, + { + "epoch": 0.87, + "grad_norm": 1.852985794251653, + "learning_rate": 4.36298569320433e-07, + "loss": 0.4922, + "step": 8370 + }, + { + "epoch": 0.87, + "grad_norm": 2.074863975049127, + "learning_rate": 4.3561108435495936e-07, + "loss": 0.5791, + "step": 8371 + }, + { + "epoch": 0.87, + "grad_norm": 1.8898011683002591, + "learning_rate": 4.349241167889867e-07, + "loss": 0.6205, + "step": 8372 + }, + { + "epoch": 0.87, + "grad_norm": 1.859913410648297, + "learning_rate": 4.342376667003845e-07, + "loss": 0.6641, + "step": 8373 + }, + { + "epoch": 0.87, + "grad_norm": 1.835582843864355, + "learning_rate": 4.335517341669676e-07, + "loss": 0.5306, + "step": 8374 + }, + { + "epoch": 0.87, + "grad_norm": 2.1789774219199276, + "learning_rate": 4.3286631926648834e-07, + "loss": 0.6437, + "step": 8375 + }, + { + "epoch": 0.87, + "grad_norm": 2.1690881296258766, + "learning_rate": 4.321814220766457e-07, + "loss": 0.5847, + "step": 8376 + }, + { + "epoch": 0.87, + "grad_norm": 1.9311374676014503, + "learning_rate": 4.3149704267507254e-07, + "loss": 0.5837, + "step": 8377 + }, + { + "epoch": 0.87, + "grad_norm": 1.8601406230446484, + "learning_rate": 4.3081318113935013e-07, + "loss": 0.5966, + "step": 8378 + }, + { + "epoch": 0.87, + "grad_norm": 1.9117613858119065, + "learning_rate": 4.3012983754699645e-07, + "loss": 0.5844, + "step": 8379 + }, + { + "epoch": 0.87, + "grad_norm": 2.1005104659206735, + "learning_rate": 4.29447011975474e-07, + "loss": 0.5632, + "step": 8380 + }, + { + "epoch": 0.87, + "grad_norm": 1.8160285873217523, + "learning_rate": 4.2876470450218254e-07, + "loss": 0.5465, + "step": 8381 + }, + { + "epoch": 0.87, + "grad_norm": 1.8764970657380244, + "learning_rate": 4.2808291520446856e-07, + "loss": 0.6718, + "step": 8382 + }, + { + "epoch": 0.87, + "grad_norm": 1.9302392899798297, + "learning_rate": 4.274016441596146e-07, + "loss": 0.6263, + "step": 8383 + }, + { + "epoch": 0.87, + "grad_norm": 2.380086794220247, + "learning_rate": 4.267208914448467e-07, + "loss": 0.6717, + "step": 8384 + }, + { + "epoch": 0.87, + "grad_norm": 1.8467666239151335, + "learning_rate": 4.2604065713733376e-07, + "loss": 0.4995, + "step": 8385 + }, + { + "epoch": 0.87, + "grad_norm": 2.1309058059197357, + "learning_rate": 4.253609413141824e-07, + "loss": 0.6071, + "step": 8386 + }, + { + "epoch": 0.87, + "grad_norm": 2.056063519146599, + "learning_rate": 4.2468174405244255e-07, + "loss": 0.6373, + "step": 8387 + }, + { + "epoch": 0.87, + "grad_norm": 1.997005731740509, + "learning_rate": 4.240030654291061e-07, + "loss": 0.6215, + "step": 8388 + }, + { + "epoch": 0.87, + "grad_norm": 1.8805146093287572, + "learning_rate": 4.2332490552110363e-07, + "loss": 0.6959, + "step": 8389 + }, + { + "epoch": 0.87, + "grad_norm": 1.9311754319444367, + "learning_rate": 4.2264726440531036e-07, + "loss": 0.6109, + "step": 8390 + }, + { + "epoch": 0.87, + "grad_norm": 1.9639047760658772, + "learning_rate": 4.2197014215853926e-07, + "loss": 0.6419, + "step": 8391 + }, + { + "epoch": 0.87, + "grad_norm": 1.9795175845484239, + "learning_rate": 4.2129353885754564e-07, + "loss": 0.6291, + "step": 8392 + }, + { + "epoch": 0.87, + "grad_norm": 2.249186746122543, + "learning_rate": 4.206174545790281e-07, + "loss": 0.5673, + "step": 8393 + }, + { + "epoch": 0.87, + "grad_norm": 2.253664003461183, + "learning_rate": 4.199418893996232e-07, + "loss": 0.6916, + "step": 8394 + }, + { + "epoch": 0.87, + "grad_norm": 2.1217256531704485, + "learning_rate": 4.192668433959113e-07, + "loss": 0.6001, + "step": 8395 + }, + { + "epoch": 0.87, + "grad_norm": 1.9901519915823938, + "learning_rate": 4.1859231664441115e-07, + "loss": 0.5294, + "step": 8396 + }, + { + "epoch": 0.87, + "grad_norm": 1.9997277010717542, + "learning_rate": 4.179183092215855e-07, + "loss": 0.7411, + "step": 8397 + }, + { + "epoch": 0.87, + "grad_norm": 2.165446184188909, + "learning_rate": 4.172448212038371e-07, + "loss": 0.6355, + "step": 8398 + }, + { + "epoch": 0.87, + "grad_norm": 2.11255812027217, + "learning_rate": 4.165718526675083e-07, + "loss": 0.5731, + "step": 8399 + }, + { + "epoch": 0.87, + "grad_norm": 1.919796366514845, + "learning_rate": 4.158994036888847e-07, + "loss": 0.5309, + "step": 8400 + }, + { + "epoch": 0.87, + "grad_norm": 2.390639655248388, + "learning_rate": 4.152274743441925e-07, + "loss": 0.5966, + "step": 8401 + }, + { + "epoch": 0.87, + "grad_norm": 1.738681794830981, + "learning_rate": 4.1455606470959755e-07, + "loss": 0.4943, + "step": 8402 + }, + { + "epoch": 0.87, + "grad_norm": 2.2208990337370897, + "learning_rate": 4.138851748612099e-07, + "loss": 0.6778, + "step": 8403 + }, + { + "epoch": 0.87, + "grad_norm": 2.0630590778482705, + "learning_rate": 4.132148048750767e-07, + "loss": 0.5795, + "step": 8404 + }, + { + "epoch": 0.87, + "grad_norm": 1.9018986833626161, + "learning_rate": 4.125449548271909e-07, + "loss": 0.5599, + "step": 8405 + }, + { + "epoch": 0.87, + "grad_norm": 2.0210569711520066, + "learning_rate": 4.118756247934802e-07, + "loss": 0.6431, + "step": 8406 + }, + { + "epoch": 0.87, + "grad_norm": 2.0602854396292396, + "learning_rate": 4.112068148498199e-07, + "loss": 0.7268, + "step": 8407 + }, + { + "epoch": 0.87, + "grad_norm": 1.9460298623096413, + "learning_rate": 4.1053852507202117e-07, + "loss": 0.6629, + "step": 8408 + }, + { + "epoch": 0.87, + "grad_norm": 1.7595563936601968, + "learning_rate": 4.098707555358411e-07, + "loss": 0.5799, + "step": 8409 + }, + { + "epoch": 0.87, + "grad_norm": 2.0271997118305483, + "learning_rate": 4.092035063169725e-07, + "loss": 0.6046, + "step": 8410 + }, + { + "epoch": 0.87, + "grad_norm": 2.0289865415330928, + "learning_rate": 4.0853677749105426e-07, + "loss": 0.5695, + "step": 8411 + }, + { + "epoch": 0.87, + "grad_norm": 1.9559717154468896, + "learning_rate": 4.078705691336621e-07, + "loss": 0.6128, + "step": 8412 + }, + { + "epoch": 0.87, + "grad_norm": 1.8769271149503086, + "learning_rate": 4.072048813203161e-07, + "loss": 0.6361, + "step": 8413 + }, + { + "epoch": 0.87, + "grad_norm": 2.0028628194390103, + "learning_rate": 4.065397141264737e-07, + "loss": 0.6202, + "step": 8414 + }, + { + "epoch": 0.87, + "grad_norm": 1.9916985701401626, + "learning_rate": 4.0587506762753747e-07, + "loss": 0.6754, + "step": 8415 + }, + { + "epoch": 0.87, + "grad_norm": 1.9127873478665742, + "learning_rate": 4.05210941898847e-07, + "loss": 0.5592, + "step": 8416 + }, + { + "epoch": 0.87, + "grad_norm": 2.0282432551440217, + "learning_rate": 4.045473370156866e-07, + "loss": 0.7055, + "step": 8417 + }, + { + "epoch": 0.88, + "grad_norm": 1.8459979565416385, + "learning_rate": 4.038842530532777e-07, + "loss": 0.5877, + "step": 8418 + }, + { + "epoch": 0.88, + "grad_norm": 1.8860989137846904, + "learning_rate": 4.032216900867869e-07, + "loss": 0.5879, + "step": 8419 + }, + { + "epoch": 0.88, + "grad_norm": 2.303077578313458, + "learning_rate": 4.02559648191318e-07, + "loss": 0.6251, + "step": 8420 + }, + { + "epoch": 0.88, + "grad_norm": 2.0308831657626274, + "learning_rate": 4.0189812744191647e-07, + "loss": 0.6051, + "step": 8421 + }, + { + "epoch": 0.88, + "grad_norm": 1.784559200096564, + "learning_rate": 4.0123712791357185e-07, + "loss": 0.5577, + "step": 8422 + }, + { + "epoch": 0.88, + "grad_norm": 2.0297559541852874, + "learning_rate": 4.005766496812097e-07, + "loss": 0.6529, + "step": 8423 + }, + { + "epoch": 0.88, + "grad_norm": 1.9111708728025478, + "learning_rate": 3.999166928197007e-07, + "loss": 0.5998, + "step": 8424 + }, + { + "epoch": 0.88, + "grad_norm": 2.132095921466122, + "learning_rate": 3.9925725740385503e-07, + "loss": 0.6852, + "step": 8425 + }, + { + "epoch": 0.88, + "grad_norm": 1.8697129123540708, + "learning_rate": 3.985983435084212e-07, + "loss": 0.5694, + "step": 8426 + }, + { + "epoch": 0.88, + "grad_norm": 1.9320332116706223, + "learning_rate": 3.979399512080928e-07, + "loss": 0.6194, + "step": 8427 + }, + { + "epoch": 0.88, + "grad_norm": 1.7865258361708811, + "learning_rate": 3.972820805775024e-07, + "loss": 0.5196, + "step": 8428 + }, + { + "epoch": 0.88, + "grad_norm": 1.9635188121091862, + "learning_rate": 3.966247316912214e-07, + "loss": 0.6628, + "step": 8429 + }, + { + "epoch": 0.88, + "grad_norm": 2.052755454266565, + "learning_rate": 3.959679046237663e-07, + "loss": 0.5529, + "step": 8430 + }, + { + "epoch": 0.88, + "grad_norm": 2.2660073623651082, + "learning_rate": 3.9531159944959094e-07, + "loss": 0.5532, + "step": 8431 + }, + { + "epoch": 0.88, + "grad_norm": 2.1619277389271128, + "learning_rate": 3.9465581624309245e-07, + "loss": 0.6274, + "step": 8432 + }, + { + "epoch": 0.88, + "grad_norm": 1.9488850141266936, + "learning_rate": 3.9400055507860525e-07, + "loss": 0.5921, + "step": 8433 + }, + { + "epoch": 0.88, + "grad_norm": 1.9310958922073735, + "learning_rate": 3.933458160304099e-07, + "loss": 0.5115, + "step": 8434 + }, + { + "epoch": 0.88, + "grad_norm": 1.7377699202843322, + "learning_rate": 3.9269159917272313e-07, + "loss": 0.5034, + "step": 8435 + }, + { + "epoch": 0.88, + "grad_norm": 2.020931667657547, + "learning_rate": 3.920379045797046e-07, + "loss": 0.6726, + "step": 8436 + }, + { + "epoch": 0.88, + "grad_norm": 2.045919126231745, + "learning_rate": 3.9138473232545326e-07, + "loss": 0.6392, + "step": 8437 + }, + { + "epoch": 0.88, + "grad_norm": 1.9556100117349695, + "learning_rate": 3.907320824840111e-07, + "loss": 0.6564, + "step": 8438 + }, + { + "epoch": 0.88, + "grad_norm": 1.830463491913251, + "learning_rate": 3.900799551293588e-07, + "loss": 0.5916, + "step": 8439 + }, + { + "epoch": 0.88, + "grad_norm": 2.117119952335453, + "learning_rate": 3.8942835033542013e-07, + "loss": 0.6425, + "step": 8440 + }, + { + "epoch": 0.88, + "grad_norm": 2.351242053992765, + "learning_rate": 3.8877726817605655e-07, + "loss": 0.6671, + "step": 8441 + }, + { + "epoch": 0.88, + "grad_norm": 1.9606052679245647, + "learning_rate": 3.8812670872507454e-07, + "loss": 0.572, + "step": 8442 + }, + { + "epoch": 0.88, + "grad_norm": 2.001344741894956, + "learning_rate": 3.874766720562151e-07, + "loss": 0.6716, + "step": 8443 + }, + { + "epoch": 0.88, + "grad_norm": 1.8404479443713242, + "learning_rate": 3.8682715824316594e-07, + "loss": 0.588, + "step": 8444 + }, + { + "epoch": 0.88, + "grad_norm": 1.9912689065492977, + "learning_rate": 3.861781673595516e-07, + "loss": 0.5291, + "step": 8445 + }, + { + "epoch": 0.88, + "grad_norm": 1.8709075400500266, + "learning_rate": 3.855296994789415e-07, + "loss": 0.6012, + "step": 8446 + }, + { + "epoch": 0.88, + "grad_norm": 2.1625558589548106, + "learning_rate": 3.8488175467484015e-07, + "loss": 0.5877, + "step": 8447 + }, + { + "epoch": 0.88, + "grad_norm": 2.0313618701656835, + "learning_rate": 3.8423433302069824e-07, + "loss": 0.6104, + "step": 8448 + }, + { + "epoch": 0.88, + "grad_norm": 1.76595693209394, + "learning_rate": 3.8358743458990267e-07, + "loss": 0.5309, + "step": 8449 + }, + { + "epoch": 0.88, + "grad_norm": 1.9556132619540931, + "learning_rate": 3.829410594557859e-07, + "loss": 0.5177, + "step": 8450 + }, + { + "epoch": 0.88, + "grad_norm": 1.9424701856527413, + "learning_rate": 3.8229520769161474e-07, + "loss": 0.5511, + "step": 8451 + }, + { + "epoch": 0.88, + "grad_norm": 2.0982651348249606, + "learning_rate": 3.816498793706025e-07, + "loss": 0.6457, + "step": 8452 + }, + { + "epoch": 0.88, + "grad_norm": 2.1999610052684497, + "learning_rate": 3.810050745658994e-07, + "loss": 0.7338, + "step": 8453 + }, + { + "epoch": 0.88, + "grad_norm": 1.9720801346355596, + "learning_rate": 3.803607933505993e-07, + "loss": 0.5504, + "step": 8454 + }, + { + "epoch": 0.88, + "grad_norm": 1.9568723610962957, + "learning_rate": 3.797170357977337e-07, + "loss": 0.633, + "step": 8455 + }, + { + "epoch": 0.88, + "grad_norm": 1.9100529824005883, + "learning_rate": 3.790738019802775e-07, + "loss": 0.6134, + "step": 8456 + }, + { + "epoch": 0.88, + "grad_norm": 1.997788576607131, + "learning_rate": 3.7843109197114426e-07, + "loss": 0.7004, + "step": 8457 + }, + { + "epoch": 0.88, + "grad_norm": 2.0031467913611256, + "learning_rate": 3.7778890584318773e-07, + "loss": 0.6788, + "step": 8458 + }, + { + "epoch": 0.88, + "grad_norm": 1.9267261626502268, + "learning_rate": 3.771472436692053e-07, + "loss": 0.6607, + "step": 8459 + }, + { + "epoch": 0.88, + "grad_norm": 2.0280693159384757, + "learning_rate": 3.7650610552193157e-07, + "loss": 0.6408, + "step": 8460 + }, + { + "epoch": 0.88, + "grad_norm": 2.047634744310128, + "learning_rate": 3.758654914740445e-07, + "loss": 0.5969, + "step": 8461 + }, + { + "epoch": 0.88, + "grad_norm": 2.1014846487101164, + "learning_rate": 3.752254015981599e-07, + "loss": 0.6298, + "step": 8462 + }, + { + "epoch": 0.88, + "grad_norm": 1.9895447225484795, + "learning_rate": 3.745858359668375e-07, + "loss": 0.6054, + "step": 8463 + }, + { + "epoch": 0.88, + "grad_norm": 1.90717225882143, + "learning_rate": 3.7394679465257435e-07, + "loss": 0.5842, + "step": 8464 + }, + { + "epoch": 0.88, + "grad_norm": 2.051788494418991, + "learning_rate": 3.7330827772780967e-07, + "loss": 0.5711, + "step": 8465 + }, + { + "epoch": 0.88, + "grad_norm": 1.9343377474924703, + "learning_rate": 3.726702852649222e-07, + "loss": 0.5839, + "step": 8466 + }, + { + "epoch": 0.88, + "grad_norm": 2.3496780116095097, + "learning_rate": 3.720328173362331e-07, + "loss": 0.6441, + "step": 8467 + }, + { + "epoch": 0.88, + "grad_norm": 2.050871079546467, + "learning_rate": 3.713958740140022e-07, + "loss": 0.6083, + "step": 8468 + }, + { + "epoch": 0.88, + "grad_norm": 1.9364655434741551, + "learning_rate": 3.7075945537043244e-07, + "loss": 0.5007, + "step": 8469 + }, + { + "epoch": 0.88, + "grad_norm": 2.1012266768594476, + "learning_rate": 3.701235614776627e-07, + "loss": 0.7012, + "step": 8470 + }, + { + "epoch": 0.88, + "grad_norm": 2.0691007203153036, + "learning_rate": 3.6948819240777756e-07, + "loss": 0.5571, + "step": 8471 + }, + { + "epoch": 0.88, + "grad_norm": 1.7160709095761875, + "learning_rate": 3.688533482327994e-07, + "loss": 0.5777, + "step": 8472 + }, + { + "epoch": 0.88, + "grad_norm": 1.8206494989587712, + "learning_rate": 3.6821902902469066e-07, + "loss": 0.525, + "step": 8473 + }, + { + "epoch": 0.88, + "grad_norm": 1.711893505353368, + "learning_rate": 3.675852348553538e-07, + "loss": 0.5737, + "step": 8474 + }, + { + "epoch": 0.88, + "grad_norm": 2.033288025220472, + "learning_rate": 3.6695196579663583e-07, + "loss": 0.534, + "step": 8475 + }, + { + "epoch": 0.88, + "grad_norm": 2.0240144326187415, + "learning_rate": 3.6631922192031934e-07, + "loss": 0.7105, + "step": 8476 + }, + { + "epoch": 0.88, + "grad_norm": 2.2155007112046166, + "learning_rate": 3.656870032981308e-07, + "loss": 0.627, + "step": 8477 + }, + { + "epoch": 0.88, + "grad_norm": 2.021651389737251, + "learning_rate": 3.65055310001734e-07, + "loss": 0.6434, + "step": 8478 + }, + { + "epoch": 0.88, + "grad_norm": 2.350132802250893, + "learning_rate": 3.6442414210273834e-07, + "loss": 0.6545, + "step": 8479 + }, + { + "epoch": 0.88, + "grad_norm": 2.043985363928126, + "learning_rate": 3.637934996726861e-07, + "loss": 0.6513, + "step": 8480 + }, + { + "epoch": 0.88, + "grad_norm": 1.817926490943509, + "learning_rate": 3.6316338278306715e-07, + "loss": 0.621, + "step": 8481 + }, + { + "epoch": 0.88, + "grad_norm": 2.1668932428840284, + "learning_rate": 3.6253379150530676e-07, + "loss": 0.6419, + "step": 8482 + }, + { + "epoch": 0.88, + "grad_norm": 2.0924027965702496, + "learning_rate": 3.619047259107744e-07, + "loss": 0.7059, + "step": 8483 + }, + { + "epoch": 0.88, + "grad_norm": 1.973897319602242, + "learning_rate": 3.6127618607077754e-07, + "loss": 0.6232, + "step": 8484 + }, + { + "epoch": 0.88, + "grad_norm": 2.1019528532461513, + "learning_rate": 3.606481720565652e-07, + "loss": 0.6515, + "step": 8485 + }, + { + "epoch": 0.88, + "grad_norm": 1.8078476936519057, + "learning_rate": 3.600206839393261e-07, + "loss": 0.5487, + "step": 8486 + }, + { + "epoch": 0.88, + "grad_norm": 2.6102160702115826, + "learning_rate": 3.593937217901894e-07, + "loss": 0.6213, + "step": 8487 + }, + { + "epoch": 0.88, + "grad_norm": 1.7697681408557449, + "learning_rate": 3.587672856802238e-07, + "loss": 0.6497, + "step": 8488 + }, + { + "epoch": 0.88, + "grad_norm": 1.9259481594858332, + "learning_rate": 3.581413756804414e-07, + "loss": 0.6138, + "step": 8489 + }, + { + "epoch": 0.88, + "grad_norm": 2.078149202383723, + "learning_rate": 3.5751599186179155e-07, + "loss": 0.604, + "step": 8490 + }, + { + "epoch": 0.88, + "grad_norm": 2.014467331458678, + "learning_rate": 3.568911342951653e-07, + "loss": 0.7161, + "step": 8491 + }, + { + "epoch": 0.88, + "grad_norm": 1.9429447928577988, + "learning_rate": 3.5626680305139383e-07, + "loss": 0.5646, + "step": 8492 + }, + { + "epoch": 0.88, + "grad_norm": 1.8978526166540983, + "learning_rate": 3.5564299820124883e-07, + "loss": 0.6737, + "step": 8493 + }, + { + "epoch": 0.88, + "grad_norm": 1.78340609310645, + "learning_rate": 3.5501971981544206e-07, + "loss": 0.6035, + "step": 8494 + }, + { + "epoch": 0.88, + "grad_norm": 2.1841185692701757, + "learning_rate": 3.5439696796462474e-07, + "loss": 0.687, + "step": 8495 + }, + { + "epoch": 0.88, + "grad_norm": 1.9069053525425088, + "learning_rate": 3.5377474271939103e-07, + "loss": 0.5794, + "step": 8496 + }, + { + "epoch": 0.88, + "grad_norm": 1.9781368239694577, + "learning_rate": 3.531530441502723e-07, + "loss": 0.6754, + "step": 8497 + }, + { + "epoch": 0.88, + "grad_norm": 1.787260799174934, + "learning_rate": 3.525318723277427e-07, + "loss": 0.621, + "step": 8498 + }, + { + "epoch": 0.88, + "grad_norm": 1.8797198389956506, + "learning_rate": 3.519112273222142e-07, + "loss": 0.5571, + "step": 8499 + }, + { + "epoch": 0.88, + "grad_norm": 2.0387277170307674, + "learning_rate": 3.512911092040422e-07, + "loss": 0.6346, + "step": 8500 + }, + { + "epoch": 0.88, + "grad_norm": 1.9635088913195, + "learning_rate": 3.506715180435194e-07, + "loss": 0.6559, + "step": 8501 + }, + { + "epoch": 0.88, + "grad_norm": 1.8491276195881368, + "learning_rate": 3.500524539108807e-07, + "loss": 0.5446, + "step": 8502 + }, + { + "epoch": 0.88, + "grad_norm": 1.9939506343422873, + "learning_rate": 3.4943391687629946e-07, + "loss": 0.4936, + "step": 8503 + }, + { + "epoch": 0.88, + "grad_norm": 1.9671648301858564, + "learning_rate": 3.4881590700989175e-07, + "loss": 0.5706, + "step": 8504 + }, + { + "epoch": 0.88, + "grad_norm": 2.044394803943145, + "learning_rate": 3.48198424381711e-07, + "loss": 0.625, + "step": 8505 + }, + { + "epoch": 0.88, + "grad_norm": 1.9877868545973922, + "learning_rate": 3.475814690617541e-07, + "loss": 0.5645, + "step": 8506 + }, + { + "epoch": 0.88, + "grad_norm": 2.052299829237452, + "learning_rate": 3.469650411199543e-07, + "loss": 0.6363, + "step": 8507 + }, + { + "epoch": 0.88, + "grad_norm": 2.07731152157618, + "learning_rate": 3.4634914062618984e-07, + "loss": 0.5551, + "step": 8508 + }, + { + "epoch": 0.88, + "grad_norm": 1.8411429732595614, + "learning_rate": 3.457337676502753e-07, + "loss": 0.6741, + "step": 8509 + }, + { + "epoch": 0.88, + "grad_norm": 1.7952870625461101, + "learning_rate": 3.451189222619661e-07, + "loss": 0.5644, + "step": 8510 + }, + { + "epoch": 0.88, + "grad_norm": 2.024486086455206, + "learning_rate": 3.445046045309586e-07, + "loss": 0.6004, + "step": 8511 + }, + { + "epoch": 0.88, + "grad_norm": 1.812082428978913, + "learning_rate": 3.438908145268904e-07, + "loss": 0.5468, + "step": 8512 + }, + { + "epoch": 0.88, + "grad_norm": 1.9415016178715299, + "learning_rate": 3.4327755231933603e-07, + "loss": 0.6659, + "step": 8513 + }, + { + "epoch": 0.89, + "grad_norm": 2.4030823748882164, + "learning_rate": 3.426648179778147e-07, + "loss": 0.6063, + "step": 8514 + }, + { + "epoch": 0.89, + "grad_norm": 1.8938865020813795, + "learning_rate": 3.420526115717815e-07, + "loss": 0.5623, + "step": 8515 + }, + { + "epoch": 0.89, + "grad_norm": 1.851080556637924, + "learning_rate": 3.4144093317063586e-07, + "loss": 0.5629, + "step": 8516 + }, + { + "epoch": 0.89, + "grad_norm": 1.8727554350128697, + "learning_rate": 3.4082978284371127e-07, + "loss": 0.5925, + "step": 8517 + }, + { + "epoch": 0.89, + "grad_norm": 1.9815146666422891, + "learning_rate": 3.4021916066028837e-07, + "loss": 0.5842, + "step": 8518 + }, + { + "epoch": 0.89, + "grad_norm": 1.7584842117350237, + "learning_rate": 3.396090666895829e-07, + "loss": 0.7568, + "step": 8519 + }, + { + "epoch": 0.89, + "grad_norm": 1.9915690900587948, + "learning_rate": 3.3899950100075354e-07, + "loss": 0.6972, + "step": 8520 + }, + { + "epoch": 0.89, + "grad_norm": 1.7809472800894863, + "learning_rate": 3.383904636628965e-07, + "loss": 0.5753, + "step": 8521 + }, + { + "epoch": 0.89, + "grad_norm": 1.8229838348029193, + "learning_rate": 3.377819547450517e-07, + "loss": 0.5927, + "step": 8522 + }, + { + "epoch": 0.89, + "grad_norm": 1.8899181595589158, + "learning_rate": 3.3717397431619614e-07, + "loss": 0.6775, + "step": 8523 + }, + { + "epoch": 0.89, + "grad_norm": 2.111746304571928, + "learning_rate": 3.365665224452469e-07, + "loss": 0.7002, + "step": 8524 + }, + { + "epoch": 0.89, + "grad_norm": 1.7960693428634793, + "learning_rate": 3.359595992010639e-07, + "loss": 0.5953, + "step": 8525 + }, + { + "epoch": 0.89, + "grad_norm": 2.0102795188304614, + "learning_rate": 3.3535320465244494e-07, + "loss": 0.6059, + "step": 8526 + }, + { + "epoch": 0.89, + "grad_norm": 1.8455053001836659, + "learning_rate": 3.3474733886812606e-07, + "loss": 0.6622, + "step": 8527 + }, + { + "epoch": 0.89, + "grad_norm": 1.9846483394387668, + "learning_rate": 3.3414200191678903e-07, + "loss": 0.6186, + "step": 8528 + }, + { + "epoch": 0.89, + "grad_norm": 1.864288965670221, + "learning_rate": 3.3353719386704954e-07, + "loss": 0.54, + "step": 8529 + }, + { + "epoch": 0.89, + "grad_norm": 1.8117014185107374, + "learning_rate": 3.329329147874677e-07, + "loss": 0.5355, + "step": 8530 + }, + { + "epoch": 0.89, + "grad_norm": 2.050544467198889, + "learning_rate": 3.3232916474654154e-07, + "loss": 0.6408, + "step": 8531 + }, + { + "epoch": 0.89, + "grad_norm": 1.9753592793290733, + "learning_rate": 3.317259438127085e-07, + "loss": 0.6121, + "step": 8532 + }, + { + "epoch": 0.89, + "grad_norm": 1.9685029438749013, + "learning_rate": 3.3112325205434834e-07, + "loss": 0.5375, + "step": 8533 + }, + { + "epoch": 0.89, + "grad_norm": 2.0390676005965145, + "learning_rate": 3.305210895397792e-07, + "loss": 0.6119, + "step": 8534 + }, + { + "epoch": 0.89, + "grad_norm": 2.147082033891267, + "learning_rate": 3.299194563372604e-07, + "loss": 0.6216, + "step": 8535 + }, + { + "epoch": 0.89, + "grad_norm": 1.9294865710173905, + "learning_rate": 3.2931835251498845e-07, + "loss": 0.5659, + "step": 8536 + }, + { + "epoch": 0.89, + "grad_norm": 2.091471816118534, + "learning_rate": 3.287177781411044e-07, + "loss": 0.7131, + "step": 8537 + }, + { + "epoch": 0.89, + "grad_norm": 1.9174669992475244, + "learning_rate": 3.2811773328368604e-07, + "loss": 0.6418, + "step": 8538 + }, + { + "epoch": 0.89, + "grad_norm": 2.00937920247632, + "learning_rate": 3.2751821801075055e-07, + "loss": 0.5425, + "step": 8539 + }, + { + "epoch": 0.89, + "grad_norm": 1.9361390403260308, + "learning_rate": 3.26919232390257e-07, + "loss": 0.6752, + "step": 8540 + }, + { + "epoch": 0.89, + "grad_norm": 1.976383613072552, + "learning_rate": 3.2632077649010495e-07, + "loss": 0.6174, + "step": 8541 + }, + { + "epoch": 0.89, + "grad_norm": 2.0442738816418555, + "learning_rate": 3.2572285037813123e-07, + "loss": 0.6375, + "step": 8542 + }, + { + "epoch": 0.89, + "grad_norm": 1.9529903619318187, + "learning_rate": 3.251254541221155e-07, + "loss": 0.6939, + "step": 8543 + }, + { + "epoch": 0.89, + "grad_norm": 1.8756515993312053, + "learning_rate": 3.245285877897747e-07, + "loss": 0.6838, + "step": 8544 + }, + { + "epoch": 0.89, + "grad_norm": 2.0880957169144088, + "learning_rate": 3.239322514487686e-07, + "loss": 0.5843, + "step": 8545 + }, + { + "epoch": 0.89, + "grad_norm": 1.9669044853905082, + "learning_rate": 3.233364451666948e-07, + "loss": 0.633, + "step": 8546 + }, + { + "epoch": 0.89, + "grad_norm": 2.1809013639310924, + "learning_rate": 3.227411690110904e-07, + "loss": 0.7172, + "step": 8547 + }, + { + "epoch": 0.89, + "grad_norm": 1.7695203042424918, + "learning_rate": 3.2214642304943364e-07, + "loss": 0.5523, + "step": 8548 + }, + { + "epoch": 0.89, + "grad_norm": 2.427593226307166, + "learning_rate": 3.215522073491434e-07, + "loss": 0.5712, + "step": 8549 + }, + { + "epoch": 0.89, + "grad_norm": 1.816262167078391, + "learning_rate": 3.2095852197757625e-07, + "loss": 0.5903, + "step": 8550 + }, + { + "epoch": 0.89, + "grad_norm": 2.0096160380666808, + "learning_rate": 3.203653670020307e-07, + "loss": 0.6423, + "step": 8551 + }, + { + "epoch": 0.89, + "grad_norm": 1.9964846253698296, + "learning_rate": 3.1977274248974286e-07, + "loss": 0.5813, + "step": 8552 + }, + { + "epoch": 0.89, + "grad_norm": 2.057566069204673, + "learning_rate": 3.1918064850789297e-07, + "loss": 0.5512, + "step": 8553 + }, + { + "epoch": 0.89, + "grad_norm": 1.9850613079984498, + "learning_rate": 3.1858908512359456e-07, + "loss": 0.5835, + "step": 8554 + }, + { + "epoch": 0.89, + "grad_norm": 1.847173933051401, + "learning_rate": 3.1799805240390723e-07, + "loss": 0.6499, + "step": 8555 + }, + { + "epoch": 0.89, + "grad_norm": 1.8814036275877406, + "learning_rate": 3.1740755041582694e-07, + "loss": 0.5895, + "step": 8556 + }, + { + "epoch": 0.89, + "grad_norm": 2.072777684061676, + "learning_rate": 3.1681757922629063e-07, + "loss": 0.6682, + "step": 8557 + }, + { + "epoch": 0.89, + "grad_norm": 1.8957016203434267, + "learning_rate": 3.1622813890217483e-07, + "loss": 0.5977, + "step": 8558 + }, + { + "epoch": 0.89, + "grad_norm": 1.9201797550472164, + "learning_rate": 3.156392295102967e-07, + "loss": 0.4838, + "step": 8559 + }, + { + "epoch": 0.89, + "grad_norm": 1.8415451231635673, + "learning_rate": 3.1505085111741165e-07, + "loss": 0.6179, + "step": 8560 + }, + { + "epoch": 0.89, + "grad_norm": 1.7549802487786756, + "learning_rate": 3.144630037902152e-07, + "loss": 0.5285, + "step": 8561 + }, + { + "epoch": 0.89, + "grad_norm": 1.982659812561418, + "learning_rate": 3.1387568759534523e-07, + "loss": 0.6163, + "step": 8562 + }, + { + "epoch": 0.89, + "grad_norm": 1.875249224583908, + "learning_rate": 3.132889025993746e-07, + "loss": 0.6487, + "step": 8563 + }, + { + "epoch": 0.89, + "grad_norm": 2.117308403382591, + "learning_rate": 3.127026488688217e-07, + "loss": 0.6205, + "step": 8564 + }, + { + "epoch": 0.89, + "grad_norm": 2.1811566929908777, + "learning_rate": 3.121169264701396e-07, + "loss": 0.5575, + "step": 8565 + }, + { + "epoch": 0.89, + "grad_norm": 1.7310740844214478, + "learning_rate": 3.1153173546972395e-07, + "loss": 0.5113, + "step": 8566 + }, + { + "epoch": 0.89, + "grad_norm": 2.0363528766549575, + "learning_rate": 3.109470759339095e-07, + "loss": 0.6399, + "step": 8567 + }, + { + "epoch": 0.89, + "grad_norm": 1.808682458516396, + "learning_rate": 3.1036294792897103e-07, + "loss": 0.5567, + "step": 8568 + }, + { + "epoch": 0.89, + "grad_norm": 2.0304155594704945, + "learning_rate": 3.097793515211211e-07, + "loss": 0.638, + "step": 8569 + }, + { + "epoch": 0.89, + "grad_norm": 1.8611948646613983, + "learning_rate": 3.0919628677651636e-07, + "loss": 0.5643, + "step": 8570 + }, + { + "epoch": 0.89, + "grad_norm": 1.8861149189282882, + "learning_rate": 3.086137537612488e-07, + "loss": 0.5412, + "step": 8571 + }, + { + "epoch": 0.89, + "grad_norm": 1.9998343579666824, + "learning_rate": 3.080317525413523e-07, + "loss": 0.6044, + "step": 8572 + }, + { + "epoch": 0.89, + "grad_norm": 1.9620610866969879, + "learning_rate": 3.074502831827997e-07, + "loss": 0.658, + "step": 8573 + }, + { + "epoch": 0.89, + "grad_norm": 1.9372850625188789, + "learning_rate": 3.0686934575150484e-07, + "loss": 0.6131, + "step": 8574 + }, + { + "epoch": 0.89, + "grad_norm": 2.010086692177107, + "learning_rate": 3.06288940313319e-07, + "loss": 0.5936, + "step": 8575 + }, + { + "epoch": 0.89, + "grad_norm": 1.7050273978062502, + "learning_rate": 3.057090669340357e-07, + "loss": 0.4815, + "step": 8576 + }, + { + "epoch": 0.89, + "grad_norm": 2.011687551457418, + "learning_rate": 3.0512972567938505e-07, + "loss": 0.6161, + "step": 8577 + }, + { + "epoch": 0.89, + "grad_norm": 1.9730483163807309, + "learning_rate": 3.045509166150412e-07, + "loss": 0.5797, + "step": 8578 + }, + { + "epoch": 0.89, + "grad_norm": 1.9574359302513729, + "learning_rate": 3.0397263980661283e-07, + "loss": 0.6348, + "step": 8579 + }, + { + "epoch": 0.89, + "grad_norm": 1.8261271087547564, + "learning_rate": 3.0339489531965307e-07, + "loss": 0.5946, + "step": 8580 + }, + { + "epoch": 0.89, + "grad_norm": 2.01638250418493, + "learning_rate": 3.028176832196511e-07, + "loss": 0.6325, + "step": 8581 + }, + { + "epoch": 0.89, + "grad_norm": 2.0561733418751396, + "learning_rate": 3.022410035720391e-07, + "loss": 0.657, + "step": 8582 + }, + { + "epoch": 0.89, + "grad_norm": 1.8801117833119756, + "learning_rate": 3.0166485644218423e-07, + "loss": 0.579, + "step": 8583 + }, + { + "epoch": 0.89, + "grad_norm": 2.17562854791616, + "learning_rate": 3.010892418953981e-07, + "loss": 0.627, + "step": 8584 + }, + { + "epoch": 0.89, + "grad_norm": 1.8988445529001572, + "learning_rate": 3.005141599969286e-07, + "loss": 0.559, + "step": 8585 + }, + { + "epoch": 0.89, + "grad_norm": 2.0209929465985708, + "learning_rate": 2.999396108119662e-07, + "loss": 0.5639, + "step": 8586 + }, + { + "epoch": 0.89, + "grad_norm": 1.7706616500453063, + "learning_rate": 2.9936559440563727e-07, + "loss": 0.577, + "step": 8587 + }, + { + "epoch": 0.89, + "grad_norm": 1.7695880019783534, + "learning_rate": 2.9879211084301194e-07, + "loss": 0.4723, + "step": 8588 + }, + { + "epoch": 0.89, + "grad_norm": 1.8372393183117093, + "learning_rate": 2.9821916018909603e-07, + "loss": 0.5891, + "step": 8589 + }, + { + "epoch": 0.89, + "grad_norm": 2.323982778463408, + "learning_rate": 2.9764674250883917e-07, + "loss": 0.6652, + "step": 8590 + }, + { + "epoch": 0.89, + "grad_norm": 1.850804961341406, + "learning_rate": 2.970748578671251e-07, + "loss": 0.6241, + "step": 8591 + }, + { + "epoch": 0.89, + "grad_norm": 1.947215875031301, + "learning_rate": 2.9650350632878246e-07, + "loss": 0.6403, + "step": 8592 + }, + { + "epoch": 0.89, + "grad_norm": 2.0757901778312218, + "learning_rate": 2.959326879585755e-07, + "loss": 0.6949, + "step": 8593 + }, + { + "epoch": 0.89, + "grad_norm": 2.3184331772097204, + "learning_rate": 2.953624028212115e-07, + "loss": 0.6928, + "step": 8594 + }, + { + "epoch": 0.89, + "grad_norm": 2.3251534514180134, + "learning_rate": 2.947926509813337e-07, + "loss": 0.6444, + "step": 8595 + }, + { + "epoch": 0.89, + "grad_norm": 1.9132798847854695, + "learning_rate": 2.942234325035287e-07, + "loss": 0.584, + "step": 8596 + }, + { + "epoch": 0.89, + "grad_norm": 1.9965630415954567, + "learning_rate": 2.9365474745231935e-07, + "loss": 0.6394, + "step": 8597 + }, + { + "epoch": 0.89, + "grad_norm": 2.044198645987738, + "learning_rate": 2.9308659589216913e-07, + "loss": 0.5541, + "step": 8598 + }, + { + "epoch": 0.89, + "grad_norm": 1.897414457067917, + "learning_rate": 2.925189778874826e-07, + "loss": 0.5109, + "step": 8599 + }, + { + "epoch": 0.89, + "grad_norm": 1.956046702740908, + "learning_rate": 2.9195189350260055e-07, + "loss": 0.6672, + "step": 8600 + }, + { + "epoch": 0.89, + "grad_norm": 1.9022595348835627, + "learning_rate": 2.9138534280180706e-07, + "loss": 0.6785, + "step": 8601 + }, + { + "epoch": 0.89, + "grad_norm": 2.026765034791194, + "learning_rate": 2.908193258493236e-07, + "loss": 0.6166, + "step": 8602 + }, + { + "epoch": 0.89, + "grad_norm": 2.127506565979339, + "learning_rate": 2.9025384270931043e-07, + "loss": 0.5703, + "step": 8603 + }, + { + "epoch": 0.89, + "grad_norm": 2.213452785659447, + "learning_rate": 2.8968889344586915e-07, + "loss": 0.6445, + "step": 8604 + }, + { + "epoch": 0.89, + "grad_norm": 1.9799657762514415, + "learning_rate": 2.8912447812303956e-07, + "loss": 0.6423, + "step": 8605 + }, + { + "epoch": 0.89, + "grad_norm": 2.1495360937337464, + "learning_rate": 2.885605968048011e-07, + "loss": 0.6764, + "step": 8606 + }, + { + "epoch": 0.89, + "grad_norm": 1.715241435422298, + "learning_rate": 2.8799724955507367e-07, + "loss": 0.555, + "step": 8607 + }, + { + "epoch": 0.89, + "grad_norm": 2.00052872630302, + "learning_rate": 2.874344364377152e-07, + "loss": 0.643, + "step": 8608 + }, + { + "epoch": 0.89, + "grad_norm": 1.9284952108877513, + "learning_rate": 2.868721575165245e-07, + "loss": 0.6311, + "step": 8609 + }, + { + "epoch": 0.9, + "grad_norm": 1.690339382498523, + "learning_rate": 2.86310412855238e-07, + "loss": 0.5235, + "step": 8610 + }, + { + "epoch": 0.9, + "grad_norm": 1.9565147307410093, + "learning_rate": 2.85749202517534e-07, + "loss": 0.6184, + "step": 8611 + }, + { + "epoch": 0.9, + "grad_norm": 2.0904675537845843, + "learning_rate": 2.8518852656702845e-07, + "loss": 0.6863, + "step": 8612 + }, + { + "epoch": 0.9, + "grad_norm": 2.106472750498347, + "learning_rate": 2.8462838506727707e-07, + "loss": 0.697, + "step": 8613 + }, + { + "epoch": 0.9, + "grad_norm": 1.961157989135241, + "learning_rate": 2.8406877808177414e-07, + "loss": 0.5415, + "step": 8614 + }, + { + "epoch": 0.9, + "grad_norm": 1.954339439444753, + "learning_rate": 2.8350970567395555e-07, + "loss": 0.6069, + "step": 8615 + }, + { + "epoch": 0.9, + "grad_norm": 2.020476422284541, + "learning_rate": 2.8295116790719444e-07, + "loss": 0.6706, + "step": 8616 + }, + { + "epoch": 0.9, + "grad_norm": 2.092819502393807, + "learning_rate": 2.8239316484480527e-07, + "loss": 0.5515, + "step": 8617 + }, + { + "epoch": 0.9, + "grad_norm": 1.835575899833264, + "learning_rate": 2.8183569655003963e-07, + "loss": 0.4899, + "step": 8618 + }, + { + "epoch": 0.9, + "grad_norm": 2.0289917891127454, + "learning_rate": 2.812787630860919e-07, + "loss": 0.5401, + "step": 8619 + }, + { + "epoch": 0.9, + "grad_norm": 1.811975427170239, + "learning_rate": 2.807223645160906e-07, + "loss": 0.5122, + "step": 8620 + }, + { + "epoch": 0.9, + "grad_norm": 2.200065793760229, + "learning_rate": 2.801665009031096e-07, + "loss": 0.536, + "step": 8621 + }, + { + "epoch": 0.9, + "grad_norm": 1.8723693676317084, + "learning_rate": 2.796111723101563e-07, + "loss": 0.5162, + "step": 8622 + }, + { + "epoch": 0.9, + "grad_norm": 1.9167058977018545, + "learning_rate": 2.7905637880018324e-07, + "loss": 0.6457, + "step": 8623 + }, + { + "epoch": 0.9, + "grad_norm": 1.9368506827247765, + "learning_rate": 2.7850212043607715e-07, + "loss": 0.6408, + "step": 8624 + }, + { + "epoch": 0.9, + "grad_norm": 1.8101557694487689, + "learning_rate": 2.779483972806685e-07, + "loss": 0.6135, + "step": 8625 + }, + { + "epoch": 0.9, + "grad_norm": 2.1315207329438612, + "learning_rate": 2.773952093967225e-07, + "loss": 0.5771, + "step": 8626 + }, + { + "epoch": 0.9, + "grad_norm": 2.0848835082063646, + "learning_rate": 2.7684255684694903e-07, + "loss": 0.6573, + "step": 8627 + }, + { + "epoch": 0.9, + "grad_norm": 1.91807737163709, + "learning_rate": 2.7629043969399193e-07, + "loss": 0.6022, + "step": 8628 + }, + { + "epoch": 0.9, + "grad_norm": 2.118468230505594, + "learning_rate": 2.7573885800043775e-07, + "loss": 0.6365, + "step": 8629 + }, + { + "epoch": 0.9, + "grad_norm": 1.9790619469804935, + "learning_rate": 2.7518781182881096e-07, + "loss": 0.6301, + "step": 8630 + }, + { + "epoch": 0.9, + "grad_norm": 2.022098894804944, + "learning_rate": 2.7463730124157706e-07, + "loss": 0.6485, + "step": 8631 + }, + { + "epoch": 0.9, + "grad_norm": 2.056757846364307, + "learning_rate": 2.7408732630113787e-07, + "loss": 0.565, + "step": 8632 + }, + { + "epoch": 0.9, + "grad_norm": 1.9947893769894807, + "learning_rate": 2.73537887069838e-07, + "loss": 0.5146, + "step": 8633 + }, + { + "epoch": 0.9, + "grad_norm": 2.231051088541484, + "learning_rate": 2.729889836099581e-07, + "loss": 0.5901, + "step": 8634 + }, + { + "epoch": 0.9, + "grad_norm": 1.9660039530430966, + "learning_rate": 2.7244061598371953e-07, + "loss": 0.7301, + "step": 8635 + }, + { + "epoch": 0.9, + "grad_norm": 1.9431122518352824, + "learning_rate": 2.7189278425328426e-07, + "loss": 0.6849, + "step": 8636 + }, + { + "epoch": 0.9, + "grad_norm": 1.8396886613553558, + "learning_rate": 2.713454884807504e-07, + "loss": 0.5825, + "step": 8637 + }, + { + "epoch": 0.9, + "grad_norm": 1.8078269413999202, + "learning_rate": 2.707987287281583e-07, + "loss": 0.6539, + "step": 8638 + }, + { + "epoch": 0.9, + "grad_norm": 1.862585778610424, + "learning_rate": 2.70252505057485e-07, + "loss": 0.5778, + "step": 8639 + }, + { + "epoch": 0.9, + "grad_norm": 1.894036420630567, + "learning_rate": 2.6970681753065e-07, + "loss": 0.5502, + "step": 8640 + }, + { + "epoch": 0.9, + "grad_norm": 1.8819699326290948, + "learning_rate": 2.691616662095092e-07, + "loss": 0.6226, + "step": 8641 + }, + { + "epoch": 0.9, + "grad_norm": 1.9204845961184525, + "learning_rate": 2.686170511558578e-07, + "loss": 0.637, + "step": 8642 + }, + { + "epoch": 0.9, + "grad_norm": 1.8825697272527415, + "learning_rate": 2.680729724314313e-07, + "loss": 0.6391, + "step": 8643 + }, + { + "epoch": 0.9, + "grad_norm": 1.8943724619603919, + "learning_rate": 2.675294300979053e-07, + "loss": 0.5944, + "step": 8644 + }, + { + "epoch": 0.9, + "grad_norm": 2.259795468206444, + "learning_rate": 2.6698642421689124e-07, + "loss": 0.6657, + "step": 8645 + }, + { + "epoch": 0.9, + "grad_norm": 1.9778845403528227, + "learning_rate": 2.664439548499448e-07, + "loss": 0.6692, + "step": 8646 + }, + { + "epoch": 0.9, + "grad_norm": 2.0445734899171097, + "learning_rate": 2.6590202205855506e-07, + "loss": 0.6772, + "step": 8647 + }, + { + "epoch": 0.9, + "grad_norm": 2.3171034519987823, + "learning_rate": 2.6536062590415577e-07, + "loss": 0.5824, + "step": 8648 + }, + { + "epoch": 0.9, + "grad_norm": 1.8081871008672612, + "learning_rate": 2.648197664481156e-07, + "loss": 0.6396, + "step": 8649 + }, + { + "epoch": 0.9, + "grad_norm": 2.1307670329224937, + "learning_rate": 2.6427944375174484e-07, + "loss": 0.6678, + "step": 8650 + }, + { + "epoch": 0.9, + "grad_norm": 2.253854108832021, + "learning_rate": 2.637396578762913e-07, + "loss": 0.5559, + "step": 8651 + }, + { + "epoch": 0.9, + "grad_norm": 2.094905790658024, + "learning_rate": 2.6320040888294373e-07, + "loss": 0.6969, + "step": 8652 + }, + { + "epoch": 0.9, + "grad_norm": 1.8667934315030559, + "learning_rate": 2.6266169683282827e-07, + "loss": 0.5351, + "step": 8653 + }, + { + "epoch": 0.9, + "grad_norm": 1.9530819540310358, + "learning_rate": 2.621235217870116e-07, + "loss": 0.6533, + "step": 8654 + }, + { + "epoch": 0.9, + "grad_norm": 1.9977273869765968, + "learning_rate": 2.615858838064983e-07, + "loss": 0.6239, + "step": 8655 + }, + { + "epoch": 0.9, + "grad_norm": 1.9377945345738095, + "learning_rate": 2.6104878295223455e-07, + "loss": 0.6392, + "step": 8656 + }, + { + "epoch": 0.9, + "grad_norm": 2.0652175471062133, + "learning_rate": 2.605122192851012e-07, + "loss": 0.6128, + "step": 8657 + }, + { + "epoch": 0.9, + "grad_norm": 2.0175796302069418, + "learning_rate": 2.5997619286592224e-07, + "loss": 0.6744, + "step": 8658 + }, + { + "epoch": 0.9, + "grad_norm": 1.747368958393763, + "learning_rate": 2.594407037554586e-07, + "loss": 0.6059, + "step": 8659 + }, + { + "epoch": 0.9, + "grad_norm": 2.2493432755306895, + "learning_rate": 2.5890575201441224e-07, + "loss": 0.6867, + "step": 8660 + }, + { + "epoch": 0.9, + "grad_norm": 2.0016522845047566, + "learning_rate": 2.5837133770342135e-07, + "loss": 0.6622, + "step": 8661 + }, + { + "epoch": 0.9, + "grad_norm": 2.2223145338114767, + "learning_rate": 2.578374608830664e-07, + "loss": 0.6139, + "step": 8662 + }, + { + "epoch": 0.9, + "grad_norm": 2.004690950521899, + "learning_rate": 2.5730412161386386e-07, + "loss": 0.6133, + "step": 8663 + }, + { + "epoch": 0.9, + "grad_norm": 1.9682711738064742, + "learning_rate": 2.567713199562727e-07, + "loss": 0.6053, + "step": 8664 + }, + { + "epoch": 0.9, + "grad_norm": 1.9739531150421892, + "learning_rate": 2.562390559706879e-07, + "loss": 0.6154, + "step": 8665 + }, + { + "epoch": 0.9, + "grad_norm": 2.1119152599912105, + "learning_rate": 2.5570732971744395e-07, + "loss": 0.6286, + "step": 8666 + }, + { + "epoch": 0.9, + "grad_norm": 1.893301025135361, + "learning_rate": 2.55176141256816e-07, + "loss": 0.6225, + "step": 8667 + }, + { + "epoch": 0.9, + "grad_norm": 2.1130422748129, + "learning_rate": 2.54645490649017e-07, + "loss": 0.6161, + "step": 8668 + }, + { + "epoch": 0.9, + "grad_norm": 2.1581512462565833, + "learning_rate": 2.5411537795419883e-07, + "loss": 0.7077, + "step": 8669 + }, + { + "epoch": 0.9, + "grad_norm": 1.9273252313199596, + "learning_rate": 2.5358580323245396e-07, + "loss": 0.6358, + "step": 8670 + }, + { + "epoch": 0.9, + "grad_norm": 1.9499885610352081, + "learning_rate": 2.530567665438116e-07, + "loss": 0.5547, + "step": 8671 + }, + { + "epoch": 0.9, + "grad_norm": 2.111913389915149, + "learning_rate": 2.52528267948241e-07, + "loss": 0.54, + "step": 8672 + }, + { + "epoch": 0.9, + "grad_norm": 2.0260864755391164, + "learning_rate": 2.520003075056521e-07, + "loss": 0.5447, + "step": 8673 + }, + { + "epoch": 0.9, + "grad_norm": 1.8671832739571868, + "learning_rate": 2.5147288527588964e-07, + "loss": 0.6723, + "step": 8674 + }, + { + "epoch": 0.9, + "grad_norm": 1.7405827829670664, + "learning_rate": 2.5094600131874205e-07, + "loss": 0.5051, + "step": 8675 + }, + { + "epoch": 0.9, + "grad_norm": 2.0584072380487015, + "learning_rate": 2.5041965569393366e-07, + "loss": 0.59, + "step": 8676 + }, + { + "epoch": 0.9, + "grad_norm": 2.0569408420447046, + "learning_rate": 2.498938484611296e-07, + "loss": 0.6142, + "step": 8677 + }, + { + "epoch": 0.9, + "grad_norm": 1.8546671023261774, + "learning_rate": 2.4936857967993166e-07, + "loss": 0.6434, + "step": 8678 + }, + { + "epoch": 0.9, + "grad_norm": 2.012713259045472, + "learning_rate": 2.4884384940988436e-07, + "loss": 0.6429, + "step": 8679 + }, + { + "epoch": 0.9, + "grad_norm": 2.1338257948479336, + "learning_rate": 2.4831965771046574e-07, + "loss": 0.6217, + "step": 8680 + }, + { + "epoch": 0.9, + "grad_norm": 1.978217762866347, + "learning_rate": 2.4779600464109874e-07, + "loss": 0.6405, + "step": 8681 + }, + { + "epoch": 0.9, + "grad_norm": 2.3616307774259555, + "learning_rate": 2.4727289026114043e-07, + "loss": 0.6803, + "step": 8682 + }, + { + "epoch": 0.9, + "grad_norm": 2.1877070740639097, + "learning_rate": 2.4675031462988995e-07, + "loss": 0.6009, + "step": 8683 + }, + { + "epoch": 0.9, + "grad_norm": 1.9569526648473758, + "learning_rate": 2.462282778065839e-07, + "loss": 0.6932, + "step": 8684 + }, + { + "epoch": 0.9, + "grad_norm": 1.8683437289001, + "learning_rate": 2.4570677985039817e-07, + "loss": 0.5918, + "step": 8685 + }, + { + "epoch": 0.9, + "grad_norm": 1.89796611020582, + "learning_rate": 2.451858208204477e-07, + "loss": 0.61, + "step": 8686 + }, + { + "epoch": 0.9, + "grad_norm": 1.9025712390079088, + "learning_rate": 2.446654007757865e-07, + "loss": 0.5998, + "step": 8687 + }, + { + "epoch": 0.9, + "grad_norm": 1.9360903945505326, + "learning_rate": 2.441455197754056e-07, + "loss": 0.5562, + "step": 8688 + }, + { + "epoch": 0.9, + "grad_norm": 1.8305256442447795, + "learning_rate": 2.436261778782378e-07, + "loss": 0.6337, + "step": 8689 + }, + { + "epoch": 0.9, + "grad_norm": 1.9413740891789482, + "learning_rate": 2.431073751431529e-07, + "loss": 0.6631, + "step": 8690 + }, + { + "epoch": 0.9, + "grad_norm": 1.8320514002798973, + "learning_rate": 2.4258911162896083e-07, + "loss": 0.6427, + "step": 8691 + }, + { + "epoch": 0.9, + "grad_norm": 1.6997980036645046, + "learning_rate": 2.4207138739440914e-07, + "loss": 0.5699, + "step": 8692 + }, + { + "epoch": 0.9, + "grad_norm": 1.768129171675751, + "learning_rate": 2.4155420249818596e-07, + "loss": 0.5332, + "step": 8693 + }, + { + "epoch": 0.9, + "grad_norm": 1.866393250711023, + "learning_rate": 2.4103755699891487e-07, + "loss": 0.5497, + "step": 8694 + }, + { + "epoch": 0.9, + "grad_norm": 2.1352325058003974, + "learning_rate": 2.405214509551623e-07, + "loss": 0.5664, + "step": 8695 + }, + { + "epoch": 0.9, + "grad_norm": 1.9594544194885912, + "learning_rate": 2.40005884425431e-07, + "loss": 0.5781, + "step": 8696 + }, + { + "epoch": 0.9, + "grad_norm": 1.7619113591533766, + "learning_rate": 2.3949085746816424e-07, + "loss": 0.5066, + "step": 8697 + }, + { + "epoch": 0.9, + "grad_norm": 2.1093914969198413, + "learning_rate": 2.389763701417419e-07, + "loss": 0.7102, + "step": 8698 + }, + { + "epoch": 0.9, + "grad_norm": 2.10399786419913, + "learning_rate": 2.3846242250448624e-07, + "loss": 0.7246, + "step": 8699 + }, + { + "epoch": 0.9, + "grad_norm": 2.0803726938672593, + "learning_rate": 2.3794901461465402e-07, + "loss": 0.7128, + "step": 8700 + }, + { + "epoch": 0.9, + "grad_norm": 1.8698045186841377, + "learning_rate": 2.3743614653044423e-07, + "loss": 0.5957, + "step": 8701 + }, + { + "epoch": 0.9, + "grad_norm": 1.6970683847366228, + "learning_rate": 2.3692381830999255e-07, + "loss": 0.5625, + "step": 8702 + }, + { + "epoch": 0.9, + "grad_norm": 2.096324173910222, + "learning_rate": 2.364120300113748e-07, + "loss": 0.667, + "step": 8703 + }, + { + "epoch": 0.9, + "grad_norm": 1.975751309515821, + "learning_rate": 2.3590078169260512e-07, + "loss": 0.5752, + "step": 8704 + }, + { + "epoch": 0.9, + "grad_norm": 1.934365429629889, + "learning_rate": 2.3539007341163656e-07, + "loss": 0.6003, + "step": 8705 + }, + { + "epoch": 0.9, + "grad_norm": 2.049087859270842, + "learning_rate": 2.3487990522636005e-07, + "loss": 0.6966, + "step": 8706 + }, + { + "epoch": 0.91, + "grad_norm": 2.1228999756456597, + "learning_rate": 2.3437027719460659e-07, + "loss": 0.5632, + "step": 8707 + }, + { + "epoch": 0.91, + "grad_norm": 1.988718489834557, + "learning_rate": 2.3386118937414602e-07, + "loss": 0.6215, + "step": 8708 + }, + { + "epoch": 0.91, + "grad_norm": 1.9566653933319493, + "learning_rate": 2.3335264182268502e-07, + "loss": 0.4998, + "step": 8709 + }, + { + "epoch": 0.91, + "grad_norm": 1.8116059731020802, + "learning_rate": 2.328446345978713e-07, + "loss": 0.505, + "step": 8710 + }, + { + "epoch": 0.91, + "grad_norm": 1.9242008346185047, + "learning_rate": 2.3233716775728943e-07, + "loss": 0.5418, + "step": 8711 + }, + { + "epoch": 0.91, + "grad_norm": 1.8515564888972094, + "learning_rate": 2.3183024135846554e-07, + "loss": 0.6389, + "step": 8712 + }, + { + "epoch": 0.91, + "grad_norm": 1.8985578066102073, + "learning_rate": 2.3132385545886037e-07, + "loss": 0.5512, + "step": 8713 + }, + { + "epoch": 0.91, + "grad_norm": 1.9046098103072053, + "learning_rate": 2.3081801011587745e-07, + "loss": 0.5628, + "step": 8714 + }, + { + "epoch": 0.91, + "grad_norm": 2.1917228952182097, + "learning_rate": 2.3031270538685701e-07, + "loss": 0.664, + "step": 8715 + }, + { + "epoch": 0.91, + "grad_norm": 2.5641131401356265, + "learning_rate": 2.2980794132907713e-07, + "loss": 0.7537, + "step": 8716 + }, + { + "epoch": 0.91, + "grad_norm": 1.8083432847429621, + "learning_rate": 2.2930371799975593e-07, + "loss": 0.5948, + "step": 8717 + }, + { + "epoch": 0.91, + "grad_norm": 2.157666039891891, + "learning_rate": 2.2880003545605101e-07, + "loss": 0.6228, + "step": 8718 + }, + { + "epoch": 0.91, + "grad_norm": 1.9598888616221026, + "learning_rate": 2.2829689375505616e-07, + "loss": 0.6904, + "step": 8719 + }, + { + "epoch": 0.91, + "grad_norm": 2.0529866203222418, + "learning_rate": 2.2779429295380683e-07, + "loss": 0.7202, + "step": 8720 + }, + { + "epoch": 0.91, + "grad_norm": 2.025400351121733, + "learning_rate": 2.2729223310927473e-07, + "loss": 0.6518, + "step": 8721 + }, + { + "epoch": 0.91, + "grad_norm": 1.643070030489791, + "learning_rate": 2.267907142783715e-07, + "loss": 0.5422, + "step": 8722 + }, + { + "epoch": 0.91, + "grad_norm": 2.168941535240748, + "learning_rate": 2.2628973651794783e-07, + "loss": 0.6902, + "step": 8723 + }, + { + "epoch": 0.91, + "grad_norm": 2.025938493633533, + "learning_rate": 2.257892998847916e-07, + "loss": 0.6059, + "step": 8724 + }, + { + "epoch": 0.91, + "grad_norm": 1.9086079703312495, + "learning_rate": 2.2528940443562964e-07, + "loss": 0.5982, + "step": 8725 + }, + { + "epoch": 0.91, + "grad_norm": 1.8506109120515981, + "learning_rate": 2.247900502271294e-07, + "loss": 0.5936, + "step": 8726 + }, + { + "epoch": 0.91, + "grad_norm": 2.129293598828574, + "learning_rate": 2.2429123731589397e-07, + "loss": 0.4808, + "step": 8727 + }, + { + "epoch": 0.91, + "grad_norm": 2.199598134027323, + "learning_rate": 2.2379296575846809e-07, + "loss": 0.6588, + "step": 8728 + }, + { + "epoch": 0.91, + "grad_norm": 1.9581770705902077, + "learning_rate": 2.2329523561133215e-07, + "loss": 0.5429, + "step": 8729 + }, + { + "epoch": 0.91, + "grad_norm": 2.070975877443173, + "learning_rate": 2.2279804693090934e-07, + "loss": 0.6249, + "step": 8730 + }, + { + "epoch": 0.91, + "grad_norm": 1.8912896810784905, + "learning_rate": 2.223013997735557e-07, + "loss": 0.6175, + "step": 8731 + }, + { + "epoch": 0.91, + "grad_norm": 2.104502168982993, + "learning_rate": 2.2180529419557062e-07, + "loss": 0.6044, + "step": 8732 + }, + { + "epoch": 0.91, + "grad_norm": 1.868832405954279, + "learning_rate": 2.213097302531897e-07, + "loss": 0.5568, + "step": 8733 + }, + { + "epoch": 0.91, + "grad_norm": 1.8040318964006492, + "learning_rate": 2.208147080025891e-07, + "loss": 0.5683, + "step": 8734 + }, + { + "epoch": 0.91, + "grad_norm": 1.9255176394764801, + "learning_rate": 2.2032022749988113e-07, + "loss": 0.6989, + "step": 8735 + }, + { + "epoch": 0.91, + "grad_norm": 1.8424660754915125, + "learning_rate": 2.1982628880111988e-07, + "loss": 0.578, + "step": 8736 + }, + { + "epoch": 0.91, + "grad_norm": 1.9013342427890227, + "learning_rate": 2.1933289196229335e-07, + "loss": 0.6747, + "step": 8737 + }, + { + "epoch": 0.91, + "grad_norm": 1.947218569687289, + "learning_rate": 2.1884003703933343e-07, + "loss": 0.5696, + "step": 8738 + }, + { + "epoch": 0.91, + "grad_norm": 2.124368808695498, + "learning_rate": 2.1834772408810712e-07, + "loss": 0.6159, + "step": 8739 + }, + { + "epoch": 0.91, + "grad_norm": 2.140229776900523, + "learning_rate": 2.1785595316442032e-07, + "loss": 0.6732, + "step": 8740 + }, + { + "epoch": 0.91, + "grad_norm": 1.9381452957689793, + "learning_rate": 2.17364724324019e-07, + "loss": 0.5212, + "step": 8741 + }, + { + "epoch": 0.91, + "grad_norm": 2.1126927729884417, + "learning_rate": 2.1687403762258585e-07, + "loss": 0.6332, + "step": 8742 + }, + { + "epoch": 0.91, + "grad_norm": 2.0856561623513987, + "learning_rate": 2.1638389311574358e-07, + "loss": 0.7432, + "step": 8743 + }, + { + "epoch": 0.91, + "grad_norm": 2.05971362182267, + "learning_rate": 2.1589429085905278e-07, + "loss": 0.6012, + "step": 8744 + }, + { + "epoch": 0.91, + "grad_norm": 2.0239460998799848, + "learning_rate": 2.1540523090801292e-07, + "loss": 0.5419, + "step": 8745 + }, + { + "epoch": 0.91, + "grad_norm": 1.9997879159734424, + "learning_rate": 2.149167133180602e-07, + "loss": 0.5633, + "step": 8746 + }, + { + "epoch": 0.91, + "grad_norm": 2.059015466218665, + "learning_rate": 2.1442873814457364e-07, + "loss": 0.6194, + "step": 8747 + }, + { + "epoch": 0.91, + "grad_norm": 2.525897595654169, + "learning_rate": 2.1394130544286519e-07, + "loss": 0.6233, + "step": 8748 + }, + { + "epoch": 0.91, + "grad_norm": 1.9910958324419423, + "learning_rate": 2.1345441526819e-07, + "loss": 0.5732, + "step": 8749 + }, + { + "epoch": 0.91, + "grad_norm": 1.9316768520667007, + "learning_rate": 2.1296806767573897e-07, + "loss": 0.667, + "step": 8750 + }, + { + "epoch": 0.91, + "grad_norm": 2.1742785881902753, + "learning_rate": 2.1248226272064299e-07, + "loss": 0.6773, + "step": 8751 + }, + { + "epoch": 0.91, + "grad_norm": 2.129579988218092, + "learning_rate": 2.1199700045797077e-07, + "loss": 0.5753, + "step": 8752 + }, + { + "epoch": 0.91, + "grad_norm": 2.188883794073873, + "learning_rate": 2.115122809427289e-07, + "loss": 0.6846, + "step": 8753 + }, + { + "epoch": 0.91, + "grad_norm": 1.8777793148808293, + "learning_rate": 2.1102810422986286e-07, + "loss": 0.5493, + "step": 8754 + }, + { + "epoch": 0.91, + "grad_norm": 1.9743047341702247, + "learning_rate": 2.105444703742582e-07, + "loss": 0.608, + "step": 8755 + }, + { + "epoch": 0.91, + "grad_norm": 1.95744377141077, + "learning_rate": 2.1006137943073602e-07, + "loss": 0.6769, + "step": 8756 + }, + { + "epoch": 0.91, + "grad_norm": 1.8324319226472476, + "learning_rate": 2.095788314540592e-07, + "loss": 0.5666, + "step": 8757 + }, + { + "epoch": 0.91, + "grad_norm": 1.6567842512967919, + "learning_rate": 2.0909682649892516e-07, + "loss": 0.5137, + "step": 8758 + }, + { + "epoch": 0.91, + "grad_norm": 1.7813909272995272, + "learning_rate": 2.08615364619974e-07, + "loss": 0.6392, + "step": 8759 + }, + { + "epoch": 0.91, + "grad_norm": 1.9961485822604668, + "learning_rate": 2.0813444587178156e-07, + "loss": 0.6946, + "step": 8760 + }, + { + "epoch": 0.91, + "grad_norm": 1.9287070838365437, + "learning_rate": 2.0765407030886197e-07, + "loss": 0.6178, + "step": 8761 + }, + { + "epoch": 0.91, + "grad_norm": 2.0168575757502505, + "learning_rate": 2.0717423798566838e-07, + "loss": 0.5699, + "step": 8762 + }, + { + "epoch": 0.91, + "grad_norm": 1.940310038251417, + "learning_rate": 2.0669494895659391e-07, + "loss": 0.5678, + "step": 8763 + }, + { + "epoch": 0.91, + "grad_norm": 1.8342339019571705, + "learning_rate": 2.0621620327596735e-07, + "loss": 0.6365, + "step": 8764 + }, + { + "epoch": 0.91, + "grad_norm": 1.9270699483483476, + "learning_rate": 2.057380009980581e-07, + "loss": 0.5998, + "step": 8765 + }, + { + "epoch": 0.91, + "grad_norm": 2.1596691648652593, + "learning_rate": 2.0526034217707213e-07, + "loss": 0.654, + "step": 8766 + }, + { + "epoch": 0.91, + "grad_norm": 2.0471564825995765, + "learning_rate": 2.0478322686715735e-07, + "loss": 0.6248, + "step": 8767 + }, + { + "epoch": 0.91, + "grad_norm": 2.009135979527017, + "learning_rate": 2.0430665512239377e-07, + "loss": 0.6103, + "step": 8768 + }, + { + "epoch": 0.91, + "grad_norm": 2.0477920499162057, + "learning_rate": 2.0383062699680601e-07, + "loss": 0.7414, + "step": 8769 + }, + { + "epoch": 0.91, + "grad_norm": 1.8535142344799098, + "learning_rate": 2.0335514254435363e-07, + "loss": 0.6597, + "step": 8770 + }, + { + "epoch": 0.91, + "grad_norm": 1.7529105422442415, + "learning_rate": 2.028802018189363e-07, + "loss": 0.5557, + "step": 8771 + }, + { + "epoch": 0.91, + "grad_norm": 2.064542466324043, + "learning_rate": 2.0240580487438988e-07, + "loss": 0.5888, + "step": 8772 + }, + { + "epoch": 0.91, + "grad_norm": 2.1972196941591506, + "learning_rate": 2.0193195176449188e-07, + "loss": 0.7178, + "step": 8773 + }, + { + "epoch": 0.91, + "grad_norm": 1.9128197640557918, + "learning_rate": 2.0145864254295434e-07, + "loss": 0.6155, + "step": 8774 + }, + { + "epoch": 0.91, + "grad_norm": 1.9423327842910736, + "learning_rate": 2.0098587726343156e-07, + "loss": 0.5647, + "step": 8775 + }, + { + "epoch": 0.91, + "grad_norm": 1.8146626640382046, + "learning_rate": 2.0051365597951233e-07, + "loss": 0.5116, + "step": 8776 + }, + { + "epoch": 0.91, + "grad_norm": 2.1038513954027906, + "learning_rate": 2.000419787447261e-07, + "loss": 0.6498, + "step": 8777 + }, + { + "epoch": 0.91, + "grad_norm": 1.9053349463179003, + "learning_rate": 1.9957084561254114e-07, + "loss": 0.6542, + "step": 8778 + }, + { + "epoch": 0.91, + "grad_norm": 2.0831840015435117, + "learning_rate": 1.9910025663636146e-07, + "loss": 0.5748, + "step": 8779 + }, + { + "epoch": 0.91, + "grad_norm": 1.997692900719902, + "learning_rate": 1.9863021186953268e-07, + "loss": 0.6376, + "step": 8780 + }, + { + "epoch": 0.91, + "grad_norm": 2.1968215047753756, + "learning_rate": 1.9816071136533665e-07, + "loss": 0.6415, + "step": 8781 + }, + { + "epoch": 0.91, + "grad_norm": 2.068923142726295, + "learning_rate": 1.9769175517699302e-07, + "loss": 0.6246, + "step": 8782 + }, + { + "epoch": 0.91, + "grad_norm": 2.047076265368446, + "learning_rate": 1.9722334335766092e-07, + "loss": 0.5755, + "step": 8783 + }, + { + "epoch": 0.91, + "grad_norm": 1.9159180146333123, + "learning_rate": 1.9675547596043787e-07, + "loss": 0.5303, + "step": 8784 + }, + { + "epoch": 0.91, + "grad_norm": 2.054746023991903, + "learning_rate": 1.962881530383587e-07, + "loss": 0.6917, + "step": 8785 + }, + { + "epoch": 0.91, + "grad_norm": 1.8723658262962497, + "learning_rate": 1.9582137464439876e-07, + "loss": 0.5934, + "step": 8786 + }, + { + "epoch": 0.91, + "grad_norm": 1.9256110172198415, + "learning_rate": 1.95355140831468e-07, + "loss": 0.5799, + "step": 8787 + }, + { + "epoch": 0.91, + "grad_norm": 2.0613496981654134, + "learning_rate": 1.94889451652418e-07, + "loss": 0.64, + "step": 8788 + }, + { + "epoch": 0.91, + "grad_norm": 1.8928277560645934, + "learning_rate": 1.9442430716003713e-07, + "loss": 0.5309, + "step": 8789 + }, + { + "epoch": 0.91, + "grad_norm": 2.211268371166142, + "learning_rate": 1.9395970740705205e-07, + "loss": 0.6969, + "step": 8790 + }, + { + "epoch": 0.91, + "grad_norm": 2.0242471939323763, + "learning_rate": 1.9349565244612678e-07, + "loss": 0.6129, + "step": 8791 + }, + { + "epoch": 0.91, + "grad_norm": 2.055751552611855, + "learning_rate": 1.9303214232986588e-07, + "loss": 0.6545, + "step": 8792 + }, + { + "epoch": 0.91, + "grad_norm": 1.936122063029454, + "learning_rate": 1.9256917711081014e-07, + "loss": 0.5843, + "step": 8793 + }, + { + "epoch": 0.91, + "grad_norm": 1.995976353696018, + "learning_rate": 1.921067568414403e-07, + "loss": 0.5792, + "step": 8794 + }, + { + "epoch": 0.91, + "grad_norm": 1.9787829732592972, + "learning_rate": 1.9164488157417284e-07, + "loss": 0.6806, + "step": 8795 + }, + { + "epoch": 0.91, + "grad_norm": 1.812060229047015, + "learning_rate": 1.9118355136136523e-07, + "loss": 0.587, + "step": 8796 + }, + { + "epoch": 0.91, + "grad_norm": 2.0037208498765238, + "learning_rate": 1.9072276625531127e-07, + "loss": 0.5948, + "step": 8797 + }, + { + "epoch": 0.91, + "grad_norm": 1.9635031676753756, + "learning_rate": 1.9026252630824415e-07, + "loss": 0.5545, + "step": 8798 + }, + { + "epoch": 0.91, + "grad_norm": 1.910679902637944, + "learning_rate": 1.898028315723338e-07, + "loss": 0.5776, + "step": 8799 + }, + { + "epoch": 0.91, + "grad_norm": 1.9739607133635115, + "learning_rate": 1.8934368209969023e-07, + "loss": 0.5948, + "step": 8800 + }, + { + "epoch": 0.91, + "grad_norm": 1.9046769737170033, + "learning_rate": 1.88885077942359e-07, + "loss": 0.6231, + "step": 8801 + }, + { + "epoch": 0.91, + "grad_norm": 1.9551105878545139, + "learning_rate": 1.8842701915232743e-07, + "loss": 0.52, + "step": 8802 + }, + { + "epoch": 0.92, + "grad_norm": 1.8166528029787048, + "learning_rate": 1.8796950578151785e-07, + "loss": 0.4986, + "step": 8803 + }, + { + "epoch": 0.92, + "grad_norm": 1.794656916003373, + "learning_rate": 1.8751253788179325e-07, + "loss": 0.6422, + "step": 8804 + }, + { + "epoch": 0.92, + "grad_norm": 2.173722308812529, + "learning_rate": 1.8705611550495218e-07, + "loss": 0.6271, + "step": 8805 + }, + { + "epoch": 0.92, + "grad_norm": 1.730807391557464, + "learning_rate": 1.866002387027338e-07, + "loss": 0.509, + "step": 8806 + }, + { + "epoch": 0.92, + "grad_norm": 1.7958190844792186, + "learning_rate": 1.8614490752681292e-07, + "loss": 0.5214, + "step": 8807 + }, + { + "epoch": 0.92, + "grad_norm": 1.9894503413864093, + "learning_rate": 1.8569012202880599e-07, + "loss": 0.5939, + "step": 8808 + }, + { + "epoch": 0.92, + "grad_norm": 2.1598352658993623, + "learning_rate": 1.852358822602629e-07, + "loss": 0.5809, + "step": 8809 + }, + { + "epoch": 0.92, + "grad_norm": 2.0071051604662937, + "learning_rate": 1.847821882726769e-07, + "loss": 0.605, + "step": 8810 + }, + { + "epoch": 0.92, + "grad_norm": 2.0832891855842615, + "learning_rate": 1.843290401174752e-07, + "loss": 0.6761, + "step": 8811 + }, + { + "epoch": 0.92, + "grad_norm": 2.0663532941669702, + "learning_rate": 1.838764378460256e-07, + "loss": 0.6676, + "step": 8812 + }, + { + "epoch": 0.92, + "grad_norm": 2.0405747575233977, + "learning_rate": 1.834243815096326e-07, + "loss": 0.5634, + "step": 8813 + }, + { + "epoch": 0.92, + "grad_norm": 2.086360893875116, + "learning_rate": 1.829728711595391e-07, + "loss": 0.6271, + "step": 8814 + }, + { + "epoch": 0.92, + "grad_norm": 1.9649083555496234, + "learning_rate": 1.825219068469275e-07, + "loss": 0.6017, + "step": 8815 + }, + { + "epoch": 0.92, + "grad_norm": 2.006301857023903, + "learning_rate": 1.820714886229158e-07, + "loss": 0.6402, + "step": 8816 + }, + { + "epoch": 0.92, + "grad_norm": 2.0959766376734814, + "learning_rate": 1.8162161653856257e-07, + "loss": 0.5846, + "step": 8817 + }, + { + "epoch": 0.92, + "grad_norm": 1.9703517347002897, + "learning_rate": 1.8117229064486264e-07, + "loss": 0.6658, + "step": 8818 + }, + { + "epoch": 0.92, + "grad_norm": 1.9458890586613302, + "learning_rate": 1.8072351099275077e-07, + "loss": 0.5241, + "step": 8819 + }, + { + "epoch": 0.92, + "grad_norm": 2.96184039789254, + "learning_rate": 1.8027527763309682e-07, + "loss": 0.5714, + "step": 8820 + }, + { + "epoch": 0.92, + "grad_norm": 1.8509877835265842, + "learning_rate": 1.798275906167124e-07, + "loss": 0.621, + "step": 8821 + }, + { + "epoch": 0.92, + "grad_norm": 1.8661444783986674, + "learning_rate": 1.7938044999434412e-07, + "loss": 0.6553, + "step": 8822 + }, + { + "epoch": 0.92, + "grad_norm": 1.8170939072394554, + "learning_rate": 1.7893385581667866e-07, + "loss": 0.6345, + "step": 8823 + }, + { + "epoch": 0.92, + "grad_norm": 1.9101906587424282, + "learning_rate": 1.7848780813433942e-07, + "loss": 0.604, + "step": 8824 + }, + { + "epoch": 0.92, + "grad_norm": 2.2378093679248, + "learning_rate": 1.7804230699788983e-07, + "loss": 0.6194, + "step": 8825 + }, + { + "epoch": 0.92, + "grad_norm": 2.28535490117752, + "learning_rate": 1.7759735245782838e-07, + "loss": 0.6506, + "step": 8826 + }, + { + "epoch": 0.92, + "grad_norm": 1.969139478413972, + "learning_rate": 1.7715294456459419e-07, + "loss": 0.5687, + "step": 8827 + }, + { + "epoch": 0.92, + "grad_norm": 1.9704670196931522, + "learning_rate": 1.7670908336856252e-07, + "loss": 0.6122, + "step": 8828 + }, + { + "epoch": 0.92, + "grad_norm": 2.046800006905025, + "learning_rate": 1.762657689200481e-07, + "loss": 0.6601, + "step": 8829 + }, + { + "epoch": 0.92, + "grad_norm": 2.0533236305561267, + "learning_rate": 1.7582300126930296e-07, + "loss": 0.6144, + "step": 8830 + }, + { + "epoch": 0.92, + "grad_norm": 1.8968834934718648, + "learning_rate": 1.7538078046651808e-07, + "loss": 0.6129, + "step": 8831 + }, + { + "epoch": 0.92, + "grad_norm": 1.8440975962291353, + "learning_rate": 1.7493910656182056e-07, + "loss": 0.5748, + "step": 8832 + }, + { + "epoch": 0.92, + "grad_norm": 2.054011675246738, + "learning_rate": 1.744979796052776e-07, + "loss": 0.6494, + "step": 8833 + }, + { + "epoch": 0.92, + "grad_norm": 2.350762103488185, + "learning_rate": 1.7405739964689362e-07, + "loss": 0.6574, + "step": 8834 + }, + { + "epoch": 0.92, + "grad_norm": 2.194253819345266, + "learning_rate": 1.7361736673660978e-07, + "loss": 0.582, + "step": 8835 + }, + { + "epoch": 0.92, + "grad_norm": 1.9923276521726647, + "learning_rate": 1.7317788092430676e-07, + "loss": 0.5446, + "step": 8836 + }, + { + "epoch": 0.92, + "grad_norm": 1.9545991453195062, + "learning_rate": 1.7273894225980303e-07, + "loss": 0.5959, + "step": 8837 + }, + { + "epoch": 0.92, + "grad_norm": 1.8931944180409803, + "learning_rate": 1.7230055079285435e-07, + "loss": 0.6128, + "step": 8838 + }, + { + "epoch": 0.92, + "grad_norm": 2.064916087058621, + "learning_rate": 1.7186270657315596e-07, + "loss": 0.5658, + "step": 8839 + }, + { + "epoch": 0.92, + "grad_norm": 2.014461259471736, + "learning_rate": 1.7142540965033815e-07, + "loss": 0.6944, + "step": 8840 + }, + { + "epoch": 0.92, + "grad_norm": 2.0937362350654034, + "learning_rate": 1.7098866007397296e-07, + "loss": 0.6447, + "step": 8841 + }, + { + "epoch": 0.92, + "grad_norm": 1.6339582774602281, + "learning_rate": 1.70552457893568e-07, + "loss": 0.5278, + "step": 8842 + }, + { + "epoch": 0.92, + "grad_norm": 1.9130656985961463, + "learning_rate": 1.701168031585676e-07, + "loss": 0.594, + "step": 8843 + }, + { + "epoch": 0.92, + "grad_norm": 1.8312506205126324, + "learning_rate": 1.696816959183578e-07, + "loss": 0.6467, + "step": 8844 + }, + { + "epoch": 0.92, + "grad_norm": 2.241570074288019, + "learning_rate": 1.6924713622225975e-07, + "loss": 0.6756, + "step": 8845 + }, + { + "epoch": 0.92, + "grad_norm": 2.1124221106992396, + "learning_rate": 1.6881312411953288e-07, + "loss": 0.6671, + "step": 8846 + }, + { + "epoch": 0.92, + "grad_norm": 1.9213107738949424, + "learning_rate": 1.6837965965937564e-07, + "loss": 0.6519, + "step": 8847 + }, + { + "epoch": 0.92, + "grad_norm": 1.8382074321131783, + "learning_rate": 1.6794674289092316e-07, + "loss": 0.5749, + "step": 8848 + }, + { + "epoch": 0.92, + "grad_norm": 1.860478511195977, + "learning_rate": 1.6751437386324842e-07, + "loss": 0.5574, + "step": 8849 + }, + { + "epoch": 0.92, + "grad_norm": 2.0995492889812497, + "learning_rate": 1.6708255262536443e-07, + "loss": 0.5592, + "step": 8850 + }, + { + "epoch": 0.92, + "grad_norm": 2.0108036150741375, + "learning_rate": 1.6665127922621927e-07, + "loss": 0.5698, + "step": 8851 + }, + { + "epoch": 0.92, + "grad_norm": 2.029768719174924, + "learning_rate": 1.6622055371470103e-07, + "loss": 0.5616, + "step": 8852 + }, + { + "epoch": 0.92, + "grad_norm": 2.069671016263712, + "learning_rate": 1.6579037613963456e-07, + "loss": 0.7072, + "step": 8853 + }, + { + "epoch": 0.92, + "grad_norm": 2.0304812240208134, + "learning_rate": 1.6536074654978307e-07, + "loss": 0.568, + "step": 8854 + }, + { + "epoch": 0.92, + "grad_norm": 1.9320514761894227, + "learning_rate": 1.6493166499384762e-07, + "loss": 0.6442, + "step": 8855 + }, + { + "epoch": 0.92, + "grad_norm": 2.0939593574383397, + "learning_rate": 1.645031315204676e-07, + "loss": 0.6313, + "step": 8856 + }, + { + "epoch": 0.92, + "grad_norm": 2.0131860014511966, + "learning_rate": 1.6407514617821752e-07, + "loss": 0.7299, + "step": 8857 + }, + { + "epoch": 0.92, + "grad_norm": 1.9602967302745016, + "learning_rate": 1.6364770901561467e-07, + "loss": 0.5883, + "step": 8858 + }, + { + "epoch": 0.92, + "grad_norm": 2.029561620284226, + "learning_rate": 1.6322082008110974e-07, + "loss": 0.58, + "step": 8859 + }, + { + "epoch": 0.92, + "grad_norm": 2.016812776940906, + "learning_rate": 1.6279447942309345e-07, + "loss": 0.687, + "step": 8860 + }, + { + "epoch": 0.92, + "grad_norm": 2.138409567531603, + "learning_rate": 1.6236868708989438e-07, + "loss": 0.686, + "step": 8861 + }, + { + "epoch": 0.92, + "grad_norm": 1.7290146664194534, + "learning_rate": 1.619434431297784e-07, + "loss": 0.559, + "step": 8862 + }, + { + "epoch": 0.92, + "grad_norm": 1.9890454463913607, + "learning_rate": 1.615187475909491e-07, + "loss": 0.6277, + "step": 8863 + }, + { + "epoch": 0.92, + "grad_norm": 1.9194698017431477, + "learning_rate": 1.6109460052154802e-07, + "loss": 0.5385, + "step": 8864 + }, + { + "epoch": 0.92, + "grad_norm": 1.873346670966746, + "learning_rate": 1.6067100196965447e-07, + "loss": 0.548, + "step": 8865 + }, + { + "epoch": 0.92, + "grad_norm": 1.7853281470209055, + "learning_rate": 1.602479519832867e-07, + "loss": 0.5921, + "step": 8866 + }, + { + "epoch": 0.92, + "grad_norm": 2.007330702321665, + "learning_rate": 1.5982545061039855e-07, + "loss": 0.6518, + "step": 8867 + }, + { + "epoch": 0.92, + "grad_norm": 1.9895802955142883, + "learning_rate": 1.5940349789888398e-07, + "loss": 0.6133, + "step": 8868 + }, + { + "epoch": 0.92, + "grad_norm": 1.977575299921113, + "learning_rate": 1.5898209389657305e-07, + "loss": 0.6876, + "step": 8869 + }, + { + "epoch": 0.92, + "grad_norm": 1.9175713363661888, + "learning_rate": 1.585612386512353e-07, + "loss": 0.594, + "step": 8870 + }, + { + "epoch": 0.92, + "grad_norm": 2.0749683134033026, + "learning_rate": 1.5814093221057647e-07, + "loss": 0.5403, + "step": 8871 + }, + { + "epoch": 0.92, + "grad_norm": 2.1233122307163885, + "learning_rate": 1.5772117462224068e-07, + "loss": 0.6821, + "step": 8872 + }, + { + "epoch": 0.92, + "grad_norm": 2.0961791591392056, + "learning_rate": 1.5730196593380877e-07, + "loss": 0.644, + "step": 8873 + }, + { + "epoch": 0.92, + "grad_norm": 1.944886823301487, + "learning_rate": 1.5688330619280269e-07, + "loss": 0.6329, + "step": 8874 + }, + { + "epoch": 0.92, + "grad_norm": 1.8193650429809556, + "learning_rate": 1.5646519544667783e-07, + "loss": 0.5126, + "step": 8875 + }, + { + "epoch": 0.92, + "grad_norm": 1.9800998991702814, + "learning_rate": 1.5604763374283073e-07, + "loss": 0.5899, + "step": 8876 + }, + { + "epoch": 0.92, + "grad_norm": 2.0403033797812267, + "learning_rate": 1.556306211285935e-07, + "loss": 0.623, + "step": 8877 + }, + { + "epoch": 0.92, + "grad_norm": 2.035385871609747, + "learning_rate": 1.5521415765123783e-07, + "loss": 0.544, + "step": 8878 + }, + { + "epoch": 0.92, + "grad_norm": 1.8780946782463006, + "learning_rate": 1.54798243357972e-07, + "loss": 0.6026, + "step": 8879 + }, + { + "epoch": 0.92, + "grad_norm": 2.4224720214407203, + "learning_rate": 1.5438287829594113e-07, + "loss": 0.6607, + "step": 8880 + }, + { + "epoch": 0.92, + "grad_norm": 2.0498507932770895, + "learning_rate": 1.5396806251223085e-07, + "loss": 0.6207, + "step": 8881 + }, + { + "epoch": 0.92, + "grad_norm": 2.011661827763579, + "learning_rate": 1.535537960538619e-07, + "loss": 0.6032, + "step": 8882 + }, + { + "epoch": 0.92, + "grad_norm": 2.1615800677651666, + "learning_rate": 1.5314007896779393e-07, + "loss": 0.6291, + "step": 8883 + }, + { + "epoch": 0.92, + "grad_norm": 2.2134352807827535, + "learning_rate": 1.5272691130092443e-07, + "loss": 0.5684, + "step": 8884 + }, + { + "epoch": 0.92, + "grad_norm": 2.113655773007037, + "learning_rate": 1.5231429310008817e-07, + "loss": 0.6024, + "step": 8885 + }, + { + "epoch": 0.92, + "grad_norm": 1.7751520094120168, + "learning_rate": 1.5190222441205715e-07, + "loss": 0.5622, + "step": 8886 + }, + { + "epoch": 0.92, + "grad_norm": 1.9421602467307941, + "learning_rate": 1.5149070528354238e-07, + "loss": 0.6058, + "step": 8887 + }, + { + "epoch": 0.92, + "grad_norm": 1.6698253787569741, + "learning_rate": 1.510797357611915e-07, + "loss": 0.6074, + "step": 8888 + }, + { + "epoch": 0.92, + "grad_norm": 1.9009707059362406, + "learning_rate": 1.5066931589159118e-07, + "loss": 0.6354, + "step": 8889 + }, + { + "epoch": 0.92, + "grad_norm": 1.7195770528720253, + "learning_rate": 1.502594457212636e-07, + "loss": 0.6722, + "step": 8890 + }, + { + "epoch": 0.92, + "grad_norm": 1.9207288432623015, + "learning_rate": 1.4985012529667052e-07, + "loss": 0.6761, + "step": 8891 + }, + { + "epoch": 0.92, + "grad_norm": 2.139605040626212, + "learning_rate": 1.4944135466421095e-07, + "loss": 0.6148, + "step": 8892 + }, + { + "epoch": 0.92, + "grad_norm": 2.0452258517628152, + "learning_rate": 1.4903313387022168e-07, + "loss": 0.6248, + "step": 8893 + }, + { + "epoch": 0.92, + "grad_norm": 2.107910493475442, + "learning_rate": 1.4862546296097514e-07, + "loss": 0.6257, + "step": 8894 + }, + { + "epoch": 0.92, + "grad_norm": 1.9371679169071094, + "learning_rate": 1.4821834198268493e-07, + "loss": 0.6685, + "step": 8895 + }, + { + "epoch": 0.92, + "grad_norm": 2.269015494506768, + "learning_rate": 1.4781177098149912e-07, + "loss": 0.6222, + "step": 8896 + }, + { + "epoch": 0.92, + "grad_norm": 2.0322257763626053, + "learning_rate": 1.4740575000350643e-07, + "loss": 0.6172, + "step": 8897 + }, + { + "epoch": 0.92, + "grad_norm": 1.9689096944036948, + "learning_rate": 1.4700027909473003e-07, + "loss": 0.5686, + "step": 8898 + }, + { + "epoch": 0.93, + "grad_norm": 1.8577584797794915, + "learning_rate": 1.4659535830113368e-07, + "loss": 0.7123, + "step": 8899 + }, + { + "epoch": 0.93, + "grad_norm": 2.087469919331919, + "learning_rate": 1.461909876686174e-07, + "loss": 0.6695, + "step": 8900 + }, + { + "epoch": 0.93, + "grad_norm": 2.0831701171131347, + "learning_rate": 1.4578716724301834e-07, + "loss": 0.5916, + "step": 8901 + }, + { + "epoch": 0.93, + "grad_norm": 1.765591749554606, + "learning_rate": 1.4538389707011103e-07, + "loss": 0.5457, + "step": 8902 + }, + { + "epoch": 0.93, + "grad_norm": 2.0101458451306575, + "learning_rate": 1.4498117719561056e-07, + "loss": 0.655, + "step": 8903 + }, + { + "epoch": 0.93, + "grad_norm": 2.057138304301439, + "learning_rate": 1.445790076651654e-07, + "loss": 0.6631, + "step": 8904 + }, + { + "epoch": 0.93, + "grad_norm": 2.01208335411512, + "learning_rate": 1.4417738852436523e-07, + "loss": 0.6347, + "step": 8905 + }, + { + "epoch": 0.93, + "grad_norm": 2.233841840702759, + "learning_rate": 1.4377631981873474e-07, + "loss": 0.6104, + "step": 8906 + }, + { + "epoch": 0.93, + "grad_norm": 1.992020434510179, + "learning_rate": 1.4337580159373864e-07, + "loss": 0.5969, + "step": 8907 + }, + { + "epoch": 0.93, + "grad_norm": 1.9569295294426226, + "learning_rate": 1.4297583389477675e-07, + "loss": 0.5636, + "step": 8908 + }, + { + "epoch": 0.93, + "grad_norm": 2.0519576156254904, + "learning_rate": 1.4257641676718891e-07, + "loss": 0.6166, + "step": 8909 + }, + { + "epoch": 0.93, + "grad_norm": 1.9110514363828413, + "learning_rate": 1.4217755025624946e-07, + "loss": 0.6078, + "step": 8910 + }, + { + "epoch": 0.93, + "grad_norm": 1.8948115316189942, + "learning_rate": 1.4177923440717445e-07, + "loss": 0.6172, + "step": 8911 + }, + { + "epoch": 0.93, + "grad_norm": 1.910510986217153, + "learning_rate": 1.413814692651133e-07, + "loss": 0.5804, + "step": 8912 + }, + { + "epoch": 0.93, + "grad_norm": 1.8590318355294009, + "learning_rate": 1.4098425487515665e-07, + "loss": 0.622, + "step": 8913 + }, + { + "epoch": 0.93, + "grad_norm": 1.9410471171303303, + "learning_rate": 1.4058759128232957e-07, + "loss": 0.5916, + "step": 8914 + }, + { + "epoch": 0.93, + "grad_norm": 1.866173074226064, + "learning_rate": 1.4019147853159663e-07, + "loss": 0.5734, + "step": 8915 + }, + { + "epoch": 0.93, + "grad_norm": 1.9339751206748805, + "learning_rate": 1.397959166678603e-07, + "loss": 0.5967, + "step": 8916 + }, + { + "epoch": 0.93, + "grad_norm": 2.0814425508882373, + "learning_rate": 1.3940090573595854e-07, + "loss": 0.6032, + "step": 8917 + }, + { + "epoch": 0.93, + "grad_norm": 1.7468867551652556, + "learning_rate": 1.3900644578066892e-07, + "loss": 0.5055, + "step": 8918 + }, + { + "epoch": 0.93, + "grad_norm": 1.9505230853370727, + "learning_rate": 1.386125368467045e-07, + "loss": 0.5505, + "step": 8919 + }, + { + "epoch": 0.93, + "grad_norm": 1.8138209073254565, + "learning_rate": 1.3821917897871905e-07, + "loss": 0.5381, + "step": 8920 + }, + { + "epoch": 0.93, + "grad_norm": 1.8811701807030197, + "learning_rate": 1.378263722213008e-07, + "loss": 0.5717, + "step": 8921 + }, + { + "epoch": 0.93, + "grad_norm": 1.6894404622210073, + "learning_rate": 1.3743411661897688e-07, + "loss": 0.5267, + "step": 8922 + }, + { + "epoch": 0.93, + "grad_norm": 2.045490954150234, + "learning_rate": 1.3704241221621062e-07, + "loss": 0.6505, + "step": 8923 + }, + { + "epoch": 0.93, + "grad_norm": 1.8548965079774178, + "learning_rate": 1.366512590574054e-07, + "loss": 0.5188, + "step": 8924 + }, + { + "epoch": 0.93, + "grad_norm": 2.1367292127451414, + "learning_rate": 1.3626065718689962e-07, + "loss": 0.6023, + "step": 8925 + }, + { + "epoch": 0.93, + "grad_norm": 1.9485421579702578, + "learning_rate": 1.3587060664897178e-07, + "loss": 0.5993, + "step": 8926 + }, + { + "epoch": 0.93, + "grad_norm": 1.9473491176314763, + "learning_rate": 1.3548110748783426e-07, + "loss": 0.6362, + "step": 8927 + }, + { + "epoch": 0.93, + "grad_norm": 2.150426581736567, + "learning_rate": 1.3509215974764067e-07, + "loss": 0.5799, + "step": 8928 + }, + { + "epoch": 0.93, + "grad_norm": 2.145747658974006, + "learning_rate": 1.347037634724796e-07, + "loss": 0.5796, + "step": 8929 + }, + { + "epoch": 0.93, + "grad_norm": 1.955864982526995, + "learning_rate": 1.3431591870637862e-07, + "loss": 0.7073, + "step": 8930 + }, + { + "epoch": 0.93, + "grad_norm": 2.0360528975248453, + "learning_rate": 1.3392862549330087e-07, + "loss": 0.6163, + "step": 8931 + }, + { + "epoch": 0.93, + "grad_norm": 2.066226650062881, + "learning_rate": 1.3354188387715017e-07, + "loss": 0.6395, + "step": 8932 + }, + { + "epoch": 0.93, + "grad_norm": 1.7992072253913418, + "learning_rate": 1.3315569390176364e-07, + "loss": 0.4591, + "step": 8933 + }, + { + "epoch": 0.93, + "grad_norm": 1.9422148778935013, + "learning_rate": 1.3277005561092016e-07, + "loss": 0.6084, + "step": 8934 + }, + { + "epoch": 0.93, + "grad_norm": 2.099312537538956, + "learning_rate": 1.3238496904833254e-07, + "loss": 0.6624, + "step": 8935 + }, + { + "epoch": 0.93, + "grad_norm": 2.106560416500555, + "learning_rate": 1.3200043425765364e-07, + "loss": 0.5163, + "step": 8936 + }, + { + "epoch": 0.93, + "grad_norm": 2.0539286083044805, + "learning_rate": 1.3161645128247247e-07, + "loss": 0.6933, + "step": 8937 + }, + { + "epoch": 0.93, + "grad_norm": 2.0500688798555298, + "learning_rate": 1.3123302016631477e-07, + "loss": 0.6283, + "step": 8938 + }, + { + "epoch": 0.93, + "grad_norm": 2.0385543350456485, + "learning_rate": 1.3085014095264527e-07, + "loss": 0.5445, + "step": 8939 + }, + { + "epoch": 0.93, + "grad_norm": 2.279005385123845, + "learning_rate": 1.3046781368486638e-07, + "loss": 0.6009, + "step": 8940 + }, + { + "epoch": 0.93, + "grad_norm": 4.050341700548169, + "learning_rate": 1.3008603840631516e-07, + "loss": 0.6678, + "step": 8941 + }, + { + "epoch": 0.93, + "grad_norm": 1.8506971858384356, + "learning_rate": 1.2970481516026922e-07, + "loss": 0.4761, + "step": 8942 + }, + { + "epoch": 0.93, + "grad_norm": 2.009727635049314, + "learning_rate": 1.293241439899423e-07, + "loss": 0.5926, + "step": 8943 + }, + { + "epoch": 0.93, + "grad_norm": 2.0818680696814704, + "learning_rate": 1.28944024938486e-07, + "loss": 0.6389, + "step": 8944 + }, + { + "epoch": 0.93, + "grad_norm": 1.903971138639191, + "learning_rate": 1.2856445804898866e-07, + "loss": 0.5683, + "step": 8945 + }, + { + "epoch": 0.93, + "grad_norm": 2.254377222554142, + "learning_rate": 1.2818544336447637e-07, + "loss": 0.5999, + "step": 8946 + }, + { + "epoch": 0.93, + "grad_norm": 2.0107754811059886, + "learning_rate": 1.2780698092791145e-07, + "loss": 0.5586, + "step": 8947 + }, + { + "epoch": 0.93, + "grad_norm": 1.811450767600721, + "learning_rate": 1.274290707821968e-07, + "loss": 0.5123, + "step": 8948 + }, + { + "epoch": 0.93, + "grad_norm": 2.2487747274358436, + "learning_rate": 1.2705171297016873e-07, + "loss": 0.6004, + "step": 8949 + }, + { + "epoch": 0.93, + "grad_norm": 2.0659723146256956, + "learning_rate": 1.2667490753460465e-07, + "loss": 0.6025, + "step": 8950 + }, + { + "epoch": 0.93, + "grad_norm": 1.857448937961545, + "learning_rate": 1.2629865451821656e-07, + "loss": 0.6056, + "step": 8951 + }, + { + "epoch": 0.93, + "grad_norm": 2.179553087454338, + "learning_rate": 1.2592295396365527e-07, + "loss": 0.668, + "step": 8952 + }, + { + "epoch": 0.93, + "grad_norm": 2.002595953566603, + "learning_rate": 1.2554780591350845e-07, + "loss": 0.5864, + "step": 8953 + }, + { + "epoch": 0.93, + "grad_norm": 2.0266575419282606, + "learning_rate": 1.2517321041030096e-07, + "loss": 0.6822, + "step": 8954 + }, + { + "epoch": 0.93, + "grad_norm": 1.9381132086572017, + "learning_rate": 1.2479916749649657e-07, + "loss": 0.5354, + "step": 8955 + }, + { + "epoch": 0.93, + "grad_norm": 1.877289303138945, + "learning_rate": 1.2442567721449307e-07, + "loss": 0.5831, + "step": 8956 + }, + { + "epoch": 0.93, + "grad_norm": 2.0137513658224586, + "learning_rate": 1.240527396066299e-07, + "loss": 0.5456, + "step": 8957 + }, + { + "epoch": 0.93, + "grad_norm": 1.8438678243217668, + "learning_rate": 1.2368035471517993e-07, + "loss": 0.578, + "step": 8958 + }, + { + "epoch": 0.93, + "grad_norm": 2.0226666721004958, + "learning_rate": 1.233085225823566e-07, + "loss": 0.6474, + "step": 8959 + }, + { + "epoch": 0.93, + "grad_norm": 1.9125971670943722, + "learning_rate": 1.2293724325030786e-07, + "loss": 0.6467, + "step": 8960 + }, + { + "epoch": 0.93, + "grad_norm": 2.0296936030038264, + "learning_rate": 1.2256651676112118e-07, + "loss": 0.6714, + "step": 8961 + }, + { + "epoch": 0.93, + "grad_norm": 1.9650618330501934, + "learning_rate": 1.2219634315681962e-07, + "loss": 0.6185, + "step": 8962 + }, + { + "epoch": 0.93, + "grad_norm": 1.9626286181880022, + "learning_rate": 1.2182672247936566e-07, + "loss": 0.574, + "step": 8963 + }, + { + "epoch": 0.93, + "grad_norm": 1.8374055121902448, + "learning_rate": 1.214576547706575e-07, + "loss": 0.5862, + "step": 8964 + }, + { + "epoch": 0.93, + "grad_norm": 2.0329421955743108, + "learning_rate": 1.210891400725306e-07, + "loss": 0.5647, + "step": 8965 + }, + { + "epoch": 0.93, + "grad_norm": 1.9931490923121717, + "learning_rate": 1.2072117842675867e-07, + "loss": 0.5806, + "step": 8966 + }, + { + "epoch": 0.93, + "grad_norm": 1.829680075590195, + "learning_rate": 1.2035376987505288e-07, + "loss": 0.6675, + "step": 8967 + }, + { + "epoch": 0.93, + "grad_norm": 1.847421420466505, + "learning_rate": 1.199869144590593e-07, + "loss": 0.5583, + "step": 8968 + }, + { + "epoch": 0.93, + "grad_norm": 1.8374941888305347, + "learning_rate": 1.196206122203647e-07, + "loss": 0.5854, + "step": 8969 + }, + { + "epoch": 0.93, + "grad_norm": 2.1554222663190084, + "learning_rate": 1.192548632004903e-07, + "loss": 0.7002, + "step": 8970 + }, + { + "epoch": 0.93, + "grad_norm": 1.9239746198381393, + "learning_rate": 1.188896674408968e-07, + "loss": 0.6828, + "step": 8971 + }, + { + "epoch": 0.93, + "grad_norm": 1.995032207458123, + "learning_rate": 1.1852502498298058e-07, + "loss": 0.5992, + "step": 8972 + }, + { + "epoch": 0.93, + "grad_norm": 1.8669745553621828, + "learning_rate": 1.1816093586807687e-07, + "loss": 0.5442, + "step": 8973 + }, + { + "epoch": 0.93, + "grad_norm": 2.1107807844678823, + "learning_rate": 1.1779740013745655e-07, + "loss": 0.5963, + "step": 8974 + }, + { + "epoch": 0.93, + "grad_norm": 1.8560952940963382, + "learning_rate": 1.174344178323289e-07, + "loss": 0.6525, + "step": 8975 + }, + { + "epoch": 0.93, + "grad_norm": 2.752115937083064, + "learning_rate": 1.1707198899383875e-07, + "loss": 0.6869, + "step": 8976 + }, + { + "epoch": 0.93, + "grad_norm": 1.7421395713844312, + "learning_rate": 1.1671011366307105e-07, + "loss": 0.556, + "step": 8977 + }, + { + "epoch": 0.93, + "grad_norm": 1.8270357499897079, + "learning_rate": 1.163487918810452e-07, + "loss": 0.5845, + "step": 8978 + }, + { + "epoch": 0.93, + "grad_norm": 1.740650769846731, + "learning_rate": 1.1598802368872009e-07, + "loss": 0.5365, + "step": 8979 + }, + { + "epoch": 0.93, + "grad_norm": 1.8365042275780272, + "learning_rate": 1.1562780912699023e-07, + "loss": 0.5251, + "step": 8980 + }, + { + "epoch": 0.93, + "grad_norm": 1.886097167360647, + "learning_rate": 1.1526814823668853e-07, + "loss": 0.6388, + "step": 8981 + }, + { + "epoch": 0.93, + "grad_norm": 1.8793682490212578, + "learning_rate": 1.1490904105858403e-07, + "loss": 0.6169, + "step": 8982 + }, + { + "epoch": 0.93, + "grad_norm": 1.7536349174266053, + "learning_rate": 1.1455048763338361e-07, + "loss": 0.5711, + "step": 8983 + }, + { + "epoch": 0.93, + "grad_norm": 1.898534836280984, + "learning_rate": 1.1419248800173199e-07, + "loss": 0.5357, + "step": 8984 + }, + { + "epoch": 0.93, + "grad_norm": 1.8602519762704828, + "learning_rate": 1.1383504220421004e-07, + "loss": 0.7386, + "step": 8985 + }, + { + "epoch": 0.93, + "grad_norm": 2.193413971204252, + "learning_rate": 1.1347815028133536e-07, + "loss": 0.6072, + "step": 8986 + }, + { + "epoch": 0.93, + "grad_norm": 1.885655084013139, + "learning_rate": 1.1312181227356556e-07, + "loss": 0.7008, + "step": 8987 + }, + { + "epoch": 0.93, + "grad_norm": 2.145617108224491, + "learning_rate": 1.1276602822129223e-07, + "loss": 0.6087, + "step": 8988 + }, + { + "epoch": 0.93, + "grad_norm": 2.1748127911102175, + "learning_rate": 1.1241079816484591e-07, + "loss": 0.6246, + "step": 8989 + }, + { + "epoch": 0.93, + "grad_norm": 2.0305142220354355, + "learning_rate": 1.1205612214449434e-07, + "loss": 0.6088, + "step": 8990 + }, + { + "epoch": 0.93, + "grad_norm": 1.8258785832904052, + "learning_rate": 1.1170200020044098e-07, + "loss": 0.549, + "step": 8991 + }, + { + "epoch": 0.93, + "grad_norm": 1.9969958511662391, + "learning_rate": 1.1134843237282922e-07, + "loss": 0.5544, + "step": 8992 + }, + { + "epoch": 0.93, + "grad_norm": 1.9296466850652296, + "learning_rate": 1.1099541870173591e-07, + "loss": 0.6445, + "step": 8993 + }, + { + "epoch": 0.93, + "grad_norm": 1.8055438751017052, + "learning_rate": 1.1064295922717904e-07, + "loss": 0.6468, + "step": 8994 + }, + { + "epoch": 0.94, + "grad_norm": 2.133732513612682, + "learning_rate": 1.1029105398911055e-07, + "loss": 0.5653, + "step": 8995 + }, + { + "epoch": 0.94, + "grad_norm": 1.8705602462539526, + "learning_rate": 1.0993970302742297e-07, + "loss": 0.6245, + "step": 8996 + }, + { + "epoch": 0.94, + "grad_norm": 1.8390147969929742, + "learning_rate": 1.0958890638194108e-07, + "loss": 0.6049, + "step": 8997 + }, + { + "epoch": 0.94, + "grad_norm": 1.9877021610759997, + "learning_rate": 1.0923866409243144e-07, + "loss": 0.5138, + "step": 8998 + }, + { + "epoch": 0.94, + "grad_norm": 1.9376071697368407, + "learning_rate": 1.0888897619859506e-07, + "loss": 0.5826, + "step": 8999 + }, + { + "epoch": 0.94, + "grad_norm": 1.8774347240206823, + "learning_rate": 1.0853984274007246e-07, + "loss": 0.64, + "step": 9000 + }, + { + "epoch": 0.94, + "grad_norm": 2.083899548076166, + "learning_rate": 1.0819126375643862e-07, + "loss": 0.6244, + "step": 9001 + }, + { + "epoch": 0.94, + "grad_norm": 2.158124305561164, + "learning_rate": 1.0784323928720753e-07, + "loss": 0.6079, + "step": 9002 + }, + { + "epoch": 0.94, + "grad_norm": 1.932121628840871, + "learning_rate": 1.0749576937182926e-07, + "loss": 0.5824, + "step": 9003 + }, + { + "epoch": 0.94, + "grad_norm": 2.052197137452352, + "learning_rate": 1.0714885404969288e-07, + "loss": 0.6556, + "step": 9004 + }, + { + "epoch": 0.94, + "grad_norm": 1.9618385932007807, + "learning_rate": 1.0680249336012139e-07, + "loss": 0.5538, + "step": 9005 + }, + { + "epoch": 0.94, + "grad_norm": 2.1643380059315005, + "learning_rate": 1.064566873423778e-07, + "loss": 0.5097, + "step": 9006 + }, + { + "epoch": 0.94, + "grad_norm": 2.0854664855258283, + "learning_rate": 1.0611143603566076e-07, + "loss": 0.6295, + "step": 9007 + }, + { + "epoch": 0.94, + "grad_norm": 1.9952119931589922, + "learning_rate": 1.0576673947910732e-07, + "loss": 0.599, + "step": 9008 + }, + { + "epoch": 0.94, + "grad_norm": 1.9799496601050393, + "learning_rate": 1.0542259771178898e-07, + "loss": 0.6321, + "step": 9009 + }, + { + "epoch": 0.94, + "grad_norm": 1.9816211810449393, + "learning_rate": 1.0507901077271843e-07, + "loss": 0.5686, + "step": 9010 + }, + { + "epoch": 0.94, + "grad_norm": 1.8540076930182927, + "learning_rate": 1.0473597870084174e-07, + "loss": 0.5351, + "step": 9011 + }, + { + "epoch": 0.94, + "grad_norm": 1.741984167133165, + "learning_rate": 1.0439350153504446e-07, + "loss": 0.5656, + "step": 9012 + }, + { + "epoch": 0.94, + "grad_norm": 1.9845451433362102, + "learning_rate": 1.0405157931414667e-07, + "loss": 0.596, + "step": 9013 + }, + { + "epoch": 0.94, + "grad_norm": 1.6155972863453967, + "learning_rate": 1.0371021207690957e-07, + "loss": 0.4353, + "step": 9014 + }, + { + "epoch": 0.94, + "grad_norm": 1.8418415679031528, + "learning_rate": 1.0336939986202666e-07, + "loss": 0.6544, + "step": 9015 + }, + { + "epoch": 0.94, + "grad_norm": 1.886523860804278, + "learning_rate": 1.0302914270813314e-07, + "loss": 0.6301, + "step": 9016 + }, + { + "epoch": 0.94, + "grad_norm": 1.8822214646480715, + "learning_rate": 1.0268944065379704e-07, + "loss": 0.5913, + "step": 9017 + }, + { + "epoch": 0.94, + "grad_norm": 2.0695162665439426, + "learning_rate": 1.0235029373752758e-07, + "loss": 0.629, + "step": 9018 + }, + { + "epoch": 0.94, + "grad_norm": 2.001389219375752, + "learning_rate": 1.0201170199776789e-07, + "loss": 0.5956, + "step": 9019 + }, + { + "epoch": 0.94, + "grad_norm": 2.1057510275279165, + "learning_rate": 1.0167366547289892e-07, + "loss": 0.6267, + "step": 9020 + }, + { + "epoch": 0.94, + "grad_norm": 1.9837252540821202, + "learning_rate": 1.0133618420123947e-07, + "loss": 0.6496, + "step": 9021 + }, + { + "epoch": 0.94, + "grad_norm": 2.04320201586192, + "learning_rate": 1.0099925822104562e-07, + "loss": 0.6135, + "step": 9022 + }, + { + "epoch": 0.94, + "grad_norm": 1.9760632612491782, + "learning_rate": 1.0066288757050846e-07, + "loss": 0.6513, + "step": 9023 + }, + { + "epoch": 0.94, + "grad_norm": 2.037409421200088, + "learning_rate": 1.0032707228775917e-07, + "loss": 0.5312, + "step": 9024 + }, + { + "epoch": 0.94, + "grad_norm": 2.0731308535241197, + "learning_rate": 9.999181241086231e-08, + "loss": 0.6405, + "step": 9025 + }, + { + "epoch": 0.94, + "grad_norm": 2.0034900044017414, + "learning_rate": 9.965710797782358e-08, + "loss": 0.6196, + "step": 9026 + }, + { + "epoch": 0.94, + "grad_norm": 2.0664636409457473, + "learning_rate": 9.932295902658263e-08, + "loss": 0.7305, + "step": 9027 + }, + { + "epoch": 0.94, + "grad_norm": 2.0057196873279506, + "learning_rate": 9.898936559501637e-08, + "loss": 0.6518, + "step": 9028 + }, + { + "epoch": 0.94, + "grad_norm": 1.920647903527355, + "learning_rate": 9.865632772094124e-08, + "loss": 0.568, + "step": 9029 + }, + { + "epoch": 0.94, + "grad_norm": 2.061404388610512, + "learning_rate": 9.832384544210704e-08, + "loss": 0.6644, + "step": 9030 + }, + { + "epoch": 0.94, + "grad_norm": 2.051063631357103, + "learning_rate": 9.799191879620474e-08, + "loss": 0.6747, + "step": 9031 + }, + { + "epoch": 0.94, + "grad_norm": 2.11086077130743, + "learning_rate": 9.76605478208581e-08, + "loss": 0.5537, + "step": 9032 + }, + { + "epoch": 0.94, + "grad_norm": 1.8661021852558275, + "learning_rate": 9.732973255363154e-08, + "loss": 0.5615, + "step": 9033 + }, + { + "epoch": 0.94, + "grad_norm": 1.8129931355838271, + "learning_rate": 9.699947303202339e-08, + "loss": 0.6432, + "step": 9034 + }, + { + "epoch": 0.94, + "grad_norm": 1.8858498295677275, + "learning_rate": 9.666976929347205e-08, + "loss": 0.5187, + "step": 9035 + }, + { + "epoch": 0.94, + "grad_norm": 2.167973700167487, + "learning_rate": 9.634062137534927e-08, + "loss": 0.6469, + "step": 9036 + }, + { + "epoch": 0.94, + "grad_norm": 1.8363965553392205, + "learning_rate": 9.601202931496745e-08, + "loss": 0.5113, + "step": 9037 + }, + { + "epoch": 0.94, + "grad_norm": 1.8634840550978502, + "learning_rate": 9.5683993149574e-08, + "loss": 0.6035, + "step": 9038 + }, + { + "epoch": 0.94, + "grad_norm": 2.2143595974193717, + "learning_rate": 9.535651291635362e-08, + "loss": 0.6819, + "step": 9039 + }, + { + "epoch": 0.94, + "grad_norm": 1.6812826245515171, + "learning_rate": 9.502958865242774e-08, + "loss": 0.5876, + "step": 9040 + }, + { + "epoch": 0.94, + "grad_norm": 1.8248003304779605, + "learning_rate": 9.470322039485614e-08, + "loss": 0.5248, + "step": 9041 + }, + { + "epoch": 0.94, + "grad_norm": 1.9421864031203093, + "learning_rate": 9.437740818063312e-08, + "loss": 0.6556, + "step": 9042 + }, + { + "epoch": 0.94, + "grad_norm": 1.906943046443007, + "learning_rate": 9.405215204669193e-08, + "loss": 0.5409, + "step": 9043 + }, + { + "epoch": 0.94, + "grad_norm": 2.0185391093753142, + "learning_rate": 9.372745202990253e-08, + "loss": 0.6874, + "step": 9044 + }, + { + "epoch": 0.94, + "grad_norm": 1.9843021074005947, + "learning_rate": 9.3403308167071e-08, + "loss": 0.5017, + "step": 9045 + }, + { + "epoch": 0.94, + "grad_norm": 1.9854311881929885, + "learning_rate": 9.30797204949413e-08, + "loss": 0.6676, + "step": 9046 + }, + { + "epoch": 0.94, + "grad_norm": 2.0826537011454134, + "learning_rate": 9.275668905019409e-08, + "loss": 0.6396, + "step": 9047 + }, + { + "epoch": 0.94, + "grad_norm": 2.039120363907294, + "learning_rate": 9.243421386944618e-08, + "loss": 0.65, + "step": 9048 + }, + { + "epoch": 0.94, + "grad_norm": 1.8978766367979325, + "learning_rate": 9.211229498925278e-08, + "loss": 0.5569, + "step": 9049 + }, + { + "epoch": 0.94, + "grad_norm": 1.7595071125989983, + "learning_rate": 9.179093244610415e-08, + "loss": 0.486, + "step": 9050 + }, + { + "epoch": 0.94, + "grad_norm": 1.890778877362623, + "learning_rate": 9.147012627642949e-08, + "loss": 0.6964, + "step": 9051 + }, + { + "epoch": 0.94, + "grad_norm": 2.005268335224658, + "learning_rate": 9.114987651659357e-08, + "loss": 0.5917, + "step": 9052 + }, + { + "epoch": 0.94, + "grad_norm": 2.1730014002086895, + "learning_rate": 9.083018320289849e-08, + "loss": 0.6614, + "step": 9053 + }, + { + "epoch": 0.94, + "grad_norm": 1.9735903938030195, + "learning_rate": 9.051104637158359e-08, + "loss": 0.5689, + "step": 9054 + }, + { + "epoch": 0.94, + "grad_norm": 1.8456661586038106, + "learning_rate": 9.019246605882492e-08, + "loss": 0.5551, + "step": 9055 + }, + { + "epoch": 0.94, + "grad_norm": 2.052240286399753, + "learning_rate": 8.987444230073528e-08, + "loss": 0.6064, + "step": 9056 + }, + { + "epoch": 0.94, + "grad_norm": 1.8535713753595429, + "learning_rate": 8.955697513336415e-08, + "loss": 0.5835, + "step": 9057 + }, + { + "epoch": 0.94, + "grad_norm": 1.8303132592401052, + "learning_rate": 8.924006459269886e-08, + "loss": 0.5045, + "step": 9058 + }, + { + "epoch": 0.94, + "grad_norm": 1.901318346829754, + "learning_rate": 8.892371071466176e-08, + "loss": 0.6645, + "step": 9059 + }, + { + "epoch": 0.94, + "grad_norm": 1.9257932527237893, + "learning_rate": 8.860791353511532e-08, + "loss": 0.6335, + "step": 9060 + }, + { + "epoch": 0.94, + "grad_norm": 1.9043707207031637, + "learning_rate": 8.829267308985535e-08, + "loss": 0.5067, + "step": 9061 + }, + { + "epoch": 0.94, + "grad_norm": 1.9917505790714765, + "learning_rate": 8.797798941461655e-08, + "loss": 0.6557, + "step": 9062 + }, + { + "epoch": 0.94, + "grad_norm": 1.8937462209908957, + "learning_rate": 8.766386254507043e-08, + "loss": 0.5871, + "step": 9063 + }, + { + "epoch": 0.94, + "grad_norm": 1.9406431576153542, + "learning_rate": 8.735029251682515e-08, + "loss": 0.5824, + "step": 9064 + }, + { + "epoch": 0.94, + "grad_norm": 1.969695305422505, + "learning_rate": 8.703727936542505e-08, + "loss": 0.6058, + "step": 9065 + }, + { + "epoch": 0.94, + "grad_norm": 1.9684624208295343, + "learning_rate": 8.672482312635233e-08, + "loss": 0.7302, + "step": 9066 + }, + { + "epoch": 0.94, + "grad_norm": 1.9063600516524655, + "learning_rate": 8.641292383502531e-08, + "loss": 0.5817, + "step": 9067 + }, + { + "epoch": 0.94, + "grad_norm": 1.913210044504492, + "learning_rate": 8.610158152680071e-08, + "loss": 0.6381, + "step": 9068 + }, + { + "epoch": 0.94, + "grad_norm": 1.9742629875445776, + "learning_rate": 8.579079623696917e-08, + "loss": 0.7128, + "step": 9069 + }, + { + "epoch": 0.94, + "grad_norm": 1.8004319724207307, + "learning_rate": 8.548056800076199e-08, + "loss": 0.6184, + "step": 9070 + }, + { + "epoch": 0.94, + "grad_norm": 1.8260617741245229, + "learning_rate": 8.517089685334323e-08, + "loss": 0.665, + "step": 9071 + }, + { + "epoch": 0.94, + "grad_norm": 2.0768267807130627, + "learning_rate": 8.486178282981761e-08, + "loss": 0.701, + "step": 9072 + }, + { + "epoch": 0.94, + "grad_norm": 1.8819509447944622, + "learning_rate": 8.455322596522375e-08, + "loss": 0.6165, + "step": 9073 + }, + { + "epoch": 0.94, + "grad_norm": 1.9499123138063066, + "learning_rate": 8.424522629453924e-08, + "loss": 0.6395, + "step": 9074 + }, + { + "epoch": 0.94, + "grad_norm": 1.978374390635286, + "learning_rate": 8.39377838526767e-08, + "loss": 0.597, + "step": 9075 + }, + { + "epoch": 0.94, + "grad_norm": 1.8940455664553872, + "learning_rate": 8.363089867448715e-08, + "loss": 0.5436, + "step": 9076 + }, + { + "epoch": 0.94, + "grad_norm": 1.9531936063259534, + "learning_rate": 8.332457079475831e-08, + "loss": 0.5746, + "step": 9077 + }, + { + "epoch": 0.94, + "grad_norm": 1.823839062817681, + "learning_rate": 8.301880024821296e-08, + "loss": 0.5773, + "step": 9078 + }, + { + "epoch": 0.94, + "grad_norm": 2.0376195888887345, + "learning_rate": 8.271358706951227e-08, + "loss": 0.5653, + "step": 9079 + }, + { + "epoch": 0.94, + "grad_norm": 1.999992907214282, + "learning_rate": 8.240893129325412e-08, + "loss": 0.6454, + "step": 9080 + }, + { + "epoch": 0.94, + "grad_norm": 1.8778664959112257, + "learning_rate": 8.210483295397309e-08, + "loss": 0.6124, + "step": 9081 + }, + { + "epoch": 0.94, + "grad_norm": 2.173549074135219, + "learning_rate": 8.180129208613996e-08, + "loss": 0.6335, + "step": 9082 + }, + { + "epoch": 0.94, + "grad_norm": 1.9313462323770607, + "learning_rate": 8.149830872416331e-08, + "loss": 0.6208, + "step": 9083 + }, + { + "epoch": 0.94, + "grad_norm": 2.0185542339124622, + "learning_rate": 8.119588290238845e-08, + "loss": 0.6189, + "step": 9084 + }, + { + "epoch": 0.94, + "grad_norm": 1.9778143235997587, + "learning_rate": 8.089401465509628e-08, + "loss": 0.6107, + "step": 9085 + }, + { + "epoch": 0.94, + "grad_norm": 1.8124396471473323, + "learning_rate": 8.059270401650555e-08, + "loss": 0.6052, + "step": 9086 + }, + { + "epoch": 0.94, + "grad_norm": 1.8136504927323756, + "learning_rate": 8.029195102077114e-08, + "loss": 0.6718, + "step": 9087 + }, + { + "epoch": 0.94, + "grad_norm": 1.9373249127008352, + "learning_rate": 7.999175570198526e-08, + "loss": 0.5822, + "step": 9088 + }, + { + "epoch": 0.94, + "grad_norm": 2.048051219588817, + "learning_rate": 7.969211809417732e-08, + "loss": 0.6618, + "step": 9089 + }, + { + "epoch": 0.94, + "grad_norm": 1.9640947013669483, + "learning_rate": 7.939303823131295e-08, + "loss": 0.6105, + "step": 9090 + }, + { + "epoch": 0.95, + "grad_norm": 1.8903042757750237, + "learning_rate": 7.909451614729335e-08, + "loss": 0.5093, + "step": 9091 + }, + { + "epoch": 0.95, + "grad_norm": 1.9700603234134746, + "learning_rate": 7.87965518759598e-08, + "loss": 0.6289, + "step": 9092 + }, + { + "epoch": 0.95, + "grad_norm": 2.068173773432267, + "learning_rate": 7.84991454510864e-08, + "loss": 0.6136, + "step": 9093 + }, + { + "epoch": 0.95, + "grad_norm": 1.9719085980940332, + "learning_rate": 7.820229690638615e-08, + "loss": 0.5706, + "step": 9094 + }, + { + "epoch": 0.95, + "grad_norm": 1.8669821696404074, + "learning_rate": 7.790600627550937e-08, + "loss": 0.6763, + "step": 9095 + }, + { + "epoch": 0.95, + "grad_norm": 2.0351969275458415, + "learning_rate": 7.761027359204088e-08, + "loss": 0.6411, + "step": 9096 + }, + { + "epoch": 0.95, + "grad_norm": 2.034610079179534, + "learning_rate": 7.731509888950551e-08, + "loss": 0.5367, + "step": 9097 + }, + { + "epoch": 0.95, + "grad_norm": 1.9903590314113853, + "learning_rate": 7.702048220136149e-08, + "loss": 0.6549, + "step": 9098 + }, + { + "epoch": 0.95, + "grad_norm": 2.040096970601361, + "learning_rate": 7.672642356100656e-08, + "loss": 0.5158, + "step": 9099 + }, + { + "epoch": 0.95, + "grad_norm": 1.8819140831818857, + "learning_rate": 7.643292300177296e-08, + "loss": 0.4779, + "step": 9100 + }, + { + "epoch": 0.95, + "grad_norm": 1.684999361885513, + "learning_rate": 7.613998055693073e-08, + "loss": 0.6065, + "step": 9101 + }, + { + "epoch": 0.95, + "grad_norm": 2.0139571541265853, + "learning_rate": 7.584759625968663e-08, + "loss": 0.554, + "step": 9102 + }, + { + "epoch": 0.95, + "grad_norm": 1.8729824867562832, + "learning_rate": 7.555577014318472e-08, + "loss": 0.5385, + "step": 9103 + }, + { + "epoch": 0.95, + "grad_norm": 2.080253251347515, + "learning_rate": 7.526450224050407e-08, + "loss": 0.5282, + "step": 9104 + }, + { + "epoch": 0.95, + "grad_norm": 1.8347182779960427, + "learning_rate": 7.497379258466275e-08, + "loss": 0.5965, + "step": 9105 + }, + { + "epoch": 0.95, + "grad_norm": 2.164534043831013, + "learning_rate": 7.468364120861272e-08, + "loss": 0.6141, + "step": 9106 + }, + { + "epoch": 0.95, + "grad_norm": 2.2212477876578496, + "learning_rate": 7.439404814524654e-08, + "loss": 0.5259, + "step": 9107 + }, + { + "epoch": 0.95, + "grad_norm": 1.87666751211576, + "learning_rate": 7.410501342738963e-08, + "loss": 0.5406, + "step": 9108 + }, + { + "epoch": 0.95, + "grad_norm": 2.042540734017541, + "learning_rate": 7.381653708780578e-08, + "loss": 0.6218, + "step": 9109 + }, + { + "epoch": 0.95, + "grad_norm": 1.9767662921202052, + "learning_rate": 7.352861915919607e-08, + "loss": 0.5197, + "step": 9110 + }, + { + "epoch": 0.95, + "grad_norm": 1.9563546830435936, + "learning_rate": 7.324125967419715e-08, + "loss": 0.6026, + "step": 9111 + }, + { + "epoch": 0.95, + "grad_norm": 2.0233014232164117, + "learning_rate": 7.295445866538297e-08, + "loss": 0.5401, + "step": 9112 + }, + { + "epoch": 0.95, + "grad_norm": 2.059552075508856, + "learning_rate": 7.26682161652642e-08, + "loss": 0.5435, + "step": 9113 + }, + { + "epoch": 0.95, + "grad_norm": 2.13299560125978, + "learning_rate": 7.238253220628822e-08, + "loss": 0.7473, + "step": 9114 + }, + { + "epoch": 0.95, + "grad_norm": 2.2557692843805333, + "learning_rate": 7.20974068208391e-08, + "loss": 0.5964, + "step": 9115 + }, + { + "epoch": 0.95, + "grad_norm": 1.9903729640281569, + "learning_rate": 7.181284004123601e-08, + "loss": 0.6408, + "step": 9116 + }, + { + "epoch": 0.95, + "grad_norm": 1.8199528667987253, + "learning_rate": 7.152883189973759e-08, + "loss": 0.5784, + "step": 9117 + }, + { + "epoch": 0.95, + "grad_norm": 2.35897713970099, + "learning_rate": 7.124538242853751e-08, + "loss": 0.6388, + "step": 9118 + }, + { + "epoch": 0.95, + "grad_norm": 2.1030461531404296, + "learning_rate": 7.096249165976621e-08, + "loss": 0.6247, + "step": 9119 + }, + { + "epoch": 0.95, + "grad_norm": 1.8662951848090812, + "learning_rate": 7.06801596254908e-08, + "loss": 0.6072, + "step": 9120 + }, + { + "epoch": 0.95, + "grad_norm": 1.9138440363291855, + "learning_rate": 7.039838635771623e-08, + "loss": 0.5645, + "step": 9121 + }, + { + "epoch": 0.95, + "grad_norm": 1.734258665755537, + "learning_rate": 7.011717188838196e-08, + "loss": 0.604, + "step": 9122 + }, + { + "epoch": 0.95, + "grad_norm": 2.196151154914006, + "learning_rate": 6.983651624936527e-08, + "loss": 0.5313, + "step": 9123 + }, + { + "epoch": 0.95, + "grad_norm": 1.7132829172978423, + "learning_rate": 6.955641947248127e-08, + "loss": 0.5929, + "step": 9124 + }, + { + "epoch": 0.95, + "grad_norm": 1.9418755624331163, + "learning_rate": 6.927688158947954e-08, + "loss": 0.5897, + "step": 9125 + }, + { + "epoch": 0.95, + "grad_norm": 1.9872540559183436, + "learning_rate": 6.899790263204643e-08, + "loss": 0.5944, + "step": 9126 + }, + { + "epoch": 0.95, + "grad_norm": 2.1639015797194205, + "learning_rate": 6.871948263180772e-08, + "loss": 0.6074, + "step": 9127 + }, + { + "epoch": 0.95, + "grad_norm": 1.9873225783662412, + "learning_rate": 6.844162162032265e-08, + "loss": 0.5684, + "step": 9128 + }, + { + "epoch": 0.95, + "grad_norm": 2.0519891397743266, + "learning_rate": 6.816431962908931e-08, + "loss": 0.6091, + "step": 9129 + }, + { + "epoch": 0.95, + "grad_norm": 1.9955079387546661, + "learning_rate": 6.788757668954038e-08, + "loss": 0.5752, + "step": 9130 + }, + { + "epoch": 0.95, + "grad_norm": 2.0818890242182357, + "learning_rate": 6.761139283304685e-08, + "loss": 0.6259, + "step": 9131 + }, + { + "epoch": 0.95, + "grad_norm": 1.9861722922728569, + "learning_rate": 6.73357680909159e-08, + "loss": 0.6536, + "step": 9132 + }, + { + "epoch": 0.95, + "grad_norm": 1.955473365977711, + "learning_rate": 6.706070249439034e-08, + "loss": 0.5794, + "step": 9133 + }, + { + "epoch": 0.95, + "grad_norm": 1.8569417282687593, + "learning_rate": 6.678619607465131e-08, + "loss": 0.5563, + "step": 9134 + }, + { + "epoch": 0.95, + "grad_norm": 1.9271564060152484, + "learning_rate": 6.651224886281504e-08, + "loss": 0.6414, + "step": 9135 + }, + { + "epoch": 0.95, + "grad_norm": 2.0471491330858442, + "learning_rate": 6.623886088993559e-08, + "loss": 0.5188, + "step": 9136 + }, + { + "epoch": 0.95, + "grad_norm": 1.9551415429882093, + "learning_rate": 6.59660321870026e-08, + "loss": 0.5605, + "step": 9137 + }, + { + "epoch": 0.95, + "grad_norm": 2.083480071147602, + "learning_rate": 6.5693762784943e-08, + "loss": 0.5904, + "step": 9138 + }, + { + "epoch": 0.95, + "grad_norm": 1.828002306812625, + "learning_rate": 6.542205271461988e-08, + "loss": 0.5919, + "step": 9139 + }, + { + "epoch": 0.95, + "grad_norm": 2.0114884598779534, + "learning_rate": 6.515090200683361e-08, + "loss": 0.5696, + "step": 9140 + }, + { + "epoch": 0.95, + "grad_norm": 1.8440201739093747, + "learning_rate": 6.48803106923196e-08, + "loss": 0.6082, + "step": 9141 + }, + { + "epoch": 0.95, + "grad_norm": 1.7543344260471143, + "learning_rate": 6.461027880175219e-08, + "loss": 0.6036, + "step": 9142 + }, + { + "epoch": 0.95, + "grad_norm": 1.7472069441257652, + "learning_rate": 6.434080636574025e-08, + "loss": 0.6077, + "step": 9143 + }, + { + "epoch": 0.95, + "grad_norm": 1.8148441768694168, + "learning_rate": 6.407189341483044e-08, + "loss": 0.5992, + "step": 9144 + }, + { + "epoch": 0.95, + "grad_norm": 1.8986202705276924, + "learning_rate": 6.380353997950506e-08, + "loss": 0.5562, + "step": 9145 + }, + { + "epoch": 0.95, + "grad_norm": 2.0313020160531634, + "learning_rate": 6.35357460901842e-08, + "loss": 0.5758, + "step": 9146 + }, + { + "epoch": 0.95, + "grad_norm": 1.9249252266461854, + "learning_rate": 6.326851177722304e-08, + "loss": 0.5855, + "step": 9147 + }, + { + "epoch": 0.95, + "grad_norm": 2.302195837407535, + "learning_rate": 6.300183707091457e-08, + "loss": 0.5979, + "step": 9148 + }, + { + "epoch": 0.95, + "grad_norm": 1.942121352149436, + "learning_rate": 6.273572200148792e-08, + "loss": 0.5166, + "step": 9149 + }, + { + "epoch": 0.95, + "grad_norm": 1.821356962288708, + "learning_rate": 6.247016659910842e-08, + "loss": 0.6556, + "step": 9150 + }, + { + "epoch": 0.95, + "grad_norm": 2.0069073914638875, + "learning_rate": 6.220517089387867e-08, + "loss": 0.6701, + "step": 9151 + }, + { + "epoch": 0.95, + "grad_norm": 1.7514971240436819, + "learning_rate": 6.194073491583796e-08, + "loss": 0.5036, + "step": 9152 + }, + { + "epoch": 0.95, + "grad_norm": 2.0429327714393395, + "learning_rate": 6.167685869495954e-08, + "loss": 0.6446, + "step": 9153 + }, + { + "epoch": 0.95, + "grad_norm": 2.136167526614985, + "learning_rate": 6.141354226115781e-08, + "loss": 0.6722, + "step": 9154 + }, + { + "epoch": 0.95, + "grad_norm": 2.036591943231998, + "learning_rate": 6.115078564427946e-08, + "loss": 0.6177, + "step": 9155 + }, + { + "epoch": 0.95, + "grad_norm": 1.957728748548829, + "learning_rate": 6.088858887411064e-08, + "loss": 0.5896, + "step": 9156 + }, + { + "epoch": 0.95, + "grad_norm": 1.8733656466152195, + "learning_rate": 6.06269519803715e-08, + "loss": 0.6692, + "step": 9157 + }, + { + "epoch": 0.95, + "grad_norm": 1.9703078943600634, + "learning_rate": 6.036587499272161e-08, + "loss": 0.5802, + "step": 9158 + }, + { + "epoch": 0.95, + "grad_norm": 1.6708428863614988, + "learning_rate": 6.010535794075455e-08, + "loss": 0.4799, + "step": 9159 + }, + { + "epoch": 0.95, + "grad_norm": 2.063875697905319, + "learning_rate": 5.984540085400114e-08, + "loss": 0.57, + "step": 9160 + }, + { + "epoch": 0.95, + "grad_norm": 1.85335140179406, + "learning_rate": 5.9586003761930024e-08, + "loss": 0.5223, + "step": 9161 + }, + { + "epoch": 0.95, + "grad_norm": 1.7270860909482966, + "learning_rate": 5.932716669394489e-08, + "loss": 0.5401, + "step": 9162 + }, + { + "epoch": 0.95, + "grad_norm": 2.0866778648399165, + "learning_rate": 5.9068889679385624e-08, + "loss": 0.563, + "step": 9163 + }, + { + "epoch": 0.95, + "grad_norm": 2.047048714162409, + "learning_rate": 5.8811172747530457e-08, + "loss": 0.5375, + "step": 9164 + }, + { + "epoch": 0.95, + "grad_norm": 1.8605628615428378, + "learning_rate": 5.855401592759269e-08, + "loss": 0.5913, + "step": 9165 + }, + { + "epoch": 0.95, + "grad_norm": 1.6883951918162357, + "learning_rate": 5.8297419248722345e-08, + "loss": 0.5417, + "step": 9166 + }, + { + "epoch": 0.95, + "grad_norm": 1.9874976605092245, + "learning_rate": 5.80413827400067e-08, + "loss": 0.5758, + "step": 9167 + }, + { + "epoch": 0.95, + "grad_norm": 2.095976679677836, + "learning_rate": 5.778590643046811e-08, + "loss": 0.5551, + "step": 9168 + }, + { + "epoch": 0.95, + "grad_norm": 2.368713155416316, + "learning_rate": 5.7530990349067285e-08, + "loss": 0.5212, + "step": 9169 + }, + { + "epoch": 0.95, + "grad_norm": 1.9132853451864427, + "learning_rate": 5.727663452469945e-08, + "loss": 0.6589, + "step": 9170 + }, + { + "epoch": 0.95, + "grad_norm": 1.8104655617619578, + "learning_rate": 5.702283898619765e-08, + "loss": 0.5446, + "step": 9171 + }, + { + "epoch": 0.95, + "grad_norm": 1.895549005171521, + "learning_rate": 5.6769603762331096e-08, + "loss": 0.6207, + "step": 9172 + }, + { + "epoch": 0.95, + "grad_norm": 2.010640938641115, + "learning_rate": 5.651692888180516e-08, + "loss": 0.5883, + "step": 9173 + }, + { + "epoch": 0.95, + "grad_norm": 1.9263129999779227, + "learning_rate": 5.626481437326303e-08, + "loss": 0.6767, + "step": 9174 + }, + { + "epoch": 0.95, + "grad_norm": 1.9656014437831926, + "learning_rate": 5.601326026528187e-08, + "loss": 0.6445, + "step": 9175 + }, + { + "epoch": 0.95, + "grad_norm": 2.0843236537066003, + "learning_rate": 5.5762266586377734e-08, + "loss": 0.6064, + "step": 9176 + }, + { + "epoch": 0.95, + "grad_norm": 2.2182950535700776, + "learning_rate": 5.551183336500177e-08, + "loss": 0.597, + "step": 9177 + }, + { + "epoch": 0.95, + "grad_norm": 2.0720744469238306, + "learning_rate": 5.526196062954181e-08, + "loss": 0.5568, + "step": 9178 + }, + { + "epoch": 0.95, + "grad_norm": 1.9350111511493413, + "learning_rate": 5.501264840832299e-08, + "loss": 0.6638, + "step": 9179 + }, + { + "epoch": 0.95, + "grad_norm": 1.9323446366770365, + "learning_rate": 5.4763896729606023e-08, + "loss": 0.578, + "step": 9180 + }, + { + "epoch": 0.95, + "grad_norm": 2.09711440356542, + "learning_rate": 5.451570562158892e-08, + "loss": 0.6727, + "step": 9181 + }, + { + "epoch": 0.95, + "grad_norm": 1.8486259014947013, + "learning_rate": 5.4268075112403615e-08, + "loss": 0.5326, + "step": 9182 + }, + { + "epoch": 0.95, + "grad_norm": 2.002990590014281, + "learning_rate": 5.402100523012266e-08, + "loss": 0.6319, + "step": 9183 + }, + { + "epoch": 0.95, + "grad_norm": 1.762628792862233, + "learning_rate": 5.377449600275142e-08, + "loss": 0.5471, + "step": 9184 + }, + { + "epoch": 0.95, + "grad_norm": 1.80757756492264, + "learning_rate": 5.352854745823366e-08, + "loss": 0.5654, + "step": 9185 + }, + { + "epoch": 0.95, + "grad_norm": 2.111766800819143, + "learning_rate": 5.3283159624448745e-08, + "loss": 0.6046, + "step": 9186 + }, + { + "epoch": 0.95, + "grad_norm": 2.136818353546279, + "learning_rate": 5.3038332529213865e-08, + "loss": 0.633, + "step": 9187 + }, + { + "epoch": 0.96, + "grad_norm": 1.7831004493513567, + "learning_rate": 5.279406620028016e-08, + "loss": 0.5601, + "step": 9188 + }, + { + "epoch": 0.96, + "grad_norm": 1.9671860829871322, + "learning_rate": 5.255036066533714e-08, + "loss": 0.6037, + "step": 9189 + }, + { + "epoch": 0.96, + "grad_norm": 2.2695919619215563, + "learning_rate": 5.230721595201049e-08, + "loss": 0.6201, + "step": 9190 + }, + { + "epoch": 0.96, + "grad_norm": 1.9143495780323916, + "learning_rate": 5.20646320878615e-08, + "loss": 0.6209, + "step": 9191 + }, + { + "epoch": 0.96, + "grad_norm": 1.7335234797845942, + "learning_rate": 5.182260910038928e-08, + "loss": 0.5238, + "step": 9192 + }, + { + "epoch": 0.96, + "grad_norm": 2.1520062494936623, + "learning_rate": 5.1581147017027434e-08, + "loss": 0.5854, + "step": 9193 + }, + { + "epoch": 0.96, + "grad_norm": 1.8251696136209343, + "learning_rate": 5.134024586514796e-08, + "loss": 0.5771, + "step": 9194 + }, + { + "epoch": 0.96, + "grad_norm": 1.8451479021419956, + "learning_rate": 5.109990567205792e-08, + "loss": 0.5387, + "step": 9195 + }, + { + "epoch": 0.96, + "grad_norm": 1.9829306324790406, + "learning_rate": 5.08601264650016e-08, + "loss": 0.5184, + "step": 9196 + }, + { + "epoch": 0.96, + "grad_norm": 2.0004402204722864, + "learning_rate": 5.06209082711584e-08, + "loss": 0.5989, + "step": 9197 + }, + { + "epoch": 0.96, + "grad_norm": 1.9331331565930703, + "learning_rate": 5.038225111764605e-08, + "loss": 0.6236, + "step": 9198 + }, + { + "epoch": 0.96, + "grad_norm": 2.1842830535619893, + "learning_rate": 5.0144155031517926e-08, + "loss": 0.5366, + "step": 9199 + }, + { + "epoch": 0.96, + "grad_norm": 2.0003594845759616, + "learning_rate": 4.990662003976243e-08, + "loss": 0.6162, + "step": 9200 + }, + { + "epoch": 0.96, + "grad_norm": 1.9969106571586461, + "learning_rate": 4.966964616930692e-08, + "loss": 0.6471, + "step": 9201 + }, + { + "epoch": 0.96, + "grad_norm": 1.971192026882797, + "learning_rate": 4.943323344701212e-08, + "loss": 0.6109, + "step": 9202 + }, + { + "epoch": 0.96, + "grad_norm": 2.158762303401497, + "learning_rate": 4.919738189967771e-08, + "loss": 0.6428, + "step": 9203 + }, + { + "epoch": 0.96, + "grad_norm": 1.6185547120279675, + "learning_rate": 4.8962091554039524e-08, + "loss": 0.6368, + "step": 9204 + }, + { + "epoch": 0.96, + "grad_norm": 2.015253292430434, + "learning_rate": 4.8727362436767344e-08, + "loss": 0.6126, + "step": 9205 + }, + { + "epoch": 0.96, + "grad_norm": 2.116270646369772, + "learning_rate": 4.8493194574470436e-08, + "loss": 0.6873, + "step": 9206 + }, + { + "epoch": 0.96, + "grad_norm": 1.7183519515168044, + "learning_rate": 4.825958799369201e-08, + "loss": 0.508, + "step": 9207 + }, + { + "epoch": 0.96, + "grad_norm": 1.7666831560398584, + "learning_rate": 4.802654272091367e-08, + "loss": 0.5841, + "step": 9208 + }, + { + "epoch": 0.96, + "grad_norm": 1.8230875968232338, + "learning_rate": 4.779405878255206e-08, + "loss": 0.5549, + "step": 9209 + }, + { + "epoch": 0.96, + "grad_norm": 2.152194890452504, + "learning_rate": 4.756213620496053e-08, + "loss": 0.6126, + "step": 9210 + }, + { + "epoch": 0.96, + "grad_norm": 2.015006634328195, + "learning_rate": 4.733077501442862e-08, + "loss": 0.6799, + "step": 9211 + }, + { + "epoch": 0.96, + "grad_norm": 2.256874151082544, + "learning_rate": 4.709997523718257e-08, + "loss": 0.5797, + "step": 9212 + }, + { + "epoch": 0.96, + "grad_norm": 1.9338870307485518, + "learning_rate": 4.6869736899385345e-08, + "loss": 0.5389, + "step": 9213 + }, + { + "epoch": 0.96, + "grad_norm": 1.866170129258445, + "learning_rate": 4.664006002713495e-08, + "loss": 0.6559, + "step": 9214 + }, + { + "epoch": 0.96, + "grad_norm": 2.1576898953891415, + "learning_rate": 4.641094464646667e-08, + "loss": 0.6158, + "step": 9215 + }, + { + "epoch": 0.96, + "grad_norm": 1.8488446862845445, + "learning_rate": 4.618239078335307e-08, + "loss": 0.5886, + "step": 9216 + }, + { + "epoch": 0.96, + "grad_norm": 2.046528350714155, + "learning_rate": 4.5954398463700647e-08, + "loss": 0.6249, + "step": 9217 + }, + { + "epoch": 0.96, + "grad_norm": 1.860971563552356, + "learning_rate": 4.572696771335483e-08, + "loss": 0.6386, + "step": 9218 + }, + { + "epoch": 0.96, + "grad_norm": 1.9383068774147776, + "learning_rate": 4.5500098558095565e-08, + "loss": 0.5341, + "step": 9219 + }, + { + "epoch": 0.96, + "grad_norm": 2.1917997000300655, + "learning_rate": 4.5273791023639494e-08, + "loss": 0.5904, + "step": 9220 + }, + { + "epoch": 0.96, + "grad_norm": 2.0815462160143805, + "learning_rate": 4.504804513564054e-08, + "loss": 0.6328, + "step": 9221 + }, + { + "epoch": 0.96, + "grad_norm": 1.8893378428676797, + "learning_rate": 4.482286091968768e-08, + "loss": 0.5827, + "step": 9222 + }, + { + "epoch": 0.96, + "grad_norm": 1.8432453644442008, + "learning_rate": 4.459823840130717e-08, + "loss": 0.5942, + "step": 9223 + }, + { + "epoch": 0.96, + "grad_norm": 2.0539820466658094, + "learning_rate": 4.437417760596141e-08, + "loss": 0.5402, + "step": 9224 + }, + { + "epoch": 0.96, + "grad_norm": 2.0722178972880947, + "learning_rate": 4.415067855904842e-08, + "loss": 0.6071, + "step": 9225 + }, + { + "epoch": 0.96, + "grad_norm": 2.2177273773031763, + "learning_rate": 4.392774128590349e-08, + "loss": 0.6256, + "step": 9226 + }, + { + "epoch": 0.96, + "grad_norm": 2.0694785751628038, + "learning_rate": 4.3705365811797515e-08, + "loss": 0.668, + "step": 9227 + }, + { + "epoch": 0.96, + "grad_norm": 1.8951352083608757, + "learning_rate": 4.348355216193867e-08, + "loss": 0.6042, + "step": 9228 + }, + { + "epoch": 0.96, + "grad_norm": 2.028419678421276, + "learning_rate": 4.326230036147017e-08, + "loss": 0.5914, + "step": 9229 + }, + { + "epoch": 0.96, + "grad_norm": 2.0823053199915336, + "learning_rate": 4.30416104354725e-08, + "loss": 0.5709, + "step": 9230 + }, + { + "epoch": 0.96, + "grad_norm": 1.823570228290745, + "learning_rate": 4.282148240896178e-08, + "loss": 0.599, + "step": 9231 + }, + { + "epoch": 0.96, + "grad_norm": 1.74260159641539, + "learning_rate": 4.2601916306891365e-08, + "loss": 0.5198, + "step": 9232 + }, + { + "epoch": 0.96, + "grad_norm": 1.7015132786393319, + "learning_rate": 4.2382912154150244e-08, + "loss": 0.5233, + "step": 9233 + }, + { + "epoch": 0.96, + "grad_norm": 1.961974043992251, + "learning_rate": 4.216446997556245e-08, + "loss": 0.5958, + "step": 9234 + }, + { + "epoch": 0.96, + "grad_norm": 2.013194471656937, + "learning_rate": 4.19465897958915e-08, + "loss": 0.541, + "step": 9235 + }, + { + "epoch": 0.96, + "grad_norm": 2.210012908565479, + "learning_rate": 4.1729271639834315e-08, + "loss": 0.6799, + "step": 9236 + }, + { + "epoch": 0.96, + "grad_norm": 2.0115717834978177, + "learning_rate": 4.151251553202562e-08, + "loss": 0.5222, + "step": 9237 + }, + { + "epoch": 0.96, + "grad_norm": 1.9749451130038853, + "learning_rate": 4.129632149703522e-08, + "loss": 0.6103, + "step": 9238 + }, + { + "epoch": 0.96, + "grad_norm": 1.8871785221244468, + "learning_rate": 4.1080689559370724e-08, + "loss": 0.6575, + "step": 9239 + }, + { + "epoch": 0.96, + "grad_norm": 2.039353039205422, + "learning_rate": 4.086561974347536e-08, + "loss": 0.6047, + "step": 9240 + }, + { + "epoch": 0.96, + "grad_norm": 2.191591399613144, + "learning_rate": 4.065111207372796e-08, + "loss": 0.6459, + "step": 9241 + }, + { + "epoch": 0.96, + "grad_norm": 2.071197060104028, + "learning_rate": 4.043716657444407e-08, + "loss": 0.6256, + "step": 9242 + }, + { + "epoch": 0.96, + "grad_norm": 1.920300538975992, + "learning_rate": 4.022378326987597e-08, + "loss": 0.6548, + "step": 9243 + }, + { + "epoch": 0.96, + "grad_norm": 2.0560108290708023, + "learning_rate": 4.001096218421152e-08, + "loss": 0.5192, + "step": 9244 + }, + { + "epoch": 0.96, + "grad_norm": 2.002316195882517, + "learning_rate": 3.9798703341575875e-08, + "loss": 0.5916, + "step": 9245 + }, + { + "epoch": 0.96, + "grad_norm": 2.191893706906724, + "learning_rate": 3.9587006766029225e-08, + "loss": 0.7022, + "step": 9246 + }, + { + "epoch": 0.96, + "grad_norm": 1.960519681813049, + "learning_rate": 3.937587248156904e-08, + "loss": 0.5285, + "step": 9247 + }, + { + "epoch": 0.96, + "grad_norm": 2.1307289887200067, + "learning_rate": 3.916530051212841e-08, + "loss": 0.7106, + "step": 9248 + }, + { + "epoch": 0.96, + "grad_norm": 2.0097373182537246, + "learning_rate": 3.8955290881576566e-08, + "loss": 0.6415, + "step": 9249 + }, + { + "epoch": 0.96, + "grad_norm": 2.088973178619039, + "learning_rate": 3.874584361371947e-08, + "loss": 0.619, + "step": 9250 + }, + { + "epoch": 0.96, + "grad_norm": 2.3453820853091965, + "learning_rate": 3.8536958732299234e-08, + "loss": 0.7008, + "step": 9251 + }, + { + "epoch": 0.96, + "grad_norm": 1.8925152599216863, + "learning_rate": 3.832863626099359e-08, + "loss": 0.6295, + "step": 9252 + }, + { + "epoch": 0.96, + "grad_norm": 1.9730246462677488, + "learning_rate": 3.8120876223418646e-08, + "loss": 0.6821, + "step": 9253 + }, + { + "epoch": 0.96, + "grad_norm": 1.995162008625557, + "learning_rate": 3.791367864312334e-08, + "loss": 0.5596, + "step": 9254 + }, + { + "epoch": 0.96, + "grad_norm": 1.9608437281347626, + "learning_rate": 3.770704354359611e-08, + "loss": 0.6098, + "step": 9255 + }, + { + "epoch": 0.96, + "grad_norm": 1.9119220279863318, + "learning_rate": 3.750097094825933e-08, + "loss": 0.5848, + "step": 9256 + }, + { + "epoch": 0.96, + "grad_norm": 1.8167225502830717, + "learning_rate": 3.729546088047264e-08, + "loss": 0.5353, + "step": 9257 + }, + { + "epoch": 0.96, + "grad_norm": 1.8657223958933558, + "learning_rate": 3.709051336353187e-08, + "loss": 0.5794, + "step": 9258 + }, + { + "epoch": 0.96, + "grad_norm": 2.3673185647016375, + "learning_rate": 3.688612842066952e-08, + "loss": 0.6088, + "step": 9259 + }, + { + "epoch": 0.96, + "grad_norm": 1.908061434806496, + "learning_rate": 3.6682306075052634e-08, + "loss": 0.519, + "step": 9260 + }, + { + "epoch": 0.96, + "grad_norm": 1.9989992305528361, + "learning_rate": 3.64790463497866e-08, + "loss": 0.5655, + "step": 9261 + }, + { + "epoch": 0.96, + "grad_norm": 2.077972598614312, + "learning_rate": 3.627634926791246e-08, + "loss": 0.6019, + "step": 9262 + }, + { + "epoch": 0.96, + "grad_norm": 1.9263719734688765, + "learning_rate": 3.6074214852405695e-08, + "loss": 0.6626, + "step": 9263 + }, + { + "epoch": 0.96, + "grad_norm": 1.8445050717486768, + "learning_rate": 3.587264312618022e-08, + "loss": 0.5336, + "step": 9264 + }, + { + "epoch": 0.96, + "grad_norm": 2.085353318157501, + "learning_rate": 3.567163411208552e-08, + "loss": 0.5532, + "step": 9265 + }, + { + "epoch": 0.96, + "grad_norm": 1.8421560673376067, + "learning_rate": 3.547118783290615e-08, + "loss": 0.6325, + "step": 9266 + }, + { + "epoch": 0.96, + "grad_norm": 1.9754557962477426, + "learning_rate": 3.527130431136505e-08, + "loss": 0.5936, + "step": 9267 + }, + { + "epoch": 0.96, + "grad_norm": 2.199110288587306, + "learning_rate": 3.507198357011909e-08, + "loss": 0.6782, + "step": 9268 + }, + { + "epoch": 0.96, + "grad_norm": 1.9254296087735812, + "learning_rate": 3.487322563176354e-08, + "loss": 0.5547, + "step": 9269 + }, + { + "epoch": 0.96, + "grad_norm": 2.0537058557901986, + "learning_rate": 3.467503051882815e-08, + "loss": 0.5562, + "step": 9270 + }, + { + "epoch": 0.96, + "grad_norm": 1.8764811043642, + "learning_rate": 3.4477398253778826e-08, + "loss": 0.6119, + "step": 9271 + }, + { + "epoch": 0.96, + "grad_norm": 1.9573268571815328, + "learning_rate": 3.4280328859019885e-08, + "loss": 0.6284, + "step": 9272 + }, + { + "epoch": 0.96, + "grad_norm": 2.2651084124987184, + "learning_rate": 3.408382235688845e-08, + "loss": 0.5998, + "step": 9273 + }, + { + "epoch": 0.96, + "grad_norm": 1.7190886471188342, + "learning_rate": 3.388787876966115e-08, + "loss": 0.5464, + "step": 9274 + }, + { + "epoch": 0.96, + "grad_norm": 1.898764604876328, + "learning_rate": 3.3692498119548e-08, + "loss": 0.5864, + "step": 9275 + }, + { + "epoch": 0.96, + "grad_norm": 1.9545866295420675, + "learning_rate": 3.3497680428697943e-08, + "loss": 0.6006, + "step": 9276 + }, + { + "epoch": 0.96, + "grad_norm": 1.9882529876550887, + "learning_rate": 3.330342571919332e-08, + "loss": 0.6126, + "step": 9277 + }, + { + "epoch": 0.96, + "grad_norm": 2.1666195490031344, + "learning_rate": 3.3109734013055396e-08, + "loss": 0.7462, + "step": 9278 + }, + { + "epoch": 0.96, + "grad_norm": 1.8273279290401259, + "learning_rate": 3.2916605332238284e-08, + "loss": 0.6519, + "step": 9279 + }, + { + "epoch": 0.96, + "grad_norm": 2.25238653257137, + "learning_rate": 3.2724039698636135e-08, + "loss": 0.6955, + "step": 9280 + }, + { + "epoch": 0.96, + "grad_norm": 2.1016705536415943, + "learning_rate": 3.2532037134076486e-08, + "loss": 0.6079, + "step": 9281 + }, + { + "epoch": 0.96, + "grad_norm": 2.0163167940465256, + "learning_rate": 3.234059766032416e-08, + "loss": 0.56, + "step": 9282 + }, + { + "epoch": 0.96, + "grad_norm": 1.9318858697010097, + "learning_rate": 3.214972129907956e-08, + "loss": 0.6073, + "step": 9283 + }, + { + "epoch": 0.97, + "grad_norm": 1.7757658787840196, + "learning_rate": 3.195940807198039e-08, + "loss": 0.5379, + "step": 9284 + }, + { + "epoch": 0.97, + "grad_norm": 1.8460998635082828, + "learning_rate": 3.1769658000598835e-08, + "loss": 0.5544, + "step": 9285 + }, + { + "epoch": 0.97, + "grad_norm": 2.0236445760390427, + "learning_rate": 3.158047110644436e-08, + "loss": 0.5358, + "step": 9286 + }, + { + "epoch": 0.97, + "grad_norm": 1.9270061530827873, + "learning_rate": 3.1391847410962573e-08, + "loss": 0.6229, + "step": 9287 + }, + { + "epoch": 0.97, + "grad_norm": 1.9604700327535756, + "learning_rate": 3.1203786935535275e-08, + "loss": 0.6332, + "step": 9288 + }, + { + "epoch": 0.97, + "grad_norm": 1.9975031878953207, + "learning_rate": 3.1016289701479296e-08, + "loss": 0.552, + "step": 9289 + }, + { + "epoch": 0.97, + "grad_norm": 1.8939971445756167, + "learning_rate": 3.082935573004986e-08, + "loss": 0.5689, + "step": 9290 + }, + { + "epoch": 0.97, + "grad_norm": 1.804739917206502, + "learning_rate": 3.064298504243612e-08, + "loss": 0.6185, + "step": 9291 + }, + { + "epoch": 0.97, + "grad_norm": 1.9170101195891174, + "learning_rate": 3.0457177659764524e-08, + "loss": 0.6434, + "step": 9292 + }, + { + "epoch": 0.97, + "grad_norm": 1.934719583858853, + "learning_rate": 3.02719336030971e-08, + "loss": 0.6144, + "step": 9293 + }, + { + "epoch": 0.97, + "grad_norm": 1.929080277217315, + "learning_rate": 3.008725289343206e-08, + "loss": 0.6113, + "step": 9294 + }, + { + "epoch": 0.97, + "grad_norm": 2.0128393803689226, + "learning_rate": 2.990313555170488e-08, + "loss": 0.6372, + "step": 9295 + }, + { + "epoch": 0.97, + "grad_norm": 1.9044953243775646, + "learning_rate": 2.9719581598786072e-08, + "loss": 0.6269, + "step": 9296 + }, + { + "epoch": 0.97, + "grad_norm": 1.987130561174613, + "learning_rate": 2.953659105548179e-08, + "loss": 0.5736, + "step": 9297 + }, + { + "epoch": 0.97, + "grad_norm": 1.9601612132663782, + "learning_rate": 2.9354163942535983e-08, + "loss": 0.5485, + "step": 9298 + }, + { + "epoch": 0.97, + "grad_norm": 1.8276899215854248, + "learning_rate": 2.9172300280627674e-08, + "loss": 0.5822, + "step": 9299 + }, + { + "epoch": 0.97, + "grad_norm": 1.9585863100809016, + "learning_rate": 2.899100009037148e-08, + "loss": 0.6306, + "step": 9300 + }, + { + "epoch": 0.97, + "grad_norm": 1.8717322423573106, + "learning_rate": 2.8810263392319293e-08, + "loss": 0.5468, + "step": 9301 + }, + { + "epoch": 0.97, + "grad_norm": 1.8433778506189058, + "learning_rate": 2.863009020695917e-08, + "loss": 0.6548, + "step": 9302 + }, + { + "epoch": 0.97, + "grad_norm": 1.8375022349395036, + "learning_rate": 2.845048055471311e-08, + "loss": 0.635, + "step": 9303 + }, + { + "epoch": 0.97, + "grad_norm": 1.986408985770117, + "learning_rate": 2.8271434455942604e-08, + "loss": 0.6994, + "step": 9304 + }, + { + "epoch": 0.97, + "grad_norm": 1.7962105054371804, + "learning_rate": 2.809295193094308e-08, + "loss": 0.6022, + "step": 9305 + }, + { + "epoch": 0.97, + "grad_norm": 2.129112709564285, + "learning_rate": 2.7915032999946133e-08, + "loss": 0.6974, + "step": 9306 + }, + { + "epoch": 0.97, + "grad_norm": 2.2015618631548834, + "learning_rate": 2.7737677683120077e-08, + "loss": 0.5899, + "step": 9307 + }, + { + "epoch": 0.97, + "grad_norm": 2.032080111074871, + "learning_rate": 2.7560886000569386e-08, + "loss": 0.6206, + "step": 9308 + }, + { + "epoch": 0.97, + "grad_norm": 1.9551756649376424, + "learning_rate": 2.7384657972334137e-08, + "loss": 0.5469, + "step": 9309 + }, + { + "epoch": 0.97, + "grad_norm": 1.957396990331527, + "learning_rate": 2.7208993618390578e-08, + "loss": 0.6458, + "step": 9310 + }, + { + "epoch": 0.97, + "grad_norm": 1.9599537127158781, + "learning_rate": 2.7033892958651665e-08, + "loss": 0.6791, + "step": 9311 + }, + { + "epoch": 0.97, + "grad_norm": 1.7920889911825337, + "learning_rate": 2.6859356012965964e-08, + "loss": 0.4747, + "step": 9312 + }, + { + "epoch": 0.97, + "grad_norm": 1.944794784351862, + "learning_rate": 2.6685382801118765e-08, + "loss": 0.5882, + "step": 9313 + }, + { + "epoch": 0.97, + "grad_norm": 1.8238895229633358, + "learning_rate": 2.6511973342829843e-08, + "loss": 0.5857, + "step": 9314 + }, + { + "epoch": 0.97, + "grad_norm": 1.772948928337245, + "learning_rate": 2.6339127657756814e-08, + "loss": 0.5756, + "step": 9315 + }, + { + "epoch": 0.97, + "grad_norm": 2.1942165661966677, + "learning_rate": 2.6166845765492333e-08, + "loss": 0.5701, + "step": 9316 + }, + { + "epoch": 0.97, + "grad_norm": 1.980825317874205, + "learning_rate": 2.5995127685566335e-08, + "loss": 0.579, + "step": 9317 + }, + { + "epoch": 0.97, + "grad_norm": 1.9101493425224827, + "learning_rate": 2.5823973437442696e-08, + "loss": 0.6031, + "step": 9318 + }, + { + "epoch": 0.97, + "grad_norm": 1.8369726379682219, + "learning_rate": 2.5653383040524228e-08, + "loss": 0.6034, + "step": 9319 + }, + { + "epoch": 0.97, + "grad_norm": 1.8791588215949244, + "learning_rate": 2.5483356514147128e-08, + "loss": 0.6301, + "step": 9320 + }, + { + "epoch": 0.97, + "grad_norm": 1.9328706087321703, + "learning_rate": 2.531389387758598e-08, + "loss": 0.6878, + "step": 9321 + }, + { + "epoch": 0.97, + "grad_norm": 2.0484986499553894, + "learning_rate": 2.5144995150049312e-08, + "loss": 0.6561, + "step": 9322 + }, + { + "epoch": 0.97, + "grad_norm": 1.8432600727248396, + "learning_rate": 2.497666035068347e-08, + "loss": 0.5751, + "step": 9323 + }, + { + "epoch": 0.97, + "grad_norm": 2.221330921501575, + "learning_rate": 2.48088894985693e-08, + "loss": 0.6359, + "step": 9324 + }, + { + "epoch": 0.97, + "grad_norm": 2.0728270839250973, + "learning_rate": 2.464168261272548e-08, + "loss": 0.5583, + "step": 9325 + }, + { + "epoch": 0.97, + "grad_norm": 2.0001901715742494, + "learning_rate": 2.4475039712105742e-08, + "loss": 0.6303, + "step": 9326 + }, + { + "epoch": 0.97, + "grad_norm": 1.9665561149483282, + "learning_rate": 2.4308960815599412e-08, + "loss": 0.6741, + "step": 9327 + }, + { + "epoch": 0.97, + "grad_norm": 2.008847781228865, + "learning_rate": 2.4143445942033105e-08, + "loss": 0.5627, + "step": 9328 + }, + { + "epoch": 0.97, + "grad_norm": 1.9899789683173035, + "learning_rate": 2.3978495110168477e-08, + "loss": 0.6163, + "step": 9329 + }, + { + "epoch": 0.97, + "grad_norm": 1.9583106013630442, + "learning_rate": 2.38141083387039e-08, + "loss": 0.6204, + "step": 9330 + }, + { + "epoch": 0.97, + "grad_norm": 2.1674235835611495, + "learning_rate": 2.3650285646273362e-08, + "loss": 0.5559, + "step": 9331 + }, + { + "epoch": 0.97, + "grad_norm": 1.8975429563101518, + "learning_rate": 2.348702705144701e-08, + "loss": 0.615, + "step": 9332 + }, + { + "epoch": 0.97, + "grad_norm": 2.090178727546794, + "learning_rate": 2.33243325727317e-08, + "loss": 0.7192, + "step": 9333 + }, + { + "epoch": 0.97, + "grad_norm": 1.881027148542671, + "learning_rate": 2.3162202228569353e-08, + "loss": 0.5326, + "step": 9334 + }, + { + "epoch": 0.97, + "grad_norm": 3.9837370957943103, + "learning_rate": 2.300063603733804e-08, + "loss": 0.6979, + "step": 9335 + }, + { + "epoch": 0.97, + "grad_norm": 1.7547614413184414, + "learning_rate": 2.2839634017353118e-08, + "loss": 0.5466, + "step": 9336 + }, + { + "epoch": 0.97, + "grad_norm": 2.0185544324841307, + "learning_rate": 2.267919618686443e-08, + "loss": 0.6056, + "step": 9337 + }, + { + "epoch": 0.97, + "grad_norm": 2.0292071357538766, + "learning_rate": 2.251932256405853e-08, + "loss": 0.6469, + "step": 9338 + }, + { + "epoch": 0.97, + "grad_norm": 1.804027643043366, + "learning_rate": 2.2360013167057602e-08, + "loss": 0.5343, + "step": 9339 + }, + { + "epoch": 0.97, + "grad_norm": 1.9766515344332558, + "learning_rate": 2.220126801392164e-08, + "loss": 0.5715, + "step": 9340 + }, + { + "epoch": 0.97, + "grad_norm": 1.9002895224444987, + "learning_rate": 2.2043087122644023e-08, + "loss": 0.7093, + "step": 9341 + }, + { + "epoch": 0.97, + "grad_norm": 2.1821936473764985, + "learning_rate": 2.188547051115597e-08, + "loss": 0.5981, + "step": 9342 + }, + { + "epoch": 0.97, + "grad_norm": 1.9245969157170688, + "learning_rate": 2.1728418197323742e-08, + "loss": 0.5908, + "step": 9343 + }, + { + "epoch": 0.97, + "grad_norm": 2.12228430228684, + "learning_rate": 2.1571930198950874e-08, + "loss": 0.5723, + "step": 9344 + }, + { + "epoch": 0.97, + "grad_norm": 1.8732070599355801, + "learning_rate": 2.1416006533775957e-08, + "loss": 0.5704, + "step": 9345 + }, + { + "epoch": 0.97, + "grad_norm": 1.897499138291424, + "learning_rate": 2.1260647219473742e-08, + "loss": 0.5922, + "step": 9346 + }, + { + "epoch": 0.97, + "grad_norm": 2.1515440733884086, + "learning_rate": 2.110585227365458e-08, + "loss": 0.6734, + "step": 9347 + }, + { + "epoch": 0.97, + "grad_norm": 2.0743929440733906, + "learning_rate": 2.095162171386611e-08, + "loss": 0.6869, + "step": 9348 + }, + { + "epoch": 0.97, + "grad_norm": 1.9802962077153008, + "learning_rate": 2.0797955557590454e-08, + "loss": 0.5603, + "step": 9349 + }, + { + "epoch": 0.97, + "grad_norm": 1.902902250037822, + "learning_rate": 2.064485382224757e-08, + "loss": 0.6033, + "step": 9350 + }, + { + "epoch": 0.97, + "grad_norm": 1.9135159924715113, + "learning_rate": 2.04923165251919e-08, + "loss": 0.6096, + "step": 9351 + }, + { + "epoch": 0.97, + "grad_norm": 2.001111887938867, + "learning_rate": 2.0340343683714624e-08, + "loss": 0.66, + "step": 9352 + }, + { + "epoch": 0.97, + "grad_norm": 2.2845616405345233, + "learning_rate": 2.018893531504196e-08, + "loss": 0.7283, + "step": 9353 + }, + { + "epoch": 0.97, + "grad_norm": 2.0236378220572275, + "learning_rate": 2.0038091436337392e-08, + "loss": 0.5974, + "step": 9354 + }, + { + "epoch": 0.97, + "grad_norm": 1.7188227459501022, + "learning_rate": 1.9887812064700028e-08, + "loss": 0.5671, + "step": 9355 + }, + { + "epoch": 0.97, + "grad_norm": 1.9222401255258874, + "learning_rate": 1.973809721716513e-08, + "loss": 0.5832, + "step": 9356 + }, + { + "epoch": 0.97, + "grad_norm": 2.0028373714720367, + "learning_rate": 1.9588946910703567e-08, + "loss": 0.5924, + "step": 9357 + }, + { + "epoch": 0.97, + "grad_norm": 1.8852559821091195, + "learning_rate": 1.9440361162222367e-08, + "loss": 0.6009, + "step": 9358 + }, + { + "epoch": 0.97, + "grad_norm": 1.6645381563062454, + "learning_rate": 1.929233998856417e-08, + "loss": 0.5973, + "step": 9359 + }, + { + "epoch": 0.97, + "grad_norm": 1.8784683299762617, + "learning_rate": 1.914488340650833e-08, + "loss": 0.6166, + "step": 9360 + }, + { + "epoch": 0.97, + "grad_norm": 1.9423516733751873, + "learning_rate": 1.8997991432769812e-08, + "loss": 0.5375, + "step": 9361 + }, + { + "epoch": 0.97, + "grad_norm": 1.896972762313844, + "learning_rate": 1.8851664083999742e-08, + "loss": 0.5419, + "step": 9362 + }, + { + "epoch": 0.97, + "grad_norm": 1.9254153543066406, + "learning_rate": 1.8705901376784852e-08, + "loss": 0.5956, + "step": 9363 + }, + { + "epoch": 0.97, + "grad_norm": 1.837387913964868, + "learning_rate": 1.8560703327649144e-08, + "loss": 0.5569, + "step": 9364 + }, + { + "epoch": 0.97, + "grad_norm": 2.1808835722204445, + "learning_rate": 1.8416069953050565e-08, + "loss": 0.6779, + "step": 9365 + }, + { + "epoch": 0.97, + "grad_norm": 2.1182269084331415, + "learning_rate": 1.8272001269384886e-08, + "loss": 0.6208, + "step": 9366 + }, + { + "epoch": 0.97, + "grad_norm": 1.8505257750618553, + "learning_rate": 1.812849729298238e-08, + "loss": 0.5383, + "step": 9367 + }, + { + "epoch": 0.97, + "grad_norm": 1.85357421039629, + "learning_rate": 1.7985558040110594e-08, + "loss": 0.657, + "step": 9368 + }, + { + "epoch": 0.97, + "grad_norm": 2.0404214691662395, + "learning_rate": 1.784318352697212e-08, + "loss": 0.6548, + "step": 9369 + }, + { + "epoch": 0.97, + "grad_norm": 2.0413228816834392, + "learning_rate": 1.7701373769706265e-08, + "loss": 0.6042, + "step": 9370 + }, + { + "epoch": 0.97, + "grad_norm": 2.192428086651773, + "learning_rate": 1.7560128784387953e-08, + "loss": 0.6355, + "step": 9371 + }, + { + "epoch": 0.97, + "grad_norm": 1.693491337066328, + "learning_rate": 1.741944858702771e-08, + "loss": 0.5473, + "step": 9372 + }, + { + "epoch": 0.97, + "grad_norm": 1.9447757214859827, + "learning_rate": 1.7279333193573332e-08, + "loss": 0.5628, + "step": 9373 + }, + { + "epoch": 0.97, + "grad_norm": 1.9521877617740033, + "learning_rate": 1.7139782619906565e-08, + "loss": 0.5326, + "step": 9374 + }, + { + "epoch": 0.97, + "grad_norm": 2.1318514544476956, + "learning_rate": 1.700079688184697e-08, + "loss": 0.5991, + "step": 9375 + }, + { + "epoch": 0.97, + "grad_norm": 2.111380016447978, + "learning_rate": 1.686237599514917e-08, + "loss": 0.5793, + "step": 9376 + }, + { + "epoch": 0.97, + "grad_norm": 2.0061463420649823, + "learning_rate": 1.672451997550395e-08, + "loss": 0.5981, + "step": 9377 + }, + { + "epoch": 0.97, + "grad_norm": 1.9289965935272066, + "learning_rate": 1.658722883853825e-08, + "loss": 0.6348, + "step": 9378 + }, + { + "epoch": 0.97, + "grad_norm": 1.759822966934597, + "learning_rate": 1.6450502599814622e-08, + "loss": 0.5902, + "step": 9379 + }, + { + "epoch": 0.98, + "grad_norm": 1.830609332103879, + "learning_rate": 1.631434127483178e-08, + "loss": 0.6012, + "step": 9380 + }, + { + "epoch": 0.98, + "grad_norm": 1.903223961744007, + "learning_rate": 1.6178744879024045e-08, + "loss": 0.5514, + "step": 9381 + }, + { + "epoch": 0.98, + "grad_norm": 1.9105186920277744, + "learning_rate": 1.604371342776301e-08, + "loss": 0.528, + "step": 9382 + }, + { + "epoch": 0.98, + "grad_norm": 1.7738895886516455, + "learning_rate": 1.5909246936354205e-08, + "loss": 0.5702, + "step": 9383 + }, + { + "epoch": 0.98, + "grad_norm": 1.9203976490248245, + "learning_rate": 1.5775345420041e-08, + "loss": 0.509, + "step": 9384 + }, + { + "epoch": 0.98, + "grad_norm": 1.981620797130497, + "learning_rate": 1.564200889400125e-08, + "loss": 0.6442, + "step": 9385 + }, + { + "epoch": 0.98, + "grad_norm": 1.686120661097064, + "learning_rate": 1.5509237373349527e-08, + "loss": 0.5022, + "step": 9386 + }, + { + "epoch": 0.98, + "grad_norm": 2.011187623061088, + "learning_rate": 1.537703087313658e-08, + "loss": 0.6468, + "step": 9387 + }, + { + "epoch": 0.98, + "grad_norm": 1.7837135872441217, + "learning_rate": 1.5245389408348744e-08, + "loss": 0.5189, + "step": 9388 + }, + { + "epoch": 0.98, + "grad_norm": 1.9655077598433057, + "learning_rate": 1.5114312993908532e-08, + "loss": 0.5999, + "step": 9389 + }, + { + "epoch": 0.98, + "grad_norm": 2.052384135182507, + "learning_rate": 1.4983801644672948e-08, + "loss": 0.5486, + "step": 9390 + }, + { + "epoch": 0.98, + "grad_norm": 2.1277424515467844, + "learning_rate": 1.4853855375437377e-08, + "loss": 0.6428, + "step": 9391 + }, + { + "epoch": 0.98, + "grad_norm": 1.977814419207284, + "learning_rate": 1.4724474200931704e-08, + "loss": 0.6306, + "step": 9392 + }, + { + "epoch": 0.98, + "grad_norm": 2.138743859165059, + "learning_rate": 1.4595658135822533e-08, + "loss": 0.7269, + "step": 9393 + }, + { + "epoch": 0.98, + "grad_norm": 1.7904427754103556, + "learning_rate": 1.4467407194710958e-08, + "loss": 0.5781, + "step": 9394 + }, + { + "epoch": 0.98, + "grad_norm": 1.9993046254554308, + "learning_rate": 1.4339721392135352e-08, + "loss": 0.6969, + "step": 9395 + }, + { + "epoch": 0.98, + "grad_norm": 2.0825943027963394, + "learning_rate": 1.4212600742569694e-08, + "loss": 0.6154, + "step": 9396 + }, + { + "epoch": 0.98, + "grad_norm": 1.9421694999551333, + "learning_rate": 1.4086045260423564e-08, + "loss": 0.6173, + "step": 9397 + }, + { + "epoch": 0.98, + "grad_norm": 2.0324173008481954, + "learning_rate": 1.3960054960043267e-08, + "loss": 0.6327, + "step": 9398 + }, + { + "epoch": 0.98, + "grad_norm": 2.3108147574118845, + "learning_rate": 1.3834629855710158e-08, + "loss": 0.7125, + "step": 9399 + }, + { + "epoch": 0.98, + "grad_norm": 1.973482473276303, + "learning_rate": 1.370976996164175e-08, + "loss": 0.6279, + "step": 9400 + }, + { + "epoch": 0.98, + "grad_norm": 2.086224009235821, + "learning_rate": 1.3585475291991724e-08, + "loss": 0.5979, + "step": 9401 + }, + { + "epoch": 0.98, + "grad_norm": 1.9357123351662384, + "learning_rate": 1.3461745860849917e-08, + "loss": 0.6264, + "step": 9402 + }, + { + "epoch": 0.98, + "grad_norm": 1.7861483587494011, + "learning_rate": 1.333858168224178e-08, + "loss": 0.561, + "step": 9403 + }, + { + "epoch": 0.98, + "grad_norm": 2.0311118954318155, + "learning_rate": 1.3215982770128366e-08, + "loss": 0.7493, + "step": 9404 + }, + { + "epoch": 0.98, + "grad_norm": 2.045942396521214, + "learning_rate": 1.3093949138406892e-08, + "loss": 0.661, + "step": 9405 + }, + { + "epoch": 0.98, + "grad_norm": 2.07625840929344, + "learning_rate": 1.2972480800910181e-08, + "loss": 0.6479, + "step": 9406 + }, + { + "epoch": 0.98, + "grad_norm": 2.067265450826643, + "learning_rate": 1.285157777140833e-08, + "loss": 0.587, + "step": 9407 + }, + { + "epoch": 0.98, + "grad_norm": 2.119763936274861, + "learning_rate": 1.2731240063605931e-08, + "loss": 0.5975, + "step": 9408 + }, + { + "epoch": 0.98, + "grad_norm": 2.024916147212216, + "learning_rate": 1.2611467691144297e-08, + "loss": 0.6936, + "step": 9409 + }, + { + "epoch": 0.98, + "grad_norm": 1.983127501150537, + "learning_rate": 1.2492260667599232e-08, + "loss": 0.6452, + "step": 9410 + }, + { + "epoch": 0.98, + "grad_norm": 1.8649494084823937, + "learning_rate": 1.2373619006484927e-08, + "loss": 0.5512, + "step": 9411 + }, + { + "epoch": 0.98, + "grad_norm": 2.0159592835650364, + "learning_rate": 1.2255542721248959e-08, + "loss": 0.4776, + "step": 9412 + }, + { + "epoch": 0.98, + "grad_norm": 2.1385407875276283, + "learning_rate": 1.213803182527673e-08, + "loss": 0.617, + "step": 9413 + }, + { + "epoch": 0.98, + "grad_norm": 1.834974861679934, + "learning_rate": 1.2021086331888143e-08, + "loss": 0.6695, + "step": 9414 + }, + { + "epoch": 0.98, + "grad_norm": 2.0853828764496343, + "learning_rate": 1.190470625434037e-08, + "loss": 0.6612, + "step": 9415 + }, + { + "epoch": 0.98, + "grad_norm": 1.98313825019437, + "learning_rate": 1.1788891605825081e-08, + "loss": 0.6369, + "step": 9416 + }, + { + "epoch": 0.98, + "grad_norm": 2.117767892785416, + "learning_rate": 1.1673642399470663e-08, + "loss": 0.5904, + "step": 9417 + }, + { + "epoch": 0.98, + "grad_norm": 2.003554226087581, + "learning_rate": 1.1558958648341667e-08, + "loss": 0.6737, + "step": 9418 + }, + { + "epoch": 0.98, + "grad_norm": 1.8760117400728373, + "learning_rate": 1.1444840365437692e-08, + "loss": 0.5358, + "step": 9419 + }, + { + "epoch": 0.98, + "grad_norm": 1.826652775318849, + "learning_rate": 1.1331287563695059e-08, + "loss": 0.5802, + "step": 9420 + }, + { + "epoch": 0.98, + "grad_norm": 2.210283365637128, + "learning_rate": 1.1218300255985137e-08, + "loss": 0.6695, + "step": 9421 + }, + { + "epoch": 0.98, + "grad_norm": 1.9114471453067057, + "learning_rate": 1.1105878455116015e-08, + "loss": 0.6807, + "step": 9422 + }, + { + "epoch": 0.98, + "grad_norm": 2.308622820891408, + "learning_rate": 1.0994022173831386e-08, + "loss": 0.5271, + "step": 9423 + }, + { + "epoch": 0.98, + "grad_norm": 2.154207219627266, + "learning_rate": 1.088273142481111e-08, + "loss": 0.6708, + "step": 9424 + }, + { + "epoch": 0.98, + "grad_norm": 2.062686057137944, + "learning_rate": 1.0772006220670094e-08, + "loss": 0.6164, + "step": 9425 + }, + { + "epoch": 0.98, + "grad_norm": 1.9871469414224412, + "learning_rate": 1.0661846573959412e-08, + "loss": 0.5549, + "step": 9426 + }, + { + "epoch": 0.98, + "grad_norm": 1.9153912700098923, + "learning_rate": 1.0552252497166849e-08, + "loss": 0.5732, + "step": 9427 + }, + { + "epoch": 0.98, + "grad_norm": 2.0642917207760196, + "learning_rate": 1.0443224002715801e-08, + "loss": 0.6291, + "step": 9428 + }, + { + "epoch": 0.98, + "grad_norm": 1.8687845348076741, + "learning_rate": 1.0334761102964163e-08, + "loss": 0.5605, + "step": 9429 + }, + { + "epoch": 0.98, + "grad_norm": 2.0119112889139665, + "learning_rate": 1.022686381020821e-08, + "loss": 0.6568, + "step": 9430 + }, + { + "epoch": 0.98, + "grad_norm": 2.142024130237965, + "learning_rate": 1.01195321366776e-08, + "loss": 0.6009, + "step": 9431 + }, + { + "epoch": 0.98, + "grad_norm": 1.9652097501051895, + "learning_rate": 1.0012766094539273e-08, + "loss": 0.5757, + "step": 9432 + }, + { + "epoch": 0.98, + "grad_norm": 2.11686381269212, + "learning_rate": 9.90656569589632e-09, + "loss": 0.6156, + "step": 9433 + }, + { + "epoch": 0.98, + "grad_norm": 1.8540572366176127, + "learning_rate": 9.800930952786336e-09, + "loss": 0.5143, + "step": 9434 + }, + { + "epoch": 0.98, + "grad_norm": 1.8884066141241278, + "learning_rate": 9.695861877184187e-09, + "loss": 0.6136, + "step": 9435 + }, + { + "epoch": 0.98, + "grad_norm": 1.8442181601552678, + "learning_rate": 9.59135848099979e-09, + "loss": 0.556, + "step": 9436 + }, + { + "epoch": 0.98, + "grad_norm": 2.010704790123921, + "learning_rate": 9.487420776079226e-09, + "loss": 0.6489, + "step": 9437 + }, + { + "epoch": 0.98, + "grad_norm": 2.1347997954400997, + "learning_rate": 9.38404877420418e-09, + "loss": 0.6559, + "step": 9438 + }, + { + "epoch": 0.98, + "grad_norm": 2.0640248046696636, + "learning_rate": 9.281242487093612e-09, + "loss": 0.622, + "step": 9439 + }, + { + "epoch": 0.98, + "grad_norm": 2.0903299573728393, + "learning_rate": 9.179001926399866e-09, + "loss": 0.5661, + "step": 9440 + }, + { + "epoch": 0.98, + "grad_norm": 2.011961237897929, + "learning_rate": 9.077327103713119e-09, + "loss": 0.6185, + "step": 9441 + }, + { + "epoch": 0.98, + "grad_norm": 1.9101374524807846, + "learning_rate": 8.976218030558592e-09, + "loss": 0.5676, + "step": 9442 + }, + { + "epoch": 0.98, + "grad_norm": 1.997803767273175, + "learning_rate": 8.87567471839712e-09, + "loss": 0.5351, + "step": 9443 + }, + { + "epoch": 0.98, + "grad_norm": 1.8370994226150568, + "learning_rate": 8.775697178626807e-09, + "loss": 0.5526, + "step": 9444 + }, + { + "epoch": 0.98, + "grad_norm": 1.8661873226938193, + "learning_rate": 8.676285422580255e-09, + "loss": 0.6151, + "step": 9445 + }, + { + "epoch": 0.98, + "grad_norm": 2.0505710073168952, + "learning_rate": 8.577439461526782e-09, + "loss": 0.593, + "step": 9446 + }, + { + "epoch": 0.98, + "grad_norm": 1.9162215230441964, + "learning_rate": 8.479159306670203e-09, + "loss": 0.6494, + "step": 9447 + }, + { + "epoch": 0.98, + "grad_norm": 1.9933094696955849, + "learning_rate": 8.381444969151608e-09, + "loss": 0.5787, + "step": 9448 + }, + { + "epoch": 0.98, + "grad_norm": 1.8322768596191124, + "learning_rate": 8.284296460047691e-09, + "loss": 0.6197, + "step": 9449 + }, + { + "epoch": 0.98, + "grad_norm": 1.9649119341867372, + "learning_rate": 8.1877137903702e-09, + "loss": 0.5414, + "step": 9450 + }, + { + "epoch": 0.98, + "grad_norm": 1.8174793553111006, + "learning_rate": 8.091696971068152e-09, + "loss": 0.5382, + "step": 9451 + }, + { + "epoch": 0.98, + "grad_norm": 2.024317306699993, + "learning_rate": 7.996246013025067e-09, + "loss": 0.5741, + "step": 9452 + }, + { + "epoch": 0.98, + "grad_norm": 2.293287635439528, + "learning_rate": 7.901360927061174e-09, + "loss": 0.6518, + "step": 9453 + }, + { + "epoch": 0.98, + "grad_norm": 2.086365621299315, + "learning_rate": 7.807041723931763e-09, + "loss": 0.5751, + "step": 9454 + }, + { + "epoch": 0.98, + "grad_norm": 1.936273640010896, + "learning_rate": 7.713288414328835e-09, + "loss": 0.7171, + "step": 9455 + }, + { + "epoch": 0.98, + "grad_norm": 1.768772970005881, + "learning_rate": 7.620101008879444e-09, + "loss": 0.498, + "step": 9456 + }, + { + "epoch": 0.98, + "grad_norm": 1.8866484555298333, + "learning_rate": 7.527479518147918e-09, + "loss": 0.5336, + "step": 9457 + }, + { + "epoch": 0.98, + "grad_norm": 1.9101721466577788, + "learning_rate": 7.435423952631971e-09, + "loss": 0.5666, + "step": 9458 + }, + { + "epoch": 0.98, + "grad_norm": 2.1247138345157066, + "learning_rate": 7.343934322767699e-09, + "loss": 0.636, + "step": 9459 + }, + { + "epoch": 0.98, + "grad_norm": 2.0203673144733316, + "learning_rate": 7.253010638925695e-09, + "loss": 0.4786, + "step": 9460 + }, + { + "epoch": 0.98, + "grad_norm": 2.0489168644317868, + "learning_rate": 7.1626529114127155e-09, + "loss": 0.6674, + "step": 9461 + }, + { + "epoch": 0.98, + "grad_norm": 2.014479744809945, + "learning_rate": 7.072861150471122e-09, + "loss": 0.6233, + "step": 9462 + }, + { + "epoch": 0.98, + "grad_norm": 2.1318020390134382, + "learning_rate": 6.98363536627944e-09, + "loss": 0.6574, + "step": 9463 + }, + { + "epoch": 0.98, + "grad_norm": 2.126617683494603, + "learning_rate": 6.894975568951801e-09, + "loss": 0.6201, + "step": 9464 + }, + { + "epoch": 0.98, + "grad_norm": 2.1193407018704273, + "learning_rate": 6.806881768539053e-09, + "loss": 0.5861, + "step": 9465 + }, + { + "epoch": 0.98, + "grad_norm": 2.1218760503414384, + "learning_rate": 6.719353975025989e-09, + "loss": 0.7565, + "step": 9466 + }, + { + "epoch": 0.98, + "grad_norm": 1.9224166553741697, + "learning_rate": 6.63239219833467e-09, + "loss": 0.6918, + "step": 9467 + }, + { + "epoch": 0.98, + "grad_norm": 1.90476914149826, + "learning_rate": 6.5459964483233215e-09, + "loss": 0.5775, + "step": 9468 + }, + { + "epoch": 0.98, + "grad_norm": 2.0805059353113027, + "learning_rate": 6.460166734785223e-09, + "loss": 0.6907, + "step": 9469 + }, + { + "epoch": 0.98, + "grad_norm": 2.039038658401796, + "learning_rate": 6.374903067448701e-09, + "loss": 0.5588, + "step": 9470 + }, + { + "epoch": 0.98, + "grad_norm": 1.9051077555679738, + "learning_rate": 6.290205455980469e-09, + "loss": 0.6101, + "step": 9471 + }, + { + "epoch": 0.98, + "grad_norm": 1.9114694925966835, + "learning_rate": 6.2060739099800704e-09, + "loss": 0.6417, + "step": 9472 + }, + { + "epoch": 0.98, + "grad_norm": 2.0034549196304288, + "learning_rate": 6.122508438984875e-09, + "loss": 0.6337, + "step": 9473 + }, + { + "epoch": 0.98, + "grad_norm": 1.929605593178843, + "learning_rate": 6.039509052467862e-09, + "loss": 0.6391, + "step": 9474 + }, + { + "epoch": 0.98, + "grad_norm": 2.0565609104314695, + "learning_rate": 5.957075759837061e-09, + "loss": 0.6141, + "step": 9475 + }, + { + "epoch": 0.99, + "grad_norm": 1.8833486655812495, + "learning_rate": 5.875208570436663e-09, + "loss": 0.5839, + "step": 9476 + }, + { + "epoch": 0.99, + "grad_norm": 2.023643705570554, + "learning_rate": 5.793907493546469e-09, + "loss": 0.5759, + "step": 9477 + }, + { + "epoch": 0.99, + "grad_norm": 1.8797156245993185, + "learning_rate": 5.713172538383549e-09, + "loss": 0.5925, + "step": 9478 + }, + { + "epoch": 0.99, + "grad_norm": 2.029520402534025, + "learning_rate": 5.6330037140989166e-09, + "loss": 0.5766, + "step": 9479 + }, + { + "epoch": 0.99, + "grad_norm": 2.1692397510491266, + "learning_rate": 5.5534010297803034e-09, + "loss": 0.5764, + "step": 9480 + }, + { + "epoch": 0.99, + "grad_norm": 2.101204297729523, + "learning_rate": 5.474364494451045e-09, + "loss": 0.5781, + "step": 9481 + }, + { + "epoch": 0.99, + "grad_norm": 1.9301730906644166, + "learning_rate": 5.395894117070089e-09, + "loss": 0.7224, + "step": 9482 + }, + { + "epoch": 0.99, + "grad_norm": 1.7290399911078331, + "learning_rate": 5.317989906533094e-09, + "loss": 0.5331, + "step": 9483 + }, + { + "epoch": 0.99, + "grad_norm": 1.7841568775539753, + "learning_rate": 5.2406518716707766e-09, + "loss": 0.63, + "step": 9484 + }, + { + "epoch": 0.99, + "grad_norm": 2.154539636467135, + "learning_rate": 5.1638800212494566e-09, + "loss": 0.6452, + "step": 9485 + }, + { + "epoch": 0.99, + "grad_norm": 2.0702545934630017, + "learning_rate": 5.087674363972173e-09, + "loss": 0.5551, + "step": 9486 + }, + { + "epoch": 0.99, + "grad_norm": 1.9771019143634083, + "learning_rate": 5.01203490847646e-09, + "loss": 0.5728, + "step": 9487 + }, + { + "epoch": 0.99, + "grad_norm": 1.8392551379936382, + "learning_rate": 4.93696166333768e-09, + "loss": 0.6022, + "step": 9488 + }, + { + "epoch": 0.99, + "grad_norm": 1.9594213971787073, + "learning_rate": 4.8624546370651395e-09, + "loss": 0.6141, + "step": 9489 + }, + { + "epoch": 0.99, + "grad_norm": 1.8586510143718253, + "learning_rate": 4.788513838104858e-09, + "loss": 0.6084, + "step": 9490 + }, + { + "epoch": 0.99, + "grad_norm": 1.6622364581523985, + "learning_rate": 4.7151392748379095e-09, + "loss": 0.5573, + "step": 9491 + }, + { + "epoch": 0.99, + "grad_norm": 2.07679473110362, + "learning_rate": 4.642330955582641e-09, + "loss": 0.6039, + "step": 9492 + }, + { + "epoch": 0.99, + "grad_norm": 1.8678922922176417, + "learning_rate": 4.57008888859134e-09, + "loss": 0.5118, + "step": 9493 + }, + { + "epoch": 0.99, + "grad_norm": 1.9392398900547896, + "learning_rate": 4.498413082053566e-09, + "loss": 0.6151, + "step": 9494 + }, + { + "epoch": 0.99, + "grad_norm": 1.9618678399300282, + "learning_rate": 4.42730354409393e-09, + "loss": 0.6178, + "step": 9495 + }, + { + "epoch": 0.99, + "grad_norm": 1.8062383269521096, + "learning_rate": 4.356760282773209e-09, + "loss": 0.5622, + "step": 9496 + }, + { + "epoch": 0.99, + "grad_norm": 1.8782190565478545, + "learning_rate": 4.286783306087783e-09, + "loss": 0.5159, + "step": 9497 + }, + { + "epoch": 0.99, + "grad_norm": 1.976593913395866, + "learning_rate": 4.21737262197075e-09, + "loss": 0.6743, + "step": 9498 + }, + { + "epoch": 0.99, + "grad_norm": 1.9818379691434536, + "learning_rate": 4.148528238289151e-09, + "loss": 0.6773, + "step": 9499 + }, + { + "epoch": 0.99, + "grad_norm": 2.0152612123522133, + "learning_rate": 4.080250162847299e-09, + "loss": 0.6145, + "step": 9500 + }, + { + "epoch": 0.99, + "grad_norm": 2.1683488804338475, + "learning_rate": 4.0125384033845586e-09, + "loss": 0.5787, + "step": 9501 + }, + { + "epoch": 0.99, + "grad_norm": 1.8175768439932203, + "learning_rate": 3.945392967577011e-09, + "loss": 0.555, + "step": 9502 + }, + { + "epoch": 0.99, + "grad_norm": 2.1157456006926907, + "learning_rate": 3.8788138630357905e-09, + "loss": 0.5811, + "step": 9503 + }, + { + "epoch": 0.99, + "grad_norm": 2.1523135188211087, + "learning_rate": 3.812801097308194e-09, + "loss": 0.6436, + "step": 9504 + }, + { + "epoch": 0.99, + "grad_norm": 1.9801354903550479, + "learning_rate": 3.747354677876569e-09, + "loss": 0.7153, + "step": 9505 + }, + { + "epoch": 0.99, + "grad_norm": 1.9816879347353105, + "learning_rate": 3.682474612159981e-09, + "loss": 0.561, + "step": 9506 + }, + { + "epoch": 0.99, + "grad_norm": 2.036183290464764, + "learning_rate": 3.6181609075131017e-09, + "loss": 0.7561, + "step": 9507 + }, + { + "epoch": 0.99, + "grad_norm": 2.1541774964882485, + "learning_rate": 3.5544135712262116e-09, + "loss": 0.642, + "step": 9508 + }, + { + "epoch": 0.99, + "grad_norm": 2.2831346674446213, + "learning_rate": 3.4912326105246418e-09, + "loss": 0.6434, + "step": 9509 + }, + { + "epoch": 0.99, + "grad_norm": 1.958292617985834, + "learning_rate": 3.4286180325715513e-09, + "loss": 0.6362, + "step": 9510 + }, + { + "epoch": 0.99, + "grad_norm": 1.9314782542001665, + "learning_rate": 3.3665698444640406e-09, + "loss": 0.5673, + "step": 9511 + }, + { + "epoch": 0.99, + "grad_norm": 1.991690621452874, + "learning_rate": 3.3050880532359277e-09, + "loss": 0.5816, + "step": 9512 + }, + { + "epoch": 0.99, + "grad_norm": 2.5630124336650852, + "learning_rate": 3.2441726658560825e-09, + "loss": 0.591, + "step": 9513 + }, + { + "epoch": 0.99, + "grad_norm": 1.98736244563508, + "learning_rate": 3.183823689230092e-09, + "loss": 0.661, + "step": 9514 + }, + { + "epoch": 0.99, + "grad_norm": 1.976417843136004, + "learning_rate": 3.1240411301980413e-09, + "loss": 0.6741, + "step": 9515 + }, + { + "epoch": 0.99, + "grad_norm": 2.0882239803944302, + "learning_rate": 3.0648249955378405e-09, + "loss": 0.6652, + "step": 9516 + }, + { + "epoch": 0.99, + "grad_norm": 2.163565371948662, + "learning_rate": 3.006175291960789e-09, + "loss": 0.5793, + "step": 9517 + }, + { + "epoch": 0.99, + "grad_norm": 2.00691462554776, + "learning_rate": 2.9480920261154565e-09, + "loss": 0.6054, + "step": 9518 + }, + { + "epoch": 0.99, + "grad_norm": 2.076838889221059, + "learning_rate": 2.8905752045865766e-09, + "loss": 0.6402, + "step": 9519 + }, + { + "epoch": 0.99, + "grad_norm": 2.115754866675787, + "learning_rate": 2.833624833893933e-09, + "loss": 0.6234, + "step": 9520 + }, + { + "epoch": 0.99, + "grad_norm": 2.0742777041776113, + "learning_rate": 2.7772409204923633e-09, + "loss": 0.529, + "step": 9521 + }, + { + "epoch": 0.99, + "grad_norm": 2.033700985807466, + "learning_rate": 2.721423470773421e-09, + "loss": 0.5545, + "step": 9522 + }, + { + "epoch": 0.99, + "grad_norm": 1.8799003192685757, + "learning_rate": 2.6661724910653774e-09, + "loss": 0.626, + "step": 9523 + }, + { + "epoch": 0.99, + "grad_norm": 2.276364282368124, + "learning_rate": 2.6114879876298905e-09, + "loss": 0.7034, + "step": 9524 + }, + { + "epoch": 0.99, + "grad_norm": 1.9794806333649237, + "learning_rate": 2.5573699666664455e-09, + "loss": 0.5946, + "step": 9525 + }, + { + "epoch": 0.99, + "grad_norm": 1.849392363068563, + "learning_rate": 2.5038184343101346e-09, + "loss": 0.6102, + "step": 9526 + }, + { + "epoch": 0.99, + "grad_norm": 1.9217115956324478, + "learning_rate": 2.4508333966305473e-09, + "loss": 0.5229, + "step": 9527 + }, + { + "epoch": 0.99, + "grad_norm": 2.019210354811399, + "learning_rate": 2.3984148596339907e-09, + "loss": 0.5405, + "step": 9528 + }, + { + "epoch": 0.99, + "grad_norm": 2.3043229195920816, + "learning_rate": 2.3465628292623776e-09, + "loss": 0.6815, + "step": 9529 + }, + { + "epoch": 0.99, + "grad_norm": 2.080083068675512, + "learning_rate": 2.295277311393784e-09, + "loss": 0.579, + "step": 9530 + }, + { + "epoch": 0.99, + "grad_norm": 2.1199920401449552, + "learning_rate": 2.2445583118413384e-09, + "loss": 0.6651, + "step": 9531 + }, + { + "epoch": 0.99, + "grad_norm": 2.0842626790362053, + "learning_rate": 2.19440583635433e-09, + "loss": 0.6451, + "step": 9532 + }, + { + "epoch": 0.99, + "grad_norm": 2.0915589293813173, + "learning_rate": 2.1448198906182106e-09, + "loss": 0.6338, + "step": 9533 + }, + { + "epoch": 0.99, + "grad_norm": 2.0306946472868996, + "learning_rate": 2.0958004802529297e-09, + "loss": 0.5401, + "step": 9534 + }, + { + "epoch": 0.99, + "grad_norm": 2.055855540710789, + "learning_rate": 2.047347610816819e-09, + "loss": 0.586, + "step": 9535 + }, + { + "epoch": 0.99, + "grad_norm": 1.9878400684546984, + "learning_rate": 1.999461287800486e-09, + "loss": 0.5352, + "step": 9536 + }, + { + "epoch": 0.99, + "grad_norm": 1.905336891439375, + "learning_rate": 1.9521415166329216e-09, + "loss": 0.484, + "step": 9537 + }, + { + "epoch": 0.99, + "grad_norm": 1.8815579402867484, + "learning_rate": 1.9053883026781685e-09, + "loss": 0.6296, + "step": 9538 + }, + { + "epoch": 0.99, + "grad_norm": 1.984575489338281, + "learning_rate": 1.8592016512358757e-09, + "loss": 0.5511, + "step": 9539 + }, + { + "epoch": 0.99, + "grad_norm": 2.046470877718721, + "learning_rate": 1.8135815675418556e-09, + "loss": 0.6024, + "step": 9540 + }, + { + "epoch": 0.99, + "grad_norm": 1.9722402840427116, + "learning_rate": 1.7685280567664165e-09, + "loss": 0.5257, + "step": 9541 + }, + { + "epoch": 0.99, + "grad_norm": 2.000880220808784, + "learning_rate": 1.7240411240176946e-09, + "loss": 0.5769, + "step": 9542 + }, + { + "epoch": 0.99, + "grad_norm": 2.1063839838590925, + "learning_rate": 1.680120774338323e-09, + "loss": 0.6462, + "step": 9543 + }, + { + "epoch": 0.99, + "grad_norm": 2.0056915245462923, + "learning_rate": 1.6367670127059864e-09, + "loss": 0.63, + "step": 9544 + }, + { + "epoch": 0.99, + "grad_norm": 1.8659277173450295, + "learning_rate": 1.5939798440367527e-09, + "loss": 0.5298, + "step": 9545 + }, + { + "epoch": 0.99, + "grad_norm": 1.864106975642594, + "learning_rate": 1.5517592731789654e-09, + "loss": 0.5465, + "step": 9546 + }, + { + "epoch": 0.99, + "grad_norm": 1.9575734799120101, + "learning_rate": 1.5101053049199065e-09, + "loss": 0.5879, + "step": 9547 + }, + { + "epoch": 0.99, + "grad_norm": 1.9740114393181525, + "learning_rate": 1.4690179439807993e-09, + "loss": 0.64, + "step": 9548 + }, + { + "epoch": 0.99, + "grad_norm": 1.9215329174434153, + "learning_rate": 1.4284971950195847e-09, + "loss": 0.652, + "step": 9549 + }, + { + "epoch": 0.99, + "grad_norm": 1.8879530552664527, + "learning_rate": 1.3885430626287e-09, + "loss": 0.5143, + "step": 9550 + }, + { + "epoch": 0.99, + "grad_norm": 2.0021158704501776, + "learning_rate": 1.3491555513378552e-09, + "loss": 0.6374, + "step": 9551 + }, + { + "epoch": 0.99, + "grad_norm": 1.750227327277223, + "learning_rate": 1.310334665611257e-09, + "loss": 0.6026, + "step": 9552 + }, + { + "epoch": 0.99, + "grad_norm": 1.924162397266902, + "learning_rate": 1.2720804098498297e-09, + "loss": 0.6646, + "step": 9553 + }, + { + "epoch": 0.99, + "grad_norm": 1.9347971214125042, + "learning_rate": 1.2343927883901042e-09, + "loss": 0.6712, + "step": 9554 + }, + { + "epoch": 0.99, + "grad_norm": 2.006885502405168, + "learning_rate": 1.1972718055036637e-09, + "loss": 0.5202, + "step": 9555 + }, + { + "epoch": 0.99, + "grad_norm": 1.689423303629976, + "learning_rate": 1.1607174653988085e-09, + "loss": 0.5011, + "step": 9556 + }, + { + "epoch": 0.99, + "grad_norm": 1.9127280989270514, + "learning_rate": 1.124729772219446e-09, + "loss": 0.4981, + "step": 9557 + }, + { + "epoch": 0.99, + "grad_norm": 2.0010664585250657, + "learning_rate": 1.089308730043981e-09, + "loss": 0.6541, + "step": 9558 + }, + { + "epoch": 0.99, + "grad_norm": 2.07992521114073, + "learning_rate": 1.0544543428886445e-09, + "loss": 0.6736, + "step": 9559 + }, + { + "epoch": 0.99, + "grad_norm": 1.9283583857469802, + "learning_rate": 1.0201666147041656e-09, + "loss": 0.6337, + "step": 9560 + }, + { + "epoch": 0.99, + "grad_norm": 1.9700243707520468, + "learning_rate": 9.864455493763247e-10, + "loss": 0.6305, + "step": 9561 + }, + { + "epoch": 0.99, + "grad_norm": 2.147094232957025, + "learning_rate": 9.5329115072873e-10, + "loss": 0.6686, + "step": 9562 + }, + { + "epoch": 0.99, + "grad_norm": 2.079316149178158, + "learning_rate": 9.207034225189315e-10, + "loss": 0.5308, + "step": 9563 + }, + { + "epoch": 0.99, + "grad_norm": 2.354067111579948, + "learning_rate": 8.886823684417512e-10, + "loss": 0.6647, + "step": 9564 + }, + { + "epoch": 0.99, + "grad_norm": 1.7465751308270212, + "learning_rate": 8.57227992125953e-10, + "loss": 0.5295, + "step": 9565 + }, + { + "epoch": 0.99, + "grad_norm": 1.9868861820598962, + "learning_rate": 8.263402971375734e-10, + "loss": 0.5493, + "step": 9566 + }, + { + "epoch": 0.99, + "grad_norm": 2.22374318466106, + "learning_rate": 7.960192869782557e-10, + "loss": 0.6371, + "step": 9567 + }, + { + "epoch": 0.99, + "grad_norm": 1.9923933780189522, + "learning_rate": 7.662649650841403e-10, + "loss": 0.5635, + "step": 9568 + }, + { + "epoch": 0.99, + "grad_norm": 2.038120541474471, + "learning_rate": 7.370773348286397e-10, + "loss": 0.5779, + "step": 9569 + }, + { + "epoch": 0.99, + "grad_norm": 1.9771844157809515, + "learning_rate": 7.084563995202187e-10, + "loss": 0.526, + "step": 9570 + }, + { + "epoch": 0.99, + "grad_norm": 1.812053455374471, + "learning_rate": 6.80402162403504e-10, + "loss": 0.5634, + "step": 9571 + }, + { + "epoch": 1.0, + "grad_norm": 1.9921272648147166, + "learning_rate": 6.529146266587294e-10, + "loss": 0.5553, + "step": 9572 + }, + { + "epoch": 1.0, + "grad_norm": 1.859834751177411, + "learning_rate": 6.259937954006257e-10, + "loss": 0.6282, + "step": 9573 + }, + { + "epoch": 1.0, + "grad_norm": 2.040795878230898, + "learning_rate": 5.996396716823061e-10, + "loss": 0.641, + "step": 9574 + }, + { + "epoch": 1.0, + "grad_norm": 1.9718501791261143, + "learning_rate": 5.738522584897155e-10, + "loss": 0.5137, + "step": 9575 + }, + { + "epoch": 1.0, + "grad_norm": 2.019272879708207, + "learning_rate": 5.486315587471813e-10, + "loss": 0.613, + "step": 9576 + }, + { + "epoch": 1.0, + "grad_norm": 2.0140937375266206, + "learning_rate": 5.239775753129728e-10, + "loss": 0.5492, + "step": 9577 + }, + { + "epoch": 1.0, + "grad_norm": 1.7876855888321224, + "learning_rate": 4.998903109826314e-10, + "loss": 0.5968, + "step": 9578 + }, + { + "epoch": 1.0, + "grad_norm": 1.937692711387017, + "learning_rate": 4.763697684850854e-10, + "loss": 0.5326, + "step": 9579 + }, + { + "epoch": 1.0, + "grad_norm": 1.8871781497039766, + "learning_rate": 4.5341595048764564e-10, + "loss": 0.5952, + "step": 9580 + }, + { + "epoch": 1.0, + "grad_norm": 1.657956554866962, + "learning_rate": 4.310288595921197e-10, + "loss": 0.4911, + "step": 9581 + }, + { + "epoch": 1.0, + "grad_norm": 1.953405180081528, + "learning_rate": 4.0920849833592236e-10, + "loss": 0.6131, + "step": 9582 + }, + { + "epoch": 1.0, + "grad_norm": 2.091422857268565, + "learning_rate": 3.879548691926305e-10, + "loss": 0.6022, + "step": 9583 + }, + { + "epoch": 1.0, + "grad_norm": 1.9263886894181483, + "learning_rate": 3.6726797457198314e-10, + "loss": 0.5952, + "step": 9584 + }, + { + "epoch": 1.0, + "grad_norm": 2.146763105003528, + "learning_rate": 3.471478168176612e-10, + "loss": 0.6512, + "step": 9585 + }, + { + "epoch": 1.0, + "grad_norm": 1.931113208646632, + "learning_rate": 3.27594398211728e-10, + "loss": 0.5383, + "step": 9586 + }, + { + "epoch": 1.0, + "grad_norm": 1.9413320484945569, + "learning_rate": 3.086077209701888e-10, + "loss": 0.6305, + "step": 9587 + }, + { + "epoch": 1.0, + "grad_norm": 1.917465576733899, + "learning_rate": 2.901877872452108e-10, + "loss": 0.5411, + "step": 9588 + }, + { + "epoch": 1.0, + "grad_norm": 1.9595757628143469, + "learning_rate": 2.723345991245685e-10, + "loss": 0.5646, + "step": 9589 + }, + { + "epoch": 1.0, + "grad_norm": 2.196310127527789, + "learning_rate": 2.550481586321985e-10, + "loss": 0.5639, + "step": 9590 + }, + { + "epoch": 1.0, + "grad_norm": 1.900419785333137, + "learning_rate": 2.3832846772819937e-10, + "loss": 0.7065, + "step": 9591 + }, + { + "epoch": 1.0, + "grad_norm": 1.9730652890498885, + "learning_rate": 2.2217552830716693e-10, + "loss": 0.6706, + "step": 9592 + }, + { + "epoch": 1.0, + "grad_norm": 1.9593091425150002, + "learning_rate": 2.0658934219985883e-10, + "loss": 0.5465, + "step": 9593 + }, + { + "epoch": 1.0, + "grad_norm": 2.135942361894095, + "learning_rate": 1.9156991117430523e-10, + "loss": 0.758, + "step": 9594 + }, + { + "epoch": 1.0, + "grad_norm": 1.930526843929985, + "learning_rate": 1.7711723693192296e-10, + "loss": 0.5605, + "step": 9595 + }, + { + "epoch": 1.0, + "grad_norm": 1.9256736889575217, + "learning_rate": 1.6323132111084605e-10, + "loss": 0.6454, + "step": 9596 + }, + { + "epoch": 1.0, + "grad_norm": 2.064044604856687, + "learning_rate": 1.49912165286481e-10, + "loss": 0.6168, + "step": 9597 + }, + { + "epoch": 1.0, + "grad_norm": 1.8522204350674754, + "learning_rate": 1.3715977096706578e-10, + "loss": 0.626, + "step": 9598 + }, + { + "epoch": 1.0, + "grad_norm": 2.0195678223686127, + "learning_rate": 1.24974139599221e-10, + "loss": 0.7152, + "step": 9599 + }, + { + "epoch": 1.0, + "grad_norm": 1.974776889449131, + "learning_rate": 1.1335527256350898e-10, + "loss": 0.5263, + "step": 9600 + }, + { + "epoch": 1.0, + "grad_norm": 1.9816482265824762, + "learning_rate": 1.0230317117776445e-10, + "loss": 0.5823, + "step": 9601 + }, + { + "epoch": 1.0, + "grad_norm": 1.9456288642203419, + "learning_rate": 9.181783669431898e-11, + "loss": 0.5181, + "step": 9602 + }, + { + "epoch": 1.0, + "grad_norm": 1.8505141834481915, + "learning_rate": 8.189927030222145e-11, + "loss": 0.5149, + "step": 9603 + }, + { + "epoch": 1.0, + "grad_norm": 1.9702244129196227, + "learning_rate": 7.254747312501754e-11, + "loss": 0.6123, + "step": 9604 + }, + { + "epoch": 1.0, + "grad_norm": 1.9383150259419573, + "learning_rate": 6.376244622297023e-11, + "loss": 0.6127, + "step": 9605 + }, + { + "epoch": 1.0, + "grad_norm": 1.9864553209022995, + "learning_rate": 5.554419059250471e-11, + "loss": 0.6042, + "step": 9606 + }, + { + "epoch": 1.0, + "grad_norm": 1.9772866782374987, + "learning_rate": 4.7892707164542975e-11, + "loss": 0.5077, + "step": 9607 + }, + { + "epoch": 1.0, + "grad_norm": 2.0014007844865565, + "learning_rate": 4.080799680727943e-11, + "loss": 0.6155, + "step": 9608 + }, + { + "epoch": 1.0, + "grad_norm": 2.2486787110200264, + "learning_rate": 3.42900603228502e-11, + "loss": 0.5979, + "step": 9609 + }, + { + "epoch": 1.0, + "grad_norm": 2.064265143022498, + "learning_rate": 2.833889845010873e-11, + "loss": 0.6211, + "step": 9610 + }, + { + "epoch": 1.0, + "grad_norm": 2.081375211775521, + "learning_rate": 2.2954511864625717e-11, + "loss": 0.5994, + "step": 9611 + }, + { + "epoch": 1.0, + "grad_norm": 1.7557657446910315, + "learning_rate": 1.8136901175913602e-11, + "loss": 0.6341, + "step": 9612 + }, + { + "epoch": 1.0, + "grad_norm": 1.8772616582779724, + "learning_rate": 1.3886066930202113e-11, + "loss": 0.6788, + "step": 9613 + }, + { + "epoch": 1.0, + "grad_norm": 1.6975943413231218, + "learning_rate": 1.0202009609883157e-11, + "loss": 0.4914, + "step": 9614 + }, + { + "epoch": 1.0, + "grad_norm": 1.9089824368308848, + "learning_rate": 7.084729631845477e-12, + "loss": 0.5957, + "step": 9615 + }, + { + "epoch": 1.0, + "grad_norm": 1.8543367974168876, + "learning_rate": 4.534227349695108e-12, + "loss": 0.6776, + "step": 9616 + }, + { + "epoch": 1.0, + "grad_norm": 1.842851875350606, + "learning_rate": 2.5505030532002594e-12, + "loss": 0.5382, + "step": 9617 + }, + { + "epoch": 1.0, + "grad_norm": 1.8926880273749038, + "learning_rate": 1.13355696607087e-12, + "loss": 0.6009, + "step": 9618 + }, + { + "epoch": 1.0, + "grad_norm": 1.9155764490752814, + "learning_rate": 2.8338924928927867e-13, + "loss": 0.5445, + "step": 9619 + }, + { + "epoch": 1.0, + "grad_norm": 1.8883391558284497, + "learning_rate": 0.0, + "loss": 0.6125, + "step": 9620 + }, + { + "epoch": 1.0, + "step": 9620, + "total_flos": 2995729119199232.0, + "train_loss": 0.6451899706387966, + "train_runtime": 324149.63, + "train_samples_per_second": 3.799, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 1.0, + "max_steps": 9620, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 2995729119199232.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}