diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12320 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.99786248664054, + "eval_steps": 50000, + "global_step": 17540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0057000356252226575, + "grad_norm": 0.8329347851096954, + "learning_rate": 1.1402508551881415e-07, + "loss": 1.6812, + "step": 10 + }, + { + "epoch": 0.011400071250445315, + "grad_norm": 0.8173922269493806, + "learning_rate": 2.280501710376283e-07, + "loss": 1.6858, + "step": 20 + }, + { + "epoch": 0.017100106875667972, + "grad_norm": 0.809557505770668, + "learning_rate": 3.4207525655644247e-07, + "loss": 1.6882, + "step": 30 + }, + { + "epoch": 0.02280014250089063, + "grad_norm": 0.8748268159634193, + "learning_rate": 4.561003420752566e-07, + "loss": 1.684, + "step": 40 + }, + { + "epoch": 0.028500178126113287, + "grad_norm": 0.8347602557149199, + "learning_rate": 5.701254275940708e-07, + "loss": 1.6798, + "step": 50 + }, + { + "epoch": 0.034200213751335945, + "grad_norm": 0.8430580040769771, + "learning_rate": 6.841505131128849e-07, + "loss": 1.6845, + "step": 60 + }, + { + "epoch": 0.0399002493765586, + "grad_norm": 0.8808469992546447, + "learning_rate": 7.98175598631699e-07, + "loss": 1.6805, + "step": 70 + }, + { + "epoch": 0.04560028500178126, + "grad_norm": 0.9332195994860901, + "learning_rate": 9.122006841505132e-07, + "loss": 1.6737, + "step": 80 + }, + { + "epoch": 0.05130032062700392, + "grad_norm": 0.9292762787168932, + "learning_rate": 1.0262257696693273e-06, + "loss": 1.6761, + "step": 90 + }, + { + "epoch": 0.057000356252226575, + "grad_norm": 0.8959401505265313, + "learning_rate": 1.1402508551881415e-06, + "loss": 1.672, + "step": 100 + }, + { + "epoch": 0.06270039187744923, + "grad_norm": 0.9753219116814298, + "learning_rate": 1.2542759407069557e-06, + "loss": 1.6788, + "step": 110 + }, + { + "epoch": 0.06840042750267189, + "grad_norm": 1.0355003267680642, + "learning_rate": 1.3683010262257699e-06, + "loss": 1.6521, + "step": 120 + }, + { + "epoch": 0.07410046312789455, + "grad_norm": 0.9854284833432406, + "learning_rate": 1.4823261117445838e-06, + "loss": 1.6542, + "step": 130 + }, + { + "epoch": 0.0798004987531172, + "grad_norm": 0.8995458835476428, + "learning_rate": 1.596351197263398e-06, + "loss": 1.6266, + "step": 140 + }, + { + "epoch": 0.08550053437833986, + "grad_norm": 0.8937279797895432, + "learning_rate": 1.7103762827822124e-06, + "loss": 1.6018, + "step": 150 + }, + { + "epoch": 0.09120057000356252, + "grad_norm": 0.8508463445168406, + "learning_rate": 1.8244013683010263e-06, + "loss": 1.5702, + "step": 160 + }, + { + "epoch": 0.09690060562878518, + "grad_norm": 0.826282613868742, + "learning_rate": 1.9384264538198407e-06, + "loss": 1.568, + "step": 170 + }, + { + "epoch": 0.10260064125400783, + "grad_norm": 0.7919479763516626, + "learning_rate": 2.0524515393386547e-06, + "loss": 1.5417, + "step": 180 + }, + { + "epoch": 0.10830067687923049, + "grad_norm": 0.684561861348144, + "learning_rate": 2.166476624857469e-06, + "loss": 1.5114, + "step": 190 + }, + { + "epoch": 0.11400071250445315, + "grad_norm": 0.680205774626441, + "learning_rate": 2.280501710376283e-06, + "loss": 1.4744, + "step": 200 + }, + { + "epoch": 0.11970074812967581, + "grad_norm": 0.7006502228163867, + "learning_rate": 2.394526795895097e-06, + "loss": 1.4864, + "step": 210 + }, + { + "epoch": 0.12540078375489846, + "grad_norm": 0.6771625316054201, + "learning_rate": 2.5085518814139114e-06, + "loss": 1.4508, + "step": 220 + }, + { + "epoch": 0.13110081938012114, + "grad_norm": 0.6383811243685706, + "learning_rate": 2.6225769669327258e-06, + "loss": 1.4298, + "step": 230 + }, + { + "epoch": 0.13680085500534378, + "grad_norm": 0.6383478063230229, + "learning_rate": 2.7366020524515397e-06, + "loss": 1.4201, + "step": 240 + }, + { + "epoch": 0.14250089063056645, + "grad_norm": 0.664438350643699, + "learning_rate": 2.8506271379703537e-06, + "loss": 1.4037, + "step": 250 + }, + { + "epoch": 0.1482009262557891, + "grad_norm": 0.6120680534196937, + "learning_rate": 2.9646522234891676e-06, + "loss": 1.3977, + "step": 260 + }, + { + "epoch": 0.15390096188101177, + "grad_norm": 0.6152217588311774, + "learning_rate": 3.078677309007982e-06, + "loss": 1.3728, + "step": 270 + }, + { + "epoch": 0.1596009975062344, + "grad_norm": 0.6131574126061529, + "learning_rate": 3.192702394526796e-06, + "loss": 1.3524, + "step": 280 + }, + { + "epoch": 0.16530103313145708, + "grad_norm": 0.626799181295027, + "learning_rate": 3.30672748004561e-06, + "loss": 1.3344, + "step": 290 + }, + { + "epoch": 0.17100106875667972, + "grad_norm": 0.5974951223625137, + "learning_rate": 3.4207525655644248e-06, + "loss": 1.3409, + "step": 300 + }, + { + "epoch": 0.1767011043819024, + "grad_norm": 0.6112769459431191, + "learning_rate": 3.5347776510832387e-06, + "loss": 1.3241, + "step": 310 + }, + { + "epoch": 0.18240114000712504, + "grad_norm": 0.6229753422183877, + "learning_rate": 3.6488027366020527e-06, + "loss": 1.3017, + "step": 320 + }, + { + "epoch": 0.1881011756323477, + "grad_norm": 0.6105785881673802, + "learning_rate": 3.762827822120867e-06, + "loss": 1.3024, + "step": 330 + }, + { + "epoch": 0.19380121125757035, + "grad_norm": 0.6244573917539824, + "learning_rate": 3.8768529076396815e-06, + "loss": 1.2862, + "step": 340 + }, + { + "epoch": 0.19950124688279303, + "grad_norm": 0.6380126761223662, + "learning_rate": 3.990877993158495e-06, + "loss": 1.2763, + "step": 350 + }, + { + "epoch": 0.20520128250801567, + "grad_norm": 0.6234244975836795, + "learning_rate": 4.104903078677309e-06, + "loss": 1.2578, + "step": 360 + }, + { + "epoch": 0.21090131813323834, + "grad_norm": 0.640686344376765, + "learning_rate": 4.218928164196123e-06, + "loss": 1.264, + "step": 370 + }, + { + "epoch": 0.21660135375846098, + "grad_norm": 0.6664146960033351, + "learning_rate": 4.332953249714938e-06, + "loss": 1.2566, + "step": 380 + }, + { + "epoch": 0.22230138938368366, + "grad_norm": 0.6791004999182745, + "learning_rate": 4.446978335233752e-06, + "loss": 1.2506, + "step": 390 + }, + { + "epoch": 0.2280014250089063, + "grad_norm": 0.660782823423085, + "learning_rate": 4.561003420752566e-06, + "loss": 1.2341, + "step": 400 + }, + { + "epoch": 0.23370146063412897, + "grad_norm": 0.6496810448682551, + "learning_rate": 4.67502850627138e-06, + "loss": 1.2303, + "step": 410 + }, + { + "epoch": 0.23940149625935161, + "grad_norm": 0.6760464986579647, + "learning_rate": 4.789053591790194e-06, + "loss": 1.2157, + "step": 420 + }, + { + "epoch": 0.2451015318845743, + "grad_norm": 0.7099546157920937, + "learning_rate": 4.903078677309008e-06, + "loss": 1.2119, + "step": 430 + }, + { + "epoch": 0.25080156750979693, + "grad_norm": 0.6985263486782688, + "learning_rate": 5.017103762827823e-06, + "loss": 1.1998, + "step": 440 + }, + { + "epoch": 0.2565016031350196, + "grad_norm": 0.7086874492220844, + "learning_rate": 5.131128848346637e-06, + "loss": 1.1845, + "step": 450 + }, + { + "epoch": 0.2622016387602423, + "grad_norm": 0.6952132915582485, + "learning_rate": 5.2451539338654515e-06, + "loss": 1.2045, + "step": 460 + }, + { + "epoch": 0.2679016743854649, + "grad_norm": 0.7563820343660739, + "learning_rate": 5.3591790193842655e-06, + "loss": 1.1816, + "step": 470 + }, + { + "epoch": 0.27360171001068756, + "grad_norm": 0.7383768041736016, + "learning_rate": 5.4732041049030794e-06, + "loss": 1.1867, + "step": 480 + }, + { + "epoch": 0.2793017456359102, + "grad_norm": 0.7377951397065144, + "learning_rate": 5.587229190421893e-06, + "loss": 1.1693, + "step": 490 + }, + { + "epoch": 0.2850017812611329, + "grad_norm": 0.7930492156928618, + "learning_rate": 5.701254275940707e-06, + "loss": 1.1769, + "step": 500 + }, + { + "epoch": 0.29070181688635555, + "grad_norm": 0.7804629910719232, + "learning_rate": 5.815279361459521e-06, + "loss": 1.1572, + "step": 510 + }, + { + "epoch": 0.2964018525115782, + "grad_norm": 0.7476754916217279, + "learning_rate": 5.929304446978335e-06, + "loss": 1.1559, + "step": 520 + }, + { + "epoch": 0.30210188813680083, + "grad_norm": 0.7928284277955501, + "learning_rate": 6.04332953249715e-06, + "loss": 1.1402, + "step": 530 + }, + { + "epoch": 0.30780192376202353, + "grad_norm": 0.8039246648014535, + "learning_rate": 6.157354618015964e-06, + "loss": 1.1456, + "step": 540 + }, + { + "epoch": 0.3135019593872462, + "grad_norm": 0.7914550658951943, + "learning_rate": 6.271379703534778e-06, + "loss": 1.138, + "step": 550 + }, + { + "epoch": 0.3192019950124688, + "grad_norm": 0.8477034908005336, + "learning_rate": 6.385404789053592e-06, + "loss": 1.1398, + "step": 560 + }, + { + "epoch": 0.32490203063769146, + "grad_norm": 0.8870257129031497, + "learning_rate": 6.499429874572406e-06, + "loss": 1.121, + "step": 570 + }, + { + "epoch": 0.33060206626291416, + "grad_norm": 0.8865720994018285, + "learning_rate": 6.61345496009122e-06, + "loss": 1.1116, + "step": 580 + }, + { + "epoch": 0.3363021018881368, + "grad_norm": 0.894221994899268, + "learning_rate": 6.727480045610034e-06, + "loss": 1.1134, + "step": 590 + }, + { + "epoch": 0.34200213751335945, + "grad_norm": 0.9002142031410411, + "learning_rate": 6.8415051311288495e-06, + "loss": 1.0982, + "step": 600 + }, + { + "epoch": 0.3477021731385821, + "grad_norm": 0.9405603084160153, + "learning_rate": 6.9555302166476635e-06, + "loss": 1.1104, + "step": 610 + }, + { + "epoch": 0.3534022087638048, + "grad_norm": 0.8978621968763663, + "learning_rate": 7.0695553021664774e-06, + "loss": 1.0935, + "step": 620 + }, + { + "epoch": 0.35910224438902744, + "grad_norm": 0.923135430228548, + "learning_rate": 7.183580387685291e-06, + "loss": 1.0747, + "step": 630 + }, + { + "epoch": 0.3648022800142501, + "grad_norm": 0.9637043977541311, + "learning_rate": 7.297605473204105e-06, + "loss": 1.0766, + "step": 640 + }, + { + "epoch": 0.3705023156394727, + "grad_norm": 0.9634742025386968, + "learning_rate": 7.41163055872292e-06, + "loss": 1.0817, + "step": 650 + }, + { + "epoch": 0.3762023512646954, + "grad_norm": 1.0138264576502096, + "learning_rate": 7.525655644241734e-06, + "loss": 1.0708, + "step": 660 + }, + { + "epoch": 0.38190238688991807, + "grad_norm": 0.9754277408302766, + "learning_rate": 7.639680729760547e-06, + "loss": 1.0662, + "step": 670 + }, + { + "epoch": 0.3876024225151407, + "grad_norm": 0.9997112312036316, + "learning_rate": 7.753705815279363e-06, + "loss": 1.0743, + "step": 680 + }, + { + "epoch": 0.39330245814036335, + "grad_norm": 1.0137614677943894, + "learning_rate": 7.867730900798177e-06, + "loss": 1.055, + "step": 690 + }, + { + "epoch": 0.39900249376558605, + "grad_norm": 1.0243398381093383, + "learning_rate": 7.98175598631699e-06, + "loss": 1.0547, + "step": 700 + }, + { + "epoch": 0.4047025293908087, + "grad_norm": 1.077627151044799, + "learning_rate": 8.095781071835805e-06, + "loss": 1.0497, + "step": 710 + }, + { + "epoch": 0.41040256501603134, + "grad_norm": 1.0659844500266695, + "learning_rate": 8.209806157354619e-06, + "loss": 1.0603, + "step": 720 + }, + { + "epoch": 0.416102600641254, + "grad_norm": 1.0891087944465083, + "learning_rate": 8.323831242873433e-06, + "loss": 1.0428, + "step": 730 + }, + { + "epoch": 0.4218026362664767, + "grad_norm": 1.155107592465822, + "learning_rate": 8.437856328392247e-06, + "loss": 1.0424, + "step": 740 + }, + { + "epoch": 0.4275026718916993, + "grad_norm": 1.0357273266673794, + "learning_rate": 8.55188141391106e-06, + "loss": 1.0419, + "step": 750 + }, + { + "epoch": 0.43320270751692197, + "grad_norm": 1.1099226858511917, + "learning_rate": 8.665906499429876e-06, + "loss": 1.0368, + "step": 760 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 1.1550259480727676, + "learning_rate": 8.77993158494869e-06, + "loss": 1.0324, + "step": 770 + }, + { + "epoch": 0.4446027787673673, + "grad_norm": 1.1766907357775453, + "learning_rate": 8.893956670467504e-06, + "loss": 1.0344, + "step": 780 + }, + { + "epoch": 0.45030281439258996, + "grad_norm": 1.086671978162032, + "learning_rate": 9.007981755986318e-06, + "loss": 1.0273, + "step": 790 + }, + { + "epoch": 0.4560028500178126, + "grad_norm": 1.186181822112197, + "learning_rate": 9.122006841505132e-06, + "loss": 1.0247, + "step": 800 + }, + { + "epoch": 0.46170288564303524, + "grad_norm": 1.150017830725986, + "learning_rate": 9.236031927023946e-06, + "loss": 1.0234, + "step": 810 + }, + { + "epoch": 0.46740292126825794, + "grad_norm": 1.169372753028457, + "learning_rate": 9.35005701254276e-06, + "loss": 1.0206, + "step": 820 + }, + { + "epoch": 0.4731029568934806, + "grad_norm": 1.1106663677739594, + "learning_rate": 9.464082098061574e-06, + "loss": 1.0159, + "step": 830 + }, + { + "epoch": 0.47880299251870323, + "grad_norm": 1.2049682047754315, + "learning_rate": 9.578107183580388e-06, + "loss": 1.0073, + "step": 840 + }, + { + "epoch": 0.4845030281439259, + "grad_norm": 1.2123026532151817, + "learning_rate": 9.692132269099202e-06, + "loss": 1.0049, + "step": 850 + }, + { + "epoch": 0.4902030637691486, + "grad_norm": 1.2675857450988537, + "learning_rate": 9.806157354618016e-06, + "loss": 1.0008, + "step": 860 + }, + { + "epoch": 0.4959030993943712, + "grad_norm": 1.2939190886660976, + "learning_rate": 9.920182440136832e-06, + "loss": 0.997, + "step": 870 + }, + { + "epoch": 0.5016031350195939, + "grad_norm": 1.367015284250686, + "learning_rate": 1.0034207525655646e-05, + "loss": 0.9979, + "step": 880 + }, + { + "epoch": 0.5073031706448166, + "grad_norm": 1.24993826544231, + "learning_rate": 1.014823261117446e-05, + "loss": 0.9898, + "step": 890 + }, + { + "epoch": 0.5130032062700391, + "grad_norm": 1.188181097585842, + "learning_rate": 1.0262257696693273e-05, + "loss": 0.9821, + "step": 900 + }, + { + "epoch": 0.5187032418952618, + "grad_norm": 1.219257967517726, + "learning_rate": 1.0376282782212087e-05, + "loss": 0.9753, + "step": 910 + }, + { + "epoch": 0.5244032775204845, + "grad_norm": 1.3131103694864974, + "learning_rate": 1.0490307867730903e-05, + "loss": 0.9907, + "step": 920 + }, + { + "epoch": 0.5301033131457071, + "grad_norm": 1.3286474526232694, + "learning_rate": 1.0604332953249717e-05, + "loss": 0.9886, + "step": 930 + }, + { + "epoch": 0.5358033487709298, + "grad_norm": 1.3048668421776233, + "learning_rate": 1.0718358038768531e-05, + "loss": 0.9826, + "step": 940 + }, + { + "epoch": 0.5415033843961524, + "grad_norm": 1.3627327837670187, + "learning_rate": 1.0832383124287345e-05, + "loss": 0.9865, + "step": 950 + }, + { + "epoch": 0.5472034200213751, + "grad_norm": 1.3265339665857847, + "learning_rate": 1.0946408209806159e-05, + "loss": 0.9804, + "step": 960 + }, + { + "epoch": 0.5529034556465978, + "grad_norm": 1.3467055060741506, + "learning_rate": 1.1060433295324973e-05, + "loss": 0.9725, + "step": 970 + }, + { + "epoch": 0.5586034912718204, + "grad_norm": 1.3166304290161006, + "learning_rate": 1.1174458380843787e-05, + "loss": 0.9745, + "step": 980 + }, + { + "epoch": 0.5643035268970431, + "grad_norm": 1.2412125716176452, + "learning_rate": 1.12884834663626e-05, + "loss": 0.962, + "step": 990 + }, + { + "epoch": 0.5700035625222658, + "grad_norm": 1.3189399694165516, + "learning_rate": 1.1402508551881415e-05, + "loss": 0.9821, + "step": 1000 + }, + { + "epoch": 0.5757035981474884, + "grad_norm": 1.3642955662034428, + "learning_rate": 1.1516533637400229e-05, + "loss": 0.9718, + "step": 1010 + }, + { + "epoch": 0.5814036337727111, + "grad_norm": 1.322103414761333, + "learning_rate": 1.1630558722919043e-05, + "loss": 0.9831, + "step": 1020 + }, + { + "epoch": 0.5871036693979338, + "grad_norm": 1.4293389786169786, + "learning_rate": 1.1744583808437857e-05, + "loss": 0.9661, + "step": 1030 + }, + { + "epoch": 0.5928037050231564, + "grad_norm": 1.3823165883871669, + "learning_rate": 1.185860889395667e-05, + "loss": 0.973, + "step": 1040 + }, + { + "epoch": 0.5985037406483791, + "grad_norm": 1.4800355218426224, + "learning_rate": 1.1972633979475485e-05, + "loss": 0.9701, + "step": 1050 + }, + { + "epoch": 0.6042037762736017, + "grad_norm": 1.3491909788324237, + "learning_rate": 1.20866590649943e-05, + "loss": 0.9583, + "step": 1060 + }, + { + "epoch": 0.6099038118988244, + "grad_norm": 1.4377241923522286, + "learning_rate": 1.2200684150513114e-05, + "loss": 0.9578, + "step": 1070 + }, + { + "epoch": 0.6156038475240471, + "grad_norm": 1.4076537543124394, + "learning_rate": 1.2314709236031928e-05, + "loss": 0.9558, + "step": 1080 + }, + { + "epoch": 0.6213038831492697, + "grad_norm": 1.4389533917231583, + "learning_rate": 1.2428734321550742e-05, + "loss": 0.9582, + "step": 1090 + }, + { + "epoch": 0.6270039187744924, + "grad_norm": 1.414132752155285, + "learning_rate": 1.2542759407069556e-05, + "loss": 0.9552, + "step": 1100 + }, + { + "epoch": 0.632703954399715, + "grad_norm": 1.3992854580137006, + "learning_rate": 1.265678449258837e-05, + "loss": 0.9553, + "step": 1110 + }, + { + "epoch": 0.6384039900249376, + "grad_norm": 1.3690113788333766, + "learning_rate": 1.2770809578107184e-05, + "loss": 0.9371, + "step": 1120 + }, + { + "epoch": 0.6441040256501603, + "grad_norm": 1.4043265246321486, + "learning_rate": 1.2884834663625998e-05, + "loss": 0.9493, + "step": 1130 + }, + { + "epoch": 0.6498040612753829, + "grad_norm": 1.3861708246884175, + "learning_rate": 1.2998859749144812e-05, + "loss": 0.9416, + "step": 1140 + }, + { + "epoch": 0.6555040969006056, + "grad_norm": 1.4273466903674794, + "learning_rate": 1.3112884834663626e-05, + "loss": 0.9357, + "step": 1150 + }, + { + "epoch": 0.6612041325258283, + "grad_norm": 1.4006529288929217, + "learning_rate": 1.322690992018244e-05, + "loss": 0.9391, + "step": 1160 + }, + { + "epoch": 0.6669041681510509, + "grad_norm": 1.441422241182402, + "learning_rate": 1.3340935005701254e-05, + "loss": 0.9352, + "step": 1170 + }, + { + "epoch": 0.6726042037762736, + "grad_norm": 1.574058398645153, + "learning_rate": 1.3454960091220068e-05, + "loss": 0.9272, + "step": 1180 + }, + { + "epoch": 0.6783042394014963, + "grad_norm": 1.4756460944297158, + "learning_rate": 1.3568985176738885e-05, + "loss": 0.9366, + "step": 1190 + }, + { + "epoch": 0.6840042750267189, + "grad_norm": 1.5008253996689045, + "learning_rate": 1.3683010262257699e-05, + "loss": 0.9337, + "step": 1200 + }, + { + "epoch": 0.6897043106519416, + "grad_norm": 1.499817814739839, + "learning_rate": 1.3797035347776513e-05, + "loss": 0.921, + "step": 1210 + }, + { + "epoch": 0.6954043462771642, + "grad_norm": 1.5055646516816286, + "learning_rate": 1.3911060433295327e-05, + "loss": 0.9332, + "step": 1220 + }, + { + "epoch": 0.7011043819023869, + "grad_norm": 1.5564454315844756, + "learning_rate": 1.4025085518814141e-05, + "loss": 0.9303, + "step": 1230 + }, + { + "epoch": 0.7068044175276096, + "grad_norm": 1.6439995518569874, + "learning_rate": 1.4139110604332955e-05, + "loss": 0.93, + "step": 1240 + }, + { + "epoch": 0.7125044531528322, + "grad_norm": 1.6715644098081408, + "learning_rate": 1.4253135689851769e-05, + "loss": 0.9216, + "step": 1250 + }, + { + "epoch": 0.7182044887780549, + "grad_norm": 1.530693928930533, + "learning_rate": 1.4367160775370583e-05, + "loss": 0.9202, + "step": 1260 + }, + { + "epoch": 0.7239045244032776, + "grad_norm": 1.501811382005542, + "learning_rate": 1.4481185860889397e-05, + "loss": 0.9241, + "step": 1270 + }, + { + "epoch": 0.7296045600285002, + "grad_norm": 1.5857176435296712, + "learning_rate": 1.459521094640821e-05, + "loss": 0.9182, + "step": 1280 + }, + { + "epoch": 0.7353045956537229, + "grad_norm": 1.6016630245706045, + "learning_rate": 1.4709236031927025e-05, + "loss": 0.9315, + "step": 1290 + }, + { + "epoch": 0.7410046312789454, + "grad_norm": 1.6390065509439025, + "learning_rate": 1.482326111744584e-05, + "loss": 0.9112, + "step": 1300 + }, + { + "epoch": 0.7467046669041681, + "grad_norm": 1.6435383861134847, + "learning_rate": 1.4937286202964654e-05, + "loss": 0.9272, + "step": 1310 + }, + { + "epoch": 0.7524047025293908, + "grad_norm": 1.6250668417933374, + "learning_rate": 1.5051311288483468e-05, + "loss": 0.9297, + "step": 1320 + }, + { + "epoch": 0.7581047381546134, + "grad_norm": 1.5301736614268264, + "learning_rate": 1.5165336374002282e-05, + "loss": 0.9342, + "step": 1330 + }, + { + "epoch": 0.7638047737798361, + "grad_norm": 1.4990905871574132, + "learning_rate": 1.5279361459521094e-05, + "loss": 0.9112, + "step": 1340 + }, + { + "epoch": 0.7695048094050588, + "grad_norm": 1.5663816234369212, + "learning_rate": 1.539338654503991e-05, + "loss": 0.9117, + "step": 1350 + }, + { + "epoch": 0.7752048450302814, + "grad_norm": 1.4930234575277266, + "learning_rate": 1.5507411630558726e-05, + "loss": 0.9004, + "step": 1360 + }, + { + "epoch": 0.7809048806555041, + "grad_norm": 1.6617358858909224, + "learning_rate": 1.5621436716077538e-05, + "loss": 0.9117, + "step": 1370 + }, + { + "epoch": 0.7866049162807267, + "grad_norm": 1.6825594646269288, + "learning_rate": 1.5735461801596354e-05, + "loss": 0.9163, + "step": 1380 + }, + { + "epoch": 0.7923049519059494, + "grad_norm": 1.755907748115666, + "learning_rate": 1.5849486887115166e-05, + "loss": 0.9172, + "step": 1390 + }, + { + "epoch": 0.7980049875311721, + "grad_norm": 1.6075387122236047, + "learning_rate": 1.596351197263398e-05, + "loss": 0.9072, + "step": 1400 + }, + { + "epoch": 0.8037050231563947, + "grad_norm": 1.6602708406746058, + "learning_rate": 1.6077537058152794e-05, + "loss": 0.9038, + "step": 1410 + }, + { + "epoch": 0.8094050587816174, + "grad_norm": 1.6473906939147263, + "learning_rate": 1.619156214367161e-05, + "loss": 0.9078, + "step": 1420 + }, + { + "epoch": 0.8151050944068401, + "grad_norm": 1.7005886333455729, + "learning_rate": 1.6305587229190422e-05, + "loss": 0.9092, + "step": 1430 + }, + { + "epoch": 0.8208051300320627, + "grad_norm": 1.721609161880879, + "learning_rate": 1.6419612314709237e-05, + "loss": 0.8968, + "step": 1440 + }, + { + "epoch": 0.8265051656572854, + "grad_norm": 1.6866164274833975, + "learning_rate": 1.653363740022805e-05, + "loss": 0.8897, + "step": 1450 + }, + { + "epoch": 0.832205201282508, + "grad_norm": 1.699305012779236, + "learning_rate": 1.6647662485746865e-05, + "loss": 0.9091, + "step": 1460 + }, + { + "epoch": 0.8379052369077307, + "grad_norm": 1.7192783800368083, + "learning_rate": 1.6761687571265678e-05, + "loss": 0.8915, + "step": 1470 + }, + { + "epoch": 0.8436052725329534, + "grad_norm": 1.7393841580348268, + "learning_rate": 1.6875712656784493e-05, + "loss": 0.9033, + "step": 1480 + }, + { + "epoch": 0.849305308158176, + "grad_norm": 1.6214835909104202, + "learning_rate": 1.6989737742303306e-05, + "loss": 0.9074, + "step": 1490 + }, + { + "epoch": 0.8550053437833987, + "grad_norm": 1.725366806565937, + "learning_rate": 1.710376282782212e-05, + "loss": 0.8935, + "step": 1500 + }, + { + "epoch": 0.8607053794086214, + "grad_norm": 1.7651184492996312, + "learning_rate": 1.7217787913340937e-05, + "loss": 0.886, + "step": 1510 + }, + { + "epoch": 0.8664054150338439, + "grad_norm": 1.6507462754164786, + "learning_rate": 1.7331812998859753e-05, + "loss": 0.89, + "step": 1520 + }, + { + "epoch": 0.8721054506590666, + "grad_norm": 1.638093001715848, + "learning_rate": 1.7445838084378565e-05, + "loss": 0.8916, + "step": 1530 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 1.8092851839912834, + "learning_rate": 1.755986316989738e-05, + "loss": 0.8859, + "step": 1540 + }, + { + "epoch": 0.8835055219095119, + "grad_norm": 1.723062489410676, + "learning_rate": 1.7673888255416193e-05, + "loss": 0.8919, + "step": 1550 + }, + { + "epoch": 0.8892055575347346, + "grad_norm": 1.6985104237734552, + "learning_rate": 1.778791334093501e-05, + "loss": 0.8913, + "step": 1560 + }, + { + "epoch": 0.8949055931599572, + "grad_norm": 1.7350889236903257, + "learning_rate": 1.790193842645382e-05, + "loss": 0.8915, + "step": 1570 + }, + { + "epoch": 0.9006056287851799, + "grad_norm": 1.7491394404658165, + "learning_rate": 1.8015963511972636e-05, + "loss": 0.8925, + "step": 1580 + }, + { + "epoch": 0.9063056644104026, + "grad_norm": 1.7500511217245198, + "learning_rate": 1.812998859749145e-05, + "loss": 0.8763, + "step": 1590 + }, + { + "epoch": 0.9120057000356252, + "grad_norm": 1.6928141820764897, + "learning_rate": 1.8244013683010264e-05, + "loss": 0.883, + "step": 1600 + }, + { + "epoch": 0.9177057356608479, + "grad_norm": 1.80488872658001, + "learning_rate": 1.835803876852908e-05, + "loss": 0.8849, + "step": 1610 + }, + { + "epoch": 0.9234057712860705, + "grad_norm": 1.787335771913563, + "learning_rate": 1.8472063854047892e-05, + "loss": 0.8887, + "step": 1620 + }, + { + "epoch": 0.9291058069112932, + "grad_norm": 1.7052851457365026, + "learning_rate": 1.8586088939566708e-05, + "loss": 0.8876, + "step": 1630 + }, + { + "epoch": 0.9348058425365159, + "grad_norm": 1.6517039110275196, + "learning_rate": 1.870011402508552e-05, + "loss": 0.8719, + "step": 1640 + }, + { + "epoch": 0.9405058781617385, + "grad_norm": 1.803379001547972, + "learning_rate": 1.8814139110604336e-05, + "loss": 0.8941, + "step": 1650 + }, + { + "epoch": 0.9462059137869612, + "grad_norm": 1.7412794922786006, + "learning_rate": 1.8928164196123148e-05, + "loss": 0.8849, + "step": 1660 + }, + { + "epoch": 0.9519059494121839, + "grad_norm": 1.675503434473841, + "learning_rate": 1.9042189281641964e-05, + "loss": 0.8878, + "step": 1670 + }, + { + "epoch": 0.9576059850374065, + "grad_norm": 1.8836350723671362, + "learning_rate": 1.9156214367160776e-05, + "loss": 0.8885, + "step": 1680 + }, + { + "epoch": 0.9633060206626292, + "grad_norm": 1.714840122525118, + "learning_rate": 1.927023945267959e-05, + "loss": 0.8804, + "step": 1690 + }, + { + "epoch": 0.9690060562878517, + "grad_norm": 1.693212071429786, + "learning_rate": 1.9384264538198404e-05, + "loss": 0.8664, + "step": 1700 + }, + { + "epoch": 0.9747060919130744, + "grad_norm": 1.784064006948934, + "learning_rate": 1.949828962371722e-05, + "loss": 0.887, + "step": 1710 + }, + { + "epoch": 0.9804061275382971, + "grad_norm": 1.7674046204897094, + "learning_rate": 1.9612314709236032e-05, + "loss": 0.8792, + "step": 1720 + }, + { + "epoch": 0.9861061631635197, + "grad_norm": 1.761211622720898, + "learning_rate": 1.9726339794754847e-05, + "loss": 0.8757, + "step": 1730 + }, + { + "epoch": 0.9918061987887424, + "grad_norm": 1.8153929367013604, + "learning_rate": 1.9840364880273663e-05, + "loss": 0.8753, + "step": 1740 + }, + { + "epoch": 0.9975062344139651, + "grad_norm": 1.7814599157686348, + "learning_rate": 1.9954389965792475e-05, + "loss": 0.8631, + "step": 1750 + }, + { + "epoch": 1.0032062700391877, + "grad_norm": 1.8020950772155282, + "learning_rate": 1.999999287101006e-05, + "loss": 0.8597, + "step": 1760 + }, + { + "epoch": 1.0089063056644103, + "grad_norm": 1.8834044260626686, + "learning_rate": 1.9999949304997227e-05, + "loss": 0.8749, + "step": 1770 + }, + { + "epoch": 1.0146063412896331, + "grad_norm": 1.8317172645022874, + "learning_rate": 1.9999866133693866e-05, + "loss": 0.8701, + "step": 1780 + }, + { + "epoch": 1.0203063769148557, + "grad_norm": 1.8838111981312964, + "learning_rate": 1.999974335742938e-05, + "loss": 0.8618, + "step": 1790 + }, + { + "epoch": 1.0260064125400783, + "grad_norm": 1.757836158840337, + "learning_rate": 1.999958097669003e-05, + "loss": 0.8654, + "step": 1800 + }, + { + "epoch": 1.031706448165301, + "grad_norm": 1.6747076009804915, + "learning_rate": 1.9999378992118937e-05, + "loss": 0.8559, + "step": 1810 + }, + { + "epoch": 1.0374064837905237, + "grad_norm": 1.763665724140925, + "learning_rate": 1.9999137404516062e-05, + "loss": 0.8496, + "step": 1820 + }, + { + "epoch": 1.0431065194157463, + "grad_norm": 1.7083504545677566, + "learning_rate": 1.999885621483823e-05, + "loss": 0.85, + "step": 1830 + }, + { + "epoch": 1.048806555040969, + "grad_norm": 1.7603057444620083, + "learning_rate": 1.9998535424199112e-05, + "loss": 0.8579, + "step": 1840 + }, + { + "epoch": 1.0545065906661917, + "grad_norm": 2.013331632937321, + "learning_rate": 1.9998175033869205e-05, + "loss": 0.8644, + "step": 1850 + }, + { + "epoch": 1.0602066262914143, + "grad_norm": 1.737011009746836, + "learning_rate": 1.999777504527586e-05, + "loss": 0.8597, + "step": 1860 + }, + { + "epoch": 1.065906661916637, + "grad_norm": 1.8813873322026473, + "learning_rate": 1.9997335460003246e-05, + "loss": 0.8589, + "step": 1870 + }, + { + "epoch": 1.0716066975418597, + "grad_norm": 1.8349067798622685, + "learning_rate": 1.9996856279792368e-05, + "loss": 0.8526, + "step": 1880 + }, + { + "epoch": 1.0773067331670823, + "grad_norm": 1.714703400856248, + "learning_rate": 1.999633750654104e-05, + "loss": 0.8605, + "step": 1890 + }, + { + "epoch": 1.083006768792305, + "grad_norm": 1.853883163691029, + "learning_rate": 1.999577914230388e-05, + "loss": 0.8582, + "step": 1900 + }, + { + "epoch": 1.0887068044175277, + "grad_norm": 1.8709920052287101, + "learning_rate": 1.9995181189292334e-05, + "loss": 0.8624, + "step": 1910 + }, + { + "epoch": 1.0944068400427502, + "grad_norm": 1.8883523943990927, + "learning_rate": 1.999454364987461e-05, + "loss": 0.8518, + "step": 1920 + }, + { + "epoch": 1.1001068756679728, + "grad_norm": 1.9780681144848358, + "learning_rate": 1.9993866526575723e-05, + "loss": 0.8561, + "step": 1930 + }, + { + "epoch": 1.1058069112931956, + "grad_norm": 1.7852231544209909, + "learning_rate": 1.999314982207745e-05, + "loss": 0.8627, + "step": 1940 + }, + { + "epoch": 1.1115069469184182, + "grad_norm": 1.7550418237943235, + "learning_rate": 1.9992393539218334e-05, + "loss": 0.8468, + "step": 1950 + }, + { + "epoch": 1.1172069825436408, + "grad_norm": 1.914656706563883, + "learning_rate": 1.999159768099367e-05, + "loss": 0.8612, + "step": 1960 + }, + { + "epoch": 1.1229070181688636, + "grad_norm": 2.030038254737503, + "learning_rate": 1.9990762250555495e-05, + "loss": 0.8515, + "step": 1970 + }, + { + "epoch": 1.1286070537940862, + "grad_norm": 1.8141661775314604, + "learning_rate": 1.9989887251212575e-05, + "loss": 0.8403, + "step": 1980 + }, + { + "epoch": 1.1343070894193088, + "grad_norm": 1.7992375971001897, + "learning_rate": 1.9988972686430382e-05, + "loss": 0.8434, + "step": 1990 + }, + { + "epoch": 1.1400071250445316, + "grad_norm": 1.8154747951291372, + "learning_rate": 1.9988018559831093e-05, + "loss": 0.8573, + "step": 2000 + }, + { + "epoch": 1.1457071606697542, + "grad_norm": 1.8823541957530339, + "learning_rate": 1.998702487519358e-05, + "loss": 0.8446, + "step": 2010 + }, + { + "epoch": 1.1514071962949768, + "grad_norm": 1.892962359232292, + "learning_rate": 1.998599163645338e-05, + "loss": 0.8471, + "step": 2020 + }, + { + "epoch": 1.1571072319201996, + "grad_norm": 1.8796275830005886, + "learning_rate": 1.9984918847702684e-05, + "loss": 0.8475, + "step": 2030 + }, + { + "epoch": 1.1628072675454222, + "grad_norm": 2.1504877623718777, + "learning_rate": 1.9983806513190323e-05, + "loss": 0.8463, + "step": 2040 + }, + { + "epoch": 1.1685073031706448, + "grad_norm": 2.1256410895372375, + "learning_rate": 1.998265463732175e-05, + "loss": 0.8406, + "step": 2050 + }, + { + "epoch": 1.1742073387958674, + "grad_norm": 1.9618985154355109, + "learning_rate": 1.9981463224659034e-05, + "loss": 0.8486, + "step": 2060 + }, + { + "epoch": 1.1799073744210902, + "grad_norm": 2.070139508836993, + "learning_rate": 1.9980232279920814e-05, + "loss": 0.836, + "step": 2070 + }, + { + "epoch": 1.1856074100463128, + "grad_norm": 2.0226054921350083, + "learning_rate": 1.9978961807982312e-05, + "loss": 0.8432, + "step": 2080 + }, + { + "epoch": 1.1913074456715353, + "grad_norm": 1.9367240553204879, + "learning_rate": 1.9977651813875293e-05, + "loss": 0.8327, + "step": 2090 + }, + { + "epoch": 1.1970074812967582, + "grad_norm": 1.9072166052236994, + "learning_rate": 1.997630230278806e-05, + "loss": 0.8362, + "step": 2100 + }, + { + "epoch": 1.2027075169219807, + "grad_norm": 1.9432404950125435, + "learning_rate": 1.997491328006541e-05, + "loss": 0.8441, + "step": 2110 + }, + { + "epoch": 1.2084075525472033, + "grad_norm": 1.911231463693163, + "learning_rate": 1.9973484751208636e-05, + "loss": 0.8383, + "step": 2120 + }, + { + "epoch": 1.2141075881724261, + "grad_norm": 1.9483520712665299, + "learning_rate": 1.99720167218755e-05, + "loss": 0.8507, + "step": 2130 + }, + { + "epoch": 1.2198076237976487, + "grad_norm": 1.8921621676164715, + "learning_rate": 1.9970509197880204e-05, + "loss": 0.8356, + "step": 2140 + }, + { + "epoch": 1.2255076594228713, + "grad_norm": 1.9747176106356008, + "learning_rate": 1.9968962185193367e-05, + "loss": 0.8411, + "step": 2150 + }, + { + "epoch": 1.2312076950480941, + "grad_norm": 1.9777158344485668, + "learning_rate": 1.9967375689942013e-05, + "loss": 0.8319, + "step": 2160 + }, + { + "epoch": 1.2369077306733167, + "grad_norm": 1.9527547946588864, + "learning_rate": 1.9965749718409532e-05, + "loss": 0.8488, + "step": 2170 + }, + { + "epoch": 1.2426077662985393, + "grad_norm": 1.9037749552318648, + "learning_rate": 1.9964084277035668e-05, + "loss": 0.8452, + "step": 2180 + }, + { + "epoch": 1.2483078019237621, + "grad_norm": 1.8459581980245552, + "learning_rate": 1.996237937241648e-05, + "loss": 0.8414, + "step": 2190 + }, + { + "epoch": 1.2540078375489847, + "grad_norm": 2.0833571443694288, + "learning_rate": 1.9960635011304325e-05, + "loss": 0.8434, + "step": 2200 + }, + { + "epoch": 1.2597078731742073, + "grad_norm": 1.86911029908039, + "learning_rate": 1.9958851200607833e-05, + "loss": 0.8395, + "step": 2210 + }, + { + "epoch": 1.26540790879943, + "grad_norm": 1.9980243217779092, + "learning_rate": 1.9957027947391873e-05, + "loss": 0.8477, + "step": 2220 + }, + { + "epoch": 1.2711079444246527, + "grad_norm": 1.9828685448765224, + "learning_rate": 1.9955165258877534e-05, + "loss": 0.8354, + "step": 2230 + }, + { + "epoch": 1.2768079800498753, + "grad_norm": 1.9814305922927593, + "learning_rate": 1.9953263142442078e-05, + "loss": 0.8356, + "step": 2240 + }, + { + "epoch": 1.282508015675098, + "grad_norm": 2.094648535162255, + "learning_rate": 1.9951321605618932e-05, + "loss": 0.8259, + "step": 2250 + }, + { + "epoch": 1.2882080513003207, + "grad_norm": 1.9713099315168099, + "learning_rate": 1.9949340656097652e-05, + "loss": 0.8307, + "step": 2260 + }, + { + "epoch": 1.2939080869255433, + "grad_norm": 1.9388006652656193, + "learning_rate": 1.9947320301723882e-05, + "loss": 0.8431, + "step": 2270 + }, + { + "epoch": 1.299608122550766, + "grad_norm": 2.0437788441257663, + "learning_rate": 1.9945260550499337e-05, + "loss": 0.839, + "step": 2280 + }, + { + "epoch": 1.3053081581759887, + "grad_norm": 1.9200807256749128, + "learning_rate": 1.9943161410581765e-05, + "loss": 0.8401, + "step": 2290 + }, + { + "epoch": 1.3110081938012113, + "grad_norm": 1.8932159307338257, + "learning_rate": 1.994102289028491e-05, + "loss": 0.8297, + "step": 2300 + }, + { + "epoch": 1.3167082294264338, + "grad_norm": 1.8604003595627263, + "learning_rate": 1.993884499807848e-05, + "loss": 0.8322, + "step": 2310 + }, + { + "epoch": 1.3224082650516567, + "grad_norm": 2.067435850526306, + "learning_rate": 1.9936627742588136e-05, + "loss": 0.8331, + "step": 2320 + }, + { + "epoch": 1.3281083006768792, + "grad_norm": 1.9736131852274952, + "learning_rate": 1.9934371132595426e-05, + "loss": 0.8253, + "step": 2330 + }, + { + "epoch": 1.3338083363021018, + "grad_norm": 1.973431146884113, + "learning_rate": 1.9932075177037757e-05, + "loss": 0.8252, + "step": 2340 + }, + { + "epoch": 1.3395083719273244, + "grad_norm": 1.987364467843, + "learning_rate": 1.9929739885008375e-05, + "loss": 0.8218, + "step": 2350 + }, + { + "epoch": 1.3452084075525472, + "grad_norm": 2.0389811693345234, + "learning_rate": 1.9927365265756326e-05, + "loss": 0.834, + "step": 2360 + }, + { + "epoch": 1.3509084431777698, + "grad_norm": 1.913868930297226, + "learning_rate": 1.9924951328686398e-05, + "loss": 0.8324, + "step": 2370 + }, + { + "epoch": 1.3566084788029924, + "grad_norm": 1.8211826510498335, + "learning_rate": 1.9922498083359113e-05, + "loss": 0.8257, + "step": 2380 + }, + { + "epoch": 1.3623085144282152, + "grad_norm": 2.0385423896344537, + "learning_rate": 1.9920005539490666e-05, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 1.3680085500534378, + "grad_norm": 1.9828362611522425, + "learning_rate": 1.9917473706952905e-05, + "loss": 0.8349, + "step": 2400 + }, + { + "epoch": 1.3737085856786604, + "grad_norm": 2.016395513885357, + "learning_rate": 1.9914902595773268e-05, + "loss": 0.8306, + "step": 2410 + }, + { + "epoch": 1.3794086213038832, + "grad_norm": 1.9533372967663756, + "learning_rate": 1.9912292216134775e-05, + "loss": 0.8298, + "step": 2420 + }, + { + "epoch": 1.3851086569291058, + "grad_norm": 1.9530067606486703, + "learning_rate": 1.990964257837596e-05, + "loss": 0.8148, + "step": 2430 + }, + { + "epoch": 1.3908086925543284, + "grad_norm": 2.0379623003529295, + "learning_rate": 1.9906953692990843e-05, + "loss": 0.8277, + "step": 2440 + }, + { + "epoch": 1.3965087281795512, + "grad_norm": 1.996917345031334, + "learning_rate": 1.990422557062889e-05, + "loss": 0.8248, + "step": 2450 + }, + { + "epoch": 1.4022087638047738, + "grad_norm": 2.1834776606325508, + "learning_rate": 1.9901458222094964e-05, + "loss": 0.8291, + "step": 2460 + }, + { + "epoch": 1.4079087994299964, + "grad_norm": 1.8262603600285912, + "learning_rate": 1.9898651658349276e-05, + "loss": 0.8294, + "step": 2470 + }, + { + "epoch": 1.4136088350552192, + "grad_norm": 1.9148153734335014, + "learning_rate": 1.9895805890507368e-05, + "loss": 0.827, + "step": 2480 + }, + { + "epoch": 1.4193088706804418, + "grad_norm": 1.8109824730555837, + "learning_rate": 1.9892920929840042e-05, + "loss": 0.8256, + "step": 2490 + }, + { + "epoch": 1.4250089063056643, + "grad_norm": 1.9038427847599968, + "learning_rate": 1.988999678777332e-05, + "loss": 0.8149, + "step": 2500 + }, + { + "epoch": 1.4307089419308872, + "grad_norm": 1.9183688798514287, + "learning_rate": 1.988703347588842e-05, + "loss": 0.8219, + "step": 2510 + }, + { + "epoch": 1.4364089775561097, + "grad_norm": 2.0062935185811988, + "learning_rate": 1.988403100592168e-05, + "loss": 0.8272, + "step": 2520 + }, + { + "epoch": 1.4421090131813323, + "grad_norm": 1.916360321048196, + "learning_rate": 1.988098938976453e-05, + "loss": 0.8185, + "step": 2530 + }, + { + "epoch": 1.4478090488065551, + "grad_norm": 1.9474228376010432, + "learning_rate": 1.9877908639463438e-05, + "loss": 0.8224, + "step": 2540 + }, + { + "epoch": 1.4535090844317777, + "grad_norm": 2.030260343191728, + "learning_rate": 1.987478876721987e-05, + "loss": 0.829, + "step": 2550 + }, + { + "epoch": 1.4592091200570003, + "grad_norm": 2.0669389591243332, + "learning_rate": 1.9871629785390234e-05, + "loss": 0.823, + "step": 2560 + }, + { + "epoch": 1.4649091556822231, + "grad_norm": 1.939164521109589, + "learning_rate": 1.986843170648583e-05, + "loss": 0.8192, + "step": 2570 + }, + { + "epoch": 1.4706091913074457, + "grad_norm": 1.9340560162810296, + "learning_rate": 1.9865194543172808e-05, + "loss": 0.813, + "step": 2580 + }, + { + "epoch": 1.4763092269326683, + "grad_norm": 1.9826253649714038, + "learning_rate": 1.986191830827211e-05, + "loss": 0.8206, + "step": 2590 + }, + { + "epoch": 1.4820092625578911, + "grad_norm": 2.086689008755194, + "learning_rate": 1.985860301475943e-05, + "loss": 0.8288, + "step": 2600 + }, + { + "epoch": 1.4877092981831137, + "grad_norm": 1.9286741203215845, + "learning_rate": 1.9855248675765146e-05, + "loss": 0.8212, + "step": 2610 + }, + { + "epoch": 1.4934093338083363, + "grad_norm": 1.9752592212958118, + "learning_rate": 1.9851855304574287e-05, + "loss": 0.8271, + "step": 2620 + }, + { + "epoch": 1.4991093694335589, + "grad_norm": 1.9086583701062574, + "learning_rate": 1.9848422914626462e-05, + "loss": 0.8287, + "step": 2630 + }, + { + "epoch": 1.5048094050587815, + "grad_norm": 1.8077255473050955, + "learning_rate": 1.984495151951582e-05, + "loss": 0.8171, + "step": 2640 + }, + { + "epoch": 1.5105094406840043, + "grad_norm": 1.9632613712236342, + "learning_rate": 1.9841441132990998e-05, + "loss": 0.8253, + "step": 2650 + }, + { + "epoch": 1.516209476309227, + "grad_norm": 1.9602391094516611, + "learning_rate": 1.983789176895505e-05, + "loss": 0.809, + "step": 2660 + }, + { + "epoch": 1.5219095119344495, + "grad_norm": 1.9977668186898982, + "learning_rate": 1.9834303441465402e-05, + "loss": 0.8264, + "step": 2670 + }, + { + "epoch": 1.5276095475596723, + "grad_norm": 2.004699978471638, + "learning_rate": 1.9830676164733808e-05, + "loss": 0.8128, + "step": 2680 + }, + { + "epoch": 1.5333095831848949, + "grad_norm": 2.0025333597676043, + "learning_rate": 1.9827009953126277e-05, + "loss": 0.8049, + "step": 2690 + }, + { + "epoch": 1.5390096188101174, + "grad_norm": 2.006348925420673, + "learning_rate": 1.982330482116301e-05, + "loss": 0.8144, + "step": 2700 + }, + { + "epoch": 1.5447096544353403, + "grad_norm": 1.8911374175722429, + "learning_rate": 1.9819560783518378e-05, + "loss": 0.8044, + "step": 2710 + }, + { + "epoch": 1.5504096900605628, + "grad_norm": 2.027861640952926, + "learning_rate": 1.9815777855020818e-05, + "loss": 0.8171, + "step": 2720 + }, + { + "epoch": 1.5561097256857854, + "grad_norm": 1.972962804971287, + "learning_rate": 1.9811956050652803e-05, + "loss": 0.8145, + "step": 2730 + }, + { + "epoch": 1.5618097613110082, + "grad_norm": 2.126768344817225, + "learning_rate": 1.9808095385550777e-05, + "loss": 0.8229, + "step": 2740 + }, + { + "epoch": 1.5675097969362308, + "grad_norm": 1.9270909648882355, + "learning_rate": 1.98041958750051e-05, + "loss": 0.8166, + "step": 2750 + }, + { + "epoch": 1.5732098325614534, + "grad_norm": 2.0062992830698794, + "learning_rate": 1.980025753445997e-05, + "loss": 0.8113, + "step": 2760 + }, + { + "epoch": 1.5789098681866762, + "grad_norm": 1.8604867203192628, + "learning_rate": 1.979628037951338e-05, + "loss": 0.818, + "step": 2770 + }, + { + "epoch": 1.5846099038118988, + "grad_norm": 2.0028993599947658, + "learning_rate": 1.9792264425917048e-05, + "loss": 0.8144, + "step": 2780 + }, + { + "epoch": 1.5903099394371214, + "grad_norm": 1.9586165011975456, + "learning_rate": 1.9788209689576356e-05, + "loss": 0.8135, + "step": 2790 + }, + { + "epoch": 1.5960099750623442, + "grad_norm": 2.0709951025768967, + "learning_rate": 1.9784116186550282e-05, + "loss": 0.8125, + "step": 2800 + }, + { + "epoch": 1.6017100106875668, + "grad_norm": 1.9884880289538327, + "learning_rate": 1.977998393305135e-05, + "loss": 0.8142, + "step": 2810 + }, + { + "epoch": 1.6074100463127894, + "grad_norm": 2.1537453722600888, + "learning_rate": 1.977581294544555e-05, + "loss": 0.8226, + "step": 2820 + }, + { + "epoch": 1.6131100819380122, + "grad_norm": 1.9578569344740548, + "learning_rate": 1.9771603240252287e-05, + "loss": 0.8222, + "step": 2830 + }, + { + "epoch": 1.6188101175632348, + "grad_norm": 1.9762705157898854, + "learning_rate": 1.97673548341443e-05, + "loss": 0.7992, + "step": 2840 + }, + { + "epoch": 1.6245101531884574, + "grad_norm": 1.9396847013255831, + "learning_rate": 1.9763067743947618e-05, + "loss": 0.8145, + "step": 2850 + }, + { + "epoch": 1.6302101888136802, + "grad_norm": 2.049404293109037, + "learning_rate": 1.9758741986641466e-05, + "loss": 0.8206, + "step": 2860 + }, + { + "epoch": 1.6359102244389028, + "grad_norm": 1.971677311667184, + "learning_rate": 1.9754377579358222e-05, + "loss": 0.8108, + "step": 2870 + }, + { + "epoch": 1.6416102600641254, + "grad_norm": 2.053189987118361, + "learning_rate": 1.974997453938333e-05, + "loss": 0.8131, + "step": 2880 + }, + { + "epoch": 1.6473102956893482, + "grad_norm": 1.9151820373034578, + "learning_rate": 1.974553288415525e-05, + "loss": 0.8231, + "step": 2890 + }, + { + "epoch": 1.6530103313145708, + "grad_norm": 2.11765172957531, + "learning_rate": 1.974105263126538e-05, + "loss": 0.8266, + "step": 2900 + }, + { + "epoch": 1.6587103669397933, + "grad_norm": 1.9800541115222154, + "learning_rate": 1.9736533798457976e-05, + "loss": 0.8157, + "step": 2910 + }, + { + "epoch": 1.6644104025650162, + "grad_norm": 2.064806833520072, + "learning_rate": 1.9731976403630096e-05, + "loss": 0.813, + "step": 2920 + }, + { + "epoch": 1.6701104381902387, + "grad_norm": 2.1962096297446214, + "learning_rate": 1.972738046483153e-05, + "loss": 0.8019, + "step": 2930 + }, + { + "epoch": 1.6758104738154613, + "grad_norm": 1.8787057397799634, + "learning_rate": 1.972274600026472e-05, + "loss": 0.816, + "step": 2940 + }, + { + "epoch": 1.6815105094406841, + "grad_norm": 2.1077697612705575, + "learning_rate": 1.9718073028284686e-05, + "loss": 0.8182, + "step": 2950 + }, + { + "epoch": 1.6872105450659065, + "grad_norm": 1.8352913460041305, + "learning_rate": 1.971336156739897e-05, + "loss": 0.8171, + "step": 2960 + }, + { + "epoch": 1.6929105806911293, + "grad_norm": 1.9189762589507913, + "learning_rate": 1.9708611636267538e-05, + "loss": 0.8136, + "step": 2970 + }, + { + "epoch": 1.6986106163163521, + "grad_norm": 1.7917414242527039, + "learning_rate": 1.9703823253702728e-05, + "loss": 0.8137, + "step": 2980 + }, + { + "epoch": 1.7043106519415745, + "grad_norm": 1.9742494605498844, + "learning_rate": 1.9698996438669163e-05, + "loss": 0.8145, + "step": 2990 + }, + { + "epoch": 1.7100106875667973, + "grad_norm": 2.1685413353256564, + "learning_rate": 1.969413121028368e-05, + "loss": 0.8183, + "step": 3000 + }, + { + "epoch": 1.7157107231920201, + "grad_norm": 1.9866035446983392, + "learning_rate": 1.9689227587815263e-05, + "loss": 0.8097, + "step": 3010 + }, + { + "epoch": 1.7214107588172425, + "grad_norm": 2.1414779890164404, + "learning_rate": 1.968428559068494e-05, + "loss": 0.8078, + "step": 3020 + }, + { + "epoch": 1.7271107944424653, + "grad_norm": 1.9774269101559483, + "learning_rate": 1.967930523846574e-05, + "loss": 0.8117, + "step": 3030 + }, + { + "epoch": 1.7328108300676879, + "grad_norm": 2.1161328813493263, + "learning_rate": 1.9674286550882593e-05, + "loss": 0.8007, + "step": 3040 + }, + { + "epoch": 1.7385108656929105, + "grad_norm": 2.043347202651497, + "learning_rate": 1.966922954781225e-05, + "loss": 0.8103, + "step": 3050 + }, + { + "epoch": 1.7442109013181333, + "grad_norm": 1.9187301402180963, + "learning_rate": 1.9664134249283226e-05, + "loss": 0.8119, + "step": 3060 + }, + { + "epoch": 1.7499109369433559, + "grad_norm": 1.9921052617040738, + "learning_rate": 1.96590006754757e-05, + "loss": 0.8055, + "step": 3070 + }, + { + "epoch": 1.7556109725685785, + "grad_norm": 1.9688286652209337, + "learning_rate": 1.9653828846721447e-05, + "loss": 0.8093, + "step": 3080 + }, + { + "epoch": 1.7613110081938013, + "grad_norm": 1.979593441899211, + "learning_rate": 1.964861878350374e-05, + "loss": 0.805, + "step": 3090 + }, + { + "epoch": 1.7670110438190239, + "grad_norm": 2.089479723603799, + "learning_rate": 1.96433705064573e-05, + "loss": 0.8012, + "step": 3100 + }, + { + "epoch": 1.7727110794442464, + "grad_norm": 2.0650887106817355, + "learning_rate": 1.963808403636818e-05, + "loss": 0.8117, + "step": 3110 + }, + { + "epoch": 1.7784111150694693, + "grad_norm": 2.036314415447528, + "learning_rate": 1.9632759394173705e-05, + "loss": 0.8053, + "step": 3120 + }, + { + "epoch": 1.7841111506946918, + "grad_norm": 2.0270600386737327, + "learning_rate": 1.962739660096239e-05, + "loss": 0.8116, + "step": 3130 + }, + { + "epoch": 1.7898111863199144, + "grad_norm": 2.019257712794015, + "learning_rate": 1.9621995677973827e-05, + "loss": 0.8076, + "step": 3140 + }, + { + "epoch": 1.7955112219451372, + "grad_norm": 2.00454036884464, + "learning_rate": 1.9616556646598647e-05, + "loss": 0.8129, + "step": 3150 + }, + { + "epoch": 1.8012112575703598, + "grad_norm": 2.039343370559229, + "learning_rate": 1.9611079528378395e-05, + "loss": 0.7991, + "step": 3160 + }, + { + "epoch": 1.8069112931955824, + "grad_norm": 2.1059222924259346, + "learning_rate": 1.9605564345005473e-05, + "loss": 0.7973, + "step": 3170 + }, + { + "epoch": 1.8126113288208052, + "grad_norm": 2.053612989554001, + "learning_rate": 1.9600011118323034e-05, + "loss": 0.7968, + "step": 3180 + }, + { + "epoch": 1.8183113644460278, + "grad_norm": 2.097986114345962, + "learning_rate": 1.9594419870324902e-05, + "loss": 0.7988, + "step": 3190 + }, + { + "epoch": 1.8240114000712504, + "grad_norm": 2.02925434718376, + "learning_rate": 1.958879062315549e-05, + "loss": 0.8106, + "step": 3200 + }, + { + "epoch": 1.8297114356964732, + "grad_norm": 2.01590406369675, + "learning_rate": 1.958312339910971e-05, + "loss": 0.806, + "step": 3210 + }, + { + "epoch": 1.8354114713216958, + "grad_norm": 1.9619147724985244, + "learning_rate": 1.957741822063288e-05, + "loss": 0.7976, + "step": 3220 + }, + { + "epoch": 1.8411115069469184, + "grad_norm": 2.110272560272491, + "learning_rate": 1.9571675110320643e-05, + "loss": 0.7943, + "step": 3230 + }, + { + "epoch": 1.8468115425721412, + "grad_norm": 2.0255760286052427, + "learning_rate": 1.9565894090918865e-05, + "loss": 0.8021, + "step": 3240 + }, + { + "epoch": 1.8525115781973638, + "grad_norm": 1.99496778091942, + "learning_rate": 1.956007518532356e-05, + "loss": 0.802, + "step": 3250 + }, + { + "epoch": 1.8582116138225864, + "grad_norm": 1.994786516261079, + "learning_rate": 1.9554218416580787e-05, + "loss": 0.8038, + "step": 3260 + }, + { + "epoch": 1.8639116494478092, + "grad_norm": 2.205732684895112, + "learning_rate": 1.9548323807886568e-05, + "loss": 0.8009, + "step": 3270 + }, + { + "epoch": 1.8696116850730315, + "grad_norm": 2.084099308100839, + "learning_rate": 1.954239138258679e-05, + "loss": 0.7997, + "step": 3280 + }, + { + "epoch": 1.8753117206982544, + "grad_norm": 2.061563413281635, + "learning_rate": 1.9536421164177115e-05, + "loss": 0.8007, + "step": 3290 + }, + { + "epoch": 1.8810117563234772, + "grad_norm": 2.029993447580369, + "learning_rate": 1.953041317630289e-05, + "loss": 0.8025, + "step": 3300 + }, + { + "epoch": 1.8867117919486995, + "grad_norm": 2.0214089034575022, + "learning_rate": 1.9524367442759038e-05, + "loss": 0.7985, + "step": 3310 + }, + { + "epoch": 1.8924118275739223, + "grad_norm": 1.935706084813715, + "learning_rate": 1.951828398748999e-05, + "loss": 0.7947, + "step": 3320 + }, + { + "epoch": 1.8981118631991452, + "grad_norm": 2.0906036255368834, + "learning_rate": 1.951216283458957e-05, + "loss": 0.7969, + "step": 3330 + }, + { + "epoch": 1.9038118988243675, + "grad_norm": 1.9338535149871947, + "learning_rate": 1.95060040083009e-05, + "loss": 0.7985, + "step": 3340 + }, + { + "epoch": 1.9095119344495903, + "grad_norm": 2.0268254380290665, + "learning_rate": 1.9499807533016314e-05, + "loss": 0.795, + "step": 3350 + }, + { + "epoch": 1.915211970074813, + "grad_norm": 1.9268507753120423, + "learning_rate": 1.9493573433277263e-05, + "loss": 0.7978, + "step": 3360 + }, + { + "epoch": 1.9209120057000355, + "grad_norm": 1.9415598197390873, + "learning_rate": 1.9487301733774205e-05, + "loss": 0.8048, + "step": 3370 + }, + { + "epoch": 1.9266120413252583, + "grad_norm": 2.15628713825871, + "learning_rate": 1.9480992459346506e-05, + "loss": 0.8002, + "step": 3380 + }, + { + "epoch": 1.932312076950481, + "grad_norm": 1.9609970358293323, + "learning_rate": 1.9474645634982363e-05, + "loss": 0.8063, + "step": 3390 + }, + { + "epoch": 1.9380121125757035, + "grad_norm": 1.926711494554939, + "learning_rate": 1.9468261285818686e-05, + "loss": 0.8002, + "step": 3400 + }, + { + "epoch": 1.9437121482009263, + "grad_norm": 2.0382657822085504, + "learning_rate": 1.9461839437141003e-05, + "loss": 0.7874, + "step": 3410 + }, + { + "epoch": 1.949412183826149, + "grad_norm": 2.025504138910519, + "learning_rate": 1.945538011438336e-05, + "loss": 0.7924, + "step": 3420 + }, + { + "epoch": 1.9551122194513715, + "grad_norm": 1.9730129265429237, + "learning_rate": 1.9448883343128222e-05, + "loss": 0.8058, + "step": 3430 + }, + { + "epoch": 1.9608122550765943, + "grad_norm": 2.053130371157395, + "learning_rate": 1.944234914910637e-05, + "loss": 0.7957, + "step": 3440 + }, + { + "epoch": 1.9665122907018169, + "grad_norm": 1.9496554034048525, + "learning_rate": 1.9435777558196804e-05, + "loss": 0.7956, + "step": 3450 + }, + { + "epoch": 1.9722123263270395, + "grad_norm": 2.017642077204114, + "learning_rate": 1.9429168596426635e-05, + "loss": 0.8089, + "step": 3460 + }, + { + "epoch": 1.9779123619522623, + "grad_norm": 2.0984701006035986, + "learning_rate": 1.9422522289970968e-05, + "loss": 0.7965, + "step": 3470 + }, + { + "epoch": 1.9836123975774849, + "grad_norm": 1.8720820855279117, + "learning_rate": 1.9415838665152837e-05, + "loss": 0.793, + "step": 3480 + }, + { + "epoch": 1.9893124332027075, + "grad_norm": 2.0863714679102254, + "learning_rate": 1.940911774844307e-05, + "loss": 0.806, + "step": 3490 + }, + { + "epoch": 1.9950124688279303, + "grad_norm": 2.004116422399095, + "learning_rate": 1.9402359566460175e-05, + "loss": 0.803, + "step": 3500 + }, + { + "epoch": 2.0007125044531526, + "grad_norm": 2.1353029003603106, + "learning_rate": 1.9395564145970275e-05, + "loss": 0.7912, + "step": 3510 + }, + { + "epoch": 2.0064125400783754, + "grad_norm": 1.9160373192820845, + "learning_rate": 1.9388731513886962e-05, + "loss": 0.772, + "step": 3520 + }, + { + "epoch": 2.0121125757035982, + "grad_norm": 2.059335935801628, + "learning_rate": 1.9381861697271208e-05, + "loss": 0.7845, + "step": 3530 + }, + { + "epoch": 2.0178126113288206, + "grad_norm": 1.9026716738700493, + "learning_rate": 1.9374954723331267e-05, + "loss": 0.7835, + "step": 3540 + }, + { + "epoch": 2.0235126469540434, + "grad_norm": 1.943020845917838, + "learning_rate": 1.9368010619422542e-05, + "loss": 0.7776, + "step": 3550 + }, + { + "epoch": 2.0292126825792662, + "grad_norm": 2.067353687772421, + "learning_rate": 1.93610294130475e-05, + "loss": 0.7837, + "step": 3560 + }, + { + "epoch": 2.0349127182044886, + "grad_norm": 2.118868836061161, + "learning_rate": 1.9354011131855554e-05, + "loss": 0.7818, + "step": 3570 + }, + { + "epoch": 2.0406127538297114, + "grad_norm": 2.1143965982692343, + "learning_rate": 1.934695580364295e-05, + "loss": 0.779, + "step": 3580 + }, + { + "epoch": 2.0463127894549342, + "grad_norm": 2.040166299899192, + "learning_rate": 1.9339863456352658e-05, + "loss": 0.7794, + "step": 3590 + }, + { + "epoch": 2.0520128250801566, + "grad_norm": 2.070863013108917, + "learning_rate": 1.9332734118074274e-05, + "loss": 0.7741, + "step": 3600 + }, + { + "epoch": 2.0577128607053794, + "grad_norm": 2.165533789996265, + "learning_rate": 1.9325567817043888e-05, + "loss": 0.7891, + "step": 3610 + }, + { + "epoch": 2.063412896330602, + "grad_norm": 2.0639753331696435, + "learning_rate": 1.931836458164399e-05, + "loss": 0.7745, + "step": 3620 + }, + { + "epoch": 2.0691129319558246, + "grad_norm": 2.2393200256888615, + "learning_rate": 1.9311124440403347e-05, + "loss": 0.7744, + "step": 3630 + }, + { + "epoch": 2.0748129675810474, + "grad_norm": 2.245493711349147, + "learning_rate": 1.9303847421996895e-05, + "loss": 0.7846, + "step": 3640 + }, + { + "epoch": 2.08051300320627, + "grad_norm": 2.12712542521556, + "learning_rate": 1.929653355524562e-05, + "loss": 0.7767, + "step": 3650 + }, + { + "epoch": 2.0862130388314926, + "grad_norm": 2.2156281019048354, + "learning_rate": 1.928918286911645e-05, + "loss": 0.7786, + "step": 3660 + }, + { + "epoch": 2.0919130744567154, + "grad_norm": 1.991829895411575, + "learning_rate": 1.9281795392722146e-05, + "loss": 0.788, + "step": 3670 + }, + { + "epoch": 2.097613110081938, + "grad_norm": 2.017400209394898, + "learning_rate": 1.9274371155321167e-05, + "loss": 0.7828, + "step": 3680 + }, + { + "epoch": 2.1033131457071605, + "grad_norm": 2.050803561945968, + "learning_rate": 1.9266910186317566e-05, + "loss": 0.784, + "step": 3690 + }, + { + "epoch": 2.1090131813323834, + "grad_norm": 1.9316917498870407, + "learning_rate": 1.925941251526088e-05, + "loss": 0.7681, + "step": 3700 + }, + { + "epoch": 2.114713216957606, + "grad_norm": 1.9848496136357934, + "learning_rate": 1.9251878171846008e-05, + "loss": 0.7695, + "step": 3710 + }, + { + "epoch": 2.1204132525828285, + "grad_norm": 2.071110017635957, + "learning_rate": 1.924430718591308e-05, + "loss": 0.7841, + "step": 3720 + }, + { + "epoch": 2.1261132882080513, + "grad_norm": 1.9736490227320278, + "learning_rate": 1.9236699587447363e-05, + "loss": 0.7768, + "step": 3730 + }, + { + "epoch": 2.131813323833274, + "grad_norm": 1.9869752975975226, + "learning_rate": 1.922905540657912e-05, + "loss": 0.7785, + "step": 3740 + }, + { + "epoch": 2.1375133594584965, + "grad_norm": 2.0695040821912647, + "learning_rate": 1.922137467358351e-05, + "loss": 0.7761, + "step": 3750 + }, + { + "epoch": 2.1432133950837193, + "grad_norm": 1.9299059979513908, + "learning_rate": 1.921365741888045e-05, + "loss": 0.7856, + "step": 3760 + }, + { + "epoch": 2.1489134307089417, + "grad_norm": 2.0076180790761153, + "learning_rate": 1.920590367303451e-05, + "loss": 0.7807, + "step": 3770 + }, + { + "epoch": 2.1546134663341645, + "grad_norm": 1.968046382602847, + "learning_rate": 1.9198113466754775e-05, + "loss": 0.7772, + "step": 3780 + }, + { + "epoch": 2.1603135019593873, + "grad_norm": 2.0927046946378285, + "learning_rate": 1.9190286830894744e-05, + "loss": 0.7753, + "step": 3790 + }, + { + "epoch": 2.16601353758461, + "grad_norm": 2.1166596531741004, + "learning_rate": 1.9182423796452196e-05, + "loss": 0.782, + "step": 3800 + }, + { + "epoch": 2.1717135732098325, + "grad_norm": 2.112843242331277, + "learning_rate": 1.9174524394569058e-05, + "loss": 0.7792, + "step": 3810 + }, + { + "epoch": 2.1774136088350553, + "grad_norm": 2.1683506945600723, + "learning_rate": 1.9166588656531305e-05, + "loss": 0.7726, + "step": 3820 + }, + { + "epoch": 2.1831136444602777, + "grad_norm": 2.054735148916789, + "learning_rate": 1.9158616613768812e-05, + "loss": 0.7743, + "step": 3830 + }, + { + "epoch": 2.1888136800855005, + "grad_norm": 2.1344363416734846, + "learning_rate": 1.915060829785525e-05, + "loss": 0.7771, + "step": 3840 + }, + { + "epoch": 2.1945137157107233, + "grad_norm": 1.984560409021633, + "learning_rate": 1.914256374050795e-05, + "loss": 0.778, + "step": 3850 + }, + { + "epoch": 2.2002137513359457, + "grad_norm": 1.972160935139131, + "learning_rate": 1.9134482973587773e-05, + "loss": 0.7782, + "step": 3860 + }, + { + "epoch": 2.2059137869611685, + "grad_norm": 1.9214738024805038, + "learning_rate": 1.912636602909899e-05, + "loss": 0.7771, + "step": 3870 + }, + { + "epoch": 2.2116138225863913, + "grad_norm": 1.9622083874450693, + "learning_rate": 1.9118212939189165e-05, + "loss": 0.7839, + "step": 3880 + }, + { + "epoch": 2.2173138582116136, + "grad_norm": 2.0208665741876035, + "learning_rate": 1.9110023736149007e-05, + "loss": 0.7681, + "step": 3890 + }, + { + "epoch": 2.2230138938368365, + "grad_norm": 2.226589059621567, + "learning_rate": 1.910179845241226e-05, + "loss": 0.779, + "step": 3900 + }, + { + "epoch": 2.2287139294620593, + "grad_norm": 2.1811770256146805, + "learning_rate": 1.9093537120555564e-05, + "loss": 0.7811, + "step": 3910 + }, + { + "epoch": 2.2344139650872816, + "grad_norm": 2.1292841346824325, + "learning_rate": 1.9085239773298324e-05, + "loss": 0.7859, + "step": 3920 + }, + { + "epoch": 2.2401140007125044, + "grad_norm": 1.9822686380206471, + "learning_rate": 1.9076906443502602e-05, + "loss": 0.7673, + "step": 3930 + }, + { + "epoch": 2.2458140363377272, + "grad_norm": 2.0426647477884043, + "learning_rate": 1.906853716417295e-05, + "loss": 0.7844, + "step": 3940 + }, + { + "epoch": 2.2515140719629496, + "grad_norm": 2.3068456301695677, + "learning_rate": 1.906013196845631e-05, + "loss": 0.7751, + "step": 3950 + }, + { + "epoch": 2.2572141075881724, + "grad_norm": 1.9668105468661568, + "learning_rate": 1.9051690889641884e-05, + "loss": 0.7792, + "step": 3960 + }, + { + "epoch": 2.2629141432133952, + "grad_norm": 2.0782727922468456, + "learning_rate": 1.904321396116097e-05, + "loss": 0.7707, + "step": 3970 + }, + { + "epoch": 2.2686141788386176, + "grad_norm": 2.0489858306617816, + "learning_rate": 1.903470121658686e-05, + "loss": 0.7848, + "step": 3980 + }, + { + "epoch": 2.2743142144638404, + "grad_norm": 2.0742582452117206, + "learning_rate": 1.90261526896347e-05, + "loss": 0.7766, + "step": 3990 + }, + { + "epoch": 2.280014250089063, + "grad_norm": 2.0272360660939164, + "learning_rate": 1.901756841416135e-05, + "loss": 0.7793, + "step": 4000 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 2.3308634299194333, + "learning_rate": 1.900894842416525e-05, + "loss": 0.7686, + "step": 4010 + }, + { + "epoch": 2.2914143213395084, + "grad_norm": 1.9994773525348128, + "learning_rate": 1.9000292753786305e-05, + "loss": 0.7725, + "step": 4020 + }, + { + "epoch": 2.297114356964731, + "grad_norm": 2.2336189138329114, + "learning_rate": 1.8991601437305715e-05, + "loss": 0.7773, + "step": 4030 + }, + { + "epoch": 2.3028143925899536, + "grad_norm": 2.031657276083233, + "learning_rate": 1.8982874509145866e-05, + "loss": 0.7754, + "step": 4040 + }, + { + "epoch": 2.3085144282151764, + "grad_norm": 2.216895543159883, + "learning_rate": 1.8974112003870186e-05, + "loss": 0.7761, + "step": 4050 + }, + { + "epoch": 2.314214463840399, + "grad_norm": 2.209356529326098, + "learning_rate": 1.896531395618301e-05, + "loss": 0.7704, + "step": 4060 + }, + { + "epoch": 2.3199144994656216, + "grad_norm": 2.156183494015324, + "learning_rate": 1.8956480400929438e-05, + "loss": 0.7787, + "step": 4070 + }, + { + "epoch": 2.3256145350908444, + "grad_norm": 1.984794235924156, + "learning_rate": 1.8947611373095196e-05, + "loss": 0.7753, + "step": 4080 + }, + { + "epoch": 2.331314570716067, + "grad_norm": 1.9518963935475173, + "learning_rate": 1.893870690780651e-05, + "loss": 0.7809, + "step": 4090 + }, + { + "epoch": 2.3370146063412895, + "grad_norm": 2.033804508497074, + "learning_rate": 1.892976704032994e-05, + "loss": 0.7716, + "step": 4100 + }, + { + "epoch": 2.3427146419665124, + "grad_norm": 2.068960781647104, + "learning_rate": 1.892079180607229e-05, + "loss": 0.7754, + "step": 4110 + }, + { + "epoch": 2.3484146775917347, + "grad_norm": 2.077601045276333, + "learning_rate": 1.8911781240580402e-05, + "loss": 0.7761, + "step": 4120 + }, + { + "epoch": 2.3541147132169575, + "grad_norm": 2.0915913435617743, + "learning_rate": 1.8902735379541064e-05, + "loss": 0.7685, + "step": 4130 + }, + { + "epoch": 2.3598147488421803, + "grad_norm": 2.0680783097084112, + "learning_rate": 1.889365425878086e-05, + "loss": 0.7799, + "step": 4140 + }, + { + "epoch": 2.365514784467403, + "grad_norm": 1.9963116614489376, + "learning_rate": 1.888453791426601e-05, + "loss": 0.7735, + "step": 4150 + }, + { + "epoch": 2.3712148200926255, + "grad_norm": 2.1464671268790934, + "learning_rate": 1.8875386382102245e-05, + "loss": 0.7718, + "step": 4160 + }, + { + "epoch": 2.3769148557178483, + "grad_norm": 2.175040646679394, + "learning_rate": 1.8866199698534658e-05, + "loss": 0.7788, + "step": 4170 + }, + { + "epoch": 2.3826148913430707, + "grad_norm": 2.0706867787823096, + "learning_rate": 1.885697789994756e-05, + "loss": 0.7627, + "step": 4180 + }, + { + "epoch": 2.3883149269682935, + "grad_norm": 2.194047609109707, + "learning_rate": 1.8847721022864336e-05, + "loss": 0.7793, + "step": 4190 + }, + { + "epoch": 2.3940149625935163, + "grad_norm": 2.122030501728222, + "learning_rate": 1.883842910394731e-05, + "loss": 0.7683, + "step": 4200 + }, + { + "epoch": 2.3997149982187387, + "grad_norm": 2.023047252118629, + "learning_rate": 1.8829102179997572e-05, + "loss": 0.7831, + "step": 4210 + }, + { + "epoch": 2.4054150338439615, + "grad_norm": 2.0437679196020393, + "learning_rate": 1.8819740287954876e-05, + "loss": 0.7695, + "step": 4220 + }, + { + "epoch": 2.4111150694691843, + "grad_norm": 2.0799095278671493, + "learning_rate": 1.881034346489744e-05, + "loss": 0.7665, + "step": 4230 + }, + { + "epoch": 2.4168151050944067, + "grad_norm": 2.1746368931589086, + "learning_rate": 1.880091174804186e-05, + "loss": 0.7612, + "step": 4240 + }, + { + "epoch": 2.4225151407196295, + "grad_norm": 2.282337806044003, + "learning_rate": 1.8791445174742894e-05, + "loss": 0.7766, + "step": 4250 + }, + { + "epoch": 2.4282151763448523, + "grad_norm": 2.071429297247938, + "learning_rate": 1.8781943782493392e-05, + "loss": 0.7721, + "step": 4260 + }, + { + "epoch": 2.4339152119700747, + "grad_norm": 2.0230879554093217, + "learning_rate": 1.8772407608924067e-05, + "loss": 0.7741, + "step": 4270 + }, + { + "epoch": 2.4396152475952975, + "grad_norm": 2.061387786789968, + "learning_rate": 1.8762836691803417e-05, + "loss": 0.7789, + "step": 4280 + }, + { + "epoch": 2.4453152832205203, + "grad_norm": 2.035005479237207, + "learning_rate": 1.8753231069037522e-05, + "loss": 0.7741, + "step": 4290 + }, + { + "epoch": 2.4510153188457426, + "grad_norm": 2.0348766862238947, + "learning_rate": 1.874359077866992e-05, + "loss": 0.7697, + "step": 4300 + }, + { + "epoch": 2.4567153544709655, + "grad_norm": 2.071413736213706, + "learning_rate": 1.8733915858881462e-05, + "loss": 0.7724, + "step": 4310 + }, + { + "epoch": 2.4624153900961883, + "grad_norm": 2.2246204070019595, + "learning_rate": 1.872420634799014e-05, + "loss": 0.7733, + "step": 4320 + }, + { + "epoch": 2.4681154257214106, + "grad_norm": 2.0913317389738153, + "learning_rate": 1.8714462284450948e-05, + "loss": 0.7668, + "step": 4330 + }, + { + "epoch": 2.4738154613466334, + "grad_norm": 2.143431885380468, + "learning_rate": 1.8704683706855728e-05, + "loss": 0.7758, + "step": 4340 + }, + { + "epoch": 2.4795154969718562, + "grad_norm": 2.0657978163391983, + "learning_rate": 1.869487065393302e-05, + "loss": 0.7719, + "step": 4350 + }, + { + "epoch": 2.4852155325970786, + "grad_norm": 2.2092417572395195, + "learning_rate": 1.86850231645479e-05, + "loss": 0.7751, + "step": 4360 + }, + { + "epoch": 2.4909155682223014, + "grad_norm": 2.0194258303784953, + "learning_rate": 1.8675141277701834e-05, + "loss": 0.7736, + "step": 4370 + }, + { + "epoch": 2.4966156038475242, + "grad_norm": 2.206101479863975, + "learning_rate": 1.866522503253252e-05, + "loss": 0.7672, + "step": 4380 + }, + { + "epoch": 2.5023156394727466, + "grad_norm": 2.195672996869424, + "learning_rate": 1.8655274468313732e-05, + "loss": 0.7691, + "step": 4390 + }, + { + "epoch": 2.5080156750979694, + "grad_norm": 2.068463197978191, + "learning_rate": 1.8645289624455175e-05, + "loss": 0.7696, + "step": 4400 + }, + { + "epoch": 2.5137157107231918, + "grad_norm": 2.0587484850743194, + "learning_rate": 1.8635270540502307e-05, + "loss": 0.7646, + "step": 4410 + }, + { + "epoch": 2.5194157463484146, + "grad_norm": 2.06954821802478, + "learning_rate": 1.8625217256136206e-05, + "loss": 0.7711, + "step": 4420 + }, + { + "epoch": 2.5251157819736374, + "grad_norm": 1.9723121174345877, + "learning_rate": 1.8615129811173398e-05, + "loss": 0.7805, + "step": 4430 + }, + { + "epoch": 2.53081581759886, + "grad_norm": 2.0838359377420193, + "learning_rate": 1.8605008245565704e-05, + "loss": 0.7732, + "step": 4440 + }, + { + "epoch": 2.5365158532240826, + "grad_norm": 1.995626097636077, + "learning_rate": 1.8594852599400083e-05, + "loss": 0.7645, + "step": 4450 + }, + { + "epoch": 2.5422158888493054, + "grad_norm": 2.0491043104993194, + "learning_rate": 1.8584662912898464e-05, + "loss": 0.7618, + "step": 4460 + }, + { + "epoch": 2.5479159244745277, + "grad_norm": 2.066323954109494, + "learning_rate": 1.857443922641761e-05, + "loss": 0.7721, + "step": 4470 + }, + { + "epoch": 2.5536159600997506, + "grad_norm": 1.978390527887617, + "learning_rate": 1.856418158044893e-05, + "loss": 0.7761, + "step": 4480 + }, + { + "epoch": 2.5593159957249734, + "grad_norm": 2.0613022758343122, + "learning_rate": 1.8553890015618333e-05, + "loss": 0.7617, + "step": 4490 + }, + { + "epoch": 2.565016031350196, + "grad_norm": 1.9964157030100178, + "learning_rate": 1.8543564572686072e-05, + "loss": 0.7691, + "step": 4500 + }, + { + "epoch": 2.5707160669754185, + "grad_norm": 2.0340098122596255, + "learning_rate": 1.8533205292546567e-05, + "loss": 0.7651, + "step": 4510 + }, + { + "epoch": 2.5764161026006414, + "grad_norm": 1.9963620850267063, + "learning_rate": 1.8522812216228254e-05, + "loss": 0.7706, + "step": 4520 + }, + { + "epoch": 2.5821161382258637, + "grad_norm": 2.107374755843351, + "learning_rate": 1.851238538489343e-05, + "loss": 0.7778, + "step": 4530 + }, + { + "epoch": 2.5878161738510865, + "grad_norm": 2.2279853889692656, + "learning_rate": 1.8501924839838062e-05, + "loss": 0.7698, + "step": 4540 + }, + { + "epoch": 2.5935162094763093, + "grad_norm": 2.057671470082954, + "learning_rate": 1.8491430622491665e-05, + "loss": 0.7605, + "step": 4550 + }, + { + "epoch": 2.599216245101532, + "grad_norm": 2.169407311007384, + "learning_rate": 1.8480902774417094e-05, + "loss": 0.7661, + "step": 4560 + }, + { + "epoch": 2.6049162807267545, + "grad_norm": 2.153643481958755, + "learning_rate": 1.8470341337310407e-05, + "loss": 0.7619, + "step": 4570 + }, + { + "epoch": 2.6106163163519773, + "grad_norm": 2.0441154909224064, + "learning_rate": 1.8459746353000704e-05, + "loss": 0.7615, + "step": 4580 + }, + { + "epoch": 2.6163163519771997, + "grad_norm": 2.1378232482633157, + "learning_rate": 1.8449117863449932e-05, + "loss": 0.7688, + "step": 4590 + }, + { + "epoch": 2.6220163876024225, + "grad_norm": 2.019999621488862, + "learning_rate": 1.843845591075275e-05, + "loss": 0.7682, + "step": 4600 + }, + { + "epoch": 2.6277164232276453, + "grad_norm": 2.0517533117948568, + "learning_rate": 1.8427760537136342e-05, + "loss": 0.7576, + "step": 4610 + }, + { + "epoch": 2.6334164588528677, + "grad_norm": 2.05638099267677, + "learning_rate": 1.8417031784960267e-05, + "loss": 0.7655, + "step": 4620 + }, + { + "epoch": 2.6391164944780905, + "grad_norm": 2.1453529544420373, + "learning_rate": 1.840626969671627e-05, + "loss": 0.7752, + "step": 4630 + }, + { + "epoch": 2.6448165301033133, + "grad_norm": 1.9624857487175413, + "learning_rate": 1.8395474315028134e-05, + "loss": 0.774, + "step": 4640 + }, + { + "epoch": 2.6505165657285357, + "grad_norm": 1.9796322862726417, + "learning_rate": 1.838464568265149e-05, + "loss": 0.7722, + "step": 4650 + }, + { + "epoch": 2.6562166013537585, + "grad_norm": 2.0862799346516905, + "learning_rate": 1.837378384247368e-05, + "loss": 0.7705, + "step": 4660 + }, + { + "epoch": 2.6619166369789813, + "grad_norm": 2.0408955441271974, + "learning_rate": 1.8362888837513548e-05, + "loss": 0.7633, + "step": 4670 + }, + { + "epoch": 2.6676166726042037, + "grad_norm": 2.135018499958134, + "learning_rate": 1.83519607109213e-05, + "loss": 0.7612, + "step": 4680 + }, + { + "epoch": 2.6733167082294265, + "grad_norm": 2.049209470183393, + "learning_rate": 1.834099950597832e-05, + "loss": 0.7627, + "step": 4690 + }, + { + "epoch": 2.679016743854649, + "grad_norm": 2.0472546095057553, + "learning_rate": 1.8330005266096992e-05, + "loss": 0.7661, + "step": 4700 + }, + { + "epoch": 2.6847167794798716, + "grad_norm": 2.0189392429541306, + "learning_rate": 1.8318978034820544e-05, + "loss": 0.7581, + "step": 4710 + }, + { + "epoch": 2.6904168151050945, + "grad_norm": 2.1622485501687256, + "learning_rate": 1.830791785582288e-05, + "loss": 0.7629, + "step": 4720 + }, + { + "epoch": 2.6961168507303173, + "grad_norm": 2.006323646975851, + "learning_rate": 1.8296824772908365e-05, + "loss": 0.7625, + "step": 4730 + }, + { + "epoch": 2.7018168863555396, + "grad_norm": 1.9594010919164644, + "learning_rate": 1.828569883001171e-05, + "loss": 0.7667, + "step": 4740 + }, + { + "epoch": 2.7075169219807624, + "grad_norm": 2.1349030657070474, + "learning_rate": 1.827454007119775e-05, + "loss": 0.7657, + "step": 4750 + }, + { + "epoch": 2.713216957605985, + "grad_norm": 2.0990532850307084, + "learning_rate": 1.8263348540661306e-05, + "loss": 0.7691, + "step": 4760 + }, + { + "epoch": 2.7189169932312076, + "grad_norm": 2.1746655424409345, + "learning_rate": 1.8252124282726984e-05, + "loss": 0.7635, + "step": 4770 + }, + { + "epoch": 2.7246170288564304, + "grad_norm": 2.152646261140533, + "learning_rate": 1.8240867341849e-05, + "loss": 0.7581, + "step": 4780 + }, + { + "epoch": 2.7303170644816532, + "grad_norm": 2.174162632472622, + "learning_rate": 1.8229577762611033e-05, + "loss": 0.7689, + "step": 4790 + }, + { + "epoch": 2.7360171001068756, + "grad_norm": 1.9316453580875002, + "learning_rate": 1.8218255589726007e-05, + "loss": 0.7546, + "step": 4800 + }, + { + "epoch": 2.7417171357320984, + "grad_norm": 2.1886515928391352, + "learning_rate": 1.820690086803595e-05, + "loss": 0.7674, + "step": 4810 + }, + { + "epoch": 2.7474171713573208, + "grad_norm": 2.052737570252945, + "learning_rate": 1.819551364251179e-05, + "loss": 0.7622, + "step": 4820 + }, + { + "epoch": 2.7531172069825436, + "grad_norm": 2.1180173414843906, + "learning_rate": 1.81840939582532e-05, + "loss": 0.7542, + "step": 4830 + }, + { + "epoch": 2.7588172426077664, + "grad_norm": 2.0722878980296127, + "learning_rate": 1.8172641860488393e-05, + "loss": 0.7626, + "step": 4840 + }, + { + "epoch": 2.764517278232989, + "grad_norm": 2.0043370083102685, + "learning_rate": 1.816115739457397e-05, + "loss": 0.7671, + "step": 4850 + }, + { + "epoch": 2.7702173138582116, + "grad_norm": 1.9835911307540945, + "learning_rate": 1.8149640605994722e-05, + "loss": 0.7722, + "step": 4860 + }, + { + "epoch": 2.7759173494834344, + "grad_norm": 1.9879722805165094, + "learning_rate": 1.8138091540363453e-05, + "loss": 0.7666, + "step": 4870 + }, + { + "epoch": 2.7816173851086567, + "grad_norm": 2.0295425663572026, + "learning_rate": 1.8126510243420807e-05, + "loss": 0.7553, + "step": 4880 + }, + { + "epoch": 2.7873174207338796, + "grad_norm": 2.146037273264629, + "learning_rate": 1.811489676103508e-05, + "loss": 0.7649, + "step": 4890 + }, + { + "epoch": 2.7930174563591024, + "grad_norm": 2.029168818823305, + "learning_rate": 1.8103251139202038e-05, + "loss": 0.7632, + "step": 4900 + }, + { + "epoch": 2.798717491984325, + "grad_norm": 2.0772701412050933, + "learning_rate": 1.8091573424044742e-05, + "loss": 0.7661, + "step": 4910 + }, + { + "epoch": 2.8044175276095475, + "grad_norm": 1.9717828970651405, + "learning_rate": 1.8079863661813352e-05, + "loss": 0.7709, + "step": 4920 + }, + { + "epoch": 2.8101175632347704, + "grad_norm": 2.174183368464108, + "learning_rate": 1.8068121898884955e-05, + "loss": 0.756, + "step": 4930 + }, + { + "epoch": 2.8158175988599927, + "grad_norm": 2.1486780441649547, + "learning_rate": 1.8056348181763387e-05, + "loss": 0.7537, + "step": 4940 + }, + { + "epoch": 2.8215176344852155, + "grad_norm": 1.9709706612211082, + "learning_rate": 1.8044542557079032e-05, + "loss": 0.763, + "step": 4950 + }, + { + "epoch": 2.8272176701104383, + "grad_norm": 2.044406248858559, + "learning_rate": 1.8032705071588638e-05, + "loss": 0.7667, + "step": 4960 + }, + { + "epoch": 2.8329177057356607, + "grad_norm": 1.9551236687248852, + "learning_rate": 1.8020835772175158e-05, + "loss": 0.7632, + "step": 4970 + }, + { + "epoch": 2.8386177413608835, + "grad_norm": 1.9293855795874906, + "learning_rate": 1.8008934705847533e-05, + "loss": 0.7636, + "step": 4980 + }, + { + "epoch": 2.8443177769861063, + "grad_norm": 2.158009404352588, + "learning_rate": 1.7997001919740514e-05, + "loss": 0.7606, + "step": 4990 + }, + { + "epoch": 2.8500178126113287, + "grad_norm": 2.124078613588665, + "learning_rate": 1.7985037461114497e-05, + "loss": 0.7615, + "step": 5000 + }, + { + "epoch": 2.8557178482365515, + "grad_norm": 2.021489305585605, + "learning_rate": 1.7973041377355303e-05, + "loss": 0.7549, + "step": 5010 + }, + { + "epoch": 2.8614178838617743, + "grad_norm": 2.060442251766284, + "learning_rate": 1.7961013715974008e-05, + "loss": 0.7661, + "step": 5020 + }, + { + "epoch": 2.8671179194869967, + "grad_norm": 2.144181800187938, + "learning_rate": 1.7948954524606764e-05, + "loss": 0.756, + "step": 5030 + }, + { + "epoch": 2.8728179551122195, + "grad_norm": 2.01230696124865, + "learning_rate": 1.7936863851014585e-05, + "loss": 0.7624, + "step": 5040 + }, + { + "epoch": 2.878517990737442, + "grad_norm": 1.9818525793021162, + "learning_rate": 1.7924741743083177e-05, + "loss": 0.7731, + "step": 5050 + }, + { + "epoch": 2.8842180263626647, + "grad_norm": 1.9797990283611266, + "learning_rate": 1.7912588248822744e-05, + "loss": 0.7478, + "step": 5060 + }, + { + "epoch": 2.8899180619878875, + "grad_norm": 1.918218840573413, + "learning_rate": 1.79004034163678e-05, + "loss": 0.7632, + "step": 5070 + }, + { + "epoch": 2.8956180976131103, + "grad_norm": 2.2113523658613956, + "learning_rate": 1.7888187293976974e-05, + "loss": 0.7634, + "step": 5080 + }, + { + "epoch": 2.9013181332383327, + "grad_norm": 2.0564005525418674, + "learning_rate": 1.7875939930032817e-05, + "loss": 0.7689, + "step": 5090 + }, + { + "epoch": 2.9070181688635555, + "grad_norm": 2.111212825140381, + "learning_rate": 1.786366137304161e-05, + "loss": 0.7721, + "step": 5100 + }, + { + "epoch": 2.912718204488778, + "grad_norm": 2.0529909877524974, + "learning_rate": 1.7851351671633192e-05, + "loss": 0.7583, + "step": 5110 + }, + { + "epoch": 2.9184182401140006, + "grad_norm": 1.9683300396637016, + "learning_rate": 1.7839010874560732e-05, + "loss": 0.7587, + "step": 5120 + }, + { + "epoch": 2.9241182757392234, + "grad_norm": 2.1382803877159113, + "learning_rate": 1.782663903070057e-05, + "loss": 0.7571, + "step": 5130 + }, + { + "epoch": 2.9298183113644463, + "grad_norm": 1.9734164028059396, + "learning_rate": 1.7814236189051995e-05, + "loss": 0.7591, + "step": 5140 + }, + { + "epoch": 2.9355183469896686, + "grad_norm": 1.9907600950849165, + "learning_rate": 1.780180239873707e-05, + "loss": 0.7583, + "step": 5150 + }, + { + "epoch": 2.9412183826148914, + "grad_norm": 2.0056029041693, + "learning_rate": 1.7789337709000435e-05, + "loss": 0.7664, + "step": 5160 + }, + { + "epoch": 2.946918418240114, + "grad_norm": 2.0284767043959673, + "learning_rate": 1.777684216920911e-05, + "loss": 0.7551, + "step": 5170 + }, + { + "epoch": 2.9526184538653366, + "grad_norm": 2.0770316896500054, + "learning_rate": 1.776431582885229e-05, + "loss": 0.7574, + "step": 5180 + }, + { + "epoch": 2.9583184894905594, + "grad_norm": 2.101104699087831, + "learning_rate": 1.775175873754116e-05, + "loss": 0.7648, + "step": 5190 + }, + { + "epoch": 2.9640185251157822, + "grad_norm": 2.132674479499415, + "learning_rate": 1.77391709450087e-05, + "loss": 0.7638, + "step": 5200 + }, + { + "epoch": 2.9697185607410046, + "grad_norm": 2.018973293495472, + "learning_rate": 1.772655250110948e-05, + "loss": 0.7627, + "step": 5210 + }, + { + "epoch": 2.9754185963662274, + "grad_norm": 1.9980958508890732, + "learning_rate": 1.771390345581947e-05, + "loss": 0.7736, + "step": 5220 + }, + { + "epoch": 2.9811186319914498, + "grad_norm": 2.1161985121549605, + "learning_rate": 1.7701223859235828e-05, + "loss": 0.7565, + "step": 5230 + }, + { + "epoch": 2.9868186676166726, + "grad_norm": 2.258396400625071, + "learning_rate": 1.7688513761576726e-05, + "loss": 0.7582, + "step": 5240 + }, + { + "epoch": 2.9925187032418954, + "grad_norm": 2.2236728251205875, + "learning_rate": 1.7675773213181124e-05, + "loss": 0.7639, + "step": 5250 + }, + { + "epoch": 2.9982187388671178, + "grad_norm": 2.0402341818317917, + "learning_rate": 1.7663002264508598e-05, + "loss": 0.7658, + "step": 5260 + }, + { + "epoch": 3.0039187744923406, + "grad_norm": 2.1178170815499, + "learning_rate": 1.765020096613911e-05, + "loss": 0.7438, + "step": 5270 + }, + { + "epoch": 3.0096188101175634, + "grad_norm": 2.053090963514229, + "learning_rate": 1.763736936877284e-05, + "loss": 0.7439, + "step": 5280 + }, + { + "epoch": 3.0153188457427857, + "grad_norm": 2.276361717966091, + "learning_rate": 1.762450752322995e-05, + "loss": 0.7484, + "step": 5290 + }, + { + "epoch": 3.0210188813680086, + "grad_norm": 2.0728033921919606, + "learning_rate": 1.7611615480450413e-05, + "loss": 0.7481, + "step": 5300 + }, + { + "epoch": 3.0267189169932314, + "grad_norm": 2.031463911733619, + "learning_rate": 1.7598693291493804e-05, + "loss": 0.7517, + "step": 5310 + }, + { + "epoch": 3.0324189526184537, + "grad_norm": 2.0727239453344173, + "learning_rate": 1.7585741007539083e-05, + "loss": 0.7393, + "step": 5320 + }, + { + "epoch": 3.0381189882436765, + "grad_norm": 2.245371926184325, + "learning_rate": 1.7572758679884406e-05, + "loss": 0.7366, + "step": 5330 + }, + { + "epoch": 3.0438190238688994, + "grad_norm": 2.081261044660946, + "learning_rate": 1.7559746359946925e-05, + "loss": 0.7449, + "step": 5340 + }, + { + "epoch": 3.0495190594941217, + "grad_norm": 2.0012713994421487, + "learning_rate": 1.7546704099262565e-05, + "loss": 0.7397, + "step": 5350 + }, + { + "epoch": 3.0552190951193445, + "grad_norm": 2.1632139989474886, + "learning_rate": 1.7533631949485847e-05, + "loss": 0.7436, + "step": 5360 + }, + { + "epoch": 3.0609191307445673, + "grad_norm": 2.112371468079618, + "learning_rate": 1.7520529962389655e-05, + "loss": 0.741, + "step": 5370 + }, + { + "epoch": 3.0666191663697897, + "grad_norm": 2.045319617108696, + "learning_rate": 1.7507398189865057e-05, + "loss": 0.7421, + "step": 5380 + }, + { + "epoch": 3.0723192019950125, + "grad_norm": 1.9909398875734512, + "learning_rate": 1.7494236683921084e-05, + "loss": 0.7399, + "step": 5390 + }, + { + "epoch": 3.0780192376202353, + "grad_norm": 2.1478493144485165, + "learning_rate": 1.7481045496684525e-05, + "loss": 0.7425, + "step": 5400 + }, + { + "epoch": 3.0837192732454577, + "grad_norm": 1.9881565887713344, + "learning_rate": 1.7467824680399728e-05, + "loss": 0.7397, + "step": 5410 + }, + { + "epoch": 3.0894193088706805, + "grad_norm": 2.115917226840997, + "learning_rate": 1.7454574287428382e-05, + "loss": 0.7357, + "step": 5420 + }, + { + "epoch": 3.0951193444959033, + "grad_norm": 2.003745833255236, + "learning_rate": 1.744129437024932e-05, + "loss": 0.7505, + "step": 5430 + }, + { + "epoch": 3.1008193801211257, + "grad_norm": 2.078865802580488, + "learning_rate": 1.7427984981458305e-05, + "loss": 0.7419, + "step": 5440 + }, + { + "epoch": 3.1065194157463485, + "grad_norm": 2.1288364598170664, + "learning_rate": 1.7414646173767833e-05, + "loss": 0.7298, + "step": 5450 + }, + { + "epoch": 3.112219451371571, + "grad_norm": 2.072021887298528, + "learning_rate": 1.74012780000069e-05, + "loss": 0.748, + "step": 5460 + }, + { + "epoch": 3.1179194869967937, + "grad_norm": 2.177237216421149, + "learning_rate": 1.7387880513120815e-05, + "loss": 0.7452, + "step": 5470 + }, + { + "epoch": 3.1236195226220165, + "grad_norm": 2.1683763979667843, + "learning_rate": 1.7374453766170987e-05, + "loss": 0.7436, + "step": 5480 + }, + { + "epoch": 3.129319558247239, + "grad_norm": 2.0904041608117216, + "learning_rate": 1.73609978123347e-05, + "loss": 0.7361, + "step": 5490 + }, + { + "epoch": 3.1350195938724617, + "grad_norm": 2.047553895215376, + "learning_rate": 1.734751270490493e-05, + "loss": 0.748, + "step": 5500 + }, + { + "epoch": 3.1407196294976845, + "grad_norm": 2.005927750946996, + "learning_rate": 1.7333998497290097e-05, + "loss": 0.7421, + "step": 5510 + }, + { + "epoch": 3.146419665122907, + "grad_norm": 2.1634130072879008, + "learning_rate": 1.7320455243013896e-05, + "loss": 0.751, + "step": 5520 + }, + { + "epoch": 3.1521197007481296, + "grad_norm": 2.0045457707116494, + "learning_rate": 1.730688299571504e-05, + "loss": 0.7378, + "step": 5530 + }, + { + "epoch": 3.1578197363733524, + "grad_norm": 2.136212824258042, + "learning_rate": 1.729328180914709e-05, + "loss": 0.7448, + "step": 5540 + }, + { + "epoch": 3.163519771998575, + "grad_norm": 1.9657759781442752, + "learning_rate": 1.7279651737178204e-05, + "loss": 0.7345, + "step": 5550 + }, + { + "epoch": 3.1692198076237976, + "grad_norm": 2.046679397792288, + "learning_rate": 1.726599283379096e-05, + "loss": 0.73, + "step": 5560 + }, + { + "epoch": 3.1749198432490204, + "grad_norm": 2.160754138869591, + "learning_rate": 1.7252305153082118e-05, + "loss": 0.7304, + "step": 5570 + }, + { + "epoch": 3.180619878874243, + "grad_norm": 2.149734469063933, + "learning_rate": 1.7238588749262396e-05, + "loss": 0.7405, + "step": 5580 + }, + { + "epoch": 3.1863199144994656, + "grad_norm": 2.1182922399585213, + "learning_rate": 1.72248436766563e-05, + "loss": 0.7462, + "step": 5590 + }, + { + "epoch": 3.1920199501246884, + "grad_norm": 2.0991759289002823, + "learning_rate": 1.7211069989701855e-05, + "loss": 0.7524, + "step": 5600 + }, + { + "epoch": 3.197719985749911, + "grad_norm": 2.088052832265774, + "learning_rate": 1.7197267742950435e-05, + "loss": 0.7562, + "step": 5610 + }, + { + "epoch": 3.2034200213751336, + "grad_norm": 2.116606374909782, + "learning_rate": 1.718343699106651e-05, + "loss": 0.7341, + "step": 5620 + }, + { + "epoch": 3.2091200570003564, + "grad_norm": 2.274297318117651, + "learning_rate": 1.7169577788827448e-05, + "loss": 0.751, + "step": 5630 + }, + { + "epoch": 3.2148200926255788, + "grad_norm": 2.053843021870526, + "learning_rate": 1.7155690191123313e-05, + "loss": 0.7317, + "step": 5640 + }, + { + "epoch": 3.2205201282508016, + "grad_norm": 2.014911666896998, + "learning_rate": 1.7141774252956606e-05, + "loss": 0.7426, + "step": 5650 + }, + { + "epoch": 3.2262201638760244, + "grad_norm": 2.071543993545443, + "learning_rate": 1.712783002944209e-05, + "loss": 0.7472, + "step": 5660 + }, + { + "epoch": 3.2319201995012468, + "grad_norm": 2.0226426858864324, + "learning_rate": 1.7113857575806544e-05, + "loss": 0.7368, + "step": 5670 + }, + { + "epoch": 3.2376202351264696, + "grad_norm": 2.2081035387522667, + "learning_rate": 1.709985694738856e-05, + "loss": 0.7412, + "step": 5680 + }, + { + "epoch": 3.2433202707516924, + "grad_norm": 2.0173621023828967, + "learning_rate": 1.7085828199638315e-05, + "loss": 0.7318, + "step": 5690 + }, + { + "epoch": 3.2490203063769147, + "grad_norm": 2.0533146117503325, + "learning_rate": 1.707177138811735e-05, + "loss": 0.7388, + "step": 5700 + }, + { + "epoch": 3.2547203420021376, + "grad_norm": 2.042845781990925, + "learning_rate": 1.7057686568498363e-05, + "loss": 0.7369, + "step": 5710 + }, + { + "epoch": 3.2604203776273604, + "grad_norm": 2.106781789823166, + "learning_rate": 1.7043573796564966e-05, + "loss": 0.74, + "step": 5720 + }, + { + "epoch": 3.2661204132525827, + "grad_norm": 2.068664681703829, + "learning_rate": 1.7029433128211495e-05, + "loss": 0.7356, + "step": 5730 + }, + { + "epoch": 3.2718204488778055, + "grad_norm": 2.046446640636462, + "learning_rate": 1.7015264619442758e-05, + "loss": 0.737, + "step": 5740 + }, + { + "epoch": 3.277520484503028, + "grad_norm": 2.0057975588623336, + "learning_rate": 1.7001068326373827e-05, + "loss": 0.7352, + "step": 5750 + }, + { + "epoch": 3.2832205201282507, + "grad_norm": 2.00011969394344, + "learning_rate": 1.698684430522982e-05, + "loss": 0.7485, + "step": 5760 + }, + { + "epoch": 3.2889205557534735, + "grad_norm": 1.9793845313476237, + "learning_rate": 1.6972592612345673e-05, + "loss": 0.7428, + "step": 5770 + }, + { + "epoch": 3.2946205913786963, + "grad_norm": 2.0457258452237674, + "learning_rate": 1.6958313304165915e-05, + "loss": 0.7457, + "step": 5780 + }, + { + "epoch": 3.3003206270039187, + "grad_norm": 1.9844119915920377, + "learning_rate": 1.694400643724445e-05, + "loss": 0.7288, + "step": 5790 + }, + { + "epoch": 3.3060206626291415, + "grad_norm": 2.1178149861323115, + "learning_rate": 1.6929672068244325e-05, + "loss": 0.734, + "step": 5800 + }, + { + "epoch": 3.311720698254364, + "grad_norm": 2.2494684992145864, + "learning_rate": 1.691531025393751e-05, + "loss": 0.7428, + "step": 5810 + }, + { + "epoch": 3.3174207338795867, + "grad_norm": 2.103045025019933, + "learning_rate": 1.690092105120468e-05, + "loss": 0.7336, + "step": 5820 + }, + { + "epoch": 3.3231207695048095, + "grad_norm": 2.079229864541264, + "learning_rate": 1.688650451703498e-05, + "loss": 0.735, + "step": 5830 + }, + { + "epoch": 3.3288208051300323, + "grad_norm": 2.1262241995520688, + "learning_rate": 1.68720607085258e-05, + "loss": 0.7478, + "step": 5840 + }, + { + "epoch": 3.3345208407552547, + "grad_norm": 2.0672011419206746, + "learning_rate": 1.685758968288255e-05, + "loss": 0.7364, + "step": 5850 + }, + { + "epoch": 3.3402208763804775, + "grad_norm": 2.018766214415992, + "learning_rate": 1.684309149741845e-05, + "loss": 0.7459, + "step": 5860 + }, + { + "epoch": 3.3459209120057, + "grad_norm": 2.024319172839621, + "learning_rate": 1.6828566209554254e-05, + "loss": 0.7293, + "step": 5870 + }, + { + "epoch": 3.3516209476309227, + "grad_norm": 2.2232243377101732, + "learning_rate": 1.68140138768181e-05, + "loss": 0.745, + "step": 5880 + }, + { + "epoch": 3.3573209832561455, + "grad_norm": 1.9892262862948757, + "learning_rate": 1.6799434556845206e-05, + "loss": 0.74, + "step": 5890 + }, + { + "epoch": 3.363021018881368, + "grad_norm": 1.996203658365478, + "learning_rate": 1.678482830737769e-05, + "loss": 0.7358, + "step": 5900 + }, + { + "epoch": 3.3687210545065907, + "grad_norm": 2.0492262539429476, + "learning_rate": 1.6770195186264318e-05, + "loss": 0.7481, + "step": 5910 + }, + { + "epoch": 3.3744210901318135, + "grad_norm": 2.126332808306086, + "learning_rate": 1.6755535251460282e-05, + "loss": 0.7356, + "step": 5920 + }, + { + "epoch": 3.380121125757036, + "grad_norm": 2.1510518131386216, + "learning_rate": 1.674084856102698e-05, + "loss": 0.7369, + "step": 5930 + }, + { + "epoch": 3.3858211613822586, + "grad_norm": 1.9898038869629846, + "learning_rate": 1.6726135173131767e-05, + "loss": 0.7502, + "step": 5940 + }, + { + "epoch": 3.3915211970074814, + "grad_norm": 2.0793774775395177, + "learning_rate": 1.671139514604774e-05, + "loss": 0.743, + "step": 5950 + }, + { + "epoch": 3.397221232632704, + "grad_norm": 2.0911243940655373, + "learning_rate": 1.6696628538153498e-05, + "loss": 0.7507, + "step": 5960 + }, + { + "epoch": 3.4029212682579266, + "grad_norm": 1.9431819413883284, + "learning_rate": 1.668183540793292e-05, + "loss": 0.7361, + "step": 5970 + }, + { + "epoch": 3.4086213038831494, + "grad_norm": 2.010182507808355, + "learning_rate": 1.6667015813974928e-05, + "loss": 0.7401, + "step": 5980 + }, + { + "epoch": 3.414321339508372, + "grad_norm": 2.161496770233213, + "learning_rate": 1.6652169814973246e-05, + "loss": 0.7407, + "step": 5990 + }, + { + "epoch": 3.4200213751335946, + "grad_norm": 2.026683695690911, + "learning_rate": 1.6637297469726182e-05, + "loss": 0.745, + "step": 6000 + }, + { + "epoch": 3.4257214107588174, + "grad_norm": 2.1544959902671623, + "learning_rate": 1.6622398837136397e-05, + "loss": 0.7443, + "step": 6010 + }, + { + "epoch": 3.43142144638404, + "grad_norm": 2.0254574449350002, + "learning_rate": 1.660747397621065e-05, + "loss": 0.7441, + "step": 6020 + }, + { + "epoch": 3.4371214820092626, + "grad_norm": 2.1063512225032213, + "learning_rate": 1.6592522946059594e-05, + "loss": 0.7358, + "step": 6030 + }, + { + "epoch": 3.4428215176344854, + "grad_norm": 2.109309750652788, + "learning_rate": 1.657754580589751e-05, + "loss": 0.735, + "step": 6040 + }, + { + "epoch": 3.4485215532597078, + "grad_norm": 2.1362388241431223, + "learning_rate": 1.65625426150421e-05, + "loss": 0.7528, + "step": 6050 + }, + { + "epoch": 3.4542215888849306, + "grad_norm": 1.9534769975619974, + "learning_rate": 1.6547513432914242e-05, + "loss": 0.7418, + "step": 6060 + }, + { + "epoch": 3.4599216245101534, + "grad_norm": 2.191011795660284, + "learning_rate": 1.6532458319037748e-05, + "loss": 0.7453, + "step": 6070 + }, + { + "epoch": 3.4656216601353758, + "grad_norm": 2.0481077058223627, + "learning_rate": 1.6517377333039134e-05, + "loss": 0.7355, + "step": 6080 + }, + { + "epoch": 3.4713216957605986, + "grad_norm": 2.0539224712223794, + "learning_rate": 1.650227053464739e-05, + "loss": 0.7311, + "step": 6090 + }, + { + "epoch": 3.477021731385821, + "grad_norm": 2.1197486154272345, + "learning_rate": 1.6487137983693732e-05, + "loss": 0.7416, + "step": 6100 + }, + { + "epoch": 3.4827217670110437, + "grad_norm": 1.956319564532167, + "learning_rate": 1.647197974011137e-05, + "loss": 0.7424, + "step": 6110 + }, + { + "epoch": 3.4884218026362666, + "grad_norm": 2.0396586196184296, + "learning_rate": 1.645679586393527e-05, + "loss": 0.7387, + "step": 6120 + }, + { + "epoch": 3.4941218382614894, + "grad_norm": 2.160261201715842, + "learning_rate": 1.6441586415301928e-05, + "loss": 0.7331, + "step": 6130 + }, + { + "epoch": 3.4998218738867117, + "grad_norm": 2.0696714508439307, + "learning_rate": 1.6426351454449102e-05, + "loss": 0.7382, + "step": 6140 + }, + { + "epoch": 3.5055219095119345, + "grad_norm": 2.0931254540417004, + "learning_rate": 1.641109104171561e-05, + "loss": 0.7386, + "step": 6150 + }, + { + "epoch": 3.511221945137157, + "grad_norm": 2.007931269061243, + "learning_rate": 1.6395805237541066e-05, + "loss": 0.7434, + "step": 6160 + }, + { + "epoch": 3.5169219807623797, + "grad_norm": 2.1952277405235647, + "learning_rate": 1.6380494102465644e-05, + "loss": 0.7403, + "step": 6170 + }, + { + "epoch": 3.5226220163876025, + "grad_norm": 2.0817367988428224, + "learning_rate": 1.6365157697129853e-05, + "loss": 0.7295, + "step": 6180 + }, + { + "epoch": 3.5283220520128253, + "grad_norm": 1.917307285187369, + "learning_rate": 1.6349796082274275e-05, + "loss": 0.7342, + "step": 6190 + }, + { + "epoch": 3.5340220876380477, + "grad_norm": 2.020375761125305, + "learning_rate": 1.6334409318739344e-05, + "loss": 0.737, + "step": 6200 + }, + { + "epoch": 3.5397221232632705, + "grad_norm": 1.9659177210333798, + "learning_rate": 1.631899746746509e-05, + "loss": 0.747, + "step": 6210 + }, + { + "epoch": 3.545422158888493, + "grad_norm": 2.135868919125441, + "learning_rate": 1.630356058949091e-05, + "loss": 0.7344, + "step": 6220 + }, + { + "epoch": 3.5511221945137157, + "grad_norm": 2.0069181029404195, + "learning_rate": 1.628809874595531e-05, + "loss": 0.7371, + "step": 6230 + }, + { + "epoch": 3.5568222301389385, + "grad_norm": 2.0680129880723492, + "learning_rate": 1.6272611998095694e-05, + "loss": 0.733, + "step": 6240 + }, + { + "epoch": 3.562522265764161, + "grad_norm": 1.968388651205215, + "learning_rate": 1.6257100407248075e-05, + "loss": 0.741, + "step": 6250 + }, + { + "epoch": 3.5682223013893837, + "grad_norm": 2.050635559497989, + "learning_rate": 1.6241564034846883e-05, + "loss": 0.7364, + "step": 6260 + }, + { + "epoch": 3.5739223370146065, + "grad_norm": 2.171816362983528, + "learning_rate": 1.622600294242467e-05, + "loss": 0.7358, + "step": 6270 + }, + { + "epoch": 3.579622372639829, + "grad_norm": 2.0930915534084744, + "learning_rate": 1.6210417191611917e-05, + "loss": 0.7519, + "step": 6280 + }, + { + "epoch": 3.5853224082650517, + "grad_norm": 2.0689257909003604, + "learning_rate": 1.6194806844136755e-05, + "loss": 0.7376, + "step": 6290 + }, + { + "epoch": 3.5910224438902745, + "grad_norm": 2.0357101666755772, + "learning_rate": 1.617917196182473e-05, + "loss": 0.7295, + "step": 6300 + }, + { + "epoch": 3.596722479515497, + "grad_norm": 1.9928260487092255, + "learning_rate": 1.616351260659856e-05, + "loss": 0.7433, + "step": 6310 + }, + { + "epoch": 3.6024225151407196, + "grad_norm": 1.9848046393102028, + "learning_rate": 1.6147828840477893e-05, + "loss": 0.7385, + "step": 6320 + }, + { + "epoch": 3.608122550765942, + "grad_norm": 2.01146751576876, + "learning_rate": 1.6132120725579057e-05, + "loss": 0.7346, + "step": 6330 + }, + { + "epoch": 3.613822586391165, + "grad_norm": 2.1500721922467694, + "learning_rate": 1.611638832411481e-05, + "loss": 0.7308, + "step": 6340 + }, + { + "epoch": 3.6195226220163876, + "grad_norm": 2.108969917572423, + "learning_rate": 1.61006316983941e-05, + "loss": 0.7402, + "step": 6350 + }, + { + "epoch": 3.6252226576416104, + "grad_norm": 2.1537890950662146, + "learning_rate": 1.6084850910821822e-05, + "loss": 0.7434, + "step": 6360 + }, + { + "epoch": 3.630922693266833, + "grad_norm": 2.0905163971990803, + "learning_rate": 1.6069046023898554e-05, + "loss": 0.7377, + "step": 6370 + }, + { + "epoch": 3.6366227288920556, + "grad_norm": 2.1833272882788486, + "learning_rate": 1.6053217100220332e-05, + "loss": 0.7387, + "step": 6380 + }, + { + "epoch": 3.642322764517278, + "grad_norm": 1.9978536000336802, + "learning_rate": 1.6037364202478386e-05, + "loss": 0.7436, + "step": 6390 + }, + { + "epoch": 3.648022800142501, + "grad_norm": 1.9857203809905162, + "learning_rate": 1.6021487393458893e-05, + "loss": 0.741, + "step": 6400 + }, + { + "epoch": 3.6537228357677236, + "grad_norm": 1.9785607872258348, + "learning_rate": 1.600558673604274e-05, + "loss": 0.7274, + "step": 6410 + }, + { + "epoch": 3.6594228713929464, + "grad_norm": 2.0892691974189086, + "learning_rate": 1.598966229320526e-05, + "loss": 0.7357, + "step": 6420 + }, + { + "epoch": 3.665122907018169, + "grad_norm": 2.1213516026120955, + "learning_rate": 1.5973714128015987e-05, + "loss": 0.7361, + "step": 6430 + }, + { + "epoch": 3.6708229426433916, + "grad_norm": 1.958345380862227, + "learning_rate": 1.595774230363842e-05, + "loss": 0.7328, + "step": 6440 + }, + { + "epoch": 3.676522978268614, + "grad_norm": 2.019606461449015, + "learning_rate": 1.5941746883329745e-05, + "loss": 0.7342, + "step": 6450 + }, + { + "epoch": 3.6822230138938368, + "grad_norm": 2.0150832864171186, + "learning_rate": 1.5925727930440617e-05, + "loss": 0.7331, + "step": 6460 + }, + { + "epoch": 3.6879230495190596, + "grad_norm": 2.055175500413531, + "learning_rate": 1.5909685508414884e-05, + "loss": 0.7435, + "step": 6470 + }, + { + "epoch": 3.6936230851442824, + "grad_norm": 2.06353886004344, + "learning_rate": 1.589361968078935e-05, + "loss": 0.738, + "step": 6480 + }, + { + "epoch": 3.6993231207695048, + "grad_norm": 2.008466348281505, + "learning_rate": 1.587753051119351e-05, + "loss": 0.744, + "step": 6490 + }, + { + "epoch": 3.7050231563947276, + "grad_norm": 2.045367700599523, + "learning_rate": 1.586141806334931e-05, + "loss": 0.7338, + "step": 6500 + }, + { + "epoch": 3.71072319201995, + "grad_norm": 2.0550785176203594, + "learning_rate": 1.5845282401070893e-05, + "loss": 0.7381, + "step": 6510 + }, + { + "epoch": 3.7164232276451727, + "grad_norm": 2.0872693575576386, + "learning_rate": 1.5829123588264348e-05, + "loss": 0.7305, + "step": 6520 + }, + { + "epoch": 3.7221232632703956, + "grad_norm": 1.9117270867086693, + "learning_rate": 1.5812941688927435e-05, + "loss": 0.732, + "step": 6530 + }, + { + "epoch": 3.7278232988956184, + "grad_norm": 2.1528521470214215, + "learning_rate": 1.579673676714937e-05, + "loss": 0.7357, + "step": 6540 + }, + { + "epoch": 3.7335233345208407, + "grad_norm": 2.1473073455155305, + "learning_rate": 1.5780508887110543e-05, + "loss": 0.7359, + "step": 6550 + }, + { + "epoch": 3.7392233701460635, + "grad_norm": 2.0521636868365336, + "learning_rate": 1.5764258113082266e-05, + "loss": 0.733, + "step": 6560 + }, + { + "epoch": 3.744923405771286, + "grad_norm": 2.262153989248003, + "learning_rate": 1.5747984509426528e-05, + "loss": 0.7177, + "step": 6570 + }, + { + "epoch": 3.7506234413965087, + "grad_norm": 2.013177191446144, + "learning_rate": 1.5731688140595737e-05, + "loss": 0.7336, + "step": 6580 + }, + { + "epoch": 3.7563234770217315, + "grad_norm": 2.0304305461425494, + "learning_rate": 1.5715369071132462e-05, + "loss": 0.7237, + "step": 6590 + }, + { + "epoch": 3.762023512646954, + "grad_norm": 2.0998682733697356, + "learning_rate": 1.569902736566918e-05, + "loss": 0.7311, + "step": 6600 + }, + { + "epoch": 3.7677235482721767, + "grad_norm": 2.052004412180822, + "learning_rate": 1.5682663088928017e-05, + "loss": 0.7254, + "step": 6610 + }, + { + "epoch": 3.7734235838973995, + "grad_norm": 2.083215696482381, + "learning_rate": 1.5666276305720497e-05, + "loss": 0.7347, + "step": 6620 + }, + { + "epoch": 3.779123619522622, + "grad_norm": 2.1017569504703646, + "learning_rate": 1.564986708094728e-05, + "loss": 0.7287, + "step": 6630 + }, + { + "epoch": 3.7848236551478447, + "grad_norm": 2.1295534422279347, + "learning_rate": 1.5633435479597906e-05, + "loss": 0.7382, + "step": 6640 + }, + { + "epoch": 3.7905236907730675, + "grad_norm": 2.130311845169098, + "learning_rate": 1.561698156675054e-05, + "loss": 0.7284, + "step": 6650 + }, + { + "epoch": 3.79622372639829, + "grad_norm": 2.062585126683027, + "learning_rate": 1.5600505407571706e-05, + "loss": 0.7428, + "step": 6660 + }, + { + "epoch": 3.8019237620235127, + "grad_norm": 2.133891320255428, + "learning_rate": 1.558400706731605e-05, + "loss": 0.7382, + "step": 6670 + }, + { + "epoch": 3.807623797648735, + "grad_norm": 2.0965977555975353, + "learning_rate": 1.5567486611326058e-05, + "loss": 0.731, + "step": 6680 + }, + { + "epoch": 3.813323833273958, + "grad_norm": 2.119618624800062, + "learning_rate": 1.555094410503181e-05, + "loss": 0.7333, + "step": 6690 + }, + { + "epoch": 3.8190238688991807, + "grad_norm": 1.988282420666724, + "learning_rate": 1.5534379613950704e-05, + "loss": 0.7327, + "step": 6700 + }, + { + "epoch": 3.8247239045244035, + "grad_norm": 2.1425468665186562, + "learning_rate": 1.5517793203687232e-05, + "loss": 0.7411, + "step": 6710 + }, + { + "epoch": 3.830423940149626, + "grad_norm": 2.2461283085130828, + "learning_rate": 1.5501184939932685e-05, + "loss": 0.7392, + "step": 6720 + }, + { + "epoch": 3.8361239757748486, + "grad_norm": 2.1980537869605907, + "learning_rate": 1.54845548884649e-05, + "loss": 0.7159, + "step": 6730 + }, + { + "epoch": 3.841824011400071, + "grad_norm": 2.099025441032915, + "learning_rate": 1.5467903115148023e-05, + "loss": 0.7358, + "step": 6740 + }, + { + "epoch": 3.847524047025294, + "grad_norm": 2.144448215533206, + "learning_rate": 1.5451229685932212e-05, + "loss": 0.732, + "step": 6750 + }, + { + "epoch": 3.8532240826505166, + "grad_norm": 2.0678206225034526, + "learning_rate": 1.5434534666853406e-05, + "loss": 0.7294, + "step": 6760 + }, + { + "epoch": 3.8589241182757394, + "grad_norm": 2.3507788100600915, + "learning_rate": 1.541781812403305e-05, + "loss": 0.7368, + "step": 6770 + }, + { + "epoch": 3.864624153900962, + "grad_norm": 2.062589684642445, + "learning_rate": 1.540108012367783e-05, + "loss": 0.7359, + "step": 6780 + }, + { + "epoch": 3.8703241895261846, + "grad_norm": 2.075536540767243, + "learning_rate": 1.538432073207942e-05, + "loss": 0.7237, + "step": 6790 + }, + { + "epoch": 3.876024225151407, + "grad_norm": 2.03594095541657, + "learning_rate": 1.536754001561422e-05, + "loss": 0.7227, + "step": 6800 + }, + { + "epoch": 3.88172426077663, + "grad_norm": 1.999572280341755, + "learning_rate": 1.535073804074307e-05, + "loss": 0.7384, + "step": 6810 + }, + { + "epoch": 3.8874242964018526, + "grad_norm": 2.14843223648859, + "learning_rate": 1.5333914874011025e-05, + "loss": 0.7278, + "step": 6820 + }, + { + "epoch": 3.8931243320270754, + "grad_norm": 2.0127815978780084, + "learning_rate": 1.5317070582047066e-05, + "loss": 0.7352, + "step": 6830 + }, + { + "epoch": 3.898824367652298, + "grad_norm": 2.1449298447570624, + "learning_rate": 1.530020523156383e-05, + "loss": 0.7296, + "step": 6840 + }, + { + "epoch": 3.9045244032775206, + "grad_norm": 1.9861876450849654, + "learning_rate": 1.5283318889357367e-05, + "loss": 0.72, + "step": 6850 + }, + { + "epoch": 3.910224438902743, + "grad_norm": 2.1397460467468887, + "learning_rate": 1.5266411622306873e-05, + "loss": 0.7379, + "step": 6860 + }, + { + "epoch": 3.9159244745279658, + "grad_norm": 2.1057354943710958, + "learning_rate": 1.5249483497374403e-05, + "loss": 0.7317, + "step": 6870 + }, + { + "epoch": 3.9216245101531886, + "grad_norm": 1.9498837345150901, + "learning_rate": 1.5232534581604633e-05, + "loss": 0.74, + "step": 6880 + }, + { + "epoch": 3.9273245457784114, + "grad_norm": 2.7851238097023225, + "learning_rate": 1.5215564942124573e-05, + "loss": 0.7305, + "step": 6890 + }, + { + "epoch": 3.9330245814036338, + "grad_norm": 2.19368206819208, + "learning_rate": 1.5198574646143311e-05, + "loss": 0.7415, + "step": 6900 + }, + { + "epoch": 3.9387246170288566, + "grad_norm": 2.1593430708374304, + "learning_rate": 1.5181563760951754e-05, + "loss": 0.7343, + "step": 6910 + }, + { + "epoch": 3.944424652654079, + "grad_norm": 2.1260855448365366, + "learning_rate": 1.516453235392235e-05, + "loss": 0.7305, + "step": 6920 + }, + { + "epoch": 3.9501246882793017, + "grad_norm": 1.9324032551297767, + "learning_rate": 1.5147480492508817e-05, + "loss": 0.7301, + "step": 6930 + }, + { + "epoch": 3.9558247239045246, + "grad_norm": 2.1164826662513105, + "learning_rate": 1.5130408244245893e-05, + "loss": 0.7441, + "step": 6940 + }, + { + "epoch": 3.961524759529747, + "grad_norm": 2.036890857491215, + "learning_rate": 1.5113315676749056e-05, + "loss": 0.7335, + "step": 6950 + }, + { + "epoch": 3.9672247951549697, + "grad_norm": 2.1496177354465345, + "learning_rate": 1.5096202857714261e-05, + "loss": 0.736, + "step": 6960 + }, + { + "epoch": 3.9729248307801925, + "grad_norm": 2.0232753736758253, + "learning_rate": 1.5079069854917666e-05, + "loss": 0.7306, + "step": 6970 + }, + { + "epoch": 3.978624866405415, + "grad_norm": 2.005780628746836, + "learning_rate": 1.5061916736215372e-05, + "loss": 0.7335, + "step": 6980 + }, + { + "epoch": 3.9843249020306377, + "grad_norm": 1.9480858963457948, + "learning_rate": 1.5044743569543147e-05, + "loss": 0.7261, + "step": 6990 + }, + { + "epoch": 3.9900249376558605, + "grad_norm": 2.0513896063025703, + "learning_rate": 1.5027550422916164e-05, + "loss": 0.7319, + "step": 7000 + }, + { + "epoch": 3.995724973281083, + "grad_norm": 2.0474148046725933, + "learning_rate": 1.5010337364428723e-05, + "loss": 0.7394, + "step": 7010 + }, + { + "epoch": 4.001425008906305, + "grad_norm": 1.9251825898575523, + "learning_rate": 1.4993104462253987e-05, + "loss": 0.7275, + "step": 7020 + }, + { + "epoch": 4.007125044531528, + "grad_norm": 1.9744452135994894, + "learning_rate": 1.4975851784643713e-05, + "loss": 0.7152, + "step": 7030 + }, + { + "epoch": 4.012825080156751, + "grad_norm": 2.1433515486563293, + "learning_rate": 1.4958579399927977e-05, + "loss": 0.7165, + "step": 7040 + }, + { + "epoch": 4.018525115781974, + "grad_norm": 2.0710887445851345, + "learning_rate": 1.4941287376514908e-05, + "loss": 0.7102, + "step": 7050 + }, + { + "epoch": 4.0242251514071965, + "grad_norm": 2.0375282509919046, + "learning_rate": 1.4923975782890415e-05, + "loss": 0.7132, + "step": 7060 + }, + { + "epoch": 4.029925187032419, + "grad_norm": 2.097469944887364, + "learning_rate": 1.4906644687617915e-05, + "loss": 0.7147, + "step": 7070 + }, + { + "epoch": 4.035625222657641, + "grad_norm": 2.0530341428336993, + "learning_rate": 1.4889294159338061e-05, + "loss": 0.7158, + "step": 7080 + }, + { + "epoch": 4.041325258282864, + "grad_norm": 2.142877870616465, + "learning_rate": 1.4871924266768474e-05, + "loss": 0.7045, + "step": 7090 + }, + { + "epoch": 4.047025293908087, + "grad_norm": 2.2153688485348186, + "learning_rate": 1.4854535078703466e-05, + "loss": 0.7176, + "step": 7100 + }, + { + "epoch": 4.05272532953331, + "grad_norm": 2.0916124170647645, + "learning_rate": 1.483712666401377e-05, + "loss": 0.7142, + "step": 7110 + }, + { + "epoch": 4.0584253651585325, + "grad_norm": 2.062529383171578, + "learning_rate": 1.4819699091646272e-05, + "loss": 0.7241, + "step": 7120 + }, + { + "epoch": 4.064125400783755, + "grad_norm": 2.059615898158389, + "learning_rate": 1.4802252430623725e-05, + "loss": 0.7157, + "step": 7130 + }, + { + "epoch": 4.069825436408977, + "grad_norm": 2.048208123372868, + "learning_rate": 1.4784786750044486e-05, + "loss": 0.7154, + "step": 7140 + }, + { + "epoch": 4.0755254720342, + "grad_norm": 2.167028041354583, + "learning_rate": 1.4767302119082243e-05, + "loss": 0.7118, + "step": 7150 + }, + { + "epoch": 4.081225507659423, + "grad_norm": 2.055142808371912, + "learning_rate": 1.4749798606985735e-05, + "loss": 0.7051, + "step": 7160 + }, + { + "epoch": 4.086925543284646, + "grad_norm": 2.11856154255257, + "learning_rate": 1.4732276283078484e-05, + "loss": 0.7143, + "step": 7170 + }, + { + "epoch": 4.0926255789098684, + "grad_norm": 2.103529783943278, + "learning_rate": 1.4714735216758512e-05, + "loss": 0.7151, + "step": 7180 + }, + { + "epoch": 4.098325614535091, + "grad_norm": 2.171001048115003, + "learning_rate": 1.4697175477498074e-05, + "loss": 0.7058, + "step": 7190 + }, + { + "epoch": 4.104025650160313, + "grad_norm": 2.019947806571963, + "learning_rate": 1.4679597134843382e-05, + "loss": 0.7207, + "step": 7200 + }, + { + "epoch": 4.109725685785536, + "grad_norm": 2.0337415519897255, + "learning_rate": 1.4662000258414324e-05, + "loss": 0.7229, + "step": 7210 + }, + { + "epoch": 4.115425721410759, + "grad_norm": 2.1682062034587393, + "learning_rate": 1.4644384917904195e-05, + "loss": 0.7111, + "step": 7220 + }, + { + "epoch": 4.121125757035982, + "grad_norm": 2.045117187428103, + "learning_rate": 1.4626751183079415e-05, + "loss": 0.7205, + "step": 7230 + }, + { + "epoch": 4.126825792661204, + "grad_norm": 2.037782713872047, + "learning_rate": 1.460909912377926e-05, + "loss": 0.7108, + "step": 7240 + }, + { + "epoch": 4.132525828286427, + "grad_norm": 2.1443340636277664, + "learning_rate": 1.4591428809915573e-05, + "loss": 0.7157, + "step": 7250 + }, + { + "epoch": 4.138225863911649, + "grad_norm": 2.094115051994579, + "learning_rate": 1.4573740311472506e-05, + "loss": 0.7122, + "step": 7260 + }, + { + "epoch": 4.143925899536872, + "grad_norm": 2.1551146535510606, + "learning_rate": 1.4556033698506224e-05, + "loss": 0.7211, + "step": 7270 + }, + { + "epoch": 4.149625935162095, + "grad_norm": 2.0963416387595397, + "learning_rate": 1.4538309041144636e-05, + "loss": 0.7222, + "step": 7280 + }, + { + "epoch": 4.155325970787318, + "grad_norm": 2.151941102245385, + "learning_rate": 1.4520566409587118e-05, + "loss": 0.7139, + "step": 7290 + }, + { + "epoch": 4.16102600641254, + "grad_norm": 2.2244832792640348, + "learning_rate": 1.4502805874104237e-05, + "loss": 0.7097, + "step": 7300 + }, + { + "epoch": 4.166726042037762, + "grad_norm": 2.1294605571769893, + "learning_rate": 1.4485027505037464e-05, + "loss": 0.7055, + "step": 7310 + }, + { + "epoch": 4.172426077662985, + "grad_norm": 2.1285692780070793, + "learning_rate": 1.4467231372798905e-05, + "loss": 0.7246, + "step": 7320 + }, + { + "epoch": 4.178126113288208, + "grad_norm": 1.963831315280734, + "learning_rate": 1.4449417547871014e-05, + "loss": 0.7241, + "step": 7330 + }, + { + "epoch": 4.183826148913431, + "grad_norm": 2.1043850167694593, + "learning_rate": 1.443158610080632e-05, + "loss": 0.7273, + "step": 7340 + }, + { + "epoch": 4.1895261845386536, + "grad_norm": 2.050815944061624, + "learning_rate": 1.441373710222715e-05, + "loss": 0.7065, + "step": 7350 + }, + { + "epoch": 4.195226220163876, + "grad_norm": 2.151648220360366, + "learning_rate": 1.439587062282533e-05, + "loss": 0.7081, + "step": 7360 + }, + { + "epoch": 4.200926255789098, + "grad_norm": 1.9835522892363455, + "learning_rate": 1.437798673336194e-05, + "loss": 0.7213, + "step": 7370 + }, + { + "epoch": 4.206626291414321, + "grad_norm": 2.0768211194392894, + "learning_rate": 1.4360085504666994e-05, + "loss": 0.7115, + "step": 7380 + }, + { + "epoch": 4.212326327039544, + "grad_norm": 2.1521860372568713, + "learning_rate": 1.4342167007639196e-05, + "loss": 0.7073, + "step": 7390 + }, + { + "epoch": 4.218026362664767, + "grad_norm": 2.05947948843524, + "learning_rate": 1.4324231313245629e-05, + "loss": 0.7124, + "step": 7400 + }, + { + "epoch": 4.2237263982899895, + "grad_norm": 2.069971597086405, + "learning_rate": 1.430627849252149e-05, + "loss": 0.7051, + "step": 7410 + }, + { + "epoch": 4.229426433915212, + "grad_norm": 2.02026677977584, + "learning_rate": 1.4288308616569811e-05, + "loss": 0.7127, + "step": 7420 + }, + { + "epoch": 4.235126469540434, + "grad_norm": 2.276712198730305, + "learning_rate": 1.4270321756561169e-05, + "loss": 0.7189, + "step": 7430 + }, + { + "epoch": 4.240826505165657, + "grad_norm": 2.0748662348756524, + "learning_rate": 1.4252317983733406e-05, + "loss": 0.7076, + "step": 7440 + }, + { + "epoch": 4.24652654079088, + "grad_norm": 2.2204058148387618, + "learning_rate": 1.4234297369391345e-05, + "loss": 0.7144, + "step": 7450 + }, + { + "epoch": 4.252226576416103, + "grad_norm": 2.122075457505512, + "learning_rate": 1.4216259984906522e-05, + "loss": 0.7106, + "step": 7460 + }, + { + "epoch": 4.2579266120413255, + "grad_norm": 2.1374680801145756, + "learning_rate": 1.4198205901716877e-05, + "loss": 0.7251, + "step": 7470 + }, + { + "epoch": 4.263626647666548, + "grad_norm": 2.052440861765808, + "learning_rate": 1.4180135191326498e-05, + "loss": 0.7147, + "step": 7480 + }, + { + "epoch": 4.26932668329177, + "grad_norm": 2.1734320912245457, + "learning_rate": 1.4162047925305318e-05, + "loss": 0.7142, + "step": 7490 + }, + { + "epoch": 4.275026718916993, + "grad_norm": 2.2361008396279543, + "learning_rate": 1.4143944175288846e-05, + "loss": 0.7152, + "step": 7500 + }, + { + "epoch": 4.280726754542216, + "grad_norm": 2.0505673843747227, + "learning_rate": 1.4125824012977871e-05, + "loss": 0.7135, + "step": 7510 + }, + { + "epoch": 4.286426790167439, + "grad_norm": 2.107861616724096, + "learning_rate": 1.4107687510138193e-05, + "loss": 0.7124, + "step": 7520 + }, + { + "epoch": 4.2921268257926615, + "grad_norm": 2.076372674933872, + "learning_rate": 1.408953473860031e-05, + "loss": 0.719, + "step": 7530 + }, + { + "epoch": 4.297826861417883, + "grad_norm": 2.1240116212346734, + "learning_rate": 1.4071365770259175e-05, + "loss": 0.7076, + "step": 7540 + }, + { + "epoch": 4.303526897043106, + "grad_norm": 2.0692593377515944, + "learning_rate": 1.4053180677073877e-05, + "loss": 0.7181, + "step": 7550 + }, + { + "epoch": 4.309226932668329, + "grad_norm": 2.0202823988722702, + "learning_rate": 1.403497953106737e-05, + "loss": 0.7278, + "step": 7560 + }, + { + "epoch": 4.314926968293552, + "grad_norm": 2.1221365119576254, + "learning_rate": 1.4016762404326189e-05, + "loss": 0.7179, + "step": 7570 + }, + { + "epoch": 4.320627003918775, + "grad_norm": 2.048229882442656, + "learning_rate": 1.399852936900016e-05, + "loss": 0.7209, + "step": 7580 + }, + { + "epoch": 4.326327039543997, + "grad_norm": 2.1165056652873697, + "learning_rate": 1.3980280497302113e-05, + "loss": 0.7174, + "step": 7590 + }, + { + "epoch": 4.33202707516922, + "grad_norm": 2.0241250324247955, + "learning_rate": 1.39620158615076e-05, + "loss": 0.7197, + "step": 7600 + }, + { + "epoch": 4.337727110794442, + "grad_norm": 2.198418819325663, + "learning_rate": 1.3943735533954612e-05, + "loss": 0.7134, + "step": 7610 + }, + { + "epoch": 4.343427146419665, + "grad_norm": 2.0800472200100857, + "learning_rate": 1.392543958704328e-05, + "loss": 0.7173, + "step": 7620 + }, + { + "epoch": 4.349127182044888, + "grad_norm": 1.985410025478677, + "learning_rate": 1.3907128093235604e-05, + "loss": 0.7125, + "step": 7630 + }, + { + "epoch": 4.354827217670111, + "grad_norm": 1.9651488876336438, + "learning_rate": 1.3888801125055156e-05, + "loss": 0.7149, + "step": 7640 + }, + { + "epoch": 4.360527253295333, + "grad_norm": 2.043158401774305, + "learning_rate": 1.3870458755086793e-05, + "loss": 0.7023, + "step": 7650 + }, + { + "epoch": 4.366227288920555, + "grad_norm": 2.063696570625022, + "learning_rate": 1.3852101055976367e-05, + "loss": 0.7072, + "step": 7660 + }, + { + "epoch": 4.371927324545778, + "grad_norm": 2.2378999921887535, + "learning_rate": 1.3833728100430455e-05, + "loss": 0.7185, + "step": 7670 + }, + { + "epoch": 4.377627360171001, + "grad_norm": 2.2324714292824925, + "learning_rate": 1.3815339961216046e-05, + "loss": 0.7144, + "step": 7680 + }, + { + "epoch": 4.383327395796224, + "grad_norm": 2.010917436631881, + "learning_rate": 1.3796936711160269e-05, + "loss": 0.7162, + "step": 7690 + }, + { + "epoch": 4.389027431421447, + "grad_norm": 2.0127926011934245, + "learning_rate": 1.3778518423150101e-05, + "loss": 0.7119, + "step": 7700 + }, + { + "epoch": 4.394727467046669, + "grad_norm": 2.123951318878207, + "learning_rate": 1.3760085170132076e-05, + "loss": 0.7098, + "step": 7710 + }, + { + "epoch": 4.400427502671891, + "grad_norm": 2.0649137209507504, + "learning_rate": 1.3741637025112e-05, + "loss": 0.7142, + "step": 7720 + }, + { + "epoch": 4.406127538297114, + "grad_norm": 2.1529758269316117, + "learning_rate": 1.3723174061154652e-05, + "loss": 0.7104, + "step": 7730 + }, + { + "epoch": 4.411827573922337, + "grad_norm": 2.215341160229645, + "learning_rate": 1.3704696351383516e-05, + "loss": 0.716, + "step": 7740 + }, + { + "epoch": 4.41752760954756, + "grad_norm": 2.063609107382071, + "learning_rate": 1.3686203968980465e-05, + "loss": 0.7295, + "step": 7750 + }, + { + "epoch": 4.4232276451727826, + "grad_norm": 2.168009767132644, + "learning_rate": 1.3667696987185486e-05, + "loss": 0.7153, + "step": 7760 + }, + { + "epoch": 4.428927680798005, + "grad_norm": 2.1839100011866517, + "learning_rate": 1.3649175479296393e-05, + "loss": 0.7121, + "step": 7770 + }, + { + "epoch": 4.434627716423227, + "grad_norm": 2.1865961035302273, + "learning_rate": 1.3630639518668528e-05, + "loss": 0.7179, + "step": 7780 + }, + { + "epoch": 4.44032775204845, + "grad_norm": 2.1902585443901548, + "learning_rate": 1.3612089178714473e-05, + "loss": 0.7206, + "step": 7790 + }, + { + "epoch": 4.446027787673673, + "grad_norm": 2.0700382912342796, + "learning_rate": 1.3593524532903757e-05, + "loss": 0.7262, + "step": 7800 + }, + { + "epoch": 4.451727823298896, + "grad_norm": 1.991885261826358, + "learning_rate": 1.357494565476258e-05, + "loss": 0.7133, + "step": 7810 + }, + { + "epoch": 4.4574278589241185, + "grad_norm": 2.075547581598166, + "learning_rate": 1.3556352617873492e-05, + "loss": 0.7216, + "step": 7820 + }, + { + "epoch": 4.463127894549341, + "grad_norm": 2.069830636023337, + "learning_rate": 1.3537745495875138e-05, + "loss": 0.7069, + "step": 7830 + }, + { + "epoch": 4.468827930174563, + "grad_norm": 2.1761013806481775, + "learning_rate": 1.3519124362461937e-05, + "loss": 0.7175, + "step": 7840 + }, + { + "epoch": 4.474527965799786, + "grad_norm": 2.1477462457811294, + "learning_rate": 1.3500489291383798e-05, + "loss": 0.7019, + "step": 7850 + }, + { + "epoch": 4.480228001425009, + "grad_norm": 2.091553579148087, + "learning_rate": 1.348184035644584e-05, + "loss": 0.7056, + "step": 7860 + }, + { + "epoch": 4.485928037050232, + "grad_norm": 2.175016873764588, + "learning_rate": 1.3463177631508079e-05, + "loss": 0.7065, + "step": 7870 + }, + { + "epoch": 4.4916280726754545, + "grad_norm": 2.0416242410436483, + "learning_rate": 1.3444501190485164e-05, + "loss": 0.7064, + "step": 7880 + }, + { + "epoch": 4.497328108300676, + "grad_norm": 2.047781511840451, + "learning_rate": 1.3425811107346052e-05, + "loss": 0.7087, + "step": 7890 + }, + { + "epoch": 4.503028143925899, + "grad_norm": 2.120940746397696, + "learning_rate": 1.3407107456113737e-05, + "loss": 0.7087, + "step": 7900 + }, + { + "epoch": 4.508728179551122, + "grad_norm": 2.182832991914863, + "learning_rate": 1.3388390310864945e-05, + "loss": 0.7146, + "step": 7910 + }, + { + "epoch": 4.514428215176345, + "grad_norm": 2.0731174610042014, + "learning_rate": 1.3369659745729854e-05, + "loss": 0.7129, + "step": 7920 + }, + { + "epoch": 4.520128250801568, + "grad_norm": 2.2501441238665967, + "learning_rate": 1.3350915834891786e-05, + "loss": 0.7174, + "step": 7930 + }, + { + "epoch": 4.5258282864267905, + "grad_norm": 2.0216070729039166, + "learning_rate": 1.333215865258692e-05, + "loss": 0.7129, + "step": 7940 + }, + { + "epoch": 4.531528322052013, + "grad_norm": 1.9716477987035963, + "learning_rate": 1.3313388273103999e-05, + "loss": 0.7152, + "step": 7950 + }, + { + "epoch": 4.537228357677235, + "grad_norm": 2.0783658817431285, + "learning_rate": 1.3294604770784035e-05, + "loss": 0.7132, + "step": 7960 + }, + { + "epoch": 4.542928393302458, + "grad_norm": 2.261698541426297, + "learning_rate": 1.3275808220020006e-05, + "loss": 0.7131, + "step": 7970 + }, + { + "epoch": 4.548628428927681, + "grad_norm": 2.1347742048994487, + "learning_rate": 1.3256998695256578e-05, + "loss": 0.7175, + "step": 7980 + }, + { + "epoch": 4.554328464552904, + "grad_norm": 2.140168139545353, + "learning_rate": 1.32381762709898e-05, + "loss": 0.7203, + "step": 7990 + }, + { + "epoch": 4.560028500178126, + "grad_norm": 2.1547186591948493, + "learning_rate": 1.3219341021766803e-05, + "loss": 0.7064, + "step": 8000 + }, + { + "epoch": 4.565728535803348, + "grad_norm": 2.1008597696102562, + "learning_rate": 1.3200493022185525e-05, + "loss": 0.7195, + "step": 8010 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 2.184478573038733, + "learning_rate": 1.3181632346894388e-05, + "loss": 0.7171, + "step": 8020 + }, + { + "epoch": 4.577128607053794, + "grad_norm": 2.0573951158453125, + "learning_rate": 1.3162759070592024e-05, + "loss": 0.7108, + "step": 8030 + }, + { + "epoch": 4.582828642679017, + "grad_norm": 2.2271090428071085, + "learning_rate": 1.314387326802697e-05, + "loss": 0.7104, + "step": 8040 + }, + { + "epoch": 4.58852867830424, + "grad_norm": 2.122074817755831, + "learning_rate": 1.312497501399738e-05, + "loss": 0.712, + "step": 8050 + }, + { + "epoch": 4.594228713929462, + "grad_norm": 2.0920821005791455, + "learning_rate": 1.3106064383350715e-05, + "loss": 0.7187, + "step": 8060 + }, + { + "epoch": 4.599928749554684, + "grad_norm": 2.116249375789271, + "learning_rate": 1.308714145098345e-05, + "loss": 0.7225, + "step": 8070 + }, + { + "epoch": 4.605628785179907, + "grad_norm": 2.136803566912254, + "learning_rate": 1.3068206291840799e-05, + "loss": 0.7096, + "step": 8080 + }, + { + "epoch": 4.61132882080513, + "grad_norm": 2.124886312634362, + "learning_rate": 1.3049258980916387e-05, + "loss": 0.717, + "step": 8090 + }, + { + "epoch": 4.617028856430353, + "grad_norm": 2.121467934682898, + "learning_rate": 1.3030299593251964e-05, + "loss": 0.7188, + "step": 8100 + }, + { + "epoch": 4.622728892055576, + "grad_norm": 1.9399148435216957, + "learning_rate": 1.3011328203937121e-05, + "loss": 0.7071, + "step": 8110 + }, + { + "epoch": 4.628428927680798, + "grad_norm": 2.154407536044687, + "learning_rate": 1.2992344888108981e-05, + "loss": 0.7141, + "step": 8120 + }, + { + "epoch": 4.63412896330602, + "grad_norm": 2.032093437381081, + "learning_rate": 1.297334972095189e-05, + "loss": 0.713, + "step": 8130 + }, + { + "epoch": 4.639828998931243, + "grad_norm": 2.0495210294018436, + "learning_rate": 1.2954342777697152e-05, + "loss": 0.7026, + "step": 8140 + }, + { + "epoch": 4.645529034556466, + "grad_norm": 2.1069899723792616, + "learning_rate": 1.2935324133622688e-05, + "loss": 0.7108, + "step": 8150 + }, + { + "epoch": 4.651229070181689, + "grad_norm": 2.1144338292156792, + "learning_rate": 1.291629386405278e-05, + "loss": 0.7091, + "step": 8160 + }, + { + "epoch": 4.6569291058069116, + "grad_norm": 2.0212838390369585, + "learning_rate": 1.2897252044357745e-05, + "loss": 0.7193, + "step": 8170 + }, + { + "epoch": 4.662629141432134, + "grad_norm": 2.100544352290128, + "learning_rate": 1.2878198749953642e-05, + "loss": 0.7222, + "step": 8180 + }, + { + "epoch": 4.668329177057356, + "grad_norm": 2.004292862457442, + "learning_rate": 1.285913405630198e-05, + "loss": 0.71, + "step": 8190 + }, + { + "epoch": 4.674029212682579, + "grad_norm": 2.066805788673636, + "learning_rate": 1.2840058038909415e-05, + "loss": 0.7058, + "step": 8200 + }, + { + "epoch": 4.679729248307802, + "grad_norm": 2.1905106670861714, + "learning_rate": 1.2820970773327456e-05, + "loss": 0.7139, + "step": 8210 + }, + { + "epoch": 4.685429283933025, + "grad_norm": 2.113303850774233, + "learning_rate": 1.2801872335152152e-05, + "loss": 0.7075, + "step": 8220 + }, + { + "epoch": 4.6911293195582475, + "grad_norm": 2.1795819458323464, + "learning_rate": 1.2782762800023806e-05, + "loss": 0.7069, + "step": 8230 + }, + { + "epoch": 4.6968293551834694, + "grad_norm": 2.1289949423759658, + "learning_rate": 1.2763642243626668e-05, + "loss": 0.7087, + "step": 8240 + }, + { + "epoch": 4.702529390808692, + "grad_norm": 2.3383021717285266, + "learning_rate": 1.2744510741688648e-05, + "loss": 0.7047, + "step": 8250 + }, + { + "epoch": 4.708229426433915, + "grad_norm": 2.0055553076818424, + "learning_rate": 1.2725368369980996e-05, + "loss": 0.7204, + "step": 8260 + }, + { + "epoch": 4.713929462059138, + "grad_norm": 2.1654348631523486, + "learning_rate": 1.270621520431801e-05, + "loss": 0.7111, + "step": 8270 + }, + { + "epoch": 4.719629497684361, + "grad_norm": 2.1644826955051664, + "learning_rate": 1.2687051320556751e-05, + "loss": 0.7067, + "step": 8280 + }, + { + "epoch": 4.7253295333095835, + "grad_norm": 2.089047757071869, + "learning_rate": 1.2667876794596721e-05, + "loss": 0.7161, + "step": 8290 + }, + { + "epoch": 4.731029568934806, + "grad_norm": 2.0039738679317507, + "learning_rate": 1.2648691702379568e-05, + "loss": 0.7004, + "step": 8300 + }, + { + "epoch": 4.736729604560028, + "grad_norm": 2.2125009851795343, + "learning_rate": 1.2629496119888795e-05, + "loss": 0.699, + "step": 8310 + }, + { + "epoch": 4.742429640185251, + "grad_norm": 2.187645275278001, + "learning_rate": 1.2610290123149454e-05, + "loss": 0.7087, + "step": 8320 + }, + { + "epoch": 4.748129675810474, + "grad_norm": 2.1075929722847757, + "learning_rate": 1.2591073788227827e-05, + "loss": 0.7161, + "step": 8330 + }, + { + "epoch": 4.753829711435697, + "grad_norm": 2.0657586426111414, + "learning_rate": 1.257184719123117e-05, + "loss": 0.7142, + "step": 8340 + }, + { + "epoch": 4.7595297470609195, + "grad_norm": 2.085585284929549, + "learning_rate": 1.2552610408307348e-05, + "loss": 0.7126, + "step": 8350 + }, + { + "epoch": 4.765229782686141, + "grad_norm": 2.017479984454758, + "learning_rate": 1.2533363515644595e-05, + "loss": 0.7135, + "step": 8360 + }, + { + "epoch": 4.770929818311364, + "grad_norm": 2.1270569310907774, + "learning_rate": 1.2514106589471169e-05, + "loss": 0.7187, + "step": 8370 + }, + { + "epoch": 4.776629853936587, + "grad_norm": 2.08061530850382, + "learning_rate": 1.2494839706055075e-05, + "loss": 0.7017, + "step": 8380 + }, + { + "epoch": 4.78232988956181, + "grad_norm": 2.0904751797462446, + "learning_rate": 1.2475562941703755e-05, + "loss": 0.7078, + "step": 8390 + }, + { + "epoch": 4.788029925187033, + "grad_norm": 2.079199224404677, + "learning_rate": 1.2456276372763776e-05, + "loss": 0.7135, + "step": 8400 + }, + { + "epoch": 4.793729960812255, + "grad_norm": 1.9760702520656304, + "learning_rate": 1.2436980075620543e-05, + "loss": 0.7227, + "step": 8410 + }, + { + "epoch": 4.799429996437477, + "grad_norm": 2.0407360391918266, + "learning_rate": 1.2417674126697989e-05, + "loss": 0.7125, + "step": 8420 + }, + { + "epoch": 4.8051300320627, + "grad_norm": 2.0872991926867575, + "learning_rate": 1.2398358602458275e-05, + "loss": 0.7094, + "step": 8430 + }, + { + "epoch": 4.810830067687923, + "grad_norm": 2.075605380407668, + "learning_rate": 1.2379033579401483e-05, + "loss": 0.7029, + "step": 8440 + }, + { + "epoch": 4.816530103313146, + "grad_norm": 2.0695319205546863, + "learning_rate": 1.2359699134065316e-05, + "loss": 0.7028, + "step": 8450 + }, + { + "epoch": 4.822230138938369, + "grad_norm": 2.142832073636643, + "learning_rate": 1.2340355343024793e-05, + "loss": 0.709, + "step": 8460 + }, + { + "epoch": 4.8279301745635905, + "grad_norm": 2.050091950243262, + "learning_rate": 1.2321002282891952e-05, + "loss": 0.7006, + "step": 8470 + }, + { + "epoch": 4.833630210188813, + "grad_norm": 2.024439862175209, + "learning_rate": 1.2301640030315537e-05, + "loss": 0.7033, + "step": 8480 + }, + { + "epoch": 4.839330245814036, + "grad_norm": 2.0787794513253544, + "learning_rate": 1.2282268661980697e-05, + "loss": 0.7032, + "step": 8490 + }, + { + "epoch": 4.845030281439259, + "grad_norm": 2.174009548757474, + "learning_rate": 1.2262888254608691e-05, + "loss": 0.6981, + "step": 8500 + }, + { + "epoch": 4.850730317064482, + "grad_norm": 2.0765315091697043, + "learning_rate": 1.2243498884956578e-05, + "loss": 0.7065, + "step": 8510 + }, + { + "epoch": 4.856430352689705, + "grad_norm": 2.038584770594007, + "learning_rate": 1.2224100629816905e-05, + "loss": 0.7143, + "step": 8520 + }, + { + "epoch": 4.862130388314927, + "grad_norm": 2.0005800600511168, + "learning_rate": 1.2204693566017417e-05, + "loss": 0.7185, + "step": 8530 + }, + { + "epoch": 4.867830423940149, + "grad_norm": 2.1842325091967703, + "learning_rate": 1.2185277770420739e-05, + "loss": 0.7134, + "step": 8540 + }, + { + "epoch": 4.873530459565372, + "grad_norm": 2.130842764618701, + "learning_rate": 1.2165853319924088e-05, + "loss": 0.7213, + "step": 8550 + }, + { + "epoch": 4.879230495190595, + "grad_norm": 2.1578893100012078, + "learning_rate": 1.2146420291458954e-05, + "loss": 0.7084, + "step": 8560 + }, + { + "epoch": 4.884930530815818, + "grad_norm": 2.1830068437308934, + "learning_rate": 1.21269787619908e-05, + "loss": 0.7061, + "step": 8570 + }, + { + "epoch": 4.8906305664410405, + "grad_norm": 2.0507704117164516, + "learning_rate": 1.2107528808518756e-05, + "loss": 0.7121, + "step": 8580 + }, + { + "epoch": 4.8963306020662625, + "grad_norm": 2.2296960181873975, + "learning_rate": 1.2088070508075325e-05, + "loss": 0.7134, + "step": 8590 + }, + { + "epoch": 4.902030637691485, + "grad_norm": 2.2294466692347052, + "learning_rate": 1.2068603937726057e-05, + "loss": 0.7018, + "step": 8600 + }, + { + "epoch": 4.907730673316708, + "grad_norm": 2.3162681168819486, + "learning_rate": 1.2049129174569261e-05, + "loss": 0.7129, + "step": 8610 + }, + { + "epoch": 4.913430708941931, + "grad_norm": 2.2283898023173556, + "learning_rate": 1.2029646295735694e-05, + "loss": 0.7033, + "step": 8620 + }, + { + "epoch": 4.919130744567154, + "grad_norm": 2.1019985515460213, + "learning_rate": 1.2010155378388253e-05, + "loss": 0.7102, + "step": 8630 + }, + { + "epoch": 4.9248307801923765, + "grad_norm": 2.104644017177413, + "learning_rate": 1.1990656499721673e-05, + "loss": 0.7059, + "step": 8640 + }, + { + "epoch": 4.930530815817599, + "grad_norm": 2.1752936170393844, + "learning_rate": 1.1971149736962229e-05, + "loss": 0.7019, + "step": 8650 + }, + { + "epoch": 4.936230851442821, + "grad_norm": 2.1210263601772774, + "learning_rate": 1.1951635167367403e-05, + "loss": 0.7094, + "step": 8660 + }, + { + "epoch": 4.941930887068044, + "grad_norm": 2.1174594655618657, + "learning_rate": 1.1932112868225613e-05, + "loss": 0.7186, + "step": 8670 + }, + { + "epoch": 4.947630922693267, + "grad_norm": 2.0085889940321304, + "learning_rate": 1.1912582916855883e-05, + "loss": 0.7129, + "step": 8680 + }, + { + "epoch": 4.95333095831849, + "grad_norm": 2.1847254708548682, + "learning_rate": 1.1893045390607542e-05, + "loss": 0.7101, + "step": 8690 + }, + { + "epoch": 4.9590309939437125, + "grad_norm": 2.1005181730831346, + "learning_rate": 1.1873500366859925e-05, + "loss": 0.7117, + "step": 8700 + }, + { + "epoch": 4.964731029568934, + "grad_norm": 2.197303113242465, + "learning_rate": 1.1853947923022057e-05, + "loss": 0.7124, + "step": 8710 + }, + { + "epoch": 4.970431065194157, + "grad_norm": 2.119459567257772, + "learning_rate": 1.1834388136532358e-05, + "loss": 0.7061, + "step": 8720 + }, + { + "epoch": 4.97613110081938, + "grad_norm": 2.106855530802309, + "learning_rate": 1.1814821084858315e-05, + "loss": 0.7056, + "step": 8730 + }, + { + "epoch": 4.981831136444603, + "grad_norm": 2.1509371468107608, + "learning_rate": 1.1795246845496205e-05, + "loss": 0.6997, + "step": 8740 + }, + { + "epoch": 4.987531172069826, + "grad_norm": 2.0890300669870783, + "learning_rate": 1.1775665495970756e-05, + "loss": 0.7088, + "step": 8750 + }, + { + "epoch": 4.9932312076950485, + "grad_norm": 2.132056711702782, + "learning_rate": 1.1756077113834873e-05, + "loss": 0.705, + "step": 8760 + }, + { + "epoch": 4.99893124332027, + "grad_norm": 2.153283209947663, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.7137, + "step": 8770 + }, + { + "epoch": 5.004631278945493, + "grad_norm": 2.1163261553034682, + "learning_rate": 1.1716879562082343e-05, + "loss": 0.6993, + "step": 8780 + }, + { + "epoch": 5.010331314570716, + "grad_norm": 2.1583328617449897, + "learning_rate": 1.1697270547709527e-05, + "loss": 0.6859, + "step": 8790 + }, + { + "epoch": 5.016031350195939, + "grad_norm": 2.1781625538447145, + "learning_rate": 1.1677654811213316e-05, + "loss": 0.6986, + "step": 8800 + }, + { + "epoch": 5.021731385821162, + "grad_norm": 2.12558766978219, + "learning_rate": 1.16580324302828e-05, + "loss": 0.6982, + "step": 8810 + }, + { + "epoch": 5.027431421446384, + "grad_norm": 2.176681743446634, + "learning_rate": 1.1638403482633383e-05, + "loss": 0.7033, + "step": 8820 + }, + { + "epoch": 5.033131457071606, + "grad_norm": 2.079820413393339, + "learning_rate": 1.1618768046006476e-05, + "loss": 0.673, + "step": 8830 + }, + { + "epoch": 5.038831492696829, + "grad_norm": 2.0700636259608696, + "learning_rate": 1.1599126198169196e-05, + "loss": 0.6934, + "step": 8840 + }, + { + "epoch": 5.044531528322052, + "grad_norm": 2.151973353252644, + "learning_rate": 1.1579478016914038e-05, + "loss": 0.6989, + "step": 8850 + }, + { + "epoch": 5.050231563947275, + "grad_norm": 2.129485987400305, + "learning_rate": 1.1559823580058591e-05, + "loss": 0.6962, + "step": 8860 + }, + { + "epoch": 5.055931599572498, + "grad_norm": 2.1772520549019405, + "learning_rate": 1.1540162965445224e-05, + "loss": 0.6789, + "step": 8870 + }, + { + "epoch": 5.06163163519772, + "grad_norm": 2.1068284615452515, + "learning_rate": 1.152049625094076e-05, + "loss": 0.6963, + "step": 8880 + }, + { + "epoch": 5.067331670822942, + "grad_norm": 2.1446212460776137, + "learning_rate": 1.150082351443619e-05, + "loss": 0.6852, + "step": 8890 + }, + { + "epoch": 5.073031706448165, + "grad_norm": 2.1765018405494434, + "learning_rate": 1.1481144833846358e-05, + "loss": 0.6904, + "step": 8900 + }, + { + "epoch": 5.078731742073388, + "grad_norm": 2.2251286404486788, + "learning_rate": 1.146146028710964e-05, + "loss": 0.6896, + "step": 8910 + }, + { + "epoch": 5.084431777698611, + "grad_norm": 2.0897124016602113, + "learning_rate": 1.144176995218765e-05, + "loss": 0.6951, + "step": 8920 + }, + { + "epoch": 5.090131813323834, + "grad_norm": 2.1995547700271887, + "learning_rate": 1.1422073907064932e-05, + "loss": 0.6918, + "step": 8930 + }, + { + "epoch": 5.0958318489490555, + "grad_norm": 2.3209537414628434, + "learning_rate": 1.1402372229748635e-05, + "loss": 0.6855, + "step": 8940 + }, + { + "epoch": 5.101531884574278, + "grad_norm": 2.0555932442118343, + "learning_rate": 1.1382664998268222e-05, + "loss": 0.6899, + "step": 8950 + }, + { + "epoch": 5.107231920199501, + "grad_norm": 2.044771606602399, + "learning_rate": 1.1362952290675153e-05, + "loss": 0.6946, + "step": 8960 + }, + { + "epoch": 5.112931955824724, + "grad_norm": 2.061837650585162, + "learning_rate": 1.1343234185042575e-05, + "loss": 0.69, + "step": 8970 + }, + { + "epoch": 5.118631991449947, + "grad_norm": 2.151710810846666, + "learning_rate": 1.1323510759465012e-05, + "loss": 0.6932, + "step": 8980 + }, + { + "epoch": 5.1243320270751695, + "grad_norm": 2.1434546309764704, + "learning_rate": 1.1303782092058062e-05, + "loss": 0.695, + "step": 8990 + }, + { + "epoch": 5.1300320627003915, + "grad_norm": 2.1143478500205597, + "learning_rate": 1.1284048260958076e-05, + "loss": 0.6916, + "step": 9000 + }, + { + "epoch": 5.135732098325614, + "grad_norm": 2.1239759708768737, + "learning_rate": 1.126430934432187e-05, + "loss": 0.7005, + "step": 9010 + }, + { + "epoch": 5.141432133950837, + "grad_norm": 2.0615509246767463, + "learning_rate": 1.1244565420326388e-05, + "loss": 0.6972, + "step": 9020 + }, + { + "epoch": 5.14713216957606, + "grad_norm": 2.2157339698853917, + "learning_rate": 1.1224816567168413e-05, + "loss": 0.6872, + "step": 9030 + }, + { + "epoch": 5.152832205201283, + "grad_norm": 2.1414826849096045, + "learning_rate": 1.1205062863064247e-05, + "loss": 0.7017, + "step": 9040 + }, + { + "epoch": 5.1585322408265055, + "grad_norm": 2.0947592953491916, + "learning_rate": 1.1185304386249405e-05, + "loss": 0.6879, + "step": 9050 + }, + { + "epoch": 5.164232276451727, + "grad_norm": 2.241240477683422, + "learning_rate": 1.1165541214978306e-05, + "loss": 0.6969, + "step": 9060 + }, + { + "epoch": 5.16993231207695, + "grad_norm": 2.195267971963587, + "learning_rate": 1.1145773427523963e-05, + "loss": 0.6879, + "step": 9070 + }, + { + "epoch": 5.175632347702173, + "grad_norm": 2.2669294865119647, + "learning_rate": 1.1126001102177667e-05, + "loss": 0.7013, + "step": 9080 + }, + { + "epoch": 5.181332383327396, + "grad_norm": 2.123451159279259, + "learning_rate": 1.1106224317248682e-05, + "loss": 0.693, + "step": 9090 + }, + { + "epoch": 5.187032418952619, + "grad_norm": 2.236333085476668, + "learning_rate": 1.108644315106394e-05, + "loss": 0.7021, + "step": 9100 + }, + { + "epoch": 5.1927324545778415, + "grad_norm": 2.092129797924252, + "learning_rate": 1.1066657681967723e-05, + "loss": 0.6828, + "step": 9110 + }, + { + "epoch": 5.198432490203063, + "grad_norm": 2.130859878723095, + "learning_rate": 1.1046867988321349e-05, + "loss": 0.6917, + "step": 9120 + }, + { + "epoch": 5.204132525828286, + "grad_norm": 2.0855014986112743, + "learning_rate": 1.102707414850287e-05, + "loss": 0.7009, + "step": 9130 + }, + { + "epoch": 5.209832561453509, + "grad_norm": 2.221078070968992, + "learning_rate": 1.100727624090677e-05, + "loss": 0.6877, + "step": 9140 + }, + { + "epoch": 5.215532597078732, + "grad_norm": 2.1677759083316617, + "learning_rate": 1.0987474343943625e-05, + "loss": 0.6892, + "step": 9150 + }, + { + "epoch": 5.221232632703955, + "grad_norm": 2.1121478588367304, + "learning_rate": 1.0967668536039828e-05, + "loss": 0.686, + "step": 9160 + }, + { + "epoch": 5.2269326683291775, + "grad_norm": 2.140684147466254, + "learning_rate": 1.0947858895637255e-05, + "loss": 0.7028, + "step": 9170 + }, + { + "epoch": 5.232632703954399, + "grad_norm": 2.1807307861135676, + "learning_rate": 1.0928045501192952e-05, + "loss": 0.6941, + "step": 9180 + }, + { + "epoch": 5.238332739579622, + "grad_norm": 2.208380922343372, + "learning_rate": 1.0908228431178847e-05, + "loss": 0.7014, + "step": 9190 + }, + { + "epoch": 5.244032775204845, + "grad_norm": 2.1946065233509287, + "learning_rate": 1.0888407764081416e-05, + "loss": 0.6871, + "step": 9200 + }, + { + "epoch": 5.249732810830068, + "grad_norm": 2.135339588212707, + "learning_rate": 1.0868583578401391e-05, + "loss": 0.6838, + "step": 9210 + }, + { + "epoch": 5.255432846455291, + "grad_norm": 2.2183442641416327, + "learning_rate": 1.0848755952653426e-05, + "loss": 0.7054, + "step": 9220 + }, + { + "epoch": 5.261132882080513, + "grad_norm": 2.107836660256195, + "learning_rate": 1.0828924965365814e-05, + "loss": 0.6933, + "step": 9230 + }, + { + "epoch": 5.266832917705735, + "grad_norm": 2.2166875612836248, + "learning_rate": 1.0809090695080148e-05, + "loss": 0.686, + "step": 9240 + }, + { + "epoch": 5.272532953330958, + "grad_norm": 2.104650506590125, + "learning_rate": 1.0789253220351035e-05, + "loss": 0.6979, + "step": 9250 + }, + { + "epoch": 5.278232988956181, + "grad_norm": 2.0369346584749244, + "learning_rate": 1.0769412619745762e-05, + "loss": 0.6888, + "step": 9260 + }, + { + "epoch": 5.283933024581404, + "grad_norm": 2.182316976219354, + "learning_rate": 1.0749568971844011e-05, + "loss": 0.6962, + "step": 9270 + }, + { + "epoch": 5.289633060206627, + "grad_norm": 2.08731443365099, + "learning_rate": 1.0729722355237519e-05, + "loss": 0.6888, + "step": 9280 + }, + { + "epoch": 5.2953330958318485, + "grad_norm": 2.07024796554627, + "learning_rate": 1.0709872848529787e-05, + "loss": 0.6942, + "step": 9290 + }, + { + "epoch": 5.301033131457071, + "grad_norm": 2.135528373311544, + "learning_rate": 1.0690020530335764e-05, + "loss": 0.6944, + "step": 9300 + }, + { + "epoch": 5.306733167082294, + "grad_norm": 2.0853893054463732, + "learning_rate": 1.0670165479281522e-05, + "loss": 0.6955, + "step": 9310 + }, + { + "epoch": 5.312433202707517, + "grad_norm": 2.1518235426488115, + "learning_rate": 1.065030777400398e-05, + "loss": 0.6836, + "step": 9320 + }, + { + "epoch": 5.31813323833274, + "grad_norm": 2.2237139366167478, + "learning_rate": 1.0630447493150547e-05, + "loss": 0.7044, + "step": 9330 + }, + { + "epoch": 5.323833273957963, + "grad_norm": 2.213573275816285, + "learning_rate": 1.0610584715378843e-05, + "loss": 0.6893, + "step": 9340 + }, + { + "epoch": 5.3295333095831845, + "grad_norm": 2.1970195247994906, + "learning_rate": 1.0590719519356373e-05, + "loss": 0.7029, + "step": 9350 + }, + { + "epoch": 5.335233345208407, + "grad_norm": 2.1836535830369037, + "learning_rate": 1.0570851983760228e-05, + "loss": 0.6918, + "step": 9360 + }, + { + "epoch": 5.34093338083363, + "grad_norm": 2.2205086474880513, + "learning_rate": 1.0550982187276752e-05, + "loss": 0.6962, + "step": 9370 + }, + { + "epoch": 5.346633416458853, + "grad_norm": 2.0062208831981163, + "learning_rate": 1.0531110208601254e-05, + "loss": 0.7039, + "step": 9380 + }, + { + "epoch": 5.352333452084076, + "grad_norm": 2.197549239307711, + "learning_rate": 1.0511236126437682e-05, + "loss": 0.6922, + "step": 9390 + }, + { + "epoch": 5.3580334877092985, + "grad_norm": 2.1844139260865623, + "learning_rate": 1.0491360019498312e-05, + "loss": 0.6929, + "step": 9400 + }, + { + "epoch": 5.3637335233345205, + "grad_norm": 2.2073816657140957, + "learning_rate": 1.0471481966503446e-05, + "loss": 0.6905, + "step": 9410 + }, + { + "epoch": 5.369433558959743, + "grad_norm": 2.0734746928445227, + "learning_rate": 1.0451602046181084e-05, + "loss": 0.6809, + "step": 9420 + }, + { + "epoch": 5.375133594584966, + "grad_norm": 2.0981553256945515, + "learning_rate": 1.0431720337266632e-05, + "loss": 0.6954, + "step": 9430 + }, + { + "epoch": 5.380833630210189, + "grad_norm": 1.9956443908942936, + "learning_rate": 1.0411836918502573e-05, + "loss": 0.6869, + "step": 9440 + }, + { + "epoch": 5.386533665835412, + "grad_norm": 2.160809451240416, + "learning_rate": 1.0391951868638167e-05, + "loss": 0.6908, + "step": 9450 + }, + { + "epoch": 5.3922337014606345, + "grad_norm": 2.1171495609558932, + "learning_rate": 1.0372065266429124e-05, + "loss": 0.693, + "step": 9460 + }, + { + "epoch": 5.397933737085856, + "grad_norm": 2.1299349506902123, + "learning_rate": 1.0352177190637315e-05, + "loss": 0.6885, + "step": 9470 + }, + { + "epoch": 5.403633772711079, + "grad_norm": 2.2267815303969356, + "learning_rate": 1.0332287720030442e-05, + "loss": 0.6905, + "step": 9480 + }, + { + "epoch": 5.409333808336302, + "grad_norm": 2.173479864292032, + "learning_rate": 1.0312396933381728e-05, + "loss": 0.6978, + "step": 9490 + }, + { + "epoch": 5.415033843961525, + "grad_norm": 2.0878022848536077, + "learning_rate": 1.0292504909469612e-05, + "loss": 0.6881, + "step": 9500 + }, + { + "epoch": 5.420733879586748, + "grad_norm": 2.2809403605624077, + "learning_rate": 1.0272611727077426e-05, + "loss": 0.691, + "step": 9510 + }, + { + "epoch": 5.42643391521197, + "grad_norm": 2.09129513356794, + "learning_rate": 1.0252717464993105e-05, + "loss": 0.6909, + "step": 9520 + }, + { + "epoch": 5.432133950837192, + "grad_norm": 2.1587358103587326, + "learning_rate": 1.0232822202008845e-05, + "loss": 0.6877, + "step": 9530 + }, + { + "epoch": 5.437833986462415, + "grad_norm": 2.0969324770102014, + "learning_rate": 1.0212926016920816e-05, + "loss": 0.6854, + "step": 9540 + }, + { + "epoch": 5.443534022087638, + "grad_norm": 2.2219797664048375, + "learning_rate": 1.019302898852884e-05, + "loss": 0.6834, + "step": 9550 + }, + { + "epoch": 5.449234057712861, + "grad_norm": 2.245112065626013, + "learning_rate": 1.0173131195636068e-05, + "loss": 0.6935, + "step": 9560 + }, + { + "epoch": 5.454934093338084, + "grad_norm": 2.207368160265503, + "learning_rate": 1.0153232717048686e-05, + "loss": 0.6978, + "step": 9570 + }, + { + "epoch": 5.4606341289633065, + "grad_norm": 2.1020563924041133, + "learning_rate": 1.0133333631575606e-05, + "loss": 0.694, + "step": 9580 + }, + { + "epoch": 5.466334164588528, + "grad_norm": 2.125673123994935, + "learning_rate": 1.0113434018028124e-05, + "loss": 0.6774, + "step": 9590 + }, + { + "epoch": 5.472034200213751, + "grad_norm": 2.136834440229622, + "learning_rate": 1.0093533955219639e-05, + "loss": 0.6924, + "step": 9600 + }, + { + "epoch": 5.477734235838974, + "grad_norm": 2.158621618784402, + "learning_rate": 1.0073633521965334e-05, + "loss": 0.6998, + "step": 9610 + }, + { + "epoch": 5.483434271464197, + "grad_norm": 2.2025522074206116, + "learning_rate": 1.0053732797081843e-05, + "loss": 0.6927, + "step": 9620 + }, + { + "epoch": 5.48913430708942, + "grad_norm": 2.171821093820873, + "learning_rate": 1.003383185938697e-05, + "loss": 0.6846, + "step": 9630 + }, + { + "epoch": 5.4948343427146416, + "grad_norm": 2.384577160611531, + "learning_rate": 1.0013930787699358e-05, + "loss": 0.6924, + "step": 9640 + }, + { + "epoch": 5.500534378339864, + "grad_norm": 2.122322707377548, + "learning_rate": 9.994029660838175e-06, + "loss": 0.6988, + "step": 9650 + }, + { + "epoch": 5.506234413965087, + "grad_norm": 2.1255532435570084, + "learning_rate": 9.974128557622814e-06, + "loss": 0.6856, + "step": 9660 + }, + { + "epoch": 5.51193444959031, + "grad_norm": 2.2432791993701904, + "learning_rate": 9.95422755687257e-06, + "loss": 0.6862, + "step": 9670 + }, + { + "epoch": 5.517634485215533, + "grad_norm": 2.157704074197768, + "learning_rate": 9.934326737406338e-06, + "loss": 0.6937, + "step": 9680 + }, + { + "epoch": 5.523334520840756, + "grad_norm": 2.132689938935754, + "learning_rate": 9.91442617804229e-06, + "loss": 0.6912, + "step": 9690 + }, + { + "epoch": 5.529034556465978, + "grad_norm": 2.086298565136052, + "learning_rate": 9.894525957597566e-06, + "loss": 0.703, + "step": 9700 + }, + { + "epoch": 5.5347345920912, + "grad_norm": 2.1735739979051765, + "learning_rate": 9.87462615488797e-06, + "loss": 0.6823, + "step": 9710 + }, + { + "epoch": 5.540434627716423, + "grad_norm": 2.2046428225180508, + "learning_rate": 9.854726848727645e-06, + "loss": 0.6936, + "step": 9720 + }, + { + "epoch": 5.546134663341646, + "grad_norm": 2.119617171719753, + "learning_rate": 9.834828117928776e-06, + "loss": 0.6909, + "step": 9730 + }, + { + "epoch": 5.551834698966869, + "grad_norm": 2.1771878721565594, + "learning_rate": 9.81493004130126e-06, + "loss": 0.7087, + "step": 9740 + }, + { + "epoch": 5.557534734592092, + "grad_norm": 2.097542061204044, + "learning_rate": 9.795032697652408e-06, + "loss": 0.6869, + "step": 9750 + }, + { + "epoch": 5.5632347702173135, + "grad_norm": 2.060454397168932, + "learning_rate": 9.775136165786626e-06, + "loss": 0.6975, + "step": 9760 + }, + { + "epoch": 5.568934805842536, + "grad_norm": 2.220837079068152, + "learning_rate": 9.755240524505107e-06, + "loss": 0.7019, + "step": 9770 + }, + { + "epoch": 5.574634841467759, + "grad_norm": 2.161939035304091, + "learning_rate": 9.735345852605519e-06, + "loss": 0.6906, + "step": 9780 + }, + { + "epoch": 5.580334877092982, + "grad_norm": 2.1577219592337507, + "learning_rate": 9.715452228881683e-06, + "loss": 0.7011, + "step": 9790 + }, + { + "epoch": 5.586034912718205, + "grad_norm": 2.191829870908184, + "learning_rate": 9.695559732123275e-06, + "loss": 0.6822, + "step": 9800 + }, + { + "epoch": 5.5917349483434275, + "grad_norm": 2.2162513440456593, + "learning_rate": 9.675668441115503e-06, + "loss": 0.6941, + "step": 9810 + }, + { + "epoch": 5.5974349839686495, + "grad_norm": 2.108945629779053, + "learning_rate": 9.655778434638807e-06, + "loss": 0.6934, + "step": 9820 + }, + { + "epoch": 5.603135019593872, + "grad_norm": 2.1700938789600275, + "learning_rate": 9.635889791468533e-06, + "loss": 0.6963, + "step": 9830 + }, + { + "epoch": 5.608835055219095, + "grad_norm": 2.233530335715926, + "learning_rate": 9.616002590374628e-06, + "loss": 0.6995, + "step": 9840 + }, + { + "epoch": 5.614535090844318, + "grad_norm": 2.1198785161387286, + "learning_rate": 9.596116910121328e-06, + "loss": 0.6951, + "step": 9850 + }, + { + "epoch": 5.620235126469541, + "grad_norm": 2.12074082476983, + "learning_rate": 9.57623282946685e-06, + "loss": 0.6916, + "step": 9860 + }, + { + "epoch": 5.625935162094763, + "grad_norm": 2.1419743093669585, + "learning_rate": 9.556350427163073e-06, + "loss": 0.6943, + "step": 9870 + }, + { + "epoch": 5.631635197719985, + "grad_norm": 2.21297293287053, + "learning_rate": 9.536469781955224e-06, + "loss": 0.6797, + "step": 9880 + }, + { + "epoch": 5.637335233345208, + "grad_norm": 2.080452235855697, + "learning_rate": 9.516590972581579e-06, + "loss": 0.6842, + "step": 9890 + }, + { + "epoch": 5.643035268970431, + "grad_norm": 2.1622295794258495, + "learning_rate": 9.496714077773132e-06, + "loss": 0.703, + "step": 9900 + }, + { + "epoch": 5.648735304595654, + "grad_norm": 2.2116620471484416, + "learning_rate": 9.476839176253311e-06, + "loss": 0.6915, + "step": 9910 + }, + { + "epoch": 5.654435340220877, + "grad_norm": 2.1804184869804986, + "learning_rate": 9.456966346737638e-06, + "loss": 0.6946, + "step": 9920 + }, + { + "epoch": 5.6601353758460995, + "grad_norm": 2.1894753754867984, + "learning_rate": 9.437095667933427e-06, + "loss": 0.6936, + "step": 9930 + }, + { + "epoch": 5.665835411471321, + "grad_norm": 2.2271688400846186, + "learning_rate": 9.417227218539475e-06, + "loss": 0.6929, + "step": 9940 + }, + { + "epoch": 5.671535447096544, + "grad_norm": 2.1505497312386157, + "learning_rate": 9.397361077245762e-06, + "loss": 0.6865, + "step": 9950 + }, + { + "epoch": 5.677235482721767, + "grad_norm": 2.2384375215999195, + "learning_rate": 9.377497322733109e-06, + "loss": 0.6944, + "step": 9960 + }, + { + "epoch": 5.68293551834699, + "grad_norm": 2.2881024340389917, + "learning_rate": 9.357636033672892e-06, + "loss": 0.7028, + "step": 9970 + }, + { + "epoch": 5.688635553972213, + "grad_norm": 2.213265696372914, + "learning_rate": 9.337777288726722e-06, + "loss": 0.6857, + "step": 9980 + }, + { + "epoch": 5.694335589597435, + "grad_norm": 2.299796287206333, + "learning_rate": 9.317921166546139e-06, + "loss": 0.6923, + "step": 9990 + }, + { + "epoch": 5.700035625222657, + "grad_norm": 2.228394016754354, + "learning_rate": 9.298067745772286e-06, + "loss": 0.6904, + "step": 10000 + }, + { + "epoch": 5.70573566084788, + "grad_norm": 2.1624614203874604, + "learning_rate": 9.278217105035613e-06, + "loss": 0.689, + "step": 10010 + }, + { + "epoch": 5.711435696473103, + "grad_norm": 2.1817564157072034, + "learning_rate": 9.258369322955558e-06, + "loss": 0.6867, + "step": 10020 + }, + { + "epoch": 5.717135732098326, + "grad_norm": 2.0985906977408306, + "learning_rate": 9.238524478140231e-06, + "loss": 0.6988, + "step": 10030 + }, + { + "epoch": 5.722835767723549, + "grad_norm": 2.2148178236948817, + "learning_rate": 9.218682649186123e-06, + "loss": 0.6813, + "step": 10040 + }, + { + "epoch": 5.7285358033487705, + "grad_norm": 2.1637803609456503, + "learning_rate": 9.198843914677776e-06, + "loss": 0.6828, + "step": 10050 + }, + { + "epoch": 5.734235838973993, + "grad_norm": 2.0867931176845436, + "learning_rate": 9.17900835318746e-06, + "loss": 0.6947, + "step": 10060 + }, + { + "epoch": 5.739935874599216, + "grad_norm": 2.1688434329476047, + "learning_rate": 9.159176043274896e-06, + "loss": 0.6869, + "step": 10070 + }, + { + "epoch": 5.745635910224439, + "grad_norm": 2.154324487594934, + "learning_rate": 9.139347063486926e-06, + "loss": 0.6807, + "step": 10080 + }, + { + "epoch": 5.751335945849662, + "grad_norm": 2.2017177171263063, + "learning_rate": 9.119521492357196e-06, + "loss": 0.6905, + "step": 10090 + }, + { + "epoch": 5.757035981474885, + "grad_norm": 2.118326945408275, + "learning_rate": 9.099699408405854e-06, + "loss": 0.6914, + "step": 10100 + }, + { + "epoch": 5.7627360171001065, + "grad_norm": 2.119914772970993, + "learning_rate": 9.079880890139238e-06, + "loss": 0.6947, + "step": 10110 + }, + { + "epoch": 5.768436052725329, + "grad_norm": 2.115252049304522, + "learning_rate": 9.06006601604956e-06, + "loss": 0.6937, + "step": 10120 + }, + { + "epoch": 5.774136088350552, + "grad_norm": 2.0727940773951716, + "learning_rate": 9.040254864614608e-06, + "loss": 0.7006, + "step": 10130 + }, + { + "epoch": 5.779836123975775, + "grad_norm": 2.2180027506072326, + "learning_rate": 9.020447514297417e-06, + "loss": 0.6984, + "step": 10140 + }, + { + "epoch": 5.785536159600998, + "grad_norm": 2.1795734441999723, + "learning_rate": 9.000644043545974e-06, + "loss": 0.6908, + "step": 10150 + }, + { + "epoch": 5.791236195226221, + "grad_norm": 2.315220502416694, + "learning_rate": 8.980844530792889e-06, + "loss": 0.6913, + "step": 10160 + }, + { + "epoch": 5.7969362308514425, + "grad_norm": 2.2227835874849644, + "learning_rate": 8.96104905445512e-06, + "loss": 0.6887, + "step": 10170 + }, + { + "epoch": 5.802636266476665, + "grad_norm": 2.148778586167764, + "learning_rate": 8.941257692933613e-06, + "loss": 0.6937, + "step": 10180 + }, + { + "epoch": 5.808336302101888, + "grad_norm": 2.135091331207858, + "learning_rate": 8.92147052461303e-06, + "loss": 0.6893, + "step": 10190 + }, + { + "epoch": 5.814036337727111, + "grad_norm": 2.2959580471537495, + "learning_rate": 8.901687627861423e-06, + "loss": 0.6976, + "step": 10200 + }, + { + "epoch": 5.819736373352334, + "grad_norm": 2.134327621747421, + "learning_rate": 8.881909081029923e-06, + "loss": 0.6935, + "step": 10210 + }, + { + "epoch": 5.825436408977556, + "grad_norm": 2.2490208608775273, + "learning_rate": 8.862134962452444e-06, + "loss": 0.7015, + "step": 10220 + }, + { + "epoch": 5.8311364446027785, + "grad_norm": 2.1876216665258577, + "learning_rate": 8.84236535044535e-06, + "loss": 0.6956, + "step": 10230 + }, + { + "epoch": 5.836836480228001, + "grad_norm": 2.1348120705699674, + "learning_rate": 8.822600323307163e-06, + "loss": 0.6904, + "step": 10240 + }, + { + "epoch": 5.842536515853224, + "grad_norm": 2.276591677956022, + "learning_rate": 8.802839959318238e-06, + "loss": 0.6876, + "step": 10250 + }, + { + "epoch": 5.848236551478447, + "grad_norm": 2.1847877639758937, + "learning_rate": 8.783084336740474e-06, + "loss": 0.6939, + "step": 10260 + }, + { + "epoch": 5.85393658710367, + "grad_norm": 2.172811243873737, + "learning_rate": 8.763333533816985e-06, + "loss": 0.6881, + "step": 10270 + }, + { + "epoch": 5.8596366227288925, + "grad_norm": 2.0077396330477715, + "learning_rate": 8.743587628771793e-06, + "loss": 0.6843, + "step": 10280 + }, + { + "epoch": 5.865336658354114, + "grad_norm": 2.1776154252978968, + "learning_rate": 8.723846699809522e-06, + "loss": 0.6834, + "step": 10290 + }, + { + "epoch": 5.871036693979337, + "grad_norm": 2.197726726366027, + "learning_rate": 8.704110825115098e-06, + "loss": 0.6918, + "step": 10300 + }, + { + "epoch": 5.87673672960456, + "grad_norm": 2.185810249326048, + "learning_rate": 8.68438008285342e-06, + "loss": 0.6928, + "step": 10310 + }, + { + "epoch": 5.882436765229783, + "grad_norm": 2.2642176568146435, + "learning_rate": 8.664654551169061e-06, + "loss": 0.6844, + "step": 10320 + }, + { + "epoch": 5.888136800855006, + "grad_norm": 2.161758512793611, + "learning_rate": 8.644934308185959e-06, + "loss": 0.6915, + "step": 10330 + }, + { + "epoch": 5.893836836480228, + "grad_norm": 2.163900058783309, + "learning_rate": 8.6252194320071e-06, + "loss": 0.6826, + "step": 10340 + }, + { + "epoch": 5.89953687210545, + "grad_norm": 2.1666177712043115, + "learning_rate": 8.605510000714228e-06, + "loss": 0.6871, + "step": 10350 + }, + { + "epoch": 5.905236907730673, + "grad_norm": 2.186489891927606, + "learning_rate": 8.585806092367513e-06, + "loss": 0.6926, + "step": 10360 + }, + { + "epoch": 5.910936943355896, + "grad_norm": 2.0842711436314687, + "learning_rate": 8.566107785005251e-06, + "loss": 0.6819, + "step": 10370 + }, + { + "epoch": 5.916636978981119, + "grad_norm": 2.2450803050111814, + "learning_rate": 8.546415156643549e-06, + "loss": 0.692, + "step": 10380 + }, + { + "epoch": 5.922337014606342, + "grad_norm": 2.2474530881350674, + "learning_rate": 8.526728285276039e-06, + "loss": 0.6984, + "step": 10390 + }, + { + "epoch": 5.928037050231564, + "grad_norm": 2.2904276116665474, + "learning_rate": 8.507047248873539e-06, + "loss": 0.686, + "step": 10400 + }, + { + "epoch": 5.933737085856786, + "grad_norm": 2.112697139828902, + "learning_rate": 8.487372125383757e-06, + "loss": 0.6824, + "step": 10410 + }, + { + "epoch": 5.939437121482009, + "grad_norm": 2.1620065424570045, + "learning_rate": 8.467702992730992e-06, + "loss": 0.6952, + "step": 10420 + }, + { + "epoch": 5.945137157107232, + "grad_norm": 2.1255308293368116, + "learning_rate": 8.448039928815804e-06, + "loss": 0.691, + "step": 10430 + }, + { + "epoch": 5.950837192732455, + "grad_norm": 2.028997680326995, + "learning_rate": 8.42838301151473e-06, + "loss": 0.6969, + "step": 10440 + }, + { + "epoch": 5.956537228357677, + "grad_norm": 2.1235085422204536, + "learning_rate": 8.408732318679953e-06, + "loss": 0.711, + "step": 10450 + }, + { + "epoch": 5.9622372639828995, + "grad_norm": 2.0842748601067176, + "learning_rate": 8.389087928139008e-06, + "loss": 0.6955, + "step": 10460 + }, + { + "epoch": 5.967937299608122, + "grad_norm": 2.2061370472464668, + "learning_rate": 8.369449917694466e-06, + "loss": 0.6943, + "step": 10470 + }, + { + "epoch": 5.973637335233345, + "grad_norm": 2.154886985406157, + "learning_rate": 8.34981836512364e-06, + "loss": 0.6905, + "step": 10480 + }, + { + "epoch": 5.979337370858568, + "grad_norm": 2.1100063096741875, + "learning_rate": 8.330193348178254e-06, + "loss": 0.703, + "step": 10490 + }, + { + "epoch": 5.985037406483791, + "grad_norm": 2.260276094006089, + "learning_rate": 8.310574944584151e-06, + "loss": 0.6957, + "step": 10500 + }, + { + "epoch": 5.990737442109014, + "grad_norm": 2.1602419464603755, + "learning_rate": 8.290963232040984e-06, + "loss": 0.6918, + "step": 10510 + }, + { + "epoch": 5.9964374777342355, + "grad_norm": 2.2948853765089563, + "learning_rate": 8.271358288221897e-06, + "loss": 0.6945, + "step": 10520 + }, + { + "epoch": 6.002137513359458, + "grad_norm": 2.170085520328391, + "learning_rate": 8.251760190773243e-06, + "loss": 0.6872, + "step": 10530 + }, + { + "epoch": 6.007837548984681, + "grad_norm": 2.279696142110909, + "learning_rate": 8.232169017314247e-06, + "loss": 0.6774, + "step": 10540 + }, + { + "epoch": 6.013537584609904, + "grad_norm": 2.322165475925097, + "learning_rate": 8.212584845436713e-06, + "loss": 0.6681, + "step": 10550 + }, + { + "epoch": 6.019237620235127, + "grad_norm": 2.240989851797247, + "learning_rate": 8.193007752704714e-06, + "loss": 0.6697, + "step": 10560 + }, + { + "epoch": 6.024937655860349, + "grad_norm": 2.246153691155315, + "learning_rate": 8.173437816654292e-06, + "loss": 0.6725, + "step": 10570 + }, + { + "epoch": 6.0306376914855715, + "grad_norm": 2.1966286414290868, + "learning_rate": 8.153875114793137e-06, + "loss": 0.6894, + "step": 10580 + }, + { + "epoch": 6.036337727110794, + "grad_norm": 2.1294095450667108, + "learning_rate": 8.13431972460029e-06, + "loss": 0.6775, + "step": 10590 + }, + { + "epoch": 6.042037762736017, + "grad_norm": 2.2155021741663647, + "learning_rate": 8.11477172352584e-06, + "loss": 0.6709, + "step": 10600 + }, + { + "epoch": 6.04773779836124, + "grad_norm": 2.259295223173431, + "learning_rate": 8.095231188990597e-06, + "loss": 0.6823, + "step": 10610 + }, + { + "epoch": 6.053437833986463, + "grad_norm": 2.2718220988832867, + "learning_rate": 8.075698198385817e-06, + "loss": 0.681, + "step": 10620 + }, + { + "epoch": 6.059137869611685, + "grad_norm": 2.1822599854506817, + "learning_rate": 8.056172829072863e-06, + "loss": 0.6738, + "step": 10630 + }, + { + "epoch": 6.0648379052369075, + "grad_norm": 2.2108465993733857, + "learning_rate": 8.036655158382922e-06, + "loss": 0.6846, + "step": 10640 + }, + { + "epoch": 6.07053794086213, + "grad_norm": 2.3040922125703323, + "learning_rate": 8.017145263616683e-06, + "loss": 0.674, + "step": 10650 + }, + { + "epoch": 6.076237976487353, + "grad_norm": 2.106527392828737, + "learning_rate": 7.997643222044051e-06, + "loss": 0.6852, + "step": 10660 + }, + { + "epoch": 6.081938012112576, + "grad_norm": 2.1551369757776464, + "learning_rate": 7.978149110903816e-06, + "loss": 0.6688, + "step": 10670 + }, + { + "epoch": 6.087638047737799, + "grad_norm": 2.2730916010608686, + "learning_rate": 7.958663007403362e-06, + "loss": 0.6797, + "step": 10680 + }, + { + "epoch": 6.093338083363021, + "grad_norm": 2.292642755654638, + "learning_rate": 7.939184988718359e-06, + "loss": 0.6658, + "step": 10690 + }, + { + "epoch": 6.099038118988243, + "grad_norm": 2.192515472600447, + "learning_rate": 7.919715131992459e-06, + "loss": 0.676, + "step": 10700 + }, + { + "epoch": 6.104738154613466, + "grad_norm": 2.1376221361926935, + "learning_rate": 7.900253514336985e-06, + "loss": 0.6753, + "step": 10710 + }, + { + "epoch": 6.110438190238689, + "grad_norm": 2.1942538782779097, + "learning_rate": 7.88080021283063e-06, + "loss": 0.667, + "step": 10720 + }, + { + "epoch": 6.116138225863912, + "grad_norm": 2.246819786705824, + "learning_rate": 7.86135530451915e-06, + "loss": 0.6666, + "step": 10730 + }, + { + "epoch": 6.121838261489135, + "grad_norm": 2.202275918725809, + "learning_rate": 7.84191886641506e-06, + "loss": 0.6774, + "step": 10740 + }, + { + "epoch": 6.127538297114357, + "grad_norm": 2.1771264629986016, + "learning_rate": 7.822490975497326e-06, + "loss": 0.6766, + "step": 10750 + }, + { + "epoch": 6.133238332739579, + "grad_norm": 2.391203193647443, + "learning_rate": 7.80307170871107e-06, + "loss": 0.6845, + "step": 10760 + }, + { + "epoch": 6.138938368364802, + "grad_norm": 2.2744480554444513, + "learning_rate": 7.783661142967247e-06, + "loss": 0.682, + "step": 10770 + }, + { + "epoch": 6.144638403990025, + "grad_norm": 2.234849216259114, + "learning_rate": 7.764259355142354e-06, + "loss": 0.6716, + "step": 10780 + }, + { + "epoch": 6.150338439615248, + "grad_norm": 2.202045737877405, + "learning_rate": 7.744866422078133e-06, + "loss": 0.6846, + "step": 10790 + }, + { + "epoch": 6.156038475240471, + "grad_norm": 2.1917326270336144, + "learning_rate": 7.725482420581245e-06, + "loss": 0.6793, + "step": 10800 + }, + { + "epoch": 6.161738510865693, + "grad_norm": 2.2201548403178544, + "learning_rate": 7.70610742742298e-06, + "loss": 0.6787, + "step": 10810 + }, + { + "epoch": 6.167438546490915, + "grad_norm": 2.1974557147442306, + "learning_rate": 7.686741519338949e-06, + "loss": 0.6801, + "step": 10820 + }, + { + "epoch": 6.173138582116138, + "grad_norm": 2.262594067441599, + "learning_rate": 7.667384773028778e-06, + "loss": 0.6813, + "step": 10830 + }, + { + "epoch": 6.178838617741361, + "grad_norm": 2.14103947815712, + "learning_rate": 7.64803726515582e-06, + "loss": 0.6784, + "step": 10840 + }, + { + "epoch": 6.184538653366584, + "grad_norm": 2.1449864947544, + "learning_rate": 7.62869907234683e-06, + "loss": 0.6873, + "step": 10850 + }, + { + "epoch": 6.190238688991807, + "grad_norm": 2.202782593471053, + "learning_rate": 7.609370271191667e-06, + "loss": 0.6816, + "step": 10860 + }, + { + "epoch": 6.1959387246170285, + "grad_norm": 2.2955199701611906, + "learning_rate": 7.590050938242997e-06, + "loss": 0.6745, + "step": 10870 + }, + { + "epoch": 6.201638760242251, + "grad_norm": 2.3085316499548534, + "learning_rate": 7.57074115001599e-06, + "loss": 0.6805, + "step": 10880 + }, + { + "epoch": 6.207338795867474, + "grad_norm": 2.287598178157803, + "learning_rate": 7.551440982988011e-06, + "loss": 0.6695, + "step": 10890 + }, + { + "epoch": 6.213038831492697, + "grad_norm": 2.3348856472179125, + "learning_rate": 7.532150513598318e-06, + "loss": 0.671, + "step": 10900 + }, + { + "epoch": 6.21873886711792, + "grad_norm": 2.2779474366048786, + "learning_rate": 7.512869818247763e-06, + "loss": 0.6755, + "step": 10910 + }, + { + "epoch": 6.224438902743142, + "grad_norm": 2.2397875686297515, + "learning_rate": 7.493598973298485e-06, + "loss": 0.6838, + "step": 10920 + }, + { + "epoch": 6.2301389383683645, + "grad_norm": 2.2689325500785857, + "learning_rate": 7.47433805507362e-06, + "loss": 0.6861, + "step": 10930 + }, + { + "epoch": 6.235838973993587, + "grad_norm": 2.321132598662188, + "learning_rate": 7.4550871398569755e-06, + "loss": 0.6751, + "step": 10940 + }, + { + "epoch": 6.24153900961881, + "grad_norm": 2.146161666540449, + "learning_rate": 7.4358463038927464e-06, + "loss": 0.6846, + "step": 10950 + }, + { + "epoch": 6.247239045244033, + "grad_norm": 2.191499499785015, + "learning_rate": 7.416615623385205e-06, + "loss": 0.6689, + "step": 10960 + }, + { + "epoch": 6.252939080869256, + "grad_norm": 2.2642018013263905, + "learning_rate": 7.397395174498416e-06, + "loss": 0.6758, + "step": 10970 + }, + { + "epoch": 6.258639116494478, + "grad_norm": 2.259935424672811, + "learning_rate": 7.3781850333559065e-06, + "loss": 0.6769, + "step": 10980 + }, + { + "epoch": 6.2643391521197005, + "grad_norm": 2.308436223900447, + "learning_rate": 7.3589852760403845e-06, + "loss": 0.672, + "step": 10990 + }, + { + "epoch": 6.270039187744923, + "grad_norm": 2.2759428945966302, + "learning_rate": 7.3397959785934305e-06, + "loss": 0.6762, + "step": 11000 + }, + { + "epoch": 6.275739223370146, + "grad_norm": 2.2795748590311167, + "learning_rate": 7.3206172170152025e-06, + "loss": 0.6711, + "step": 11010 + }, + { + "epoch": 6.281439258995369, + "grad_norm": 2.21392152862467, + "learning_rate": 7.301449067264128e-06, + "loss": 0.6748, + "step": 11020 + }, + { + "epoch": 6.287139294620592, + "grad_norm": 2.1429505866001684, + "learning_rate": 7.282291605256604e-06, + "loss": 0.673, + "step": 11030 + }, + { + "epoch": 6.292839330245814, + "grad_norm": 2.239080481574686, + "learning_rate": 7.263144906866701e-06, + "loss": 0.6695, + "step": 11040 + }, + { + "epoch": 6.2985393658710365, + "grad_norm": 2.1917221329279086, + "learning_rate": 7.244009047925858e-06, + "loss": 0.6782, + "step": 11050 + }, + { + "epoch": 6.304239401496259, + "grad_norm": 2.2307933648937475, + "learning_rate": 7.224884104222585e-06, + "loss": 0.6746, + "step": 11060 + }, + { + "epoch": 6.309939437121482, + "grad_norm": 2.1416509854746377, + "learning_rate": 7.205770151502163e-06, + "loss": 0.6846, + "step": 11070 + }, + { + "epoch": 6.315639472746705, + "grad_norm": 2.173159302277719, + "learning_rate": 7.186667265466337e-06, + "loss": 0.668, + "step": 11080 + }, + { + "epoch": 6.321339508371928, + "grad_norm": 2.261191207971506, + "learning_rate": 7.1675755217730245e-06, + "loss": 0.6764, + "step": 11090 + }, + { + "epoch": 6.32703954399715, + "grad_norm": 2.3224880630444007, + "learning_rate": 7.148494996036022e-06, + "loss": 0.6776, + "step": 11100 + }, + { + "epoch": 6.332739579622372, + "grad_norm": 2.0716743785582055, + "learning_rate": 7.129425763824683e-06, + "loss": 0.6749, + "step": 11110 + }, + { + "epoch": 6.338439615247595, + "grad_norm": 2.261127297203104, + "learning_rate": 7.110367900663642e-06, + "loss": 0.6732, + "step": 11120 + }, + { + "epoch": 6.344139650872818, + "grad_norm": 2.2896807270604116, + "learning_rate": 7.091321482032501e-06, + "loss": 0.6766, + "step": 11130 + }, + { + "epoch": 6.349839686498041, + "grad_norm": 2.162182939499591, + "learning_rate": 7.072286583365533e-06, + "loss": 0.6753, + "step": 11140 + }, + { + "epoch": 6.355539722123264, + "grad_norm": 2.17833933141201, + "learning_rate": 7.053263280051394e-06, + "loss": 0.6821, + "step": 11150 + }, + { + "epoch": 6.361239757748486, + "grad_norm": 2.209348413867267, + "learning_rate": 7.034251647432811e-06, + "loss": 0.6628, + "step": 11160 + }, + { + "epoch": 6.366939793373708, + "grad_norm": 2.249600616326594, + "learning_rate": 7.01525176080629e-06, + "loss": 0.6708, + "step": 11170 + }, + { + "epoch": 6.372639828998931, + "grad_norm": 2.1575927428642876, + "learning_rate": 6.99626369542181e-06, + "loss": 0.6746, + "step": 11180 + }, + { + "epoch": 6.378339864624154, + "grad_norm": 2.2669648805317344, + "learning_rate": 6.977287526482541e-06, + "loss": 0.6877, + "step": 11190 + }, + { + "epoch": 6.384039900249377, + "grad_norm": 2.2249493885686378, + "learning_rate": 6.958323329144534e-06, + "loss": 0.6732, + "step": 11200 + }, + { + "epoch": 6.3897399358746, + "grad_norm": 2.3167895861205148, + "learning_rate": 6.939371178516423e-06, + "loss": 0.677, + "step": 11210 + }, + { + "epoch": 6.395439971499822, + "grad_norm": 2.202354910854256, + "learning_rate": 6.920431149659128e-06, + "loss": 0.6673, + "step": 11220 + }, + { + "epoch": 6.401140007125044, + "grad_norm": 2.289065268830389, + "learning_rate": 6.901503317585565e-06, + "loss": 0.6773, + "step": 11230 + }, + { + "epoch": 6.406840042750267, + "grad_norm": 2.3316974329710667, + "learning_rate": 6.882587757260349e-06, + "loss": 0.6767, + "step": 11240 + }, + { + "epoch": 6.41254007837549, + "grad_norm": 2.187793708950635, + "learning_rate": 6.86368454359948e-06, + "loss": 0.6848, + "step": 11250 + }, + { + "epoch": 6.418240114000713, + "grad_norm": 2.2413921903856924, + "learning_rate": 6.844793751470069e-06, + "loss": 0.6827, + "step": 11260 + }, + { + "epoch": 6.423940149625935, + "grad_norm": 2.1505171696058576, + "learning_rate": 6.825915455690015e-06, + "loss": 0.6712, + "step": 11270 + }, + { + "epoch": 6.4296401852511575, + "grad_norm": 2.335901213288781, + "learning_rate": 6.807049731027751e-06, + "loss": 0.6856, + "step": 11280 + }, + { + "epoch": 6.43534022087638, + "grad_norm": 2.3470142982475677, + "learning_rate": 6.788196652201899e-06, + "loss": 0.6775, + "step": 11290 + }, + { + "epoch": 6.441040256501603, + "grad_norm": 2.2525507766531327, + "learning_rate": 6.769356293881005e-06, + "loss": 0.6784, + "step": 11300 + }, + { + "epoch": 6.446740292126826, + "grad_norm": 2.260718619048699, + "learning_rate": 6.750528730683231e-06, + "loss": 0.6781, + "step": 11310 + }, + { + "epoch": 6.452440327752049, + "grad_norm": 2.0555539548278055, + "learning_rate": 6.731714037176071e-06, + "loss": 0.6872, + "step": 11320 + }, + { + "epoch": 6.458140363377271, + "grad_norm": 2.2640903532916643, + "learning_rate": 6.712912287876041e-06, + "loss": 0.6746, + "step": 11330 + }, + { + "epoch": 6.4638403990024935, + "grad_norm": 2.295093150078742, + "learning_rate": 6.6941235572483905e-06, + "loss": 0.6717, + "step": 11340 + }, + { + "epoch": 6.469540434627716, + "grad_norm": 2.1669622783413276, + "learning_rate": 6.6753479197068136e-06, + "loss": 0.6888, + "step": 11350 + }, + { + "epoch": 6.475240470252939, + "grad_norm": 2.239696811909756, + "learning_rate": 6.65658544961314e-06, + "loss": 0.6704, + "step": 11360 + }, + { + "epoch": 6.480940505878162, + "grad_norm": 2.325134420293425, + "learning_rate": 6.637836221277063e-06, + "loss": 0.681, + "step": 11370 + }, + { + "epoch": 6.486640541503385, + "grad_norm": 2.297069745780302, + "learning_rate": 6.619100308955817e-06, + "loss": 0.68, + "step": 11380 + }, + { + "epoch": 6.492340577128607, + "grad_norm": 2.2432527420421815, + "learning_rate": 6.600377786853903e-06, + "loss": 0.6787, + "step": 11390 + }, + { + "epoch": 6.4980406127538295, + "grad_norm": 2.2220378243498797, + "learning_rate": 6.581668729122788e-06, + "loss": 0.6819, + "step": 11400 + }, + { + "epoch": 6.503740648379052, + "grad_norm": 2.3093011410269684, + "learning_rate": 6.562973209860619e-06, + "loss": 0.6799, + "step": 11410 + }, + { + "epoch": 6.509440684004275, + "grad_norm": 2.2678503311756026, + "learning_rate": 6.544291303111918e-06, + "loss": 0.6789, + "step": 11420 + }, + { + "epoch": 6.515140719629498, + "grad_norm": 2.2357396617872434, + "learning_rate": 6.525623082867292e-06, + "loss": 0.6779, + "step": 11430 + }, + { + "epoch": 6.520840755254721, + "grad_norm": 2.2611435548190144, + "learning_rate": 6.506968623063145e-06, + "loss": 0.6828, + "step": 11440 + }, + { + "epoch": 6.526540790879943, + "grad_norm": 2.243445764732507, + "learning_rate": 6.488327997581383e-06, + "loss": 0.6665, + "step": 11450 + }, + { + "epoch": 6.5322408265051655, + "grad_norm": 2.347032248498623, + "learning_rate": 6.469701280249118e-06, + "loss": 0.675, + "step": 11460 + }, + { + "epoch": 6.537940862130388, + "grad_norm": 2.348855883099169, + "learning_rate": 6.4510885448383796e-06, + "loss": 0.681, + "step": 11470 + }, + { + "epoch": 6.543640897755611, + "grad_norm": 2.310343148383439, + "learning_rate": 6.432489865065821e-06, + "loss": 0.6743, + "step": 11480 + }, + { + "epoch": 6.549340933380834, + "grad_norm": 2.1760150064972605, + "learning_rate": 6.4139053145924234e-06, + "loss": 0.6714, + "step": 11490 + }, + { + "epoch": 6.555040969006056, + "grad_norm": 2.242725273011376, + "learning_rate": 6.395334967023219e-06, + "loss": 0.6763, + "step": 11500 + }, + { + "epoch": 6.560741004631279, + "grad_norm": 2.2683025520996885, + "learning_rate": 6.3767788959069765e-06, + "loss": 0.6829, + "step": 11510 + }, + { + "epoch": 6.566441040256501, + "grad_norm": 2.2405754012097443, + "learning_rate": 6.358237174735931e-06, + "loss": 0.6764, + "step": 11520 + }, + { + "epoch": 6.572141075881724, + "grad_norm": 2.132139690504168, + "learning_rate": 6.339709876945475e-06, + "loss": 0.6869, + "step": 11530 + }, + { + "epoch": 6.577841111506947, + "grad_norm": 2.1531564491426662, + "learning_rate": 6.321197075913883e-06, + "loss": 0.6776, + "step": 11540 + }, + { + "epoch": 6.58354114713217, + "grad_norm": 2.240267761454674, + "learning_rate": 6.302698844962019e-06, + "loss": 0.6768, + "step": 11550 + }, + { + "epoch": 6.589241182757393, + "grad_norm": 2.251819890328819, + "learning_rate": 6.2842152573530294e-06, + "loss": 0.6839, + "step": 11560 + }, + { + "epoch": 6.594941218382615, + "grad_norm": 2.287793168135825, + "learning_rate": 6.265746386292073e-06, + "loss": 0.6822, + "step": 11570 + }, + { + "epoch": 6.600641254007837, + "grad_norm": 2.207158562564074, + "learning_rate": 6.24729230492602e-06, + "loss": 0.6687, + "step": 11580 + }, + { + "epoch": 6.60634128963306, + "grad_norm": 2.210442842730923, + "learning_rate": 6.228853086343169e-06, + "loss": 0.6845, + "step": 11590 + }, + { + "epoch": 6.612041325258283, + "grad_norm": 2.3938104834418343, + "learning_rate": 6.210428803572949e-06, + "loss": 0.675, + "step": 11600 + }, + { + "epoch": 6.617741360883506, + "grad_norm": 2.267869603038391, + "learning_rate": 6.192019529585638e-06, + "loss": 0.6877, + "step": 11610 + }, + { + "epoch": 6.623441396508728, + "grad_norm": 2.177035041489817, + "learning_rate": 6.173625337292068e-06, + "loss": 0.6703, + "step": 11620 + }, + { + "epoch": 6.629141432133951, + "grad_norm": 2.095897873005044, + "learning_rate": 6.155246299543342e-06, + "loss": 0.6641, + "step": 11630 + }, + { + "epoch": 6.634841467759173, + "grad_norm": 2.278131320767482, + "learning_rate": 6.136882489130545e-06, + "loss": 0.6715, + "step": 11640 + }, + { + "epoch": 6.640541503384396, + "grad_norm": 2.2722217340570294, + "learning_rate": 6.1185339787844475e-06, + "loss": 0.6773, + "step": 11650 + }, + { + "epoch": 6.646241539009619, + "grad_norm": 2.1265165938539847, + "learning_rate": 6.100200841175228e-06, + "loss": 0.67, + "step": 11660 + }, + { + "epoch": 6.651941574634842, + "grad_norm": 2.249161145163528, + "learning_rate": 6.081883148912174e-06, + "loss": 0.6761, + "step": 11670 + }, + { + "epoch": 6.657641610260065, + "grad_norm": 2.3663245069496117, + "learning_rate": 6.06358097454341e-06, + "loss": 0.6764, + "step": 11680 + }, + { + "epoch": 6.6633416458852865, + "grad_norm": 2.241759785395175, + "learning_rate": 6.045294390555598e-06, + "loss": 0.6778, + "step": 11690 + }, + { + "epoch": 6.669041681510509, + "grad_norm": 2.1832541172871047, + "learning_rate": 6.027023469373654e-06, + "loss": 0.6683, + "step": 11700 + }, + { + "epoch": 6.674741717135732, + "grad_norm": 2.165879211709872, + "learning_rate": 6.0087682833604475e-06, + "loss": 0.6883, + "step": 11710 + }, + { + "epoch": 6.680441752760955, + "grad_norm": 2.158951360462029, + "learning_rate": 5.990528904816553e-06, + "loss": 0.6803, + "step": 11720 + }, + { + "epoch": 6.686141788386178, + "grad_norm": 2.2647654446771055, + "learning_rate": 5.972305405979919e-06, + "loss": 0.6711, + "step": 11730 + }, + { + "epoch": 6.6918418240114, + "grad_norm": 2.3092540339887795, + "learning_rate": 5.954097859025609e-06, + "loss": 0.6813, + "step": 11740 + }, + { + "epoch": 6.6975418596366225, + "grad_norm": 2.189512090931114, + "learning_rate": 5.9359063360655065e-06, + "loss": 0.6802, + "step": 11750 + }, + { + "epoch": 6.703241895261845, + "grad_norm": 2.2124034866664357, + "learning_rate": 5.9177309091480295e-06, + "loss": 0.6762, + "step": 11760 + }, + { + "epoch": 6.708941930887068, + "grad_norm": 2.2058246047742798, + "learning_rate": 5.899571650257856e-06, + "loss": 0.674, + "step": 11770 + }, + { + "epoch": 6.714641966512291, + "grad_norm": 2.2319022200132204, + "learning_rate": 5.88142863131562e-06, + "loss": 0.6759, + "step": 11780 + }, + { + "epoch": 6.720342002137514, + "grad_norm": 2.2625836265140147, + "learning_rate": 5.863301924177638e-06, + "loss": 0.6806, + "step": 11790 + }, + { + "epoch": 6.726042037762736, + "grad_norm": 2.217514359432777, + "learning_rate": 5.84519160063562e-06, + "loss": 0.6889, + "step": 11800 + }, + { + "epoch": 6.7317420733879585, + "grad_norm": 2.1712652173946685, + "learning_rate": 5.827097732416404e-06, + "loss": 0.6768, + "step": 11810 + }, + { + "epoch": 6.737442109013181, + "grad_norm": 2.1940538960439104, + "learning_rate": 5.809020391181635e-06, + "loss": 0.67, + "step": 11820 + }, + { + "epoch": 6.743142144638404, + "grad_norm": 2.3137907789358176, + "learning_rate": 5.790959648527513e-06, + "loss": 0.6825, + "step": 11830 + }, + { + "epoch": 6.748842180263627, + "grad_norm": 2.2737669042973683, + "learning_rate": 5.772915575984497e-06, + "loss": 0.6653, + "step": 11840 + }, + { + "epoch": 6.754542215888849, + "grad_norm": 2.1182723284435094, + "learning_rate": 5.754888245017019e-06, + "loss": 0.6715, + "step": 11850 + }, + { + "epoch": 6.760242251514072, + "grad_norm": 2.11533837427729, + "learning_rate": 5.736877727023217e-06, + "loss": 0.6687, + "step": 11860 + }, + { + "epoch": 6.7659422871392945, + "grad_norm": 2.20018065816376, + "learning_rate": 5.7188840933346265e-06, + "loss": 0.676, + "step": 11870 + }, + { + "epoch": 6.771642322764517, + "grad_norm": 2.1443107954137477, + "learning_rate": 5.700907415215922e-06, + "loss": 0.6783, + "step": 11880 + }, + { + "epoch": 6.77734235838974, + "grad_norm": 2.2913214558881303, + "learning_rate": 5.682947763864612e-06, + "loss": 0.6734, + "step": 11890 + }, + { + "epoch": 6.783042394014963, + "grad_norm": 2.287316672737734, + "learning_rate": 5.665005210410788e-06, + "loss": 0.6719, + "step": 11900 + }, + { + "epoch": 6.788742429640186, + "grad_norm": 2.2262316652328, + "learning_rate": 5.64707982591681e-06, + "loss": 0.6756, + "step": 11910 + }, + { + "epoch": 6.794442465265408, + "grad_norm": 2.2678002183176638, + "learning_rate": 5.629171681377049e-06, + "loss": 0.6682, + "step": 11920 + }, + { + "epoch": 6.80014250089063, + "grad_norm": 2.2387850423103735, + "learning_rate": 5.611280847717581e-06, + "loss": 0.6729, + "step": 11930 + }, + { + "epoch": 6.805842536515853, + "grad_norm": 2.226404135799429, + "learning_rate": 5.593407395795936e-06, + "loss": 0.6746, + "step": 11940 + }, + { + "epoch": 6.811542572141076, + "grad_norm": 2.17331328615478, + "learning_rate": 5.575551396400802e-06, + "loss": 0.6637, + "step": 11950 + }, + { + "epoch": 6.817242607766299, + "grad_norm": 2.2195107960727682, + "learning_rate": 5.557712920251741e-06, + "loss": 0.6861, + "step": 11960 + }, + { + "epoch": 6.822942643391521, + "grad_norm": 2.211970729931292, + "learning_rate": 5.539892037998911e-06, + "loss": 0.6742, + "step": 11970 + }, + { + "epoch": 6.828642679016744, + "grad_norm": 2.114732092115801, + "learning_rate": 5.5220888202227906e-06, + "loss": 0.6707, + "step": 11980 + }, + { + "epoch": 6.834342714641966, + "grad_norm": 2.2228110319573164, + "learning_rate": 5.504303337433905e-06, + "loss": 0.6833, + "step": 11990 + }, + { + "epoch": 6.840042750267189, + "grad_norm": 2.272654536271849, + "learning_rate": 5.48653566007253e-06, + "loss": 0.682, + "step": 12000 + }, + { + "epoch": 6.845742785892412, + "grad_norm": 2.3142083790744534, + "learning_rate": 5.468785858508423e-06, + "loss": 0.6661, + "step": 12010 + }, + { + "epoch": 6.851442821517635, + "grad_norm": 2.306620120881866, + "learning_rate": 5.451054003040541e-06, + "loss": 0.6825, + "step": 12020 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 2.0840208651157, + "learning_rate": 5.4333401638967794e-06, + "loss": 0.6817, + "step": 12030 + }, + { + "epoch": 6.86284289276808, + "grad_norm": 2.211367100073465, + "learning_rate": 5.415644411233667e-06, + "loss": 0.6756, + "step": 12040 + }, + { + "epoch": 6.868542928393302, + "grad_norm": 2.2686426180873513, + "learning_rate": 5.3979668151360905e-06, + "loss": 0.6769, + "step": 12050 + }, + { + "epoch": 6.874242964018525, + "grad_norm": 2.226550828270745, + "learning_rate": 5.380307445617048e-06, + "loss": 0.6759, + "step": 12060 + }, + { + "epoch": 6.879942999643748, + "grad_norm": 2.3277448858046332, + "learning_rate": 5.362666372617331e-06, + "loss": 0.6758, + "step": 12070 + }, + { + "epoch": 6.885643035268971, + "grad_norm": 2.278894744962253, + "learning_rate": 5.345043666005287e-06, + "loss": 0.6658, + "step": 12080 + }, + { + "epoch": 6.891343070894193, + "grad_norm": 2.181303637950725, + "learning_rate": 5.327439395576503e-06, + "loss": 0.6705, + "step": 12090 + }, + { + "epoch": 6.8970431065194155, + "grad_norm": 2.2149757242569086, + "learning_rate": 5.309853631053563e-06, + "loss": 0.6748, + "step": 12100 + }, + { + "epoch": 6.902743142144638, + "grad_norm": 2.192223874753431, + "learning_rate": 5.2922864420857425e-06, + "loss": 0.6728, + "step": 12110 + }, + { + "epoch": 6.908443177769861, + "grad_norm": 2.2881232299842993, + "learning_rate": 5.274737898248767e-06, + "loss": 0.6847, + "step": 12120 + }, + { + "epoch": 6.914143213395084, + "grad_norm": 2.187704076050992, + "learning_rate": 5.257208069044501e-06, + "loss": 0.6755, + "step": 12130 + }, + { + "epoch": 6.919843249020307, + "grad_norm": 2.2275431904161938, + "learning_rate": 5.239697023900696e-06, + "loss": 0.6694, + "step": 12140 + }, + { + "epoch": 6.925543284645529, + "grad_norm": 2.2332210515648416, + "learning_rate": 5.222204832170705e-06, + "loss": 0.6787, + "step": 12150 + }, + { + "epoch": 6.9312433202707515, + "grad_norm": 2.180968213454905, + "learning_rate": 5.204731563133214e-06, + "loss": 0.6771, + "step": 12160 + }, + { + "epoch": 6.936943355895974, + "grad_norm": 2.2106229537355526, + "learning_rate": 5.187277285991963e-06, + "loss": 0.6743, + "step": 12170 + }, + { + "epoch": 6.942643391521197, + "grad_norm": 2.233420363116453, + "learning_rate": 5.169842069875474e-06, + "loss": 0.6817, + "step": 12180 + }, + { + "epoch": 6.94834342714642, + "grad_norm": 2.2651013060170366, + "learning_rate": 5.152425983836777e-06, + "loss": 0.6808, + "step": 12190 + }, + { + "epoch": 6.954043462771642, + "grad_norm": 2.172658855315481, + "learning_rate": 5.135029096853132e-06, + "loss": 0.6699, + "step": 12200 + }, + { + "epoch": 6.959743498396865, + "grad_norm": 2.2475701558030265, + "learning_rate": 5.117651477825776e-06, + "loss": 0.6749, + "step": 12210 + }, + { + "epoch": 6.9654435340220875, + "grad_norm": 2.1616444155760446, + "learning_rate": 5.100293195579613e-06, + "loss": 0.6681, + "step": 12220 + }, + { + "epoch": 6.97114356964731, + "grad_norm": 2.2189909840543285, + "learning_rate": 5.082954318862978e-06, + "loss": 0.6765, + "step": 12230 + }, + { + "epoch": 6.976843605272533, + "grad_norm": 2.2374869184686443, + "learning_rate": 5.0656349163473405e-06, + "loss": 0.6758, + "step": 12240 + }, + { + "epoch": 6.982543640897756, + "grad_norm": 2.24523811915557, + "learning_rate": 5.048335056627043e-06, + "loss": 0.6793, + "step": 12250 + }, + { + "epoch": 6.988243676522979, + "grad_norm": 2.1336155734190387, + "learning_rate": 5.031054808219038e-06, + "loss": 0.6733, + "step": 12260 + }, + { + "epoch": 6.993943712148201, + "grad_norm": 2.0935622253465223, + "learning_rate": 5.013794239562593e-06, + "loss": 0.6736, + "step": 12270 + }, + { + "epoch": 6.9996437477734235, + "grad_norm": 2.226381254274828, + "learning_rate": 4.996553419019039e-06, + "loss": 0.6801, + "step": 12280 + }, + { + "epoch": 7.005343783398646, + "grad_norm": 2.2742882385031793, + "learning_rate": 4.9793324148714935e-06, + "loss": 0.66, + "step": 12290 + }, + { + "epoch": 7.011043819023869, + "grad_norm": 2.4163699536407717, + "learning_rate": 4.962131295324588e-06, + "loss": 0.6675, + "step": 12300 + }, + { + "epoch": 7.016743854649092, + "grad_norm": 2.2283661151132548, + "learning_rate": 4.944950128504202e-06, + "loss": 0.6717, + "step": 12310 + }, + { + "epoch": 7.022443890274314, + "grad_norm": 2.2757432948656873, + "learning_rate": 4.9277889824571925e-06, + "loss": 0.6669, + "step": 12320 + }, + { + "epoch": 7.028143925899537, + "grad_norm": 2.200927526322527, + "learning_rate": 4.910647925151115e-06, + "loss": 0.6622, + "step": 12330 + }, + { + "epoch": 7.033843961524759, + "grad_norm": 2.2724813454996355, + "learning_rate": 4.893527024473979e-06, + "loss": 0.6572, + "step": 12340 + }, + { + "epoch": 7.039543997149982, + "grad_norm": 2.186235554279113, + "learning_rate": 4.876426348233948e-06, + "loss": 0.6626, + "step": 12350 + }, + { + "epoch": 7.045244032775205, + "grad_norm": 2.309704746767001, + "learning_rate": 4.85934596415909e-06, + "loss": 0.6636, + "step": 12360 + }, + { + "epoch": 7.050944068400428, + "grad_norm": 2.269715193602036, + "learning_rate": 4.842285939897107e-06, + "loss": 0.6621, + "step": 12370 + }, + { + "epoch": 7.05664410402565, + "grad_norm": 2.3094500491239187, + "learning_rate": 4.825246343015056e-06, + "loss": 0.657, + "step": 12380 + }, + { + "epoch": 7.062344139650873, + "grad_norm": 2.2173637465487848, + "learning_rate": 4.808227240999109e-06, + "loss": 0.6591, + "step": 12390 + }, + { + "epoch": 7.068044175276095, + "grad_norm": 2.3536529122634366, + "learning_rate": 4.791228701254251e-06, + "loss": 0.6561, + "step": 12400 + }, + { + "epoch": 7.073744210901318, + "grad_norm": 2.2288027952210956, + "learning_rate": 4.774250791104033e-06, + "loss": 0.6647, + "step": 12410 + }, + { + "epoch": 7.079444246526541, + "grad_norm": 2.1423835925047, + "learning_rate": 4.757293577790302e-06, + "loss": 0.6634, + "step": 12420 + }, + { + "epoch": 7.085144282151764, + "grad_norm": 2.2470079391106665, + "learning_rate": 4.740357128472936e-06, + "loss": 0.6785, + "step": 12430 + }, + { + "epoch": 7.090844317776986, + "grad_norm": 2.2574430392774785, + "learning_rate": 4.723441510229572e-06, + "loss": 0.6785, + "step": 12440 + }, + { + "epoch": 7.096544353402209, + "grad_norm": 2.3043198643832166, + "learning_rate": 4.70654679005535e-06, + "loss": 0.6596, + "step": 12450 + }, + { + "epoch": 7.102244389027431, + "grad_norm": 2.295680833918298, + "learning_rate": 4.689673034862637e-06, + "loss": 0.6594, + "step": 12460 + }, + { + "epoch": 7.107944424652654, + "grad_norm": 2.1605206188100206, + "learning_rate": 4.672820311480768e-06, + "loss": 0.6627, + "step": 12470 + }, + { + "epoch": 7.113644460277877, + "grad_norm": 2.1919151628756586, + "learning_rate": 4.655988686655787e-06, + "loss": 0.6667, + "step": 12480 + }, + { + "epoch": 7.1193444959031, + "grad_norm": 2.267528809716308, + "learning_rate": 4.639178227050169e-06, + "loss": 0.6582, + "step": 12490 + }, + { + "epoch": 7.125044531528322, + "grad_norm": 2.3241456885445535, + "learning_rate": 4.622388999242564e-06, + "loss": 0.6558, + "step": 12500 + }, + { + "epoch": 7.1307445671535445, + "grad_norm": 2.2495978263074763, + "learning_rate": 4.6056210697275315e-06, + "loss": 0.666, + "step": 12510 + }, + { + "epoch": 7.136444602778767, + "grad_norm": 2.3115077705620957, + "learning_rate": 4.588874504915287e-06, + "loss": 0.6659, + "step": 12520 + }, + { + "epoch": 7.14214463840399, + "grad_norm": 2.2516114253186887, + "learning_rate": 4.572149371131419e-06, + "loss": 0.6569, + "step": 12530 + }, + { + "epoch": 7.147844674029213, + "grad_norm": 2.331648891990796, + "learning_rate": 4.555445734616641e-06, + "loss": 0.6671, + "step": 12540 + }, + { + "epoch": 7.153544709654435, + "grad_norm": 2.3183931570473444, + "learning_rate": 4.538763661526527e-06, + "loss": 0.6711, + "step": 12550 + }, + { + "epoch": 7.159244745279658, + "grad_norm": 2.208243417910211, + "learning_rate": 4.522103217931247e-06, + "loss": 0.6503, + "step": 12560 + }, + { + "epoch": 7.1649447809048805, + "grad_norm": 2.155675760594788, + "learning_rate": 4.505464469815307e-06, + "loss": 0.6661, + "step": 12570 + }, + { + "epoch": 7.170644816530103, + "grad_norm": 2.2664894262413497, + "learning_rate": 4.488847483077285e-06, + "loss": 0.666, + "step": 12580 + }, + { + "epoch": 7.176344852155326, + "grad_norm": 2.2795499909428423, + "learning_rate": 4.472252323529575e-06, + "loss": 0.6674, + "step": 12590 + }, + { + "epoch": 7.182044887780549, + "grad_norm": 2.223281501378646, + "learning_rate": 4.455679056898116e-06, + "loss": 0.6597, + "step": 12600 + }, + { + "epoch": 7.187744923405771, + "grad_norm": 2.277136474302064, + "learning_rate": 4.439127748822153e-06, + "loss": 0.6648, + "step": 12610 + }, + { + "epoch": 7.193444959030994, + "grad_norm": 2.2711057573337072, + "learning_rate": 4.4225984648539525e-06, + "loss": 0.677, + "step": 12620 + }, + { + "epoch": 7.1991449946562165, + "grad_norm": 2.2561112509358727, + "learning_rate": 4.406091270458553e-06, + "loss": 0.6709, + "step": 12630 + }, + { + "epoch": 7.204845030281439, + "grad_norm": 2.2208121144419524, + "learning_rate": 4.389606231013512e-06, + "loss": 0.6582, + "step": 12640 + }, + { + "epoch": 7.210545065906662, + "grad_norm": 2.2549802019435083, + "learning_rate": 4.3731434118086324e-06, + "loss": 0.6633, + "step": 12650 + }, + { + "epoch": 7.216245101531885, + "grad_norm": 2.252019210437217, + "learning_rate": 4.356702878045728e-06, + "loss": 0.6662, + "step": 12660 + }, + { + "epoch": 7.221945137157107, + "grad_norm": 2.2006478078981693, + "learning_rate": 4.3402846948383334e-06, + "loss": 0.6622, + "step": 12670 + }, + { + "epoch": 7.22764517278233, + "grad_norm": 2.3860767372289704, + "learning_rate": 4.323888927211472e-06, + "loss": 0.6759, + "step": 12680 + }, + { + "epoch": 7.2333452084075525, + "grad_norm": 2.384532210236136, + "learning_rate": 4.307515640101387e-06, + "loss": 0.6648, + "step": 12690 + }, + { + "epoch": 7.239045244032775, + "grad_norm": 2.2186370735450662, + "learning_rate": 4.291164898355286e-06, + "loss": 0.6614, + "step": 12700 + }, + { + "epoch": 7.244745279657998, + "grad_norm": 2.275761327122912, + "learning_rate": 4.274836766731087e-06, + "loss": 0.6583, + "step": 12710 + }, + { + "epoch": 7.250445315283221, + "grad_norm": 2.3723166854023945, + "learning_rate": 4.2585313098971535e-06, + "loss": 0.6676, + "step": 12720 + }, + { + "epoch": 7.256145350908443, + "grad_norm": 2.1797372895464084, + "learning_rate": 4.242248592432048e-06, + "loss": 0.6629, + "step": 12730 + }, + { + "epoch": 7.261845386533666, + "grad_norm": 2.223664652891473, + "learning_rate": 4.225988678824279e-06, + "loss": 0.6704, + "step": 12740 + }, + { + "epoch": 7.267545422158888, + "grad_norm": 2.23232293309273, + "learning_rate": 4.209751633472029e-06, + "loss": 0.676, + "step": 12750 + }, + { + "epoch": 7.273245457784111, + "grad_norm": 2.245225811111464, + "learning_rate": 4.1935375206829156e-06, + "loss": 0.6634, + "step": 12760 + }, + { + "epoch": 7.278945493409334, + "grad_norm": 2.3450273180018466, + "learning_rate": 4.1773464046737275e-06, + "loss": 0.6745, + "step": 12770 + }, + { + "epoch": 7.284645529034557, + "grad_norm": 2.297072523505608, + "learning_rate": 4.161178349570173e-06, + "loss": 0.6689, + "step": 12780 + }, + { + "epoch": 7.290345564659779, + "grad_norm": 2.2269748343650537, + "learning_rate": 4.145033419406635e-06, + "loss": 0.6625, + "step": 12790 + }, + { + "epoch": 7.296045600285002, + "grad_norm": 2.3128792584540534, + "learning_rate": 4.128911678125902e-06, + "loss": 0.6645, + "step": 12800 + }, + { + "epoch": 7.301745635910224, + "grad_norm": 2.232593518867681, + "learning_rate": 4.112813189578917e-06, + "loss": 0.6658, + "step": 12810 + }, + { + "epoch": 7.307445671535447, + "grad_norm": 2.1477231869993267, + "learning_rate": 4.096738017524533e-06, + "loss": 0.6608, + "step": 12820 + }, + { + "epoch": 7.31314570716067, + "grad_norm": 2.3814575079547184, + "learning_rate": 4.080686225629267e-06, + "loss": 0.6643, + "step": 12830 + }, + { + "epoch": 7.318845742785893, + "grad_norm": 2.2391714897073522, + "learning_rate": 4.064657877467025e-06, + "loss": 0.6463, + "step": 12840 + }, + { + "epoch": 7.324545778411115, + "grad_norm": 2.2795231271731833, + "learning_rate": 4.048653036518869e-06, + "loss": 0.6593, + "step": 12850 + }, + { + "epoch": 7.330245814036338, + "grad_norm": 2.2963500847619196, + "learning_rate": 4.032671766172756e-06, + "loss": 0.6589, + "step": 12860 + }, + { + "epoch": 7.33594584966156, + "grad_norm": 2.186846566909248, + "learning_rate": 4.016714129723291e-06, + "loss": 0.6707, + "step": 12870 + }, + { + "epoch": 7.341645885286783, + "grad_norm": 2.3184883940263243, + "learning_rate": 4.00078019037148e-06, + "loss": 0.6701, + "step": 12880 + }, + { + "epoch": 7.347345920912006, + "grad_norm": 2.2939704186094976, + "learning_rate": 3.984870011224474e-06, + "loss": 0.6648, + "step": 12890 + }, + { + "epoch": 7.353045956537228, + "grad_norm": 2.1953034187690497, + "learning_rate": 3.968983655295317e-06, + "loss": 0.6736, + "step": 12900 + }, + { + "epoch": 7.358745992162451, + "grad_norm": 2.233747449114086, + "learning_rate": 3.953121185502699e-06, + "loss": 0.6745, + "step": 12910 + }, + { + "epoch": 7.3644460277876735, + "grad_norm": 2.348797145656744, + "learning_rate": 3.9372826646707215e-06, + "loss": 0.6576, + "step": 12920 + }, + { + "epoch": 7.370146063412896, + "grad_norm": 2.279768759531536, + "learning_rate": 3.921468155528614e-06, + "loss": 0.6681, + "step": 12930 + }, + { + "epoch": 7.375846099038119, + "grad_norm": 2.332254197841433, + "learning_rate": 3.90567772071052e-06, + "loss": 0.658, + "step": 12940 + }, + { + "epoch": 7.381546134663342, + "grad_norm": 2.2512646676554167, + "learning_rate": 3.8899114227552315e-06, + "loss": 0.6643, + "step": 12950 + }, + { + "epoch": 7.387246170288564, + "grad_norm": 2.229958740556661, + "learning_rate": 3.874169324105945e-06, + "loss": 0.6682, + "step": 12960 + }, + { + "epoch": 7.392946205913787, + "grad_norm": 2.2359413189154433, + "learning_rate": 3.8584514871100206e-06, + "loss": 0.6643, + "step": 12970 + }, + { + "epoch": 7.3986462415390095, + "grad_norm": 2.2773932967294597, + "learning_rate": 3.842757974018721e-06, + "loss": 0.6674, + "step": 12980 + }, + { + "epoch": 7.404346277164232, + "grad_norm": 2.26805369430861, + "learning_rate": 3.827088846986977e-06, + "loss": 0.6578, + "step": 12990 + }, + { + "epoch": 7.410046312789455, + "grad_norm": 2.2090255220562858, + "learning_rate": 3.8114441680731317e-06, + "loss": 0.6627, + "step": 13000 + }, + { + "epoch": 7.415746348414678, + "grad_norm": 2.419885181729568, + "learning_rate": 3.7958239992387113e-06, + "loss": 0.6672, + "step": 13010 + }, + { + "epoch": 7.4214463840399, + "grad_norm": 2.35136144391521, + "learning_rate": 3.7802284023481582e-06, + "loss": 0.6591, + "step": 13020 + }, + { + "epoch": 7.427146419665123, + "grad_norm": 2.214639339938116, + "learning_rate": 3.7646574391686007e-06, + "loss": 0.6593, + "step": 13030 + }, + { + "epoch": 7.4328464552903455, + "grad_norm": 2.255892281186678, + "learning_rate": 3.7491111713696026e-06, + "loss": 0.6728, + "step": 13040 + }, + { + "epoch": 7.438546490915568, + "grad_norm": 2.20115600566101, + "learning_rate": 3.733589660522923e-06, + "loss": 0.6656, + "step": 13050 + }, + { + "epoch": 7.444246526540791, + "grad_norm": 2.18933378089345, + "learning_rate": 3.718092968102267e-06, + "loss": 0.6544, + "step": 13060 + }, + { + "epoch": 7.449946562166014, + "grad_norm": 2.21831722919064, + "learning_rate": 3.702621155483046e-06, + "loss": 0.6719, + "step": 13070 + }, + { + "epoch": 7.455646597791236, + "grad_norm": 2.3224124638500188, + "learning_rate": 3.6871742839421344e-06, + "loss": 0.663, + "step": 13080 + }, + { + "epoch": 7.461346633416459, + "grad_norm": 2.285975538502232, + "learning_rate": 3.6717524146576234e-06, + "loss": 0.6627, + "step": 13090 + }, + { + "epoch": 7.4670466690416815, + "grad_norm": 2.2816850224350693, + "learning_rate": 3.6563556087085894e-06, + "loss": 0.6607, + "step": 13100 + }, + { + "epoch": 7.472746704666904, + "grad_norm": 2.223032277100294, + "learning_rate": 3.640983927074836e-06, + "loss": 0.6718, + "step": 13110 + }, + { + "epoch": 7.478446740292127, + "grad_norm": 2.3196995262580264, + "learning_rate": 3.6256374306366635e-06, + "loss": 0.6601, + "step": 13120 + }, + { + "epoch": 7.48414677591735, + "grad_norm": 2.179693956322987, + "learning_rate": 3.6103161801746224e-06, + "loss": 0.6634, + "step": 13130 + }, + { + "epoch": 7.489846811542572, + "grad_norm": 2.3196268076269093, + "learning_rate": 3.5950202363692822e-06, + "loss": 0.6626, + "step": 13140 + }, + { + "epoch": 7.495546847167795, + "grad_norm": 2.1820384798845858, + "learning_rate": 3.5797496598009794e-06, + "loss": 0.6632, + "step": 13150 + }, + { + "epoch": 7.501246882793017, + "grad_norm": 2.2375359061986417, + "learning_rate": 3.564504510949581e-06, + "loss": 0.6693, + "step": 13160 + }, + { + "epoch": 7.50694691841824, + "grad_norm": 2.2623386175960385, + "learning_rate": 3.54928485019425e-06, + "loss": 0.6685, + "step": 13170 + }, + { + "epoch": 7.512646954043463, + "grad_norm": 2.3114477674940273, + "learning_rate": 3.534090737813198e-06, + "loss": 0.669, + "step": 13180 + }, + { + "epoch": 7.518346989668686, + "grad_norm": 2.2773591124406862, + "learning_rate": 3.518922233983455e-06, + "loss": 0.6648, + "step": 13190 + }, + { + "epoch": 7.524047025293908, + "grad_norm": 2.2249861523523027, + "learning_rate": 3.503779398780626e-06, + "loss": 0.6674, + "step": 13200 + }, + { + "epoch": 7.529747060919131, + "grad_norm": 2.370282321028903, + "learning_rate": 3.4886622921786517e-06, + "loss": 0.6679, + "step": 13210 + }, + { + "epoch": 7.535447096544353, + "grad_norm": 2.2532698509194544, + "learning_rate": 3.4735709740495748e-06, + "loss": 0.6731, + "step": 13220 + }, + { + "epoch": 7.541147132169576, + "grad_norm": 2.30291311854667, + "learning_rate": 3.4585055041633076e-06, + "loss": 0.6609, + "step": 13230 + }, + { + "epoch": 7.546847167794799, + "grad_norm": 2.1904288998748833, + "learning_rate": 3.4434659421873807e-06, + "loss": 0.6629, + "step": 13240 + }, + { + "epoch": 7.552547203420021, + "grad_norm": 2.2214360575027823, + "learning_rate": 3.428452347686717e-06, + "loss": 0.6712, + "step": 13250 + }, + { + "epoch": 7.558247239045244, + "grad_norm": 2.294107190530825, + "learning_rate": 3.4134647801233976e-06, + "loss": 0.6684, + "step": 13260 + }, + { + "epoch": 7.563947274670467, + "grad_norm": 2.23145548647493, + "learning_rate": 3.3985032988564147e-06, + "loss": 0.6651, + "step": 13270 + }, + { + "epoch": 7.569647310295689, + "grad_norm": 2.3301816146667638, + "learning_rate": 3.3835679631414588e-06, + "loss": 0.6653, + "step": 13280 + }, + { + "epoch": 7.575347345920912, + "grad_norm": 2.4013703507360225, + "learning_rate": 3.3686588321306546e-06, + "loss": 0.6603, + "step": 13290 + }, + { + "epoch": 7.581047381546135, + "grad_norm": 2.3636437939957324, + "learning_rate": 3.35377596487235e-06, + "loss": 0.6591, + "step": 13300 + }, + { + "epoch": 7.586747417171358, + "grad_norm": 2.4085206590723485, + "learning_rate": 3.338919420310871e-06, + "loss": 0.6793, + "step": 13310 + }, + { + "epoch": 7.59244745279658, + "grad_norm": 2.234619347891406, + "learning_rate": 3.3240892572862924e-06, + "loss": 0.6711, + "step": 13320 + }, + { + "epoch": 7.5981474884218025, + "grad_norm": 2.231117043894081, + "learning_rate": 3.3092855345342047e-06, + "loss": 0.6655, + "step": 13330 + }, + { + "epoch": 7.603847524047025, + "grad_norm": 2.268181240569829, + "learning_rate": 3.294508310685478e-06, + "loss": 0.661, + "step": 13340 + }, + { + "epoch": 7.609547559672248, + "grad_norm": 2.33558451140121, + "learning_rate": 3.2797576442660293e-06, + "loss": 0.6635, + "step": 13350 + }, + { + "epoch": 7.615247595297471, + "grad_norm": 2.301128072633845, + "learning_rate": 3.265033593696606e-06, + "loss": 0.6627, + "step": 13360 + }, + { + "epoch": 7.620947630922693, + "grad_norm": 2.1001706479882993, + "learning_rate": 3.25033621729253e-06, + "loss": 0.6675, + "step": 13370 + }, + { + "epoch": 7.626647666547916, + "grad_norm": 2.2899545362349767, + "learning_rate": 3.2356655732634825e-06, + "loss": 0.667, + "step": 13380 + }, + { + "epoch": 7.6323477021731385, + "grad_norm": 2.236065324768278, + "learning_rate": 3.2210217197132685e-06, + "loss": 0.6647, + "step": 13390 + }, + { + "epoch": 7.638047737798361, + "grad_norm": 2.423950515329974, + "learning_rate": 3.2064047146395894e-06, + "loss": 0.6582, + "step": 13400 + }, + { + "epoch": 7.643747773423584, + "grad_norm": 2.113006486576792, + "learning_rate": 3.191814615933816e-06, + "loss": 0.6694, + "step": 13410 + }, + { + "epoch": 7.649447809048807, + "grad_norm": 2.272493144954798, + "learning_rate": 3.1772514813807474e-06, + "loss": 0.664, + "step": 13420 + }, + { + "epoch": 7.655147844674029, + "grad_norm": 2.2477799449220157, + "learning_rate": 3.1627153686583954e-06, + "loss": 0.6665, + "step": 13430 + }, + { + "epoch": 7.660847880299252, + "grad_norm": 2.359330416347929, + "learning_rate": 3.1482063353377468e-06, + "loss": 0.6608, + "step": 13440 + }, + { + "epoch": 7.6665479159244745, + "grad_norm": 2.2768436025456107, + "learning_rate": 3.1337244388825428e-06, + "loss": 0.6662, + "step": 13450 + }, + { + "epoch": 7.672247951549697, + "grad_norm": 2.222221818207048, + "learning_rate": 3.1192697366490443e-06, + "loss": 0.6691, + "step": 13460 + }, + { + "epoch": 7.67794798717492, + "grad_norm": 2.318255375011537, + "learning_rate": 3.104842285885811e-06, + "loss": 0.6597, + "step": 13470 + }, + { + "epoch": 7.683648022800142, + "grad_norm": 2.2622741956334798, + "learning_rate": 3.0904421437334685e-06, + "loss": 0.647, + "step": 13480 + }, + { + "epoch": 7.689348058425365, + "grad_norm": 2.3412395862257624, + "learning_rate": 3.0760693672244858e-06, + "loss": 0.6608, + "step": 13490 + }, + { + "epoch": 7.695048094050588, + "grad_norm": 2.2700240775817644, + "learning_rate": 3.061724013282956e-06, + "loss": 0.6575, + "step": 13500 + }, + { + "epoch": 7.7007481296758105, + "grad_norm": 2.2489411090100275, + "learning_rate": 3.047406138724355e-06, + "loss": 0.6635, + "step": 13510 + }, + { + "epoch": 7.706448165301033, + "grad_norm": 2.40198054648677, + "learning_rate": 3.0331158002553296e-06, + "loss": 0.6638, + "step": 13520 + }, + { + "epoch": 7.712148200926256, + "grad_norm": 2.276918761843776, + "learning_rate": 3.018853054473463e-06, + "loss": 0.665, + "step": 13530 + }, + { + "epoch": 7.717848236551479, + "grad_norm": 2.2169633358638414, + "learning_rate": 3.0046179578670664e-06, + "loss": 0.6632, + "step": 13540 + }, + { + "epoch": 7.723548272176701, + "grad_norm": 2.183177198102162, + "learning_rate": 2.9904105668149374e-06, + "loss": 0.668, + "step": 13550 + }, + { + "epoch": 7.729248307801924, + "grad_norm": 2.3235798898652975, + "learning_rate": 2.9762309375861462e-06, + "loss": 0.6633, + "step": 13560 + }, + { + "epoch": 7.734948343427146, + "grad_norm": 2.325494714539688, + "learning_rate": 2.9620791263398142e-06, + "loss": 0.6619, + "step": 13570 + }, + { + "epoch": 7.740648379052369, + "grad_norm": 2.2715717713625696, + "learning_rate": 2.9479551891248746e-06, + "loss": 0.6591, + "step": 13580 + }, + { + "epoch": 7.746348414677592, + "grad_norm": 2.32880260951295, + "learning_rate": 2.9338591818798856e-06, + "loss": 0.6642, + "step": 13590 + }, + { + "epoch": 7.752048450302814, + "grad_norm": 2.1873728881485874, + "learning_rate": 2.919791160432772e-06, + "loss": 0.6644, + "step": 13600 + }, + { + "epoch": 7.757748485928037, + "grad_norm": 2.2332074048709165, + "learning_rate": 2.9057511805006246e-06, + "loss": 0.6637, + "step": 13610 + }, + { + "epoch": 7.76344852155326, + "grad_norm": 2.2648757192147073, + "learning_rate": 2.89173929768947e-06, + "loss": 0.6672, + "step": 13620 + }, + { + "epoch": 7.769148557178482, + "grad_norm": 2.258292024109965, + "learning_rate": 2.877755567494066e-06, + "loss": 0.6566, + "step": 13630 + }, + { + "epoch": 7.774848592803705, + "grad_norm": 2.2498733750901283, + "learning_rate": 2.863800045297659e-06, + "loss": 0.6713, + "step": 13640 + }, + { + "epoch": 7.780548628428928, + "grad_norm": 2.257203818446692, + "learning_rate": 2.8498727863717803e-06, + "loss": 0.6689, + "step": 13650 + }, + { + "epoch": 7.786248664054151, + "grad_norm": 2.2968049679993947, + "learning_rate": 2.835973845876022e-06, + "loss": 0.6716, + "step": 13660 + }, + { + "epoch": 7.791948699679373, + "grad_norm": 2.3196500348038955, + "learning_rate": 2.8221032788578206e-06, + "loss": 0.6732, + "step": 13670 + }, + { + "epoch": 7.797648735304596, + "grad_norm": 2.216899403748187, + "learning_rate": 2.808261140252242e-06, + "loss": 0.6718, + "step": 13680 + }, + { + "epoch": 7.803348770929818, + "grad_norm": 2.254643395065768, + "learning_rate": 2.7944474848817572e-06, + "loss": 0.6619, + "step": 13690 + }, + { + "epoch": 7.809048806555041, + "grad_norm": 2.2820083783430105, + "learning_rate": 2.780662367456021e-06, + "loss": 0.6655, + "step": 13700 + }, + { + "epoch": 7.814748842180264, + "grad_norm": 2.2433132656609933, + "learning_rate": 2.7669058425716676e-06, + "loss": 0.6602, + "step": 13710 + }, + { + "epoch": 7.820448877805486, + "grad_norm": 2.335507982400234, + "learning_rate": 2.753177964712096e-06, + "loss": 0.6596, + "step": 13720 + }, + { + "epoch": 7.826148913430709, + "grad_norm": 2.2275175940366965, + "learning_rate": 2.7394787882472374e-06, + "loss": 0.6644, + "step": 13730 + }, + { + "epoch": 7.8318489490559315, + "grad_norm": 2.295329033852509, + "learning_rate": 2.7258083674333545e-06, + "loss": 0.6616, + "step": 13740 + }, + { + "epoch": 7.837548984681154, + "grad_norm": 2.192580449354095, + "learning_rate": 2.7121667564128173e-06, + "loss": 0.6535, + "step": 13750 + }, + { + "epoch": 7.843249020306377, + "grad_norm": 2.342528376405033, + "learning_rate": 2.698554009213902e-06, + "loss": 0.6774, + "step": 13760 + }, + { + "epoch": 7.8489490559316, + "grad_norm": 2.2950904697540024, + "learning_rate": 2.684970179750559e-06, + "loss": 0.6659, + "step": 13770 + }, + { + "epoch": 7.854649091556822, + "grad_norm": 2.328359619364796, + "learning_rate": 2.6714153218222127e-06, + "loss": 0.6699, + "step": 13780 + }, + { + "epoch": 7.860349127182045, + "grad_norm": 2.215674623073645, + "learning_rate": 2.6578894891135455e-06, + "loss": 0.6633, + "step": 13790 + }, + { + "epoch": 7.8660491628072675, + "grad_norm": 2.3862089040460206, + "learning_rate": 2.6443927351942798e-06, + "loss": 0.6624, + "step": 13800 + }, + { + "epoch": 7.87174919843249, + "grad_norm": 2.243717235079729, + "learning_rate": 2.630925113518974e-06, + "loss": 0.6643, + "step": 13810 + }, + { + "epoch": 7.877449234057713, + "grad_norm": 2.2927993968105946, + "learning_rate": 2.617486677426806e-06, + "loss": 0.6627, + "step": 13820 + }, + { + "epoch": 7.883149269682935, + "grad_norm": 2.322818052755111, + "learning_rate": 2.6040774801413616e-06, + "loss": 0.6582, + "step": 13830 + }, + { + "epoch": 7.888849305308158, + "grad_norm": 2.2736195651698066, + "learning_rate": 2.590697574770421e-06, + "loss": 0.6633, + "step": 13840 + }, + { + "epoch": 7.894549340933381, + "grad_norm": 2.171993807344973, + "learning_rate": 2.5773470143057657e-06, + "loss": 0.6605, + "step": 13850 + }, + { + "epoch": 7.9002493765586035, + "grad_norm": 2.2587649223942456, + "learning_rate": 2.564025851622941e-06, + "loss": 0.6654, + "step": 13860 + }, + { + "epoch": 7.905949412183826, + "grad_norm": 2.333016160998036, + "learning_rate": 2.550734139481067e-06, + "loss": 0.6674, + "step": 13870 + }, + { + "epoch": 7.911649447809049, + "grad_norm": 2.369469386300691, + "learning_rate": 2.5374719305226226e-06, + "loss": 0.6709, + "step": 13880 + }, + { + "epoch": 7.917349483434272, + "grad_norm": 2.3460895427142434, + "learning_rate": 2.524239277273235e-06, + "loss": 0.671, + "step": 13890 + }, + { + "epoch": 7.923049519059494, + "grad_norm": 2.3133515698467852, + "learning_rate": 2.511036232141484e-06, + "loss": 0.6658, + "step": 13900 + }, + { + "epoch": 7.928749554684717, + "grad_norm": 2.1978929439065147, + "learning_rate": 2.497862847418674e-06, + "loss": 0.6588, + "step": 13910 + }, + { + "epoch": 7.9344495903099395, + "grad_norm": 2.339957351608019, + "learning_rate": 2.4847191752786437e-06, + "loss": 0.6611, + "step": 13920 + }, + { + "epoch": 7.940149625935162, + "grad_norm": 2.2748253461664456, + "learning_rate": 2.4716052677775524e-06, + "loss": 0.6594, + "step": 13930 + }, + { + "epoch": 7.945849661560385, + "grad_norm": 2.2765085031372365, + "learning_rate": 2.4585211768536754e-06, + "loss": 0.6682, + "step": 13940 + }, + { + "epoch": 7.951549697185607, + "grad_norm": 2.143922983187977, + "learning_rate": 2.445466954327196e-06, + "loss": 0.6636, + "step": 13950 + }, + { + "epoch": 7.95724973281083, + "grad_norm": 2.3081123161562926, + "learning_rate": 2.4324426519000056e-06, + "loss": 0.6693, + "step": 13960 + }, + { + "epoch": 7.962949768436053, + "grad_norm": 2.3648144828229434, + "learning_rate": 2.419448321155493e-06, + "loss": 0.6661, + "step": 13970 + }, + { + "epoch": 7.968649804061275, + "grad_norm": 2.319378028427634, + "learning_rate": 2.4064840135583413e-06, + "loss": 0.6511, + "step": 13980 + }, + { + "epoch": 7.974349839686498, + "grad_norm": 2.3294256788629664, + "learning_rate": 2.3935497804543317e-06, + "loss": 0.66, + "step": 13990 + }, + { + "epoch": 7.980049875311721, + "grad_norm": 2.3273735482235773, + "learning_rate": 2.380645673070129e-06, + "loss": 0.6638, + "step": 14000 + }, + { + "epoch": 7.985749910936943, + "grad_norm": 2.3047174470726413, + "learning_rate": 2.3677717425130832e-06, + "loss": 0.6664, + "step": 14010 + }, + { + "epoch": 7.991449946562166, + "grad_norm": 2.2591163992818006, + "learning_rate": 2.3549280397710273e-06, + "loss": 0.6536, + "step": 14020 + }, + { + "epoch": 7.997149982187389, + "grad_norm": 2.2848907363838156, + "learning_rate": 2.3421146157120813e-06, + "loss": 0.674, + "step": 14030 + }, + { + "epoch": 8.00285001781261, + "grad_norm": 2.395804129166372, + "learning_rate": 2.329331521084439e-06, + "loss": 0.663, + "step": 14040 + }, + { + "epoch": 8.008550053437833, + "grad_norm": 2.349509693484751, + "learning_rate": 2.3165788065161742e-06, + "loss": 0.6537, + "step": 14050 + }, + { + "epoch": 8.014250089063056, + "grad_norm": 2.317112213637775, + "learning_rate": 2.303856522515039e-06, + "loss": 0.6553, + "step": 14060 + }, + { + "epoch": 8.019950124688279, + "grad_norm": 2.2344274209565165, + "learning_rate": 2.291164719468265e-06, + "loss": 0.646, + "step": 14070 + }, + { + "epoch": 8.025650160313502, + "grad_norm": 2.2402337776074766, + "learning_rate": 2.2785034476423608e-06, + "loss": 0.6645, + "step": 14080 + }, + { + "epoch": 8.031350195938725, + "grad_norm": 2.2434977798959985, + "learning_rate": 2.2658727571829176e-06, + "loss": 0.6647, + "step": 14090 + }, + { + "epoch": 8.037050231563947, + "grad_norm": 2.2912602059287193, + "learning_rate": 2.2532726981144028e-06, + "loss": 0.6608, + "step": 14100 + }, + { + "epoch": 8.04275026718917, + "grad_norm": 2.2971956830820885, + "learning_rate": 2.2407033203399687e-06, + "loss": 0.6525, + "step": 14110 + }, + { + "epoch": 8.048450302814393, + "grad_norm": 2.5095251658904014, + "learning_rate": 2.2281646736412575e-06, + "loss": 0.6555, + "step": 14120 + }, + { + "epoch": 8.054150338439616, + "grad_norm": 2.346002360483435, + "learning_rate": 2.215656807678194e-06, + "loss": 0.6531, + "step": 14130 + }, + { + "epoch": 8.059850374064839, + "grad_norm": 2.3550136302279174, + "learning_rate": 2.203179771988796e-06, + "loss": 0.652, + "step": 14140 + }, + { + "epoch": 8.065550409690061, + "grad_norm": 2.298658799397616, + "learning_rate": 2.1907336159889712e-06, + "loss": 0.6641, + "step": 14150 + }, + { + "epoch": 8.071250445315282, + "grad_norm": 2.274958209439512, + "learning_rate": 2.1783183889723415e-06, + "loss": 0.6556, + "step": 14160 + }, + { + "epoch": 8.076950480940505, + "grad_norm": 2.354922282381701, + "learning_rate": 2.1659341401100165e-06, + "loss": 0.6625, + "step": 14170 + }, + { + "epoch": 8.082650516565728, + "grad_norm": 2.2464704977690815, + "learning_rate": 2.1535809184504255e-06, + "loss": 0.6495, + "step": 14180 + }, + { + "epoch": 8.088350552190951, + "grad_norm": 2.427423409659305, + "learning_rate": 2.141258772919108e-06, + "loss": 0.6603, + "step": 14190 + }, + { + "epoch": 8.094050587816174, + "grad_norm": 2.3827561888368, + "learning_rate": 2.128967752318527e-06, + "loss": 0.6551, + "step": 14200 + }, + { + "epoch": 8.099750623441397, + "grad_norm": 2.4054911753771884, + "learning_rate": 2.116707905327874e-06, + "loss": 0.6511, + "step": 14210 + }, + { + "epoch": 8.10545065906662, + "grad_norm": 2.2407461145856686, + "learning_rate": 2.1044792805028756e-06, + "loss": 0.6607, + "step": 14220 + }, + { + "epoch": 8.111150694691842, + "grad_norm": 2.378679457250059, + "learning_rate": 2.0922819262756e-06, + "loss": 0.6453, + "step": 14230 + }, + { + "epoch": 8.116850730317065, + "grad_norm": 2.3368426825434816, + "learning_rate": 2.080115890954266e-06, + "loss": 0.6517, + "step": 14240 + }, + { + "epoch": 8.122550765942288, + "grad_norm": 2.2661181727697532, + "learning_rate": 2.06798122272306e-06, + "loss": 0.6485, + "step": 14250 + }, + { + "epoch": 8.12825080156751, + "grad_norm": 2.2638492639891625, + "learning_rate": 2.0558779696419274e-06, + "loss": 0.6633, + "step": 14260 + }, + { + "epoch": 8.133950837192732, + "grad_norm": 2.370400165547832, + "learning_rate": 2.043806179646399e-06, + "loss": 0.665, + "step": 14270 + }, + { + "epoch": 8.139650872817954, + "grad_norm": 2.2408509531522856, + "learning_rate": 2.03176590054739e-06, + "loss": 0.6639, + "step": 14280 + }, + { + "epoch": 8.145350908443177, + "grad_norm": 2.1835237645071515, + "learning_rate": 2.019757180031017e-06, + "loss": 0.6618, + "step": 14290 + }, + { + "epoch": 8.1510509440684, + "grad_norm": 2.379295837422735, + "learning_rate": 2.0077800656584102e-06, + "loss": 0.6589, + "step": 14300 + }, + { + "epoch": 8.156750979693623, + "grad_norm": 2.3318576043677277, + "learning_rate": 1.9958346048655188e-06, + "loss": 0.6497, + "step": 14310 + }, + { + "epoch": 8.162451015318846, + "grad_norm": 2.3660969704194783, + "learning_rate": 1.9839208449629265e-06, + "loss": 0.6612, + "step": 14320 + }, + { + "epoch": 8.168151050944068, + "grad_norm": 2.2599900871895224, + "learning_rate": 1.9720388331356643e-06, + "loss": 0.6544, + "step": 14330 + }, + { + "epoch": 8.173851086569291, + "grad_norm": 2.2625667227411386, + "learning_rate": 1.960188616443025e-06, + "loss": 0.6577, + "step": 14340 + }, + { + "epoch": 8.179551122194514, + "grad_norm": 2.1949842609024497, + "learning_rate": 1.9483702418183725e-06, + "loss": 0.64, + "step": 14350 + }, + { + "epoch": 8.185251157819737, + "grad_norm": 2.2246089702453533, + "learning_rate": 1.9365837560689626e-06, + "loss": 0.6573, + "step": 14360 + }, + { + "epoch": 8.19095119344496, + "grad_norm": 2.355732389959134, + "learning_rate": 1.924829205875746e-06, + "loss": 0.6627, + "step": 14370 + }, + { + "epoch": 8.196651229070183, + "grad_norm": 2.290300180364329, + "learning_rate": 1.9131066377932017e-06, + "loss": 0.661, + "step": 14380 + }, + { + "epoch": 8.202351264695404, + "grad_norm": 2.3069453870221093, + "learning_rate": 1.901416098249136e-06, + "loss": 0.6467, + "step": 14390 + }, + { + "epoch": 8.208051300320626, + "grad_norm": 2.287176978146281, + "learning_rate": 1.8897576335445023e-06, + "loss": 0.6597, + "step": 14400 + }, + { + "epoch": 8.21375133594585, + "grad_norm": 2.3455594812602687, + "learning_rate": 1.8781312898532256e-06, + "loss": 0.6534, + "step": 14410 + }, + { + "epoch": 8.219451371571072, + "grad_norm": 2.2756337406814207, + "learning_rate": 1.8665371132220068e-06, + "loss": 0.6431, + "step": 14420 + }, + { + "epoch": 8.225151407196295, + "grad_norm": 2.249136593496119, + "learning_rate": 1.8549751495701584e-06, + "loss": 0.6552, + "step": 14430 + }, + { + "epoch": 8.230851442821518, + "grad_norm": 2.279678512876365, + "learning_rate": 1.8434454446894023e-06, + "loss": 0.6562, + "step": 14440 + }, + { + "epoch": 8.23655147844674, + "grad_norm": 2.449260711946442, + "learning_rate": 1.8319480442437043e-06, + "loss": 0.6656, + "step": 14450 + }, + { + "epoch": 8.242251514071963, + "grad_norm": 2.2927685740460997, + "learning_rate": 1.8204829937690748e-06, + "loss": 0.647, + "step": 14460 + }, + { + "epoch": 8.247951549697186, + "grad_norm": 2.2528090490901493, + "learning_rate": 1.8090503386734181e-06, + "loss": 0.6562, + "step": 14470 + }, + { + "epoch": 8.253651585322409, + "grad_norm": 2.338568674068967, + "learning_rate": 1.7976501242363242e-06, + "loss": 0.6632, + "step": 14480 + }, + { + "epoch": 8.259351620947632, + "grad_norm": 2.291192489078667, + "learning_rate": 1.7862823956089014e-06, + "loss": 0.6603, + "step": 14490 + }, + { + "epoch": 8.265051656572854, + "grad_norm": 2.2914805164289413, + "learning_rate": 1.774947197813598e-06, + "loss": 0.653, + "step": 14500 + }, + { + "epoch": 8.270751692198075, + "grad_norm": 2.286679795683027, + "learning_rate": 1.763644575744019e-06, + "loss": 0.6568, + "step": 14510 + }, + { + "epoch": 8.276451727823298, + "grad_norm": 2.2870797249748516, + "learning_rate": 1.7523745741647602e-06, + "loss": 0.6526, + "step": 14520 + }, + { + "epoch": 8.282151763448521, + "grad_norm": 2.3049667918026002, + "learning_rate": 1.7411372377112146e-06, + "loss": 0.6552, + "step": 14530 + }, + { + "epoch": 8.287851799073744, + "grad_norm": 2.360017011300003, + "learning_rate": 1.7299326108894033e-06, + "loss": 0.6571, + "step": 14540 + }, + { + "epoch": 8.293551834698967, + "grad_norm": 2.2514456956710065, + "learning_rate": 1.7187607380757998e-06, + "loss": 0.6634, + "step": 14550 + }, + { + "epoch": 8.29925187032419, + "grad_norm": 2.287113507409648, + "learning_rate": 1.7076216635171594e-06, + "loss": 0.6518, + "step": 14560 + }, + { + "epoch": 8.304951905949412, + "grad_norm": 2.2783805652974376, + "learning_rate": 1.6965154313303367e-06, + "loss": 0.6652, + "step": 14570 + }, + { + "epoch": 8.310651941574635, + "grad_norm": 2.222279328728929, + "learning_rate": 1.6854420855021026e-06, + "loss": 0.6661, + "step": 14580 + }, + { + "epoch": 8.316351977199858, + "grad_norm": 2.252278786396242, + "learning_rate": 1.6744016698889897e-06, + "loss": 0.6517, + "step": 14590 + }, + { + "epoch": 8.32205201282508, + "grad_norm": 2.345685753280762, + "learning_rate": 1.6633942282171056e-06, + "loss": 0.6551, + "step": 14600 + }, + { + "epoch": 8.327752048450304, + "grad_norm": 2.338706727686837, + "learning_rate": 1.6524198040819683e-06, + "loss": 0.6543, + "step": 14610 + }, + { + "epoch": 8.333452084075525, + "grad_norm": 2.384430342462321, + "learning_rate": 1.6414784409483197e-06, + "loss": 0.6569, + "step": 14620 + }, + { + "epoch": 8.339152119700747, + "grad_norm": 2.3701656962457673, + "learning_rate": 1.6305701821499686e-06, + "loss": 0.6535, + "step": 14630 + }, + { + "epoch": 8.34485215532597, + "grad_norm": 2.290965795306861, + "learning_rate": 1.6196950708896053e-06, + "loss": 0.6643, + "step": 14640 + }, + { + "epoch": 8.350552190951193, + "grad_norm": 2.3311296972099442, + "learning_rate": 1.6088531502386484e-06, + "loss": 0.6509, + "step": 14650 + }, + { + "epoch": 8.356252226576416, + "grad_norm": 2.335399650649135, + "learning_rate": 1.598044463137054e-06, + "loss": 0.6601, + "step": 14660 + }, + { + "epoch": 8.361952262201639, + "grad_norm": 2.455475427043487, + "learning_rate": 1.58726905239316e-06, + "loss": 0.657, + "step": 14670 + }, + { + "epoch": 8.367652297826861, + "grad_norm": 2.3282079850959754, + "learning_rate": 1.5765269606835054e-06, + "loss": 0.6679, + "step": 14680 + }, + { + "epoch": 8.373352333452084, + "grad_norm": 2.311135221618032, + "learning_rate": 1.5658182305526815e-06, + "loss": 0.6565, + "step": 14690 + }, + { + "epoch": 8.379052369077307, + "grad_norm": 2.2488393528174426, + "learning_rate": 1.5551429044131305e-06, + "loss": 0.6595, + "step": 14700 + }, + { + "epoch": 8.38475240470253, + "grad_norm": 2.267703234475785, + "learning_rate": 1.544501024545011e-06, + "loss": 0.6591, + "step": 14710 + }, + { + "epoch": 8.390452440327753, + "grad_norm": 2.2332246831041456, + "learning_rate": 1.5338926330960102e-06, + "loss": 0.6619, + "step": 14720 + }, + { + "epoch": 8.396152475952976, + "grad_norm": 2.365659533928728, + "learning_rate": 1.5233177720811798e-06, + "loss": 0.6543, + "step": 14730 + }, + { + "epoch": 8.401852511578197, + "grad_norm": 2.3371976515238586, + "learning_rate": 1.512776483382783e-06, + "loss": 0.6609, + "step": 14740 + }, + { + "epoch": 8.40755254720342, + "grad_norm": 2.374043422057661, + "learning_rate": 1.5022688087501092e-06, + "loss": 0.6643, + "step": 14750 + }, + { + "epoch": 8.413252582828642, + "grad_norm": 2.4076504196305817, + "learning_rate": 1.491794789799319e-06, + "loss": 0.6641, + "step": 14760 + }, + { + "epoch": 8.418952618453865, + "grad_norm": 2.2023957766134474, + "learning_rate": 1.4813544680132763e-06, + "loss": 0.6536, + "step": 14770 + }, + { + "epoch": 8.424652654079088, + "grad_norm": 2.202310327507326, + "learning_rate": 1.4709478847413948e-06, + "loss": 0.6467, + "step": 14780 + }, + { + "epoch": 8.43035268970431, + "grad_norm": 2.2051414307499106, + "learning_rate": 1.4605750811994557e-06, + "loss": 0.662, + "step": 14790 + }, + { + "epoch": 8.436052725329533, + "grad_norm": 2.434564818817124, + "learning_rate": 1.4502360984694563e-06, + "loss": 0.6532, + "step": 14800 + }, + { + "epoch": 8.441752760954756, + "grad_norm": 2.2946949223375737, + "learning_rate": 1.4399309774994475e-06, + "loss": 0.6584, + "step": 14810 + }, + { + "epoch": 8.447452796579979, + "grad_norm": 2.228248217670367, + "learning_rate": 1.4296597591033656e-06, + "loss": 0.6614, + "step": 14820 + }, + { + "epoch": 8.453152832205202, + "grad_norm": 2.2820771284655637, + "learning_rate": 1.4194224839608761e-06, + "loss": 0.6451, + "step": 14830 + }, + { + "epoch": 8.458852867830425, + "grad_norm": 2.393253761036466, + "learning_rate": 1.4092191926172106e-06, + "loss": 0.6543, + "step": 14840 + }, + { + "epoch": 8.464552903455647, + "grad_norm": 2.282212085947841, + "learning_rate": 1.3990499254830047e-06, + "loss": 0.6595, + "step": 14850 + }, + { + "epoch": 8.470252939080869, + "grad_norm": 2.365145023219437, + "learning_rate": 1.3889147228341394e-06, + "loss": 0.664, + "step": 14860 + }, + { + "epoch": 8.475952974706091, + "grad_norm": 2.189934931833651, + "learning_rate": 1.3788136248115869e-06, + "loss": 0.6629, + "step": 14870 + }, + { + "epoch": 8.481653010331314, + "grad_norm": 2.4011038278551173, + "learning_rate": 1.3687466714212393e-06, + "loss": 0.6577, + "step": 14880 + }, + { + "epoch": 8.487353045956537, + "grad_norm": 2.4106598851131475, + "learning_rate": 1.3587139025337615e-06, + "loss": 0.658, + "step": 14890 + }, + { + "epoch": 8.49305308158176, + "grad_norm": 2.3592819788417523, + "learning_rate": 1.348715357884427e-06, + "loss": 0.6579, + "step": 14900 + }, + { + "epoch": 8.498753117206983, + "grad_norm": 2.3305573077262505, + "learning_rate": 1.3387510770729595e-06, + "loss": 0.665, + "step": 14910 + }, + { + "epoch": 8.504453152832205, + "grad_norm": 2.3672792720086564, + "learning_rate": 1.3288210995633888e-06, + "loss": 0.6547, + "step": 14920 + }, + { + "epoch": 8.510153188457428, + "grad_norm": 2.280075759106149, + "learning_rate": 1.3189254646838766e-06, + "loss": 0.6652, + "step": 14930 + }, + { + "epoch": 8.515853224082651, + "grad_norm": 2.3369049806095297, + "learning_rate": 1.3090642116265695e-06, + "loss": 0.6568, + "step": 14940 + }, + { + "epoch": 8.521553259707874, + "grad_norm": 2.316623374340676, + "learning_rate": 1.2992373794474466e-06, + "loss": 0.6551, + "step": 14950 + }, + { + "epoch": 8.527253295333097, + "grad_norm": 2.303245948854903, + "learning_rate": 1.289445007066158e-06, + "loss": 0.6494, + "step": 14960 + }, + { + "epoch": 8.532953330958318, + "grad_norm": 2.359287289036829, + "learning_rate": 1.2796871332658756e-06, + "loss": 0.6558, + "step": 14970 + }, + { + "epoch": 8.53865336658354, + "grad_norm": 2.25661886393714, + "learning_rate": 1.26996379669314e-06, + "loss": 0.6602, + "step": 14980 + }, + { + "epoch": 8.544353402208763, + "grad_norm": 2.309228332149793, + "learning_rate": 1.260275035857701e-06, + "loss": 0.6609, + "step": 14990 + }, + { + "epoch": 8.550053437833986, + "grad_norm": 2.174364791553763, + "learning_rate": 1.2506208891323711e-06, + "loss": 0.6567, + "step": 15000 + }, + { + "epoch": 8.555753473459209, + "grad_norm": 2.43153962366673, + "learning_rate": 1.2410013947528766e-06, + "loss": 0.6589, + "step": 15010 + }, + { + "epoch": 8.561453509084432, + "grad_norm": 2.272819737875179, + "learning_rate": 1.2314165908176956e-06, + "loss": 0.6559, + "step": 15020 + }, + { + "epoch": 8.567153544709655, + "grad_norm": 2.179261588787464, + "learning_rate": 1.221866515287915e-06, + "loss": 0.6575, + "step": 15030 + }, + { + "epoch": 8.572853580334877, + "grad_norm": 2.221719946180341, + "learning_rate": 1.2123512059870756e-06, + "loss": 0.6535, + "step": 15040 + }, + { + "epoch": 8.5785536159601, + "grad_norm": 2.3048878158808206, + "learning_rate": 1.202870700601032e-06, + "loss": 0.6605, + "step": 15050 + }, + { + "epoch": 8.584253651585323, + "grad_norm": 2.365355709443227, + "learning_rate": 1.1934250366777899e-06, + "loss": 0.6649, + "step": 15060 + }, + { + "epoch": 8.589953687210546, + "grad_norm": 2.269976756998104, + "learning_rate": 1.1840142516273644e-06, + "loss": 0.6587, + "step": 15070 + }, + { + "epoch": 8.595653722835767, + "grad_norm": 2.4599719284501274, + "learning_rate": 1.1746383827216334e-06, + "loss": 0.6598, + "step": 15080 + }, + { + "epoch": 8.60135375846099, + "grad_norm": 2.270776618032317, + "learning_rate": 1.165297467094184e-06, + "loss": 0.6604, + "step": 15090 + }, + { + "epoch": 8.607053794086212, + "grad_norm": 2.5066430755058726, + "learning_rate": 1.1559915417401746e-06, + "loss": 0.6616, + "step": 15100 + }, + { + "epoch": 8.612753829711435, + "grad_norm": 2.2437923308781786, + "learning_rate": 1.146720643516177e-06, + "loss": 0.6556, + "step": 15110 + }, + { + "epoch": 8.618453865336658, + "grad_norm": 2.1955644450862613, + "learning_rate": 1.1374848091400403e-06, + "loss": 0.656, + "step": 15120 + }, + { + "epoch": 8.62415390096188, + "grad_norm": 2.2820502927208577, + "learning_rate": 1.1282840751907387e-06, + "loss": 0.6586, + "step": 15130 + }, + { + "epoch": 8.629853936587104, + "grad_norm": 2.2244689899988437, + "learning_rate": 1.1191184781082342e-06, + "loss": 0.6555, + "step": 15140 + }, + { + "epoch": 8.635553972212326, + "grad_norm": 2.2241919830845953, + "learning_rate": 1.1099880541933228e-06, + "loss": 0.6593, + "step": 15150 + }, + { + "epoch": 8.64125400783755, + "grad_norm": 2.2162843709409437, + "learning_rate": 1.100892839607497e-06, + "loss": 0.6502, + "step": 15160 + }, + { + "epoch": 8.646954043462772, + "grad_norm": 2.3805084624468624, + "learning_rate": 1.0918328703727998e-06, + "loss": 0.6616, + "step": 15170 + }, + { + "epoch": 8.652654079087995, + "grad_norm": 2.33050488774571, + "learning_rate": 1.0828081823716862e-06, + "loss": 0.6489, + "step": 15180 + }, + { + "epoch": 8.658354114713218, + "grad_norm": 2.396488801272341, + "learning_rate": 1.0738188113468762e-06, + "loss": 0.6563, + "step": 15190 + }, + { + "epoch": 8.66405415033844, + "grad_norm": 2.1970815029045685, + "learning_rate": 1.0648647929012157e-06, + "loss": 0.6626, + "step": 15200 + }, + { + "epoch": 8.669754185963662, + "grad_norm": 2.356974951580113, + "learning_rate": 1.0559461624975343e-06, + "loss": 0.6599, + "step": 15210 + }, + { + "epoch": 8.675454221588884, + "grad_norm": 2.231841861511598, + "learning_rate": 1.0470629554585043e-06, + "loss": 0.656, + "step": 15220 + }, + { + "epoch": 8.681154257214107, + "grad_norm": 2.3405687246859013, + "learning_rate": 1.0382152069665063e-06, + "loss": 0.6534, + "step": 15230 + }, + { + "epoch": 8.68685429283933, + "grad_norm": 2.3057634631991344, + "learning_rate": 1.0294029520634806e-06, + "loss": 0.6569, + "step": 15240 + }, + { + "epoch": 8.692554328464553, + "grad_norm": 2.3591092566045826, + "learning_rate": 1.020626225650797e-06, + "loss": 0.6516, + "step": 15250 + }, + { + "epoch": 8.698254364089776, + "grad_norm": 2.2774547172739794, + "learning_rate": 1.0118850624891097e-06, + "loss": 0.6611, + "step": 15260 + }, + { + "epoch": 8.703954399714998, + "grad_norm": 2.3044521216927687, + "learning_rate": 1.0031794971982278e-06, + "loss": 0.657, + "step": 15270 + }, + { + "epoch": 8.709654435340221, + "grad_norm": 2.328406536786459, + "learning_rate": 9.945095642569692e-07, + "loss": 0.6549, + "step": 15280 + }, + { + "epoch": 8.715354470965444, + "grad_norm": 2.258894872917727, + "learning_rate": 9.858752980030295e-07, + "loss": 0.658, + "step": 15290 + }, + { + "epoch": 8.721054506590667, + "grad_norm": 2.221407186169279, + "learning_rate": 9.772767326328435e-07, + "loss": 0.6627, + "step": 15300 + }, + { + "epoch": 8.72675454221589, + "grad_norm": 2.371525100568372, + "learning_rate": 9.687139022014502e-07, + "loss": 0.6614, + "step": 15310 + }, + { + "epoch": 8.73245457784111, + "grad_norm": 2.3645791912994274, + "learning_rate": 9.601868406223647e-07, + "loss": 0.6589, + "step": 15320 + }, + { + "epoch": 8.738154613466333, + "grad_norm": 2.253478405026816, + "learning_rate": 9.516955816674311e-07, + "loss": 0.6512, + "step": 15330 + }, + { + "epoch": 8.743854649091556, + "grad_norm": 2.253371950211027, + "learning_rate": 9.432401589666984e-07, + "loss": 0.6547, + "step": 15340 + }, + { + "epoch": 8.749554684716779, + "grad_norm": 2.2348557907031155, + "learning_rate": 9.348206060082799e-07, + "loss": 0.6517, + "step": 15350 + }, + { + "epoch": 8.755254720342002, + "grad_norm": 2.2444613696453306, + "learning_rate": 9.264369561382336e-07, + "loss": 0.6548, + "step": 15360 + }, + { + "epoch": 8.760954755967225, + "grad_norm": 2.337184508845707, + "learning_rate": 9.180892425604149e-07, + "loss": 0.6598, + "step": 15370 + }, + { + "epoch": 8.766654791592448, + "grad_norm": 2.280658542347082, + "learning_rate": 9.097774983363527e-07, + "loss": 0.6623, + "step": 15380 + }, + { + "epoch": 8.77235482721767, + "grad_norm": 2.3828437996657983, + "learning_rate": 9.01501756385117e-07, + "loss": 0.6563, + "step": 15390 + }, + { + "epoch": 8.778054862842893, + "grad_norm": 2.349125904984566, + "learning_rate": 8.932620494831945e-07, + "loss": 0.6652, + "step": 15400 + }, + { + "epoch": 8.783754898468116, + "grad_norm": 2.347074312107561, + "learning_rate": 8.850584102643478e-07, + "loss": 0.6536, + "step": 15410 + }, + { + "epoch": 8.789454934093339, + "grad_norm": 2.4067679774208464, + "learning_rate": 8.768908712194913e-07, + "loss": 0.6548, + "step": 15420 + }, + { + "epoch": 8.79515496971856, + "grad_norm": 2.3226632236683464, + "learning_rate": 8.687594646965669e-07, + "loss": 0.6535, + "step": 15430 + }, + { + "epoch": 8.800855005343783, + "grad_norm": 2.315292189470015, + "learning_rate": 8.606642229004059e-07, + "loss": 0.6576, + "step": 15440 + }, + { + "epoch": 8.806555040969005, + "grad_norm": 2.3198254368211724, + "learning_rate": 8.526051778926181e-07, + "loss": 0.6542, + "step": 15450 + }, + { + "epoch": 8.812255076594228, + "grad_norm": 2.2794027212990184, + "learning_rate": 8.445823615914405e-07, + "loss": 0.6521, + "step": 15460 + }, + { + "epoch": 8.817955112219451, + "grad_norm": 2.3108872183589018, + "learning_rate": 8.365958057716339e-07, + "loss": 0.6491, + "step": 15470 + }, + { + "epoch": 8.823655147844674, + "grad_norm": 2.3739240284864165, + "learning_rate": 8.286455420643424e-07, + "loss": 0.6709, + "step": 15480 + }, + { + "epoch": 8.829355183469897, + "grad_norm": 2.2853027691895034, + "learning_rate": 8.207316019569811e-07, + "loss": 0.6671, + "step": 15490 + }, + { + "epoch": 8.83505521909512, + "grad_norm": 2.255831984475305, + "learning_rate": 8.128540167930942e-07, + "loss": 0.6555, + "step": 15500 + }, + { + "epoch": 8.840755254720342, + "grad_norm": 2.3936588178116907, + "learning_rate": 8.050128177722482e-07, + "loss": 0.6479, + "step": 15510 + }, + { + "epoch": 8.846455290345565, + "grad_norm": 2.4135299551433977, + "learning_rate": 7.972080359498946e-07, + "loss": 0.6602, + "step": 15520 + }, + { + "epoch": 8.852155325970788, + "grad_norm": 2.358391743128982, + "learning_rate": 7.894397022372535e-07, + "loss": 0.6522, + "step": 15530 + }, + { + "epoch": 8.85785536159601, + "grad_norm": 2.363252827528486, + "learning_rate": 7.817078474011974e-07, + "loss": 0.6524, + "step": 15540 + }, + { + "epoch": 8.863555397221234, + "grad_norm": 2.4081698427293876, + "learning_rate": 7.740125020641143e-07, + "loss": 0.6525, + "step": 15550 + }, + { + "epoch": 8.869255432846455, + "grad_norm": 2.3027191036174477, + "learning_rate": 7.663536967037977e-07, + "loss": 0.6563, + "step": 15560 + }, + { + "epoch": 8.874955468471677, + "grad_norm": 2.2800033610965613, + "learning_rate": 7.587314616533226e-07, + "loss": 0.6604, + "step": 15570 + }, + { + "epoch": 8.8806555040969, + "grad_norm": 2.2376325505940806, + "learning_rate": 7.511458271009254e-07, + "loss": 0.6509, + "step": 15580 + }, + { + "epoch": 8.886355539722123, + "grad_norm": 2.341968038078447, + "learning_rate": 7.435968230898838e-07, + "loss": 0.65, + "step": 15590 + }, + { + "epoch": 8.892055575347346, + "grad_norm": 2.2842053769053385, + "learning_rate": 7.360844795184007e-07, + "loss": 0.6594, + "step": 15600 + }, + { + "epoch": 8.897755610972569, + "grad_norm": 2.340094367909099, + "learning_rate": 7.286088261394797e-07, + "loss": 0.6592, + "step": 15610 + }, + { + "epoch": 8.903455646597791, + "grad_norm": 2.2678395556771136, + "learning_rate": 7.211698925608134e-07, + "loss": 0.6699, + "step": 15620 + }, + { + "epoch": 8.909155682223014, + "grad_norm": 2.3621110355578336, + "learning_rate": 7.137677082446659e-07, + "loss": 0.6596, + "step": 15630 + }, + { + "epoch": 8.914855717848237, + "grad_norm": 2.272216215850836, + "learning_rate": 7.064023025077516e-07, + "loss": 0.6435, + "step": 15640 + }, + { + "epoch": 8.92055575347346, + "grad_norm": 2.4488683011179617, + "learning_rate": 6.990737045211204e-07, + "loss": 0.6607, + "step": 15650 + }, + { + "epoch": 8.926255789098683, + "grad_norm": 2.308404823149232, + "learning_rate": 6.917819433100436e-07, + "loss": 0.6544, + "step": 15660 + }, + { + "epoch": 8.931955824723904, + "grad_norm": 2.421245275018334, + "learning_rate": 6.845270477539034e-07, + "loss": 0.6585, + "step": 15670 + }, + { + "epoch": 8.937655860349127, + "grad_norm": 2.3489968102572534, + "learning_rate": 6.773090465860677e-07, + "loss": 0.654, + "step": 15680 + }, + { + "epoch": 8.94335589597435, + "grad_norm": 2.2797356549942593, + "learning_rate": 6.701279683937844e-07, + "loss": 0.6533, + "step": 15690 + }, + { + "epoch": 8.949055931599572, + "grad_norm": 2.3453879162469633, + "learning_rate": 6.629838416180679e-07, + "loss": 0.6567, + "step": 15700 + }, + { + "epoch": 8.954755967224795, + "grad_norm": 2.3713385483483496, + "learning_rate": 6.558766945535822e-07, + "loss": 0.6597, + "step": 15710 + }, + { + "epoch": 8.960456002850018, + "grad_norm": 2.2925429331052034, + "learning_rate": 6.488065553485334e-07, + "loss": 0.6563, + "step": 15720 + }, + { + "epoch": 8.96615603847524, + "grad_norm": 2.2600327088475236, + "learning_rate": 6.417734520045537e-07, + "loss": 0.6586, + "step": 15730 + }, + { + "epoch": 8.971856074100463, + "grad_norm": 2.300420354697253, + "learning_rate": 6.34777412376596e-07, + "loss": 0.6543, + "step": 15740 + }, + { + "epoch": 8.977556109725686, + "grad_norm": 2.30692835763649, + "learning_rate": 6.278184641728169e-07, + "loss": 0.6444, + "step": 15750 + }, + { + "epoch": 8.983256145350909, + "grad_norm": 2.359181343602721, + "learning_rate": 6.208966349544754e-07, + "loss": 0.6622, + "step": 15760 + }, + { + "epoch": 8.988956180976132, + "grad_norm": 2.3613948749177043, + "learning_rate": 6.140119521358146e-07, + "loss": 0.646, + "step": 15770 + }, + { + "epoch": 8.994656216601353, + "grad_norm": 2.407625610224588, + "learning_rate": 6.071644429839585e-07, + "loss": 0.6581, + "step": 15780 + }, + { + "epoch": 9.000356252226576, + "grad_norm": 2.393181765255249, + "learning_rate": 6.003541346188036e-07, + "loss": 0.6535, + "step": 15790 + }, + { + "epoch": 9.006056287851798, + "grad_norm": 2.283188570163856, + "learning_rate": 5.935810540129128e-07, + "loss": 0.6461, + "step": 15800 + }, + { + "epoch": 9.011756323477021, + "grad_norm": 2.264655867396922, + "learning_rate": 5.868452279914039e-07, + "loss": 0.6532, + "step": 15810 + }, + { + "epoch": 9.017456359102244, + "grad_norm": 2.3281868349695345, + "learning_rate": 5.801466832318458e-07, + "loss": 0.6554, + "step": 15820 + }, + { + "epoch": 9.023156394727467, + "grad_norm": 2.375263134251282, + "learning_rate": 5.734854462641548e-07, + "loss": 0.649, + "step": 15830 + }, + { + "epoch": 9.02885643035269, + "grad_norm": 2.290766353741812, + "learning_rate": 5.66861543470486e-07, + "loss": 0.652, + "step": 15840 + }, + { + "epoch": 9.034556465977913, + "grad_norm": 2.3498529360472316, + "learning_rate": 5.602750010851332e-07, + "loss": 0.6498, + "step": 15850 + }, + { + "epoch": 9.040256501603135, + "grad_norm": 2.2284167118477938, + "learning_rate": 5.537258451944206e-07, + "loss": 0.6462, + "step": 15860 + }, + { + "epoch": 9.045956537228358, + "grad_norm": 2.2655672578940695, + "learning_rate": 5.472141017366029e-07, + "loss": 0.6529, + "step": 15870 + }, + { + "epoch": 9.051656572853581, + "grad_norm": 2.249704353631801, + "learning_rate": 5.407397965017569e-07, + "loss": 0.6579, + "step": 15880 + }, + { + "epoch": 9.057356608478804, + "grad_norm": 2.299461894717784, + "learning_rate": 5.343029551316892e-07, + "loss": 0.6475, + "step": 15890 + }, + { + "epoch": 9.063056644104025, + "grad_norm": 2.2344281001437616, + "learning_rate": 5.27903603119827e-07, + "loss": 0.6593, + "step": 15900 + }, + { + "epoch": 9.068756679729248, + "grad_norm": 2.326794202315961, + "learning_rate": 5.215417658111166e-07, + "loss": 0.6513, + "step": 15910 + }, + { + "epoch": 9.07445671535447, + "grad_norm": 2.2784333346869, + "learning_rate": 5.152174684019285e-07, + "loss": 0.6504, + "step": 15920 + }, + { + "epoch": 9.080156750979693, + "grad_norm": 2.3541870558342093, + "learning_rate": 5.089307359399498e-07, + "loss": 0.6496, + "step": 15930 + }, + { + "epoch": 9.085856786604916, + "grad_norm": 2.3054372491712023, + "learning_rate": 5.02681593324098e-07, + "loss": 0.6581, + "step": 15940 + }, + { + "epoch": 9.091556822230139, + "grad_norm": 2.3799751500353894, + "learning_rate": 4.964700653044086e-07, + "loss": 0.65, + "step": 15950 + }, + { + "epoch": 9.097256857855362, + "grad_norm": 2.215324072504103, + "learning_rate": 4.902961764819414e-07, + "loss": 0.651, + "step": 15960 + }, + { + "epoch": 9.102956893480584, + "grad_norm": 2.2845100469483928, + "learning_rate": 4.84159951308687e-07, + "loss": 0.6549, + "step": 15970 + }, + { + "epoch": 9.108656929105807, + "grad_norm": 2.3153851871079962, + "learning_rate": 4.780614140874685e-07, + "loss": 0.6602, + "step": 15980 + }, + { + "epoch": 9.11435696473103, + "grad_norm": 2.3349945480874483, + "learning_rate": 4.720005889718393e-07, + "loss": 0.654, + "step": 15990 + }, + { + "epoch": 9.120057000356253, + "grad_norm": 2.2495628939704653, + "learning_rate": 4.6597749996599716e-07, + "loss": 0.6563, + "step": 16000 + }, + { + "epoch": 9.125757035981476, + "grad_norm": 2.2943503815780217, + "learning_rate": 4.5999217092468127e-07, + "loss": 0.6552, + "step": 16010 + }, + { + "epoch": 9.131457071606697, + "grad_norm": 2.3123395747706774, + "learning_rate": 4.540446255530806e-07, + "loss": 0.6536, + "step": 16020 + }, + { + "epoch": 9.13715710723192, + "grad_norm": 2.356164507527656, + "learning_rate": 4.481348874067426e-07, + "loss": 0.6496, + "step": 16030 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 2.315756183759805, + "learning_rate": 4.422629798914757e-07, + "loss": 0.6557, + "step": 16040 + }, + { + "epoch": 9.148557178482365, + "grad_norm": 2.412495445910888, + "learning_rate": 4.3642892626325595e-07, + "loss": 0.6613, + "step": 16050 + }, + { + "epoch": 9.154257214107588, + "grad_norm": 2.277413770697624, + "learning_rate": 4.3063274962813926e-07, + "loss": 0.6552, + "step": 16060 + }, + { + "epoch": 9.15995724973281, + "grad_norm": 2.3240365339718316, + "learning_rate": 4.2487447294217056e-07, + "loss": 0.6644, + "step": 16070 + }, + { + "epoch": 9.165657285358034, + "grad_norm": 2.1543175867284448, + "learning_rate": 4.1915411901128577e-07, + "loss": 0.6597, + "step": 16080 + }, + { + "epoch": 9.171357320983256, + "grad_norm": 2.3289956989255236, + "learning_rate": 4.1347171049122894e-07, + "loss": 0.6537, + "step": 16090 + }, + { + "epoch": 9.17705735660848, + "grad_norm": 2.441818257544798, + "learning_rate": 4.0782726988745634e-07, + "loss": 0.6544, + "step": 16100 + }, + { + "epoch": 9.182757392233702, + "grad_norm": 2.1992411470753814, + "learning_rate": 4.0222081955505367e-07, + "loss": 0.6612, + "step": 16110 + }, + { + "epoch": 9.188457427858925, + "grad_norm": 2.260053918486078, + "learning_rate": 3.966523816986434e-07, + "loss": 0.6628, + "step": 16120 + }, + { + "epoch": 9.194157463484148, + "grad_norm": 2.3191018874769624, + "learning_rate": 3.911219783722953e-07, + "loss": 0.6578, + "step": 16130 + }, + { + "epoch": 9.199857499109369, + "grad_norm": 2.190947251083796, + "learning_rate": 3.85629631479445e-07, + "loss": 0.6494, + "step": 16140 + }, + { + "epoch": 9.205557534734591, + "grad_norm": 2.3462795173662854, + "learning_rate": 3.801753627728011e-07, + "loss": 0.6538, + "step": 16150 + }, + { + "epoch": 9.211257570359814, + "grad_norm": 2.3600995397447497, + "learning_rate": 3.7475919385426384e-07, + "loss": 0.6484, + "step": 16160 + }, + { + "epoch": 9.216957605985037, + "grad_norm": 2.319158550292185, + "learning_rate": 3.6938114617483646e-07, + "loss": 0.655, + "step": 16170 + }, + { + "epoch": 9.22265764161026, + "grad_norm": 2.3069140811983413, + "learning_rate": 3.6404124103453954e-07, + "loss": 0.6551, + "step": 16180 + }, + { + "epoch": 9.228357677235483, + "grad_norm": 2.335764090264895, + "learning_rate": 3.587394995823301e-07, + "loss": 0.6588, + "step": 16190 + }, + { + "epoch": 9.234057712860706, + "grad_norm": 2.3001727472033227, + "learning_rate": 3.5347594281601837e-07, + "loss": 0.6569, + "step": 16200 + }, + { + "epoch": 9.239757748485928, + "grad_norm": 2.3662929871693272, + "learning_rate": 3.482505915821766e-07, + "loss": 0.6579, + "step": 16210 + }, + { + "epoch": 9.245457784111151, + "grad_norm": 2.2676447187387345, + "learning_rate": 3.430634665760668e-07, + "loss": 0.65, + "step": 16220 + }, + { + "epoch": 9.251157819736374, + "grad_norm": 2.2594944840854008, + "learning_rate": 3.379145883415502e-07, + "loss": 0.6534, + "step": 16230 + }, + { + "epoch": 9.256857855361597, + "grad_norm": 2.2559662121895627, + "learning_rate": 3.328039772710123e-07, + "loss": 0.6572, + "step": 16240 + }, + { + "epoch": 9.262557890986818, + "grad_norm": 2.2928810036817895, + "learning_rate": 3.277316536052821e-07, + "loss": 0.6572, + "step": 16250 + }, + { + "epoch": 9.26825792661204, + "grad_norm": 2.298962085856491, + "learning_rate": 3.2269763743354445e-07, + "loss": 0.6466, + "step": 16260 + }, + { + "epoch": 9.273957962237263, + "grad_norm": 2.3393731734236938, + "learning_rate": 3.1770194869326864e-07, + "loss": 0.6632, + "step": 16270 + }, + { + "epoch": 9.279657997862486, + "grad_norm": 2.389534898394802, + "learning_rate": 3.1274460717012346e-07, + "loss": 0.6521, + "step": 16280 + }, + { + "epoch": 9.285358033487709, + "grad_norm": 2.319294326677638, + "learning_rate": 3.0782563249790567e-07, + "loss": 0.6517, + "step": 16290 + }, + { + "epoch": 9.291058069112932, + "grad_norm": 2.4153981864437184, + "learning_rate": 3.0294504415845585e-07, + "loss": 0.6584, + "step": 16300 + }, + { + "epoch": 9.296758104738155, + "grad_norm": 2.3825627392396673, + "learning_rate": 2.98102861481584e-07, + "loss": 0.6456, + "step": 16310 + }, + { + "epoch": 9.302458140363377, + "grad_norm": 2.3879172528078825, + "learning_rate": 2.932991036449917e-07, + "loss": 0.6613, + "step": 16320 + }, + { + "epoch": 9.3081581759886, + "grad_norm": 2.346463251210415, + "learning_rate": 2.8853378967419686e-07, + "loss": 0.655, + "step": 16330 + }, + { + "epoch": 9.313858211613823, + "grad_norm": 2.2927763648573016, + "learning_rate": 2.8380693844246355e-07, + "loss": 0.6502, + "step": 16340 + }, + { + "epoch": 9.319558247239046, + "grad_norm": 2.3550355480798304, + "learning_rate": 2.7911856867071427e-07, + "loss": 0.6409, + "step": 16350 + }, + { + "epoch": 9.325258282864269, + "grad_norm": 2.234142791432648, + "learning_rate": 2.744686989274692e-07, + "loss": 0.6592, + "step": 16360 + }, + { + "epoch": 9.33095831848949, + "grad_norm": 2.2655908560680627, + "learning_rate": 2.698573476287658e-07, + "loss": 0.6581, + "step": 16370 + }, + { + "epoch": 9.336658354114713, + "grad_norm": 2.304446959403882, + "learning_rate": 2.652845330380882e-07, + "loss": 0.6515, + "step": 16380 + }, + { + "epoch": 9.342358389739935, + "grad_norm": 2.2890950540257777, + "learning_rate": 2.6075027326629253e-07, + "loss": 0.639, + "step": 16390 + }, + { + "epoch": 9.348058425365158, + "grad_norm": 2.2796528945861656, + "learning_rate": 2.562545862715382e-07, + "loss": 0.6417, + "step": 16400 + }, + { + "epoch": 9.353758460990381, + "grad_norm": 2.456419871618796, + "learning_rate": 2.517974898592124e-07, + "loss": 0.6574, + "step": 16410 + }, + { + "epoch": 9.359458496615604, + "grad_norm": 2.297770693420203, + "learning_rate": 2.4737900168186667e-07, + "loss": 0.6549, + "step": 16420 + }, + { + "epoch": 9.365158532240827, + "grad_norm": 2.363808835548694, + "learning_rate": 2.429991392391395e-07, + "loss": 0.6415, + "step": 16430 + }, + { + "epoch": 9.37085856786605, + "grad_norm": 2.2713206751436865, + "learning_rate": 2.386579198776917e-07, + "loss": 0.652, + "step": 16440 + }, + { + "epoch": 9.376558603491272, + "grad_norm": 2.303657933588446, + "learning_rate": 2.343553607911353e-07, + "loss": 0.6512, + "step": 16450 + }, + { + "epoch": 9.382258639116495, + "grad_norm": 2.2112371877259402, + "learning_rate": 2.300914790199682e-07, + "loss": 0.6601, + "step": 16460 + }, + { + "epoch": 9.387958674741718, + "grad_norm": 2.3239289332415716, + "learning_rate": 2.2586629145150195e-07, + "loss": 0.6557, + "step": 16470 + }, + { + "epoch": 9.393658710366939, + "grad_norm": 2.262292683671461, + "learning_rate": 2.2167981481980073e-07, + "loss": 0.6476, + "step": 16480 + }, + { + "epoch": 9.399358745992162, + "grad_norm": 2.347838181209718, + "learning_rate": 2.1753206570561015e-07, + "loss": 0.6503, + "step": 16490 + }, + { + "epoch": 9.405058781617385, + "grad_norm": 2.285652751528127, + "learning_rate": 2.1342306053629414e-07, + "loss": 0.6379, + "step": 16500 + }, + { + "epoch": 9.410758817242607, + "grad_norm": 2.232250790375402, + "learning_rate": 2.0935281558577048e-07, + "loss": 0.659, + "step": 16510 + }, + { + "epoch": 9.41645885286783, + "grad_norm": 2.3688051039592795, + "learning_rate": 2.0532134697444417e-07, + "loss": 0.6543, + "step": 16520 + }, + { + "epoch": 9.422158888493053, + "grad_norm": 2.2868901408436613, + "learning_rate": 2.0132867066914418e-07, + "loss": 0.6632, + "step": 16530 + }, + { + "epoch": 9.427858924118276, + "grad_norm": 2.3010337804794125, + "learning_rate": 1.9737480248306128e-07, + "loss": 0.657, + "step": 16540 + }, + { + "epoch": 9.433558959743499, + "grad_norm": 2.268005007344826, + "learning_rate": 1.9345975807568473e-07, + "loss": 0.6472, + "step": 16550 + }, + { + "epoch": 9.439258995368721, + "grad_norm": 2.259506030712857, + "learning_rate": 1.8958355295274012e-07, + "loss": 0.6545, + "step": 16560 + }, + { + "epoch": 9.444959030993944, + "grad_norm": 2.3732707468529233, + "learning_rate": 1.857462024661294e-07, + "loss": 0.6336, + "step": 16570 + }, + { + "epoch": 9.450659066619167, + "grad_norm": 2.2545076985099515, + "learning_rate": 1.8194772181386655e-07, + "loss": 0.6443, + "step": 16580 + }, + { + "epoch": 9.45635910224439, + "grad_norm": 2.2871428515450685, + "learning_rate": 1.781881260400209e-07, + "loss": 0.6504, + "step": 16590 + }, + { + "epoch": 9.46205913786961, + "grad_norm": 2.2933586803868686, + "learning_rate": 1.7446743003465606e-07, + "loss": 0.6561, + "step": 16600 + }, + { + "epoch": 9.467759173494834, + "grad_norm": 2.3486885562113704, + "learning_rate": 1.707856485337722e-07, + "loss": 0.6542, + "step": 16610 + }, + { + "epoch": 9.473459209120056, + "grad_norm": 2.3141845660170546, + "learning_rate": 1.6714279611924512e-07, + "loss": 0.6548, + "step": 16620 + }, + { + "epoch": 9.47915924474528, + "grad_norm": 2.3378979544356175, + "learning_rate": 1.6353888721877154e-07, + "loss": 0.6549, + "step": 16630 + }, + { + "epoch": 9.484859280370502, + "grad_norm": 2.3555369385409595, + "learning_rate": 1.5997393610580837e-07, + "loss": 0.6508, + "step": 16640 + }, + { + "epoch": 9.490559315995725, + "grad_norm": 2.3050119556793534, + "learning_rate": 1.564479568995203e-07, + "loss": 0.6548, + "step": 16650 + }, + { + "epoch": 9.496259351620948, + "grad_norm": 2.2004369929766594, + "learning_rate": 1.5296096356472223e-07, + "loss": 0.6529, + "step": 16660 + }, + { + "epoch": 9.50195938724617, + "grad_norm": 2.337622517571759, + "learning_rate": 1.495129699118214e-07, + "loss": 0.6486, + "step": 16670 + }, + { + "epoch": 9.507659422871393, + "grad_norm": 2.2515692465257438, + "learning_rate": 1.461039895967653e-07, + "loss": 0.6591, + "step": 16680 + }, + { + "epoch": 9.513359458496616, + "grad_norm": 2.257958729425762, + "learning_rate": 1.4273403612099057e-07, + "loss": 0.6589, + "step": 16690 + }, + { + "epoch": 9.519059494121839, + "grad_norm": 2.2551941842725474, + "learning_rate": 1.3940312283136192e-07, + "loss": 0.6573, + "step": 16700 + }, + { + "epoch": 9.524759529747062, + "grad_norm": 2.2839034425055753, + "learning_rate": 1.3611126292012444e-07, + "loss": 0.6528, + "step": 16710 + }, + { + "epoch": 9.530459565372283, + "grad_norm": 2.2513087425795963, + "learning_rate": 1.3285846942485247e-07, + "loss": 0.642, + "step": 16720 + }, + { + "epoch": 9.536159600997506, + "grad_norm": 2.373914409653791, + "learning_rate": 1.2964475522839304e-07, + "loss": 0.6513, + "step": 16730 + }, + { + "epoch": 9.541859636622728, + "grad_norm": 2.3653513175384746, + "learning_rate": 1.2647013305882138e-07, + "loss": 0.6521, + "step": 16740 + }, + { + "epoch": 9.547559672247951, + "grad_norm": 2.346784431083733, + "learning_rate": 1.2333461548938109e-07, + "loss": 0.6556, + "step": 16750 + }, + { + "epoch": 9.553259707873174, + "grad_norm": 2.2471608192504795, + "learning_rate": 1.2023821493844623e-07, + "loss": 0.6442, + "step": 16760 + }, + { + "epoch": 9.558959743498397, + "grad_norm": 2.41300169259645, + "learning_rate": 1.1718094366946264e-07, + "loss": 0.6393, + "step": 16770 + }, + { + "epoch": 9.56465977912362, + "grad_norm": 2.228473749196598, + "learning_rate": 1.1416281379090343e-07, + "loss": 0.65, + "step": 16780 + }, + { + "epoch": 9.570359814748842, + "grad_norm": 2.2588046626210154, + "learning_rate": 1.1118383725622018e-07, + "loss": 0.6435, + "step": 16790 + }, + { + "epoch": 9.576059850374065, + "grad_norm": 2.272713399884446, + "learning_rate": 1.0824402586379512e-07, + "loss": 0.6551, + "step": 16800 + }, + { + "epoch": 9.581759885999288, + "grad_norm": 2.3364321112703874, + "learning_rate": 1.0534339125689686e-07, + "loss": 0.6567, + "step": 16810 + }, + { + "epoch": 9.58745992162451, + "grad_norm": 2.40037730077152, + "learning_rate": 1.0248194492363028e-07, + "loss": 0.6511, + "step": 16820 + }, + { + "epoch": 9.593159957249732, + "grad_norm": 2.4025249286013217, + "learning_rate": 9.965969819689558e-08, + "loss": 0.656, + "step": 16830 + }, + { + "epoch": 9.598859992874955, + "grad_norm": 2.247494854846253, + "learning_rate": 9.687666225433823e-08, + "loss": 0.6529, + "step": 16840 + }, + { + "epoch": 9.604560028500178, + "grad_norm": 2.255657436136958, + "learning_rate": 9.413284811830903e-08, + "loss": 0.652, + "step": 16850 + }, + { + "epoch": 9.6102600641254, + "grad_norm": 2.264893980259995, + "learning_rate": 9.142826665581972e-08, + "loss": 0.6439, + "step": 16860 + }, + { + "epoch": 9.615960099750623, + "grad_norm": 2.266295402923885, + "learning_rate": 8.876292857849633e-08, + "loss": 0.6484, + "step": 16870 + }, + { + "epoch": 9.621660135375846, + "grad_norm": 2.243963802741696, + "learning_rate": 8.613684444254256e-08, + "loss": 0.6562, + "step": 16880 + }, + { + "epoch": 9.627360171001069, + "grad_norm": 2.2887376758264057, + "learning_rate": 8.35500246486931e-08, + "loss": 0.6523, + "step": 16890 + }, + { + "epoch": 9.633060206626292, + "grad_norm": 2.3370474229044733, + "learning_rate": 8.100247944217488e-08, + "loss": 0.6578, + "step": 16900 + }, + { + "epoch": 9.638760242251514, + "grad_norm": 2.328501011399279, + "learning_rate": 7.849421891266585e-08, + "loss": 0.6588, + "step": 16910 + }, + { + "epoch": 9.644460277876737, + "grad_norm": 2.3212148370181094, + "learning_rate": 7.602525299425623e-08, + "loss": 0.6483, + "step": 16920 + }, + { + "epoch": 9.65016031350196, + "grad_norm": 2.33295833370495, + "learning_rate": 7.359559146540518e-08, + "loss": 0.6587, + "step": 16930 + }, + { + "epoch": 9.655860349127183, + "grad_norm": 2.4077865683158968, + "learning_rate": 7.120524394890748e-08, + "loss": 0.6544, + "step": 16940 + }, + { + "epoch": 9.661560384752406, + "grad_norm": 2.356327360124305, + "learning_rate": 6.885421991185027e-08, + "loss": 0.6523, + "step": 16950 + }, + { + "epoch": 9.667260420377627, + "grad_norm": 2.407074508590822, + "learning_rate": 6.654252866558186e-08, + "loss": 0.6407, + "step": 16960 + }, + { + "epoch": 9.67296045600285, + "grad_norm": 2.3457387427111227, + "learning_rate": 6.427017936566859e-08, + "loss": 0.6548, + "step": 16970 + }, + { + "epoch": 9.678660491628072, + "grad_norm": 2.265353341929187, + "learning_rate": 6.203718101186141e-08, + "loss": 0.6543, + "step": 16980 + }, + { + "epoch": 9.684360527253295, + "grad_norm": 2.3296743025735216, + "learning_rate": 5.984354244805924e-08, + "loss": 0.658, + "step": 16990 + }, + { + "epoch": 9.690060562878518, + "grad_norm": 2.299627411976298, + "learning_rate": 5.768927236227684e-08, + "loss": 0.6614, + "step": 17000 + }, + { + "epoch": 9.69576059850374, + "grad_norm": 2.3220359703166666, + "learning_rate": 5.5574379286604805e-08, + "loss": 0.6502, + "step": 17010 + }, + { + "epoch": 9.701460634128964, + "grad_norm": 2.2422049453185373, + "learning_rate": 5.349887159718181e-08, + "loss": 0.6548, + "step": 17020 + }, + { + "epoch": 9.707160669754186, + "grad_norm": 2.364441388396548, + "learning_rate": 5.146275751415908e-08, + "loss": 0.6551, + "step": 17030 + }, + { + "epoch": 9.71286070537941, + "grad_norm": 2.3231425171152202, + "learning_rate": 4.9466045101664864e-08, + "loss": 0.6532, + "step": 17040 + }, + { + "epoch": 9.718560741004632, + "grad_norm": 2.457726448339882, + "learning_rate": 4.750874226777891e-08, + "loss": 0.6569, + "step": 17050 + }, + { + "epoch": 9.724260776629855, + "grad_norm": 2.352927243366894, + "learning_rate": 4.5590856764492486e-08, + "loss": 0.6481, + "step": 17060 + }, + { + "epoch": 9.729960812255076, + "grad_norm": 2.3064337676695805, + "learning_rate": 4.37123961876873e-08, + "loss": 0.6484, + "step": 17070 + }, + { + "epoch": 9.735660847880299, + "grad_norm": 2.2681358596665904, + "learning_rate": 4.187336797709884e-08, + "loss": 0.6547, + "step": 17080 + }, + { + "epoch": 9.741360883505521, + "grad_norm": 2.3506977886549754, + "learning_rate": 4.007377941628754e-08, + "loss": 0.6528, + "step": 17090 + }, + { + "epoch": 9.747060919130744, + "grad_norm": 2.3589241159878775, + "learning_rate": 3.8313637632613196e-08, + "loss": 0.6545, + "step": 17100 + }, + { + "epoch": 9.752760954755967, + "grad_norm": 2.3427788634759934, + "learning_rate": 3.659294959720283e-08, + "loss": 0.6599, + "step": 17110 + }, + { + "epoch": 9.75846099038119, + "grad_norm": 2.3522287518440455, + "learning_rate": 3.491172212492733e-08, + "loss": 0.6463, + "step": 17120 + }, + { + "epoch": 9.764161026006413, + "grad_norm": 2.351901096283413, + "learning_rate": 3.326996187436926e-08, + "loss": 0.6538, + "step": 17130 + }, + { + "epoch": 9.769861061631635, + "grad_norm": 2.2918549555739154, + "learning_rate": 3.1667675347801795e-08, + "loss": 0.6571, + "step": 17140 + }, + { + "epoch": 9.775561097256858, + "grad_norm": 2.377678545336172, + "learning_rate": 3.0104868891159825e-08, + "loss": 0.6555, + "step": 17150 + }, + { + "epoch": 9.781261132882081, + "grad_norm": 2.2424384418651893, + "learning_rate": 2.8581548694013304e-08, + "loss": 0.6623, + "step": 17160 + }, + { + "epoch": 9.786961168507304, + "grad_norm": 2.2658459936780235, + "learning_rate": 2.709772078954842e-08, + "loss": 0.6466, + "step": 17170 + }, + { + "epoch": 9.792661204132525, + "grad_norm": 2.2725019855816937, + "learning_rate": 2.565339105453757e-08, + "loss": 0.653, + "step": 17180 + }, + { + "epoch": 9.798361239757748, + "grad_norm": 2.3682684509359877, + "learning_rate": 2.4248565209320507e-08, + "loss": 0.651, + "step": 17190 + }, + { + "epoch": 9.80406127538297, + "grad_norm": 2.2230547848834714, + "learning_rate": 2.2883248817777703e-08, + "loss": 0.6567, + "step": 17200 + }, + { + "epoch": 9.809761311008193, + "grad_norm": 2.4480084011558034, + "learning_rate": 2.1557447287312572e-08, + "loss": 0.6516, + "step": 17210 + }, + { + "epoch": 9.815461346633416, + "grad_norm": 2.3857177484205594, + "learning_rate": 2.0271165868828157e-08, + "loss": 0.6609, + "step": 17220 + }, + { + "epoch": 9.821161382258639, + "grad_norm": 2.294663835248534, + "learning_rate": 1.9024409656703824e-08, + "loss": 0.6652, + "step": 17230 + }, + { + "epoch": 9.826861417883862, + "grad_norm": 2.252195410951251, + "learning_rate": 1.7817183588778596e-08, + "loss": 0.6531, + "step": 17240 + }, + { + "epoch": 9.832561453509085, + "grad_norm": 2.3618394206239466, + "learning_rate": 1.6649492446332292e-08, + "loss": 0.6479, + "step": 17250 + }, + { + "epoch": 9.838261489134307, + "grad_norm": 2.2660526958358624, + "learning_rate": 1.5521340854061097e-08, + "loss": 0.654, + "step": 17260 + }, + { + "epoch": 9.84396152475953, + "grad_norm": 2.5207873951096644, + "learning_rate": 1.4432733280065336e-08, + "loss": 0.6469, + "step": 17270 + }, + { + "epoch": 9.849661560384753, + "grad_norm": 2.383762606625419, + "learning_rate": 1.338367403583063e-08, + "loss": 0.6663, + "step": 17280 + }, + { + "epoch": 9.855361596009976, + "grad_norm": 2.374411011614862, + "learning_rate": 1.2374167276205663e-08, + "loss": 0.6425, + "step": 17290 + }, + { + "epoch": 9.861061631635197, + "grad_norm": 2.2493191496711678, + "learning_rate": 1.1404216999391093e-08, + "loss": 0.6497, + "step": 17300 + }, + { + "epoch": 9.86676166726042, + "grad_norm": 2.3796088769072177, + "learning_rate": 1.0473827046925122e-08, + "loss": 0.6605, + "step": 17310 + }, + { + "epoch": 9.872461702885643, + "grad_norm": 2.2180769836324776, + "learning_rate": 9.583001103661283e-09, + "loss": 0.6522, + "step": 17320 + }, + { + "epoch": 9.878161738510865, + "grad_norm": 2.3072964999263403, + "learning_rate": 8.731742697758449e-09, + "loss": 0.6512, + "step": 17330 + }, + { + "epoch": 9.883861774136088, + "grad_norm": 2.283779982842626, + "learning_rate": 7.92005520066974e-09, + "loss": 0.6636, + "step": 17340 + }, + { + "epoch": 9.889561809761311, + "grad_norm": 2.297326593288268, + "learning_rate": 7.147941827121419e-09, + "loss": 0.6404, + "step": 17350 + }, + { + "epoch": 9.895261845386534, + "grad_norm": 2.3893193554640257, + "learning_rate": 6.415405635107341e-09, + "loss": 0.6488, + "step": 17360 + }, + { + "epoch": 9.900961881011757, + "grad_norm": 2.3494192289938107, + "learning_rate": 5.722449525873419e-09, + "loss": 0.6595, + "step": 17370 + }, + { + "epoch": 9.90666191663698, + "grad_norm": 2.2589001422532387, + "learning_rate": 5.069076243905402e-09, + "loss": 0.6578, + "step": 17380 + }, + { + "epoch": 9.912361952262202, + "grad_norm": 2.2649210783000293, + "learning_rate": 4.455288376921108e-09, + "loss": 0.6498, + "step": 17390 + }, + { + "epoch": 9.918061987887425, + "grad_norm": 2.4144073651514524, + "learning_rate": 3.881088355855989e-09, + "loss": 0.6547, + "step": 17400 + }, + { + "epoch": 9.923762023512648, + "grad_norm": 2.278865809364427, + "learning_rate": 3.346478454859803e-09, + "loss": 0.651, + "step": 17410 + }, + { + "epoch": 9.929462059137869, + "grad_norm": 2.3417238771811055, + "learning_rate": 2.8514607912799586e-09, + "loss": 0.6503, + "step": 17420 + }, + { + "epoch": 9.935162094763092, + "grad_norm": 2.331772674843219, + "learning_rate": 2.3960373256581846e-09, + "loss": 0.6525, + "step": 17430 + }, + { + "epoch": 9.940862130388314, + "grad_norm": 2.2924527253475575, + "learning_rate": 1.9802098617216494e-09, + "loss": 0.6562, + "step": 17440 + }, + { + "epoch": 9.946562166013537, + "grad_norm": 2.320131850003266, + "learning_rate": 1.6039800463762966e-09, + "loss": 0.6554, + "step": 17450 + }, + { + "epoch": 9.95226220163876, + "grad_norm": 2.258058768319489, + "learning_rate": 1.2673493696979677e-09, + "loss": 0.6471, + "step": 17460 + }, + { + "epoch": 9.957962237263983, + "grad_norm": 2.376801772568866, + "learning_rate": 9.703191649279574e-10, + "loss": 0.6512, + "step": 17470 + }, + { + "epoch": 9.963662272889206, + "grad_norm": 2.321621141337843, + "learning_rate": 7.128906084707954e-10, + "loss": 0.6526, + "step": 17480 + }, + { + "epoch": 9.969362308514429, + "grad_norm": 2.354903966389974, + "learning_rate": 4.950647198842529e-10, + "loss": 0.6455, + "step": 17490 + }, + { + "epoch": 9.975062344139651, + "grad_norm": 2.261998747744868, + "learning_rate": 3.1684236187823345e-10, + "loss": 0.645, + "step": 17500 + }, + { + "epoch": 9.980762379764874, + "grad_norm": 2.2877903683587695, + "learning_rate": 1.7822424031144203e-10, + "loss": 0.6454, + "step": 17510 + }, + { + "epoch": 9.986462415390097, + "grad_norm": 2.3510418373932804, + "learning_rate": 7.921090418805399e-11, + "loss": 0.6525, + "step": 17520 + }, + { + "epoch": 9.992162451015318, + "grad_norm": 2.4334827508597705, + "learning_rate": 1.980274565438478e-11, + "loss": 0.6616, + "step": 17530 + }, + { + "epoch": 9.99786248664054, + "grad_norm": 2.416970594856183, + "learning_rate": 0.0, + "loss": 0.6481, + "step": 17540 + }, + { + "epoch": 9.99786248664054, + "step": 17540, + "total_flos": 2.4261309001996698e+17, + "train_loss": 0.7485582114081563, + "train_runtime": 249180.9208, + "train_samples_per_second": 27.035, + "train_steps_per_second": 0.07 + } + ], + "logging_steps": 10, + "max_steps": 17540, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4261309001996698e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}