{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22326645805206993, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011163322902603496, "grad_norm": 0.12030283361673355, "learning_rate": 4.9975e-05, "loss": 2.5157, "step": 1 }, { "epoch": 0.00022326645805206992, "grad_norm": 0.10777310281991959, "learning_rate": 4.995e-05, "loss": 2.5461, "step": 2 }, { "epoch": 0.0003348996870781049, "grad_norm": 0.1150362566113472, "learning_rate": 4.992500000000001e-05, "loss": 2.463, "step": 3 }, { "epoch": 0.00044653291610413984, "grad_norm": 0.19681179523468018, "learning_rate": 4.99e-05, "loss": 2.6149, "step": 4 }, { "epoch": 0.0005581661451301748, "grad_norm": 0.13096819818019867, "learning_rate": 4.9875000000000006e-05, "loss": 2.4514, "step": 5 }, { "epoch": 0.0006697993741562098, "grad_norm": 0.13657382130622864, "learning_rate": 4.9850000000000006e-05, "loss": 2.6006, "step": 6 }, { "epoch": 0.0007814326031822447, "grad_norm": 0.12205895781517029, "learning_rate": 4.9825000000000005e-05, "loss": 2.4963, "step": 7 }, { "epoch": 0.0008930658322082797, "grad_norm": 0.12875059247016907, "learning_rate": 4.9800000000000004e-05, "loss": 2.4755, "step": 8 }, { "epoch": 0.0010046990612343147, "grad_norm": 0.13755826652050018, "learning_rate": 4.9775000000000004e-05, "loss": 2.4421, "step": 9 }, { "epoch": 0.0011163322902603497, "grad_norm": 0.14838671684265137, "learning_rate": 4.975e-05, "loss": 2.5355, "step": 10 }, { "epoch": 0.0012279655192863846, "grad_norm": 0.20818187296390533, "learning_rate": 4.9725e-05, "loss": 2.4286, "step": 11 }, { "epoch": 0.0013395987483124196, "grad_norm": 0.16847440600395203, "learning_rate": 4.97e-05, "loss": 2.4903, "step": 12 }, { "epoch": 0.0014512319773384544, "grad_norm": 0.16514980792999268, "learning_rate": 4.967500000000001e-05, "loss": 2.3927, "step": 13 }, { "epoch": 0.0015628652063644894, "grad_norm": 0.17463476955890656, "learning_rate": 4.965e-05, "loss": 2.4566, "step": 14 }, { "epoch": 0.0016744984353905244, "grad_norm": 0.1806865930557251, "learning_rate": 4.962500000000001e-05, "loss": 2.3876, "step": 15 }, { "epoch": 0.0017861316644165594, "grad_norm": 0.4340119957923889, "learning_rate": 4.96e-05, "loss": 2.5106, "step": 16 }, { "epoch": 0.0018977648934425943, "grad_norm": 0.18326279520988464, "learning_rate": 4.9575000000000006e-05, "loss": 2.4918, "step": 17 }, { "epoch": 0.0020093981224686293, "grad_norm": 0.1809333860874176, "learning_rate": 4.9550000000000005e-05, "loss": 2.478, "step": 18 }, { "epoch": 0.0021210313514946643, "grad_norm": 0.1895465850830078, "learning_rate": 4.9525000000000004e-05, "loss": 2.5024, "step": 19 }, { "epoch": 0.0022326645805206993, "grad_norm": 0.20786350965499878, "learning_rate": 4.9500000000000004e-05, "loss": 2.5115, "step": 20 }, { "epoch": 0.0023442978095467343, "grad_norm": 0.1992110311985016, "learning_rate": 4.9475e-05, "loss": 2.4044, "step": 21 }, { "epoch": 0.0024559310385727693, "grad_norm": 0.20190922915935516, "learning_rate": 4.945e-05, "loss": 2.4079, "step": 22 }, { "epoch": 0.0025675642675988043, "grad_norm": 0.22950319945812225, "learning_rate": 4.9425e-05, "loss": 2.5367, "step": 23 }, { "epoch": 0.0026791974966248393, "grad_norm": 0.2105780690908432, "learning_rate": 4.94e-05, "loss": 2.3135, "step": 24 }, { "epoch": 0.0027908307256508742, "grad_norm": 0.21351750195026398, "learning_rate": 4.937500000000001e-05, "loss": 2.4224, "step": 25 }, { "epoch": 0.002902463954676909, "grad_norm": 0.21121960878372192, "learning_rate": 4.935e-05, "loss": 2.4935, "step": 26 }, { "epoch": 0.003014097183702944, "grad_norm": 0.21738024055957794, "learning_rate": 4.9325000000000006e-05, "loss": 2.4383, "step": 27 }, { "epoch": 0.0031257304127289788, "grad_norm": 0.22073255479335785, "learning_rate": 4.93e-05, "loss": 2.4861, "step": 28 }, { "epoch": 0.0032373636417550138, "grad_norm": 0.2213854044675827, "learning_rate": 4.9275000000000005e-05, "loss": 2.4449, "step": 29 }, { "epoch": 0.0033489968707810487, "grad_norm": 0.22176030278205872, "learning_rate": 4.9250000000000004e-05, "loss": 2.3341, "step": 30 }, { "epoch": 0.0034606300998070837, "grad_norm": 0.22657762467861176, "learning_rate": 4.9225000000000004e-05, "loss": 2.4111, "step": 31 }, { "epoch": 0.0035722633288331187, "grad_norm": 0.23043128848075867, "learning_rate": 4.92e-05, "loss": 2.4449, "step": 32 }, { "epoch": 0.0036838965578591537, "grad_norm": 0.22782647609710693, "learning_rate": 4.9175e-05, "loss": 2.496, "step": 33 }, { "epoch": 0.0037955297868851887, "grad_norm": 0.2296725958585739, "learning_rate": 4.915e-05, "loss": 2.4957, "step": 34 }, { "epoch": 0.003907163015911224, "grad_norm": 0.23677076399326324, "learning_rate": 4.9125e-05, "loss": 2.4031, "step": 35 }, { "epoch": 0.004018796244937259, "grad_norm": 0.2547452747821808, "learning_rate": 4.91e-05, "loss": 2.4666, "step": 36 }, { "epoch": 0.004130429473963294, "grad_norm": 0.24586506187915802, "learning_rate": 4.907500000000001e-05, "loss": 2.4721, "step": 37 }, { "epoch": 0.004242062702989329, "grad_norm": 0.24286368489265442, "learning_rate": 4.905e-05, "loss": 2.4584, "step": 38 }, { "epoch": 0.004353695932015364, "grad_norm": 0.2877066135406494, "learning_rate": 4.9025000000000006e-05, "loss": 2.3817, "step": 39 }, { "epoch": 0.004465329161041399, "grad_norm": 0.36161181330680847, "learning_rate": 4.9e-05, "loss": 2.4246, "step": 40 }, { "epoch": 0.004576962390067434, "grad_norm": 0.23599842190742493, "learning_rate": 4.8975000000000005e-05, "loss": 2.4416, "step": 41 }, { "epoch": 0.004688595619093469, "grad_norm": 3.929527521133423, "learning_rate": 4.8950000000000004e-05, "loss": 2.536, "step": 42 }, { "epoch": 0.004800228848119504, "grad_norm": 0.25463879108428955, "learning_rate": 4.8925e-05, "loss": 2.3839, "step": 43 }, { "epoch": 0.004911862077145539, "grad_norm": 0.2564179599285126, "learning_rate": 4.89e-05, "loss": 2.5175, "step": 44 }, { "epoch": 0.0050234953061715736, "grad_norm": 0.2552028298377991, "learning_rate": 4.8875e-05, "loss": 2.4055, "step": 45 }, { "epoch": 0.0051351285351976085, "grad_norm": 0.27434229850769043, "learning_rate": 4.885e-05, "loss": 2.4139, "step": 46 }, { "epoch": 0.0052467617642236435, "grad_norm": 0.2661423981189728, "learning_rate": 4.8825e-05, "loss": 2.4711, "step": 47 }, { "epoch": 0.0053583949932496785, "grad_norm": 0.24605660140514374, "learning_rate": 4.88e-05, "loss": 2.3726, "step": 48 }, { "epoch": 0.0054700282222757135, "grad_norm": 0.27371543645858765, "learning_rate": 4.8775000000000007e-05, "loss": 2.4012, "step": 49 }, { "epoch": 0.0055816614513017485, "grad_norm": 0.26587924361228943, "learning_rate": 4.875e-05, "loss": 2.4409, "step": 50 }, { "epoch": 0.0056932946803277835, "grad_norm": 0.28210633993148804, "learning_rate": 4.8725000000000005e-05, "loss": 2.5431, "step": 51 }, { "epoch": 0.005804927909353818, "grad_norm": 0.2553481459617615, "learning_rate": 4.87e-05, "loss": 2.5122, "step": 52 }, { "epoch": 0.005916561138379853, "grad_norm": 0.2604880928993225, "learning_rate": 4.8675000000000004e-05, "loss": 2.4656, "step": 53 }, { "epoch": 0.006028194367405888, "grad_norm": 0.266725093126297, "learning_rate": 4.8650000000000003e-05, "loss": 2.307, "step": 54 }, { "epoch": 0.0061398275964319226, "grad_norm": 5.911880016326904, "learning_rate": 4.8625e-05, "loss": 2.2753, "step": 55 }, { "epoch": 0.0062514608254579575, "grad_norm": 0.2631521224975586, "learning_rate": 4.86e-05, "loss": 2.4475, "step": 56 }, { "epoch": 0.0063630940544839925, "grad_norm": 0.2781185507774353, "learning_rate": 4.8575e-05, "loss": 2.2942, "step": 57 }, { "epoch": 0.0064747272835100275, "grad_norm": 0.27916616201400757, "learning_rate": 4.855e-05, "loss": 2.4817, "step": 58 }, { "epoch": 0.0065863605125360625, "grad_norm": 0.25466758012771606, "learning_rate": 4.8525e-05, "loss": 2.3986, "step": 59 }, { "epoch": 0.0066979937415620975, "grad_norm": 0.3041671812534332, "learning_rate": 4.85e-05, "loss": 2.3411, "step": 60 }, { "epoch": 0.0068096269705881325, "grad_norm": 0.26597726345062256, "learning_rate": 4.8475000000000006e-05, "loss": 2.3992, "step": 61 }, { "epoch": 0.0069212601996141675, "grad_norm": 0.2860955595970154, "learning_rate": 4.845e-05, "loss": 2.535, "step": 62 }, { "epoch": 0.0070328934286402025, "grad_norm": 0.2573089897632599, "learning_rate": 4.8425000000000005e-05, "loss": 2.4366, "step": 63 }, { "epoch": 0.0071445266576662374, "grad_norm": 0.3139612376689911, "learning_rate": 4.8400000000000004e-05, "loss": 2.4112, "step": 64 }, { "epoch": 0.007256159886692272, "grad_norm": 0.27238690853118896, "learning_rate": 4.8375000000000004e-05, "loss": 2.5764, "step": 65 }, { "epoch": 0.007367793115718307, "grad_norm": 0.2645399272441864, "learning_rate": 4.835e-05, "loss": 2.3662, "step": 66 }, { "epoch": 0.007479426344744342, "grad_norm": 0.2746032178401947, "learning_rate": 4.8325e-05, "loss": 2.3946, "step": 67 }, { "epoch": 0.007591059573770377, "grad_norm": 0.2907489836215973, "learning_rate": 4.83e-05, "loss": 2.4238, "step": 68 }, { "epoch": 0.007702692802796412, "grad_norm": 0.2683127522468567, "learning_rate": 4.8275e-05, "loss": 2.3503, "step": 69 }, { "epoch": 0.007814326031822447, "grad_norm": 0.29885104298591614, "learning_rate": 4.825e-05, "loss": 2.3667, "step": 70 }, { "epoch": 0.007925959260848481, "grad_norm": 0.35321930050849915, "learning_rate": 4.822500000000001e-05, "loss": 2.2999, "step": 71 }, { "epoch": 0.008037592489874517, "grad_norm": 0.28377628326416016, "learning_rate": 4.82e-05, "loss": 2.3928, "step": 72 }, { "epoch": 0.008149225718900551, "grad_norm": 0.28445249795913696, "learning_rate": 4.8175000000000005e-05, "loss": 2.4402, "step": 73 }, { "epoch": 0.008260858947926587, "grad_norm": 0.26443931460380554, "learning_rate": 4.815e-05, "loss": 2.3292, "step": 74 }, { "epoch": 0.008372492176952621, "grad_norm": 0.6532557010650635, "learning_rate": 4.8125000000000004e-05, "loss": 2.404, "step": 75 }, { "epoch": 0.008484125405978657, "grad_norm": 0.27610066533088684, "learning_rate": 4.8100000000000004e-05, "loss": 2.3759, "step": 76 }, { "epoch": 0.008595758635004691, "grad_norm": 0.6011192798614502, "learning_rate": 4.8075e-05, "loss": 2.4855, "step": 77 }, { "epoch": 0.008707391864030727, "grad_norm": 0.3119165599346161, "learning_rate": 4.805e-05, "loss": 2.4206, "step": 78 }, { "epoch": 0.008819025093056761, "grad_norm": 0.2999507486820221, "learning_rate": 4.8025e-05, "loss": 2.3795, "step": 79 }, { "epoch": 0.008930658322082797, "grad_norm": 0.32682228088378906, "learning_rate": 4.8e-05, "loss": 2.4536, "step": 80 }, { "epoch": 0.009042291551108831, "grad_norm": 0.2823121249675751, "learning_rate": 4.7975e-05, "loss": 2.3822, "step": 81 }, { "epoch": 0.009153924780134867, "grad_norm": 0.29254743456840515, "learning_rate": 4.795e-05, "loss": 2.4421, "step": 82 }, { "epoch": 0.009265558009160901, "grad_norm": 0.40489596128463745, "learning_rate": 4.7925000000000006e-05, "loss": 2.3872, "step": 83 }, { "epoch": 0.009377191238186937, "grad_norm": 0.28919968008995056, "learning_rate": 4.79e-05, "loss": 2.3616, "step": 84 }, { "epoch": 0.009488824467212971, "grad_norm": 0.2884703576564789, "learning_rate": 4.7875000000000005e-05, "loss": 2.4244, "step": 85 }, { "epoch": 0.009600457696239007, "grad_norm": 0.27483540773391724, "learning_rate": 4.785e-05, "loss": 2.3592, "step": 86 }, { "epoch": 0.009712090925265041, "grad_norm": 0.2799672484397888, "learning_rate": 4.7825000000000004e-05, "loss": 2.3994, "step": 87 }, { "epoch": 0.009823724154291077, "grad_norm": 0.2779146134853363, "learning_rate": 4.78e-05, "loss": 2.5073, "step": 88 }, { "epoch": 0.009935357383317111, "grad_norm": 0.283578097820282, "learning_rate": 4.7775e-05, "loss": 2.3933, "step": 89 }, { "epoch": 0.010046990612343147, "grad_norm": 0.28255191445350647, "learning_rate": 4.775e-05, "loss": 2.5065, "step": 90 }, { "epoch": 0.010158623841369181, "grad_norm": 0.9130760431289673, "learning_rate": 4.7725e-05, "loss": 2.353, "step": 91 }, { "epoch": 0.010270257070395217, "grad_norm": 0.28134602308273315, "learning_rate": 4.77e-05, "loss": 2.465, "step": 92 }, { "epoch": 0.010381890299421251, "grad_norm": 0.2763191759586334, "learning_rate": 4.7675e-05, "loss": 2.317, "step": 93 }, { "epoch": 0.010493523528447287, "grad_norm": 0.2876272201538086, "learning_rate": 4.765e-05, "loss": 2.3916, "step": 94 }, { "epoch": 0.010605156757473321, "grad_norm": 0.28046631813049316, "learning_rate": 4.7625000000000006e-05, "loss": 2.4663, "step": 95 }, { "epoch": 0.010716789986499357, "grad_norm": 0.2862105667591095, "learning_rate": 4.76e-05, "loss": 2.4214, "step": 96 }, { "epoch": 0.010828423215525391, "grad_norm": 0.27971214056015015, "learning_rate": 4.7575000000000004e-05, "loss": 2.4528, "step": 97 }, { "epoch": 0.010940056444551427, "grad_norm": 0.29032212495803833, "learning_rate": 4.755e-05, "loss": 2.3312, "step": 98 }, { "epoch": 0.011051689673577461, "grad_norm": 0.293649286031723, "learning_rate": 4.7525e-05, "loss": 2.3592, "step": 99 }, { "epoch": 0.011163322902603497, "grad_norm": 0.277589350938797, "learning_rate": 4.75e-05, "loss": 2.392, "step": 100 }, { "epoch": 0.011274956131629531, "grad_norm": 0.28249549865722656, "learning_rate": 4.7475e-05, "loss": 2.3154, "step": 101 }, { "epoch": 0.011386589360655567, "grad_norm": 0.30689579248428345, "learning_rate": 4.745e-05, "loss": 2.3322, "step": 102 }, { "epoch": 0.011498222589681601, "grad_norm": 0.2909144163131714, "learning_rate": 4.7425e-05, "loss": 2.4028, "step": 103 }, { "epoch": 0.011609855818707635, "grad_norm": 0.2826705574989319, "learning_rate": 4.74e-05, "loss": 2.3969, "step": 104 }, { "epoch": 0.011721489047733671, "grad_norm": 0.30071696639060974, "learning_rate": 4.7375e-05, "loss": 2.435, "step": 105 }, { "epoch": 0.011833122276759705, "grad_norm": 0.29308071732521057, "learning_rate": 4.735e-05, "loss": 2.3299, "step": 106 }, { "epoch": 0.011944755505785741, "grad_norm": 0.28309884667396545, "learning_rate": 4.7325000000000005e-05, "loss": 2.511, "step": 107 }, { "epoch": 0.012056388734811775, "grad_norm": 0.3001827597618103, "learning_rate": 4.73e-05, "loss": 2.3804, "step": 108 }, { "epoch": 0.012168021963837811, "grad_norm": 0.3125348687171936, "learning_rate": 4.7275000000000004e-05, "loss": 2.3186, "step": 109 }, { "epoch": 0.012279655192863845, "grad_norm": 0.29325881600379944, "learning_rate": 4.7249999999999997e-05, "loss": 2.4714, "step": 110 }, { "epoch": 0.012391288421889881, "grad_norm": 0.28102368116378784, "learning_rate": 4.7225e-05, "loss": 2.4592, "step": 111 }, { "epoch": 0.012502921650915915, "grad_norm": 0.2798093259334564, "learning_rate": 4.72e-05, "loss": 2.398, "step": 112 }, { "epoch": 0.012614554879941951, "grad_norm": 0.29487597942352295, "learning_rate": 4.7175e-05, "loss": 2.3144, "step": 113 }, { "epoch": 0.012726188108967985, "grad_norm": 0.28528064489364624, "learning_rate": 4.715e-05, "loss": 2.4359, "step": 114 }, { "epoch": 0.012837821337994021, "grad_norm": 0.29618656635284424, "learning_rate": 4.7125e-05, "loss": 2.3421, "step": 115 }, { "epoch": 0.012949454567020055, "grad_norm": 0.27769914269447327, "learning_rate": 4.71e-05, "loss": 2.4087, "step": 116 }, { "epoch": 0.013061087796046091, "grad_norm": 0.2721666991710663, "learning_rate": 4.7075e-05, "loss": 2.4001, "step": 117 }, { "epoch": 0.013172721025072125, "grad_norm": 0.6449373960494995, "learning_rate": 4.705e-05, "loss": 2.4581, "step": 118 }, { "epoch": 0.013284354254098161, "grad_norm": 0.28057020902633667, "learning_rate": 4.7025000000000005e-05, "loss": 2.3935, "step": 119 }, { "epoch": 0.013395987483124195, "grad_norm": 0.2757243812084198, "learning_rate": 4.7e-05, "loss": 2.465, "step": 120 }, { "epoch": 0.01350762071215023, "grad_norm": 0.2977396249771118, "learning_rate": 4.6975000000000003e-05, "loss": 2.4985, "step": 121 }, { "epoch": 0.013619253941176265, "grad_norm": 0.27909162640571594, "learning_rate": 4.695e-05, "loss": 2.4484, "step": 122 }, { "epoch": 0.0137308871702023, "grad_norm": 0.28472158312797546, "learning_rate": 4.6925e-05, "loss": 2.499, "step": 123 }, { "epoch": 0.013842520399228335, "grad_norm": 0.2772194445133209, "learning_rate": 4.69e-05, "loss": 2.4161, "step": 124 }, { "epoch": 0.01395415362825437, "grad_norm": 0.28007185459136963, "learning_rate": 4.6875e-05, "loss": 2.4538, "step": 125 }, { "epoch": 0.014065786857280405, "grad_norm": 0.2890627086162567, "learning_rate": 4.685000000000001e-05, "loss": 2.3803, "step": 126 }, { "epoch": 0.01417742008630644, "grad_norm": 0.28412866592407227, "learning_rate": 4.6825e-05, "loss": 2.4739, "step": 127 }, { "epoch": 0.014289053315332475, "grad_norm": 0.28246861696243286, "learning_rate": 4.6800000000000006e-05, "loss": 2.4031, "step": 128 }, { "epoch": 0.01440068654435851, "grad_norm": 0.27640506625175476, "learning_rate": 4.6775000000000005e-05, "loss": 2.3415, "step": 129 }, { "epoch": 0.014512319773384545, "grad_norm": 0.27817410230636597, "learning_rate": 4.6750000000000005e-05, "loss": 2.5304, "step": 130 }, { "epoch": 0.01462395300241058, "grad_norm": 0.27021604776382446, "learning_rate": 4.6725000000000004e-05, "loss": 2.34, "step": 131 }, { "epoch": 0.014735586231436615, "grad_norm": 0.2793290913105011, "learning_rate": 4.6700000000000003e-05, "loss": 2.4935, "step": 132 }, { "epoch": 0.01484721946046265, "grad_norm": 0.2610988914966583, "learning_rate": 4.6675e-05, "loss": 2.1694, "step": 133 }, { "epoch": 0.014958852689488685, "grad_norm": 0.3299348056316376, "learning_rate": 4.665e-05, "loss": 2.3272, "step": 134 }, { "epoch": 0.01507048591851472, "grad_norm": 0.27494046092033386, "learning_rate": 4.6625e-05, "loss": 2.4961, "step": 135 }, { "epoch": 0.015182119147540755, "grad_norm": 0.8337180018424988, "learning_rate": 4.660000000000001e-05, "loss": 2.4057, "step": 136 }, { "epoch": 0.01529375237656679, "grad_norm": 0.27909937500953674, "learning_rate": 4.6575e-05, "loss": 2.279, "step": 137 }, { "epoch": 0.015405385605592825, "grad_norm": 0.3143457770347595, "learning_rate": 4.655000000000001e-05, "loss": 2.4057, "step": 138 }, { "epoch": 0.015517018834618859, "grad_norm": 0.27124184370040894, "learning_rate": 4.6525e-05, "loss": 2.4525, "step": 139 }, { "epoch": 0.015628652063644895, "grad_norm": 0.27924227714538574, "learning_rate": 4.6500000000000005e-05, "loss": 2.4891, "step": 140 }, { "epoch": 0.01574028529267093, "grad_norm": 0.2761871814727783, "learning_rate": 4.6475000000000005e-05, "loss": 2.4127, "step": 141 }, { "epoch": 0.015851918521696963, "grad_norm": 0.30836501717567444, "learning_rate": 4.6450000000000004e-05, "loss": 2.3762, "step": 142 }, { "epoch": 0.015963551750723, "grad_norm": 0.2716349959373474, "learning_rate": 4.6425000000000004e-05, "loss": 2.3893, "step": 143 }, { "epoch": 0.016075184979749035, "grad_norm": 0.27172204852104187, "learning_rate": 4.64e-05, "loss": 2.3451, "step": 144 }, { "epoch": 0.01618681820877507, "grad_norm": 0.27586179971694946, "learning_rate": 4.6375e-05, "loss": 2.3197, "step": 145 }, { "epoch": 0.016298451437801103, "grad_norm": 0.27401286363601685, "learning_rate": 4.635e-05, "loss": 2.3191, "step": 146 }, { "epoch": 0.01641008466682714, "grad_norm": 0.27832385897636414, "learning_rate": 4.6325e-05, "loss": 2.4407, "step": 147 }, { "epoch": 0.016521717895853175, "grad_norm": 0.29265516996383667, "learning_rate": 4.630000000000001e-05, "loss": 2.3436, "step": 148 }, { "epoch": 0.01663335112487921, "grad_norm": 0.27826353907585144, "learning_rate": 4.6275e-05, "loss": 2.4081, "step": 149 }, { "epoch": 0.016744984353905243, "grad_norm": 0.26623812317848206, "learning_rate": 4.6250000000000006e-05, "loss": 2.4566, "step": 150 }, { "epoch": 0.01685661758293128, "grad_norm": 0.2699335217475891, "learning_rate": 4.6225e-05, "loss": 2.3, "step": 151 }, { "epoch": 0.016968250811957315, "grad_norm": 0.27325987815856934, "learning_rate": 4.6200000000000005e-05, "loss": 2.3798, "step": 152 }, { "epoch": 0.01707988404098335, "grad_norm": 0.29398098587989807, "learning_rate": 4.6175000000000004e-05, "loss": 2.2952, "step": 153 }, { "epoch": 0.017191517270009383, "grad_norm": 0.264258474111557, "learning_rate": 4.6150000000000004e-05, "loss": 2.4543, "step": 154 }, { "epoch": 0.01730315049903542, "grad_norm": 0.4211874008178711, "learning_rate": 4.6125e-05, "loss": 2.3015, "step": 155 }, { "epoch": 0.017414783728061455, "grad_norm": 0.28733232617378235, "learning_rate": 4.61e-05, "loss": 2.4002, "step": 156 }, { "epoch": 0.01752641695708749, "grad_norm": 0.2647246718406677, "learning_rate": 4.6075e-05, "loss": 2.2928, "step": 157 }, { "epoch": 0.017638050186113523, "grad_norm": 0.2679901719093323, "learning_rate": 4.605e-05, "loss": 2.4328, "step": 158 }, { "epoch": 0.01774968341513956, "grad_norm": 0.26848575472831726, "learning_rate": 4.6025e-05, "loss": 2.3402, "step": 159 }, { "epoch": 0.017861316644165594, "grad_norm": 0.282953679561615, "learning_rate": 4.600000000000001e-05, "loss": 2.3454, "step": 160 }, { "epoch": 0.01797294987319163, "grad_norm": 0.2788335084915161, "learning_rate": 4.5975e-05, "loss": 2.3779, "step": 161 }, { "epoch": 0.018084583102217663, "grad_norm": 0.2903019189834595, "learning_rate": 4.5950000000000006e-05, "loss": 2.4031, "step": 162 }, { "epoch": 0.0181962163312437, "grad_norm": 0.2800057828426361, "learning_rate": 4.5925e-05, "loss": 2.4273, "step": 163 }, { "epoch": 0.018307849560269734, "grad_norm": 0.28730225563049316, "learning_rate": 4.5900000000000004e-05, "loss": 2.3713, "step": 164 }, { "epoch": 0.01841948278929577, "grad_norm": 0.2722271978855133, "learning_rate": 4.5875000000000004e-05, "loss": 2.5114, "step": 165 }, { "epoch": 0.018531116018321803, "grad_norm": 0.27777567505836487, "learning_rate": 4.585e-05, "loss": 2.3485, "step": 166 }, { "epoch": 0.01864274924734784, "grad_norm": 0.2774522006511688, "learning_rate": 4.5825e-05, "loss": 2.4186, "step": 167 }, { "epoch": 0.018754382476373874, "grad_norm": 0.2742158770561218, "learning_rate": 4.58e-05, "loss": 2.3706, "step": 168 }, { "epoch": 0.01886601570539991, "grad_norm": 0.27442091703414917, "learning_rate": 4.5775e-05, "loss": 2.4425, "step": 169 }, { "epoch": 0.018977648934425943, "grad_norm": 0.2682335376739502, "learning_rate": 4.575e-05, "loss": 2.3597, "step": 170 }, { "epoch": 0.01908928216345198, "grad_norm": 0.27973178029060364, "learning_rate": 4.5725e-05, "loss": 2.2992, "step": 171 }, { "epoch": 0.019200915392478014, "grad_norm": 0.3113536536693573, "learning_rate": 4.5700000000000006e-05, "loss": 2.4343, "step": 172 }, { "epoch": 0.01931254862150405, "grad_norm": 0.27050501108169556, "learning_rate": 4.5675e-05, "loss": 2.4023, "step": 173 }, { "epoch": 0.019424181850530083, "grad_norm": 0.5937790870666504, "learning_rate": 4.5650000000000005e-05, "loss": 2.4726, "step": 174 }, { "epoch": 0.019535815079556117, "grad_norm": 0.27320945262908936, "learning_rate": 4.5625e-05, "loss": 2.4108, "step": 175 }, { "epoch": 0.019647448308582154, "grad_norm": 0.2724778354167938, "learning_rate": 4.5600000000000004e-05, "loss": 2.2204, "step": 176 }, { "epoch": 0.01975908153760819, "grad_norm": 0.2760343849658966, "learning_rate": 4.5575e-05, "loss": 2.4108, "step": 177 }, { "epoch": 0.019870714766634223, "grad_norm": 0.27173370122909546, "learning_rate": 4.555e-05, "loss": 2.4278, "step": 178 }, { "epoch": 0.019982347995660257, "grad_norm": 0.258478045463562, "learning_rate": 4.5525e-05, "loss": 2.3129, "step": 179 }, { "epoch": 0.020093981224686294, "grad_norm": 0.2680318355560303, "learning_rate": 4.55e-05, "loss": 2.1764, "step": 180 }, { "epoch": 0.02020561445371233, "grad_norm": 0.32532161474227905, "learning_rate": 4.5475e-05, "loss": 2.3988, "step": 181 }, { "epoch": 0.020317247682738362, "grad_norm": 0.27205905318260193, "learning_rate": 4.545000000000001e-05, "loss": 2.3432, "step": 182 }, { "epoch": 0.020428880911764397, "grad_norm": 0.3422660827636719, "learning_rate": 4.5425e-05, "loss": 2.4591, "step": 183 }, { "epoch": 0.020540514140790434, "grad_norm": 0.2941705882549286, "learning_rate": 4.5400000000000006e-05, "loss": 2.4285, "step": 184 }, { "epoch": 0.02065214736981647, "grad_norm": 0.27218639850616455, "learning_rate": 4.5375e-05, "loss": 2.3349, "step": 185 }, { "epoch": 0.020763780598842502, "grad_norm": 0.26361867785453796, "learning_rate": 4.5350000000000005e-05, "loss": 2.3049, "step": 186 }, { "epoch": 0.020875413827868537, "grad_norm": 0.47230780124664307, "learning_rate": 4.5325000000000004e-05, "loss": 2.3365, "step": 187 }, { "epoch": 0.020987047056894574, "grad_norm": 0.26752769947052, "learning_rate": 4.53e-05, "loss": 2.3887, "step": 188 }, { "epoch": 0.021098680285920608, "grad_norm": 0.27140894532203674, "learning_rate": 4.5275e-05, "loss": 2.4444, "step": 189 }, { "epoch": 0.021210313514946642, "grad_norm": 0.4308728873729706, "learning_rate": 4.525e-05, "loss": 2.397, "step": 190 }, { "epoch": 0.021321946743972676, "grad_norm": 0.6893981099128723, "learning_rate": 4.5225e-05, "loss": 2.3797, "step": 191 }, { "epoch": 0.021433579972998714, "grad_norm": 0.27642473578453064, "learning_rate": 4.52e-05, "loss": 2.3374, "step": 192 }, { "epoch": 0.021545213202024748, "grad_norm": 0.2684485912322998, "learning_rate": 4.5175e-05, "loss": 2.3368, "step": 193 }, { "epoch": 0.021656846431050782, "grad_norm": 0.2804638743400574, "learning_rate": 4.5150000000000006e-05, "loss": 2.3378, "step": 194 }, { "epoch": 0.021768479660076816, "grad_norm": 0.41617271304130554, "learning_rate": 4.5125e-05, "loss": 2.3908, "step": 195 }, { "epoch": 0.021880112889102854, "grad_norm": 0.26013967394828796, "learning_rate": 4.5100000000000005e-05, "loss": 2.3378, "step": 196 }, { "epoch": 0.021991746118128888, "grad_norm": 0.2883068919181824, "learning_rate": 4.5075e-05, "loss": 2.296, "step": 197 }, { "epoch": 0.022103379347154922, "grad_norm": 0.27083417773246765, "learning_rate": 4.5050000000000004e-05, "loss": 2.3917, "step": 198 }, { "epoch": 0.022215012576180956, "grad_norm": 0.26112979650497437, "learning_rate": 4.5025000000000003e-05, "loss": 2.4321, "step": 199 }, { "epoch": 0.022326645805206994, "grad_norm": 0.2797684669494629, "learning_rate": 4.5e-05, "loss": 2.4221, "step": 200 }, { "epoch": 0.022438279034233028, "grad_norm": 0.28574231266975403, "learning_rate": 4.4975e-05, "loss": 2.4143, "step": 201 }, { "epoch": 0.022549912263259062, "grad_norm": 0.3054039180278778, "learning_rate": 4.495e-05, "loss": 2.3744, "step": 202 }, { "epoch": 0.022661545492285096, "grad_norm": 0.2859933376312256, "learning_rate": 4.4925e-05, "loss": 2.3158, "step": 203 }, { "epoch": 0.022773178721311134, "grad_norm": 0.30749940872192383, "learning_rate": 4.49e-05, "loss": 2.3274, "step": 204 }, { "epoch": 0.022884811950337168, "grad_norm": 0.34303340315818787, "learning_rate": 4.4875e-05, "loss": 2.3822, "step": 205 }, { "epoch": 0.022996445179363202, "grad_norm": 0.6377202868461609, "learning_rate": 4.4850000000000006e-05, "loss": 2.3162, "step": 206 }, { "epoch": 0.023108078408389236, "grad_norm": 0.27729642391204834, "learning_rate": 4.4825e-05, "loss": 2.3117, "step": 207 }, { "epoch": 0.02321971163741527, "grad_norm": 0.2766030728816986, "learning_rate": 4.4800000000000005e-05, "loss": 2.4142, "step": 208 }, { "epoch": 0.023331344866441308, "grad_norm": 0.5488070249557495, "learning_rate": 4.4775e-05, "loss": 2.3999, "step": 209 }, { "epoch": 0.023442978095467342, "grad_norm": 0.27591627836227417, "learning_rate": 4.4750000000000004e-05, "loss": 2.4271, "step": 210 }, { "epoch": 0.023554611324493376, "grad_norm": 0.3173430562019348, "learning_rate": 4.4725e-05, "loss": 2.4497, "step": 211 }, { "epoch": 0.02366624455351941, "grad_norm": 0.31145554780960083, "learning_rate": 4.47e-05, "loss": 2.4128, "step": 212 }, { "epoch": 0.023777877782545448, "grad_norm": 0.27966129779815674, "learning_rate": 4.4675e-05, "loss": 2.4699, "step": 213 }, { "epoch": 0.023889511011571482, "grad_norm": 0.31131359934806824, "learning_rate": 4.465e-05, "loss": 2.3269, "step": 214 }, { "epoch": 0.024001144240597516, "grad_norm": 0.26797181367874146, "learning_rate": 4.4625e-05, "loss": 2.4169, "step": 215 }, { "epoch": 0.02411277746962355, "grad_norm": 0.28204289078712463, "learning_rate": 4.46e-05, "loss": 2.2361, "step": 216 }, { "epoch": 0.024224410698649588, "grad_norm": 0.2600002586841583, "learning_rate": 4.4575e-05, "loss": 2.3357, "step": 217 }, { "epoch": 0.024336043927675622, "grad_norm": 0.2576424777507782, "learning_rate": 4.4550000000000005e-05, "loss": 2.3424, "step": 218 }, { "epoch": 0.024447677156701656, "grad_norm": 0.3016074299812317, "learning_rate": 4.4525e-05, "loss": 2.3781, "step": 219 }, { "epoch": 0.02455931038572769, "grad_norm": 0.2684342563152313, "learning_rate": 4.4500000000000004e-05, "loss": 2.3167, "step": 220 }, { "epoch": 0.024670943614753728, "grad_norm": 0.2625711262226105, "learning_rate": 4.4475e-05, "loss": 2.4207, "step": 221 }, { "epoch": 0.024782576843779762, "grad_norm": 0.29141953587532043, "learning_rate": 4.445e-05, "loss": 2.4231, "step": 222 }, { "epoch": 0.024894210072805796, "grad_norm": 0.2682022452354431, "learning_rate": 4.4425e-05, "loss": 2.4098, "step": 223 }, { "epoch": 0.02500584330183183, "grad_norm": 0.2591974437236786, "learning_rate": 4.44e-05, "loss": 2.4387, "step": 224 }, { "epoch": 0.025117476530857868, "grad_norm": 0.2656046152114868, "learning_rate": 4.4375e-05, "loss": 2.415, "step": 225 }, { "epoch": 0.025229109759883902, "grad_norm": 0.2568715214729309, "learning_rate": 4.435e-05, "loss": 2.3837, "step": 226 }, { "epoch": 0.025340742988909936, "grad_norm": 0.3120313286781311, "learning_rate": 4.4325e-05, "loss": 2.4028, "step": 227 }, { "epoch": 0.02545237621793597, "grad_norm": 0.36895328760147095, "learning_rate": 4.43e-05, "loss": 2.4162, "step": 228 }, { "epoch": 0.025564009446962008, "grad_norm": 0.2681656777858734, "learning_rate": 4.4275e-05, "loss": 2.4289, "step": 229 }, { "epoch": 0.025675642675988042, "grad_norm": 0.2715415358543396, "learning_rate": 4.4250000000000005e-05, "loss": 2.3207, "step": 230 }, { "epoch": 0.025787275905014076, "grad_norm": 0.2677493989467621, "learning_rate": 4.4225e-05, "loss": 2.3856, "step": 231 }, { "epoch": 0.02589890913404011, "grad_norm": 0.25962767004966736, "learning_rate": 4.4200000000000004e-05, "loss": 2.3136, "step": 232 }, { "epoch": 0.026010542363066148, "grad_norm": 0.3225052058696747, "learning_rate": 4.4174999999999996e-05, "loss": 2.2765, "step": 233 }, { "epoch": 0.026122175592092182, "grad_norm": 0.3049544095993042, "learning_rate": 4.415e-05, "loss": 2.2127, "step": 234 }, { "epoch": 0.026233808821118216, "grad_norm": 0.25900280475616455, "learning_rate": 4.4125e-05, "loss": 2.3794, "step": 235 }, { "epoch": 0.02634544205014425, "grad_norm": 0.26174089312553406, "learning_rate": 4.41e-05, "loss": 2.4024, "step": 236 }, { "epoch": 0.026457075279170288, "grad_norm": 0.26936131715774536, "learning_rate": 4.4075e-05, "loss": 2.5543, "step": 237 }, { "epoch": 0.026568708508196322, "grad_norm": 0.2539876103401184, "learning_rate": 4.405e-05, "loss": 2.4051, "step": 238 }, { "epoch": 0.026680341737222356, "grad_norm": 0.26477983593940735, "learning_rate": 4.4025e-05, "loss": 2.3512, "step": 239 }, { "epoch": 0.02679197496624839, "grad_norm": 0.2632873058319092, "learning_rate": 4.4000000000000006e-05, "loss": 2.3378, "step": 240 }, { "epoch": 0.026903608195274428, "grad_norm": 0.6845733523368835, "learning_rate": 4.3975e-05, "loss": 2.3815, "step": 241 }, { "epoch": 0.02701524142430046, "grad_norm": 0.25974923372268677, "learning_rate": 4.3950000000000004e-05, "loss": 2.3816, "step": 242 }, { "epoch": 0.027126874653326496, "grad_norm": 0.2636438012123108, "learning_rate": 4.3925e-05, "loss": 2.3288, "step": 243 }, { "epoch": 0.02723850788235253, "grad_norm": 0.2576185464859009, "learning_rate": 4.39e-05, "loss": 2.4314, "step": 244 }, { "epoch": 0.027350141111378564, "grad_norm": 0.2600337862968445, "learning_rate": 4.3875e-05, "loss": 2.4147, "step": 245 }, { "epoch": 0.0274617743404046, "grad_norm": 0.2605160176753998, "learning_rate": 4.385e-05, "loss": 2.327, "step": 246 }, { "epoch": 0.027573407569430636, "grad_norm": 0.25381141901016235, "learning_rate": 4.3825e-05, "loss": 2.309, "step": 247 }, { "epoch": 0.02768504079845667, "grad_norm": 0.253326416015625, "learning_rate": 4.38e-05, "loss": 2.2849, "step": 248 }, { "epoch": 0.027796674027482704, "grad_norm": 0.3015645444393158, "learning_rate": 4.3775e-05, "loss": 2.3882, "step": 249 }, { "epoch": 0.02790830725650874, "grad_norm": 0.2591153383255005, "learning_rate": 4.375e-05, "loss": 2.3108, "step": 250 }, { "epoch": 0.028019940485534776, "grad_norm": 0.255209743976593, "learning_rate": 4.3725000000000006e-05, "loss": 2.3026, "step": 251 }, { "epoch": 0.02813157371456081, "grad_norm": 0.2543400228023529, "learning_rate": 4.3700000000000005e-05, "loss": 2.3949, "step": 252 }, { "epoch": 0.028243206943586844, "grad_norm": 0.2584831118583679, "learning_rate": 4.3675000000000005e-05, "loss": 2.4274, "step": 253 }, { "epoch": 0.02835484017261288, "grad_norm": 0.24946770071983337, "learning_rate": 4.3650000000000004e-05, "loss": 2.378, "step": 254 }, { "epoch": 0.028466473401638916, "grad_norm": 0.2595466673374176, "learning_rate": 4.3625e-05, "loss": 2.4601, "step": 255 }, { "epoch": 0.02857810663066495, "grad_norm": 0.25328534841537476, "learning_rate": 4.36e-05, "loss": 2.2783, "step": 256 }, { "epoch": 0.028689739859690984, "grad_norm": 0.2810356616973877, "learning_rate": 4.3575e-05, "loss": 2.282, "step": 257 }, { "epoch": 0.02880137308871702, "grad_norm": 0.2603547275066376, "learning_rate": 4.355e-05, "loss": 2.3768, "step": 258 }, { "epoch": 0.028913006317743056, "grad_norm": 0.25737640261650085, "learning_rate": 4.352500000000001e-05, "loss": 2.4577, "step": 259 }, { "epoch": 0.02902463954676909, "grad_norm": 0.25266796350479126, "learning_rate": 4.35e-05, "loss": 2.3293, "step": 260 }, { "epoch": 0.029136272775795124, "grad_norm": 0.35959863662719727, "learning_rate": 4.3475000000000006e-05, "loss": 2.2827, "step": 261 }, { "epoch": 0.02924790600482116, "grad_norm": 0.2564973831176758, "learning_rate": 4.345e-05, "loss": 2.3925, "step": 262 }, { "epoch": 0.029359539233847196, "grad_norm": 0.2848140597343445, "learning_rate": 4.3425000000000005e-05, "loss": 2.3275, "step": 263 }, { "epoch": 0.02947117246287323, "grad_norm": 0.26803499460220337, "learning_rate": 4.3400000000000005e-05, "loss": 2.4576, "step": 264 }, { "epoch": 0.029582805691899264, "grad_norm": 0.24815724790096283, "learning_rate": 4.3375000000000004e-05, "loss": 2.611, "step": 265 }, { "epoch": 0.0296944389209253, "grad_norm": 0.26036569476127625, "learning_rate": 4.335e-05, "loss": 2.3574, "step": 266 }, { "epoch": 0.029806072149951335, "grad_norm": 0.26427459716796875, "learning_rate": 4.3325e-05, "loss": 2.3884, "step": 267 }, { "epoch": 0.02991770537897737, "grad_norm": 0.25177526473999023, "learning_rate": 4.33e-05, "loss": 2.3812, "step": 268 }, { "epoch": 0.030029338608003404, "grad_norm": 0.2582986354827881, "learning_rate": 4.3275e-05, "loss": 2.3621, "step": 269 }, { "epoch": 0.03014097183702944, "grad_norm": 0.26251325011253357, "learning_rate": 4.325e-05, "loss": 2.4779, "step": 270 }, { "epoch": 0.030252605066055475, "grad_norm": 0.2560170590877533, "learning_rate": 4.322500000000001e-05, "loss": 2.4519, "step": 271 }, { "epoch": 0.03036423829508151, "grad_norm": 0.25769442319869995, "learning_rate": 4.32e-05, "loss": 2.255, "step": 272 }, { "epoch": 0.030475871524107544, "grad_norm": 0.2584100067615509, "learning_rate": 4.3175000000000006e-05, "loss": 2.4195, "step": 273 }, { "epoch": 0.03058750475313358, "grad_norm": 0.26976278424263, "learning_rate": 4.315e-05, "loss": 2.3927, "step": 274 }, { "epoch": 0.030699137982159615, "grad_norm": 0.2528376877307892, "learning_rate": 4.3125000000000005e-05, "loss": 2.3596, "step": 275 }, { "epoch": 0.03081077121118565, "grad_norm": 0.25755786895751953, "learning_rate": 4.3100000000000004e-05, "loss": 2.4775, "step": 276 }, { "epoch": 0.030922404440211684, "grad_norm": 0.25737857818603516, "learning_rate": 4.3075000000000003e-05, "loss": 2.3443, "step": 277 }, { "epoch": 0.031034037669237718, "grad_norm": 0.2632676362991333, "learning_rate": 4.305e-05, "loss": 2.3128, "step": 278 }, { "epoch": 0.031145670898263755, "grad_norm": 0.28831636905670166, "learning_rate": 4.3025e-05, "loss": 2.3624, "step": 279 }, { "epoch": 0.03125730412728979, "grad_norm": 0.2593206763267517, "learning_rate": 4.3e-05, "loss": 2.3546, "step": 280 }, { "epoch": 0.031368937356315824, "grad_norm": 0.25221961736679077, "learning_rate": 4.2975e-05, "loss": 2.3056, "step": 281 }, { "epoch": 0.03148057058534186, "grad_norm": 0.26562732458114624, "learning_rate": 4.295e-05, "loss": 2.2979, "step": 282 }, { "epoch": 0.03159220381436789, "grad_norm": 0.2667911648750305, "learning_rate": 4.2925000000000007e-05, "loss": 2.5051, "step": 283 }, { "epoch": 0.031703837043393926, "grad_norm": 0.2637230455875397, "learning_rate": 4.29e-05, "loss": 2.2956, "step": 284 }, { "epoch": 0.03181547027241997, "grad_norm": 0.2591506540775299, "learning_rate": 4.2875000000000005e-05, "loss": 2.4331, "step": 285 }, { "epoch": 0.031927103501446, "grad_norm": 0.25945866107940674, "learning_rate": 4.285e-05, "loss": 2.3397, "step": 286 }, { "epoch": 0.032038736730472035, "grad_norm": 0.28612184524536133, "learning_rate": 4.2825000000000004e-05, "loss": 2.4108, "step": 287 }, { "epoch": 0.03215036995949807, "grad_norm": 0.26425275206565857, "learning_rate": 4.2800000000000004e-05, "loss": 2.4308, "step": 288 }, { "epoch": 0.032262003188524103, "grad_norm": 0.2575188875198364, "learning_rate": 4.2775e-05, "loss": 2.3774, "step": 289 }, { "epoch": 0.03237363641755014, "grad_norm": 0.25762301683425903, "learning_rate": 4.275e-05, "loss": 2.4266, "step": 290 }, { "epoch": 0.03248526964657617, "grad_norm": 0.2451835572719574, "learning_rate": 4.2725e-05, "loss": 2.2542, "step": 291 }, { "epoch": 0.032596902875602206, "grad_norm": 0.25459688901901245, "learning_rate": 4.27e-05, "loss": 2.3772, "step": 292 }, { "epoch": 0.03270853610462825, "grad_norm": 0.3179572820663452, "learning_rate": 4.2675e-05, "loss": 2.4139, "step": 293 }, { "epoch": 0.03282016933365428, "grad_norm": 0.25751742720603943, "learning_rate": 4.265e-05, "loss": 2.2372, "step": 294 }, { "epoch": 0.032931802562680315, "grad_norm": 0.26437416672706604, "learning_rate": 4.2625000000000006e-05, "loss": 2.4512, "step": 295 }, { "epoch": 0.03304343579170635, "grad_norm": 0.26216599345207214, "learning_rate": 4.26e-05, "loss": 2.3041, "step": 296 }, { "epoch": 0.03315506902073238, "grad_norm": 0.2558748126029968, "learning_rate": 4.2575000000000005e-05, "loss": 2.3464, "step": 297 }, { "epoch": 0.03326670224975842, "grad_norm": 0.25759217143058777, "learning_rate": 4.2550000000000004e-05, "loss": 2.38, "step": 298 }, { "epoch": 0.03337833547878445, "grad_norm": 0.2515701353549957, "learning_rate": 4.2525000000000004e-05, "loss": 2.347, "step": 299 }, { "epoch": 0.033489968707810486, "grad_norm": 0.25506308674812317, "learning_rate": 4.25e-05, "loss": 2.4563, "step": 300 }, { "epoch": 0.03360160193683653, "grad_norm": 0.24955067038536072, "learning_rate": 4.2475e-05, "loss": 2.3467, "step": 301 }, { "epoch": 0.03371323516586256, "grad_norm": 0.26849493384361267, "learning_rate": 4.245e-05, "loss": 2.318, "step": 302 }, { "epoch": 0.033824868394888595, "grad_norm": 0.2560291886329651, "learning_rate": 4.2425e-05, "loss": 2.3567, "step": 303 }, { "epoch": 0.03393650162391463, "grad_norm": 0.2685459554195404, "learning_rate": 4.24e-05, "loss": 2.3929, "step": 304 }, { "epoch": 0.03404813485294066, "grad_norm": 0.2724890410900116, "learning_rate": 4.237500000000001e-05, "loss": 2.321, "step": 305 }, { "epoch": 0.0341597680819667, "grad_norm": 0.3363018333911896, "learning_rate": 4.235e-05, "loss": 2.2429, "step": 306 }, { "epoch": 0.03427140131099273, "grad_norm": 0.2732946276664734, "learning_rate": 4.2325000000000006e-05, "loss": 2.4269, "step": 307 }, { "epoch": 0.034383034540018766, "grad_norm": 0.25203391909599304, "learning_rate": 4.23e-05, "loss": 2.3134, "step": 308 }, { "epoch": 0.03449466776904481, "grad_norm": 0.27153274416923523, "learning_rate": 4.2275000000000004e-05, "loss": 2.3181, "step": 309 }, { "epoch": 0.03460630099807084, "grad_norm": 0.25666430592536926, "learning_rate": 4.2250000000000004e-05, "loss": 2.437, "step": 310 }, { "epoch": 0.034717934227096875, "grad_norm": 0.255957692861557, "learning_rate": 4.2225e-05, "loss": 2.3226, "step": 311 }, { "epoch": 0.03482956745612291, "grad_norm": 0.2525959014892578, "learning_rate": 4.22e-05, "loss": 2.4727, "step": 312 }, { "epoch": 0.03494120068514894, "grad_norm": 0.34017762541770935, "learning_rate": 4.2175e-05, "loss": 2.2816, "step": 313 }, { "epoch": 0.03505283391417498, "grad_norm": 0.25117596983909607, "learning_rate": 4.215e-05, "loss": 2.3801, "step": 314 }, { "epoch": 0.03516446714320101, "grad_norm": 0.2531397044658661, "learning_rate": 4.2125e-05, "loss": 2.4461, "step": 315 }, { "epoch": 0.035276100372227046, "grad_norm": 0.2584179639816284, "learning_rate": 4.21e-05, "loss": 2.2489, "step": 316 }, { "epoch": 0.03538773360125308, "grad_norm": 0.25839826464653015, "learning_rate": 4.2075000000000006e-05, "loss": 2.3748, "step": 317 }, { "epoch": 0.03549936683027912, "grad_norm": 0.2568003833293915, "learning_rate": 4.205e-05, "loss": 2.3484, "step": 318 }, { "epoch": 0.035611000059305155, "grad_norm": 0.2569085657596588, "learning_rate": 4.2025000000000005e-05, "loss": 2.4397, "step": 319 }, { "epoch": 0.03572263328833119, "grad_norm": 0.24986624717712402, "learning_rate": 4.2e-05, "loss": 2.3008, "step": 320 }, { "epoch": 0.03583426651735722, "grad_norm": 0.26116469502449036, "learning_rate": 4.1975000000000004e-05, "loss": 2.2436, "step": 321 }, { "epoch": 0.03594589974638326, "grad_norm": 0.3085286021232605, "learning_rate": 4.195e-05, "loss": 2.2419, "step": 322 }, { "epoch": 0.03605753297540929, "grad_norm": 1.1896109580993652, "learning_rate": 4.1925e-05, "loss": 2.3528, "step": 323 }, { "epoch": 0.036169166204435325, "grad_norm": 0.26796790957450867, "learning_rate": 4.19e-05, "loss": 2.314, "step": 324 }, { "epoch": 0.03628079943346136, "grad_norm": 0.27274981141090393, "learning_rate": 4.1875e-05, "loss": 2.3397, "step": 325 }, { "epoch": 0.0363924326624874, "grad_norm": 0.2676389813423157, "learning_rate": 4.185e-05, "loss": 2.4168, "step": 326 }, { "epoch": 0.036504065891513435, "grad_norm": 0.4810822904109955, "learning_rate": 4.1825e-05, "loss": 2.3375, "step": 327 }, { "epoch": 0.03661569912053947, "grad_norm": 0.28411370515823364, "learning_rate": 4.18e-05, "loss": 2.2704, "step": 328 }, { "epoch": 0.0367273323495655, "grad_norm": 0.27649131417274475, "learning_rate": 4.1775000000000006e-05, "loss": 2.3377, "step": 329 }, { "epoch": 0.03683896557859154, "grad_norm": 0.25533196330070496, "learning_rate": 4.175e-05, "loss": 2.2652, "step": 330 }, { "epoch": 0.03695059880761757, "grad_norm": 0.2640891969203949, "learning_rate": 4.1725000000000005e-05, "loss": 2.3504, "step": 331 }, { "epoch": 0.037062232036643605, "grad_norm": 0.2534253001213074, "learning_rate": 4.17e-05, "loss": 2.4219, "step": 332 }, { "epoch": 0.03717386526566964, "grad_norm": 0.2603084146976471, "learning_rate": 4.1675e-05, "loss": 2.3582, "step": 333 }, { "epoch": 0.03728549849469568, "grad_norm": 0.2546936273574829, "learning_rate": 4.165e-05, "loss": 2.3509, "step": 334 }, { "epoch": 0.037397131723721715, "grad_norm": 0.2537340819835663, "learning_rate": 4.1625e-05, "loss": 2.3683, "step": 335 }, { "epoch": 0.03750876495274775, "grad_norm": 0.26713791489601135, "learning_rate": 4.16e-05, "loss": 2.3843, "step": 336 }, { "epoch": 0.03762039818177378, "grad_norm": 0.2577325105667114, "learning_rate": 4.1575e-05, "loss": 2.4786, "step": 337 }, { "epoch": 0.03773203141079982, "grad_norm": 0.24003548920154572, "learning_rate": 4.155e-05, "loss": 2.4079, "step": 338 }, { "epoch": 0.03784366463982585, "grad_norm": 0.26992398500442505, "learning_rate": 4.1525e-05, "loss": 2.382, "step": 339 }, { "epoch": 0.037955297868851885, "grad_norm": 0.2569391131401062, "learning_rate": 4.15e-05, "loss": 2.2762, "step": 340 }, { "epoch": 0.03806693109787792, "grad_norm": 0.2576649785041809, "learning_rate": 4.1475000000000005e-05, "loss": 2.3492, "step": 341 }, { "epoch": 0.03817856432690396, "grad_norm": 0.2680610716342926, "learning_rate": 4.145e-05, "loss": 2.3769, "step": 342 }, { "epoch": 0.038290197555929995, "grad_norm": 0.2511936128139496, "learning_rate": 4.1425000000000004e-05, "loss": 2.2633, "step": 343 }, { "epoch": 0.03840183078495603, "grad_norm": 0.2549611032009125, "learning_rate": 4.14e-05, "loss": 2.4439, "step": 344 }, { "epoch": 0.03851346401398206, "grad_norm": 0.24939557909965515, "learning_rate": 4.1375e-05, "loss": 2.3102, "step": 345 }, { "epoch": 0.0386250972430081, "grad_norm": 0.2684389650821686, "learning_rate": 4.135e-05, "loss": 2.3636, "step": 346 }, { "epoch": 0.03873673047203413, "grad_norm": 0.24813228845596313, "learning_rate": 4.1325e-05, "loss": 2.3439, "step": 347 }, { "epoch": 0.038848363701060165, "grad_norm": 0.24416686594486237, "learning_rate": 4.13e-05, "loss": 2.256, "step": 348 }, { "epoch": 0.0389599969300862, "grad_norm": 0.24512575566768646, "learning_rate": 4.1275e-05, "loss": 2.3966, "step": 349 }, { "epoch": 0.03907163015911223, "grad_norm": 0.25613388419151306, "learning_rate": 4.125e-05, "loss": 2.3726, "step": 350 }, { "epoch": 0.039183263388138274, "grad_norm": 0.2553405463695526, "learning_rate": 4.1225e-05, "loss": 2.4718, "step": 351 }, { "epoch": 0.03929489661716431, "grad_norm": 0.2818881869316101, "learning_rate": 4.12e-05, "loss": 2.2828, "step": 352 }, { "epoch": 0.03940652984619034, "grad_norm": 0.2522308826446533, "learning_rate": 4.1175000000000005e-05, "loss": 2.3791, "step": 353 }, { "epoch": 0.03951816307521638, "grad_norm": 0.2561878561973572, "learning_rate": 4.115e-05, "loss": 2.4064, "step": 354 }, { "epoch": 0.03962979630424241, "grad_norm": 0.2658417522907257, "learning_rate": 4.1125000000000004e-05, "loss": 2.4249, "step": 355 }, { "epoch": 0.039741429533268445, "grad_norm": 0.3040316700935364, "learning_rate": 4.11e-05, "loss": 2.3588, "step": 356 }, { "epoch": 0.03985306276229448, "grad_norm": 0.24854056537151337, "learning_rate": 4.1075e-05, "loss": 2.3466, "step": 357 }, { "epoch": 0.03996469599132051, "grad_norm": 0.2535928785800934, "learning_rate": 4.105e-05, "loss": 2.2637, "step": 358 }, { "epoch": 0.040076329220346554, "grad_norm": 0.2553252577781677, "learning_rate": 4.1025e-05, "loss": 2.3413, "step": 359 }, { "epoch": 0.04018796244937259, "grad_norm": 0.25706836581230164, "learning_rate": 4.1e-05, "loss": 2.3426, "step": 360 }, { "epoch": 0.04029959567839862, "grad_norm": 0.2549241781234741, "learning_rate": 4.0975e-05, "loss": 2.4028, "step": 361 }, { "epoch": 0.04041122890742466, "grad_norm": 0.25191277265548706, "learning_rate": 4.095e-05, "loss": 2.3353, "step": 362 }, { "epoch": 0.04052286213645069, "grad_norm": 0.2870761454105377, "learning_rate": 4.0925000000000005e-05, "loss": 2.2961, "step": 363 }, { "epoch": 0.040634495365476725, "grad_norm": 0.26251882314682007, "learning_rate": 4.09e-05, "loss": 2.3897, "step": 364 }, { "epoch": 0.04074612859450276, "grad_norm": 0.24886254966259003, "learning_rate": 4.0875000000000004e-05, "loss": 2.3752, "step": 365 }, { "epoch": 0.04085776182352879, "grad_norm": 0.24439434707164764, "learning_rate": 4.085e-05, "loss": 2.3435, "step": 366 }, { "epoch": 0.040969395052554834, "grad_norm": 0.41298142075538635, "learning_rate": 4.0825e-05, "loss": 2.4568, "step": 367 }, { "epoch": 0.04108102828158087, "grad_norm": 0.2539288401603699, "learning_rate": 4.08e-05, "loss": 2.3788, "step": 368 }, { "epoch": 0.0411926615106069, "grad_norm": 0.2483907788991928, "learning_rate": 4.0775e-05, "loss": 2.2744, "step": 369 }, { "epoch": 0.04130429473963294, "grad_norm": 0.2610059678554535, "learning_rate": 4.075e-05, "loss": 2.4475, "step": 370 }, { "epoch": 0.04141592796865897, "grad_norm": 0.2610619068145752, "learning_rate": 4.0725e-05, "loss": 2.4049, "step": 371 }, { "epoch": 0.041527561197685005, "grad_norm": 0.26267480850219727, "learning_rate": 4.07e-05, "loss": 2.3503, "step": 372 }, { "epoch": 0.04163919442671104, "grad_norm": 0.2515001893043518, "learning_rate": 4.0675e-05, "loss": 2.3344, "step": 373 }, { "epoch": 0.04175082765573707, "grad_norm": 0.27069345116615295, "learning_rate": 4.065e-05, "loss": 2.3766, "step": 374 }, { "epoch": 0.041862460884763114, "grad_norm": 0.2560361921787262, "learning_rate": 4.0625000000000005e-05, "loss": 2.3775, "step": 375 }, { "epoch": 0.04197409411378915, "grad_norm": 0.2536779046058655, "learning_rate": 4.0600000000000004e-05, "loss": 2.3652, "step": 376 }, { "epoch": 0.04208572734281518, "grad_norm": 0.25629979372024536, "learning_rate": 4.0575000000000004e-05, "loss": 2.3127, "step": 377 }, { "epoch": 0.042197360571841216, "grad_norm": 0.25751662254333496, "learning_rate": 4.055e-05, "loss": 2.3268, "step": 378 }, { "epoch": 0.04230899380086725, "grad_norm": 0.25083836913108826, "learning_rate": 4.0525e-05, "loss": 2.3929, "step": 379 }, { "epoch": 0.042420627029893285, "grad_norm": 0.2616216242313385, "learning_rate": 4.05e-05, "loss": 2.2566, "step": 380 }, { "epoch": 0.04253226025891932, "grad_norm": 0.2545231580734253, "learning_rate": 4.0475e-05, "loss": 2.3916, "step": 381 }, { "epoch": 0.04264389348794535, "grad_norm": 0.24984431266784668, "learning_rate": 4.045000000000001e-05, "loss": 2.3001, "step": 382 }, { "epoch": 0.04275552671697139, "grad_norm": 0.2487059086561203, "learning_rate": 4.0425e-05, "loss": 2.4398, "step": 383 }, { "epoch": 0.04286715994599743, "grad_norm": 0.2508711516857147, "learning_rate": 4.0400000000000006e-05, "loss": 2.3868, "step": 384 }, { "epoch": 0.04297879317502346, "grad_norm": 0.250318706035614, "learning_rate": 4.0375e-05, "loss": 2.3627, "step": 385 }, { "epoch": 0.043090426404049496, "grad_norm": 0.27126437425613403, "learning_rate": 4.0350000000000005e-05, "loss": 2.3592, "step": 386 }, { "epoch": 0.04320205963307553, "grad_norm": 0.2641567289829254, "learning_rate": 4.0325000000000004e-05, "loss": 2.3391, "step": 387 }, { "epoch": 0.043313692862101565, "grad_norm": 0.24686801433563232, "learning_rate": 4.0300000000000004e-05, "loss": 2.4642, "step": 388 }, { "epoch": 0.0434253260911276, "grad_norm": 0.260929137468338, "learning_rate": 4.0275e-05, "loss": 2.2533, "step": 389 }, { "epoch": 0.04353695932015363, "grad_norm": 0.2568647265434265, "learning_rate": 4.025e-05, "loss": 2.3352, "step": 390 }, { "epoch": 0.04364859254917967, "grad_norm": 0.24345262348651886, "learning_rate": 4.0225e-05, "loss": 2.3717, "step": 391 }, { "epoch": 0.04376022577820571, "grad_norm": 0.24727144837379456, "learning_rate": 4.02e-05, "loss": 2.3759, "step": 392 }, { "epoch": 0.04387185900723174, "grad_norm": 0.24666303396224976, "learning_rate": 4.0175e-05, "loss": 2.3249, "step": 393 }, { "epoch": 0.043983492236257776, "grad_norm": 0.26533442735671997, "learning_rate": 4.015000000000001e-05, "loss": 2.4041, "step": 394 }, { "epoch": 0.04409512546528381, "grad_norm": 0.25406232476234436, "learning_rate": 4.0125e-05, "loss": 2.3457, "step": 395 }, { "epoch": 0.044206758694309845, "grad_norm": 0.250957190990448, "learning_rate": 4.0100000000000006e-05, "loss": 2.3591, "step": 396 }, { "epoch": 0.04431839192333588, "grad_norm": 0.25459203124046326, "learning_rate": 4.0075e-05, "loss": 2.4105, "step": 397 }, { "epoch": 0.04443002515236191, "grad_norm": 0.25286054611206055, "learning_rate": 4.0050000000000004e-05, "loss": 2.3757, "step": 398 }, { "epoch": 0.04454165838138795, "grad_norm": 0.2448861449956894, "learning_rate": 4.0025000000000004e-05, "loss": 2.4299, "step": 399 }, { "epoch": 0.04465329161041399, "grad_norm": 0.24877607822418213, "learning_rate": 4e-05, "loss": 2.4102, "step": 400 }, { "epoch": 0.04476492483944002, "grad_norm": 0.2525503635406494, "learning_rate": 3.9975e-05, "loss": 2.4452, "step": 401 }, { "epoch": 0.044876558068466056, "grad_norm": 0.24728810787200928, "learning_rate": 3.995e-05, "loss": 2.354, "step": 402 }, { "epoch": 0.04498819129749209, "grad_norm": 0.26556339859962463, "learning_rate": 3.9925e-05, "loss": 2.3333, "step": 403 }, { "epoch": 0.045099824526518124, "grad_norm": 0.25951698422431946, "learning_rate": 3.99e-05, "loss": 2.2839, "step": 404 }, { "epoch": 0.04521145775554416, "grad_norm": 0.25703710317611694, "learning_rate": 3.9875e-05, "loss": 2.3381, "step": 405 }, { "epoch": 0.04532309098457019, "grad_norm": 0.2780836820602417, "learning_rate": 3.9850000000000006e-05, "loss": 2.3457, "step": 406 }, { "epoch": 0.04543472421359623, "grad_norm": 0.252136766910553, "learning_rate": 3.9825e-05, "loss": 2.3855, "step": 407 }, { "epoch": 0.04554635744262227, "grad_norm": 0.2834818661212921, "learning_rate": 3.9800000000000005e-05, "loss": 2.4514, "step": 408 }, { "epoch": 0.0456579906716483, "grad_norm": 0.2503218948841095, "learning_rate": 3.9775e-05, "loss": 2.3824, "step": 409 }, { "epoch": 0.045769623900674336, "grad_norm": 0.2580338418483734, "learning_rate": 3.9750000000000004e-05, "loss": 2.3869, "step": 410 }, { "epoch": 0.04588125712970037, "grad_norm": 0.2537950277328491, "learning_rate": 3.9725e-05, "loss": 2.3022, "step": 411 }, { "epoch": 0.045992890358726404, "grad_norm": 0.2560393810272217, "learning_rate": 3.97e-05, "loss": 2.3521, "step": 412 }, { "epoch": 0.04610452358775244, "grad_norm": 0.2616626024246216, "learning_rate": 3.9675e-05, "loss": 2.3337, "step": 413 }, { "epoch": 0.04621615681677847, "grad_norm": 0.3034763038158417, "learning_rate": 3.965e-05, "loss": 2.2924, "step": 414 }, { "epoch": 0.04632779004580451, "grad_norm": 0.2462148219347, "learning_rate": 3.9625e-05, "loss": 2.4056, "step": 415 }, { "epoch": 0.04643942327483054, "grad_norm": 0.2612819969654083, "learning_rate": 3.960000000000001e-05, "loss": 2.2965, "step": 416 }, { "epoch": 0.04655105650385658, "grad_norm": 0.2580772638320923, "learning_rate": 3.9575e-05, "loss": 2.3585, "step": 417 }, { "epoch": 0.046662689732882616, "grad_norm": 0.25281044840812683, "learning_rate": 3.9550000000000006e-05, "loss": 2.4224, "step": 418 }, { "epoch": 0.04677432296190865, "grad_norm": 0.2507364749908447, "learning_rate": 3.9525e-05, "loss": 2.3973, "step": 419 }, { "epoch": 0.046885956190934684, "grad_norm": 0.25551095604896545, "learning_rate": 3.9500000000000005e-05, "loss": 2.3345, "step": 420 }, { "epoch": 0.04699758941996072, "grad_norm": 0.276262104511261, "learning_rate": 3.9475000000000004e-05, "loss": 2.3373, "step": 421 }, { "epoch": 0.04710922264898675, "grad_norm": 0.30092892050743103, "learning_rate": 3.9450000000000003e-05, "loss": 2.3881, "step": 422 }, { "epoch": 0.04722085587801279, "grad_norm": 0.25903210043907166, "learning_rate": 3.9425e-05, "loss": 2.3793, "step": 423 }, { "epoch": 0.04733248910703882, "grad_norm": 0.2587510347366333, "learning_rate": 3.94e-05, "loss": 2.2823, "step": 424 }, { "epoch": 0.04744412233606486, "grad_norm": 0.2502042055130005, "learning_rate": 3.9375e-05, "loss": 2.3279, "step": 425 }, { "epoch": 0.047555755565090896, "grad_norm": 0.43736714124679565, "learning_rate": 3.935e-05, "loss": 2.3215, "step": 426 }, { "epoch": 0.04766738879411693, "grad_norm": 0.2547898590564728, "learning_rate": 3.9325e-05, "loss": 2.2728, "step": 427 }, { "epoch": 0.047779022023142964, "grad_norm": 0.24751374125480652, "learning_rate": 3.9300000000000007e-05, "loss": 2.3808, "step": 428 }, { "epoch": 0.047890655252169, "grad_norm": 0.2624378800392151, "learning_rate": 3.9275e-05, "loss": 2.3256, "step": 429 }, { "epoch": 0.04800228848119503, "grad_norm": 0.4183436632156372, "learning_rate": 3.9250000000000005e-05, "loss": 2.3553, "step": 430 }, { "epoch": 0.048113921710221066, "grad_norm": 0.2535308599472046, "learning_rate": 3.9225e-05, "loss": 2.3025, "step": 431 }, { "epoch": 0.0482255549392471, "grad_norm": 0.3438050448894501, "learning_rate": 3.9200000000000004e-05, "loss": 2.322, "step": 432 }, { "epoch": 0.04833718816827314, "grad_norm": 0.2605397403240204, "learning_rate": 3.9175000000000004e-05, "loss": 2.3356, "step": 433 }, { "epoch": 0.048448821397299176, "grad_norm": 0.24819597601890564, "learning_rate": 3.915e-05, "loss": 2.4338, "step": 434 }, { "epoch": 0.04856045462632521, "grad_norm": 0.24737314879894257, "learning_rate": 3.9125e-05, "loss": 2.3618, "step": 435 }, { "epoch": 0.048672087855351244, "grad_norm": 0.25205180048942566, "learning_rate": 3.91e-05, "loss": 2.3555, "step": 436 }, { "epoch": 0.04878372108437728, "grad_norm": 0.24479413032531738, "learning_rate": 3.9075e-05, "loss": 2.3788, "step": 437 }, { "epoch": 0.04889535431340331, "grad_norm": 0.26318883895874023, "learning_rate": 3.905e-05, "loss": 2.4028, "step": 438 }, { "epoch": 0.049006987542429346, "grad_norm": 0.24493621289730072, "learning_rate": 3.9025e-05, "loss": 2.3734, "step": 439 }, { "epoch": 0.04911862077145538, "grad_norm": 0.2888612151145935, "learning_rate": 3.9000000000000006e-05, "loss": 2.3424, "step": 440 }, { "epoch": 0.04923025400048142, "grad_norm": 0.2574605941772461, "learning_rate": 3.8975e-05, "loss": 2.4289, "step": 441 }, { "epoch": 0.049341887229507456, "grad_norm": 0.329041987657547, "learning_rate": 3.8950000000000005e-05, "loss": 2.2217, "step": 442 }, { "epoch": 0.04945352045853349, "grad_norm": 0.29988110065460205, "learning_rate": 3.8925e-05, "loss": 2.4389, "step": 443 }, { "epoch": 0.049565153687559524, "grad_norm": 0.2377348095178604, "learning_rate": 3.8900000000000004e-05, "loss": 2.4175, "step": 444 }, { "epoch": 0.04967678691658556, "grad_norm": 0.28037315607070923, "learning_rate": 3.8875e-05, "loss": 2.241, "step": 445 }, { "epoch": 0.04978842014561159, "grad_norm": 0.2584727704524994, "learning_rate": 3.885e-05, "loss": 2.3034, "step": 446 }, { "epoch": 0.049900053374637626, "grad_norm": 0.24890342354774475, "learning_rate": 3.8825e-05, "loss": 2.3393, "step": 447 }, { "epoch": 0.05001168660366366, "grad_norm": 0.24033311009407043, "learning_rate": 3.88e-05, "loss": 2.339, "step": 448 }, { "epoch": 0.0501233198326897, "grad_norm": 0.2352409064769745, "learning_rate": 3.8775e-05, "loss": 2.337, "step": 449 }, { "epoch": 0.050234953061715736, "grad_norm": 0.8847767114639282, "learning_rate": 3.875e-05, "loss": 2.2669, "step": 450 }, { "epoch": 0.05034658629074177, "grad_norm": 0.2545109987258911, "learning_rate": 3.8725e-05, "loss": 2.3277, "step": 451 }, { "epoch": 0.050458219519767804, "grad_norm": 0.2856300175189972, "learning_rate": 3.8700000000000006e-05, "loss": 2.3928, "step": 452 }, { "epoch": 0.05056985274879384, "grad_norm": 0.24699796736240387, "learning_rate": 3.8675e-05, "loss": 2.4304, "step": 453 }, { "epoch": 0.05068148597781987, "grad_norm": 0.2530304193496704, "learning_rate": 3.8650000000000004e-05, "loss": 2.3304, "step": 454 }, { "epoch": 0.050793119206845906, "grad_norm": 0.25681325793266296, "learning_rate": 3.8625e-05, "loss": 2.4092, "step": 455 }, { "epoch": 0.05090475243587194, "grad_norm": 0.2615368962287903, "learning_rate": 3.86e-05, "loss": 2.3721, "step": 456 }, { "epoch": 0.051016385664897974, "grad_norm": 0.2456715703010559, "learning_rate": 3.8575e-05, "loss": 2.3512, "step": 457 }, { "epoch": 0.051128018893924015, "grad_norm": 0.26264017820358276, "learning_rate": 3.855e-05, "loss": 2.419, "step": 458 }, { "epoch": 0.05123965212295005, "grad_norm": 0.2601991295814514, "learning_rate": 3.8525e-05, "loss": 2.3389, "step": 459 }, { "epoch": 0.051351285351976084, "grad_norm": 0.24692246317863464, "learning_rate": 3.85e-05, "loss": 2.1237, "step": 460 }, { "epoch": 0.05146291858100212, "grad_norm": 0.25850188732147217, "learning_rate": 3.8475e-05, "loss": 2.3413, "step": 461 }, { "epoch": 0.05157455181002815, "grad_norm": 0.2406020611524582, "learning_rate": 3.845e-05, "loss": 2.36, "step": 462 }, { "epoch": 0.051686185039054186, "grad_norm": 0.25938504934310913, "learning_rate": 3.8425e-05, "loss": 2.3512, "step": 463 }, { "epoch": 0.05179781826808022, "grad_norm": 0.2700086236000061, "learning_rate": 3.8400000000000005e-05, "loss": 2.3883, "step": 464 }, { "epoch": 0.051909451497106254, "grad_norm": 0.24675601720809937, "learning_rate": 3.8375e-05, "loss": 2.3614, "step": 465 }, { "epoch": 0.052021084726132295, "grad_norm": 0.3309410512447357, "learning_rate": 3.8350000000000004e-05, "loss": 2.4193, "step": 466 }, { "epoch": 0.05213271795515833, "grad_norm": 0.24233734607696533, "learning_rate": 3.8324999999999996e-05, "loss": 2.3038, "step": 467 }, { "epoch": 0.052244351184184364, "grad_norm": 0.24930906295776367, "learning_rate": 3.83e-05, "loss": 2.3087, "step": 468 }, { "epoch": 0.0523559844132104, "grad_norm": 0.2506902813911438, "learning_rate": 3.8275e-05, "loss": 2.4125, "step": 469 }, { "epoch": 0.05246761764223643, "grad_norm": 0.4915805757045746, "learning_rate": 3.825e-05, "loss": 2.4222, "step": 470 }, { "epoch": 0.052579250871262466, "grad_norm": 0.24434614181518555, "learning_rate": 3.8225e-05, "loss": 2.4653, "step": 471 }, { "epoch": 0.0526908841002885, "grad_norm": 0.24511779844760895, "learning_rate": 3.82e-05, "loss": 2.3369, "step": 472 }, { "epoch": 0.052802517329314534, "grad_norm": 0.24757201969623566, "learning_rate": 3.8175e-05, "loss": 2.2323, "step": 473 }, { "epoch": 0.052914150558340575, "grad_norm": 0.2527662515640259, "learning_rate": 3.8150000000000006e-05, "loss": 2.4045, "step": 474 }, { "epoch": 0.05302578378736661, "grad_norm": 0.2481798529624939, "learning_rate": 3.8125e-05, "loss": 2.3615, "step": 475 }, { "epoch": 0.053137417016392643, "grad_norm": 0.25394219160079956, "learning_rate": 3.8100000000000005e-05, "loss": 2.3536, "step": 476 }, { "epoch": 0.05324905024541868, "grad_norm": 0.2457951009273529, "learning_rate": 3.8075e-05, "loss": 2.2919, "step": 477 }, { "epoch": 0.05336068347444471, "grad_norm": 0.43115633726119995, "learning_rate": 3.805e-05, "loss": 2.4222, "step": 478 }, { "epoch": 0.053472316703470746, "grad_norm": 0.257254958152771, "learning_rate": 3.8025e-05, "loss": 2.3744, "step": 479 }, { "epoch": 0.05358394993249678, "grad_norm": 0.2672825753688812, "learning_rate": 3.8e-05, "loss": 2.3315, "step": 480 }, { "epoch": 0.053695583161522814, "grad_norm": 0.24192893505096436, "learning_rate": 3.7975e-05, "loss": 2.4601, "step": 481 }, { "epoch": 0.053807216390548855, "grad_norm": 0.24756518006324768, "learning_rate": 3.795e-05, "loss": 2.2872, "step": 482 }, { "epoch": 0.05391884961957489, "grad_norm": 0.25323203206062317, "learning_rate": 3.7925e-05, "loss": 2.4142, "step": 483 }, { "epoch": 0.05403048284860092, "grad_norm": 0.25254499912261963, "learning_rate": 3.79e-05, "loss": 2.3528, "step": 484 }, { "epoch": 0.05414211607762696, "grad_norm": 0.25069230794906616, "learning_rate": 3.7875e-05, "loss": 2.2906, "step": 485 }, { "epoch": 0.05425374930665299, "grad_norm": 0.26814520359039307, "learning_rate": 3.7850000000000005e-05, "loss": 2.3368, "step": 486 }, { "epoch": 0.054365382535679026, "grad_norm": 0.24452055990695953, "learning_rate": 3.7825e-05, "loss": 2.3667, "step": 487 }, { "epoch": 0.05447701576470506, "grad_norm": 0.26742151379585266, "learning_rate": 3.7800000000000004e-05, "loss": 2.323, "step": 488 }, { "epoch": 0.054588648993731094, "grad_norm": 0.25766825675964355, "learning_rate": 3.7775e-05, "loss": 2.3761, "step": 489 }, { "epoch": 0.05470028222275713, "grad_norm": 0.29029610753059387, "learning_rate": 3.775e-05, "loss": 2.3216, "step": 490 }, { "epoch": 0.05481191545178317, "grad_norm": 0.25023818016052246, "learning_rate": 3.7725e-05, "loss": 2.3441, "step": 491 }, { "epoch": 0.0549235486808092, "grad_norm": 0.3373621702194214, "learning_rate": 3.77e-05, "loss": 2.3166, "step": 492 }, { "epoch": 0.05503518190983524, "grad_norm": 0.23989304900169373, "learning_rate": 3.7675e-05, "loss": 2.2795, "step": 493 }, { "epoch": 0.05514681513886127, "grad_norm": 0.3192415237426758, "learning_rate": 3.765e-05, "loss": 2.3273, "step": 494 }, { "epoch": 0.055258448367887306, "grad_norm": 0.24007610976696014, "learning_rate": 3.7625e-05, "loss": 2.413, "step": 495 }, { "epoch": 0.05537008159691334, "grad_norm": 0.36170458793640137, "learning_rate": 3.76e-05, "loss": 2.2216, "step": 496 }, { "epoch": 0.055481714825939374, "grad_norm": 0.24714718759059906, "learning_rate": 3.7575e-05, "loss": 2.3394, "step": 497 }, { "epoch": 0.05559334805496541, "grad_norm": 0.238433837890625, "learning_rate": 3.7550000000000005e-05, "loss": 2.3685, "step": 498 }, { "epoch": 0.05570498128399145, "grad_norm": 0.24975870549678802, "learning_rate": 3.7525e-05, "loss": 2.241, "step": 499 }, { "epoch": 0.05581661451301748, "grad_norm": 0.24853730201721191, "learning_rate": 3.7500000000000003e-05, "loss": 2.4173, "step": 500 }, { "epoch": 0.05592824774204352, "grad_norm": 0.24904385209083557, "learning_rate": 3.7475e-05, "loss": 2.4291, "step": 501 }, { "epoch": 0.05603988097106955, "grad_norm": 0.23549965023994446, "learning_rate": 3.745e-05, "loss": 2.3054, "step": 502 }, { "epoch": 0.056151514200095586, "grad_norm": 0.2464476376771927, "learning_rate": 3.7425e-05, "loss": 2.4886, "step": 503 }, { "epoch": 0.05626314742912162, "grad_norm": 0.25297120213508606, "learning_rate": 3.74e-05, "loss": 2.3384, "step": 504 }, { "epoch": 0.056374780658147654, "grad_norm": 0.2817951440811157, "learning_rate": 3.737500000000001e-05, "loss": 2.3319, "step": 505 }, { "epoch": 0.05648641388717369, "grad_norm": 0.24703608453273773, "learning_rate": 3.735e-05, "loss": 2.4513, "step": 506 }, { "epoch": 0.05659804711619973, "grad_norm": 0.25609731674194336, "learning_rate": 3.7325000000000006e-05, "loss": 2.4268, "step": 507 }, { "epoch": 0.05670968034522576, "grad_norm": 0.2358425110578537, "learning_rate": 3.73e-05, "loss": 2.3388, "step": 508 }, { "epoch": 0.0568213135742518, "grad_norm": 0.2538928687572479, "learning_rate": 3.7275000000000005e-05, "loss": 2.4332, "step": 509 }, { "epoch": 0.05693294680327783, "grad_norm": 0.2499266117811203, "learning_rate": 3.7250000000000004e-05, "loss": 2.3393, "step": 510 }, { "epoch": 0.057044580032303865, "grad_norm": 0.24814729392528534, "learning_rate": 3.7225000000000004e-05, "loss": 2.3558, "step": 511 }, { "epoch": 0.0571562132613299, "grad_norm": 0.24115900695323944, "learning_rate": 3.72e-05, "loss": 2.33, "step": 512 }, { "epoch": 0.057267846490355934, "grad_norm": 0.24013325572013855, "learning_rate": 3.7175e-05, "loss": 2.4107, "step": 513 }, { "epoch": 0.05737947971938197, "grad_norm": 0.24541234970092773, "learning_rate": 3.715e-05, "loss": 2.3866, "step": 514 }, { "epoch": 0.05749111294840801, "grad_norm": 0.23506753146648407, "learning_rate": 3.7125e-05, "loss": 2.3321, "step": 515 }, { "epoch": 0.05760274617743404, "grad_norm": 0.24425718188285828, "learning_rate": 3.71e-05, "loss": 2.4, "step": 516 }, { "epoch": 0.05771437940646008, "grad_norm": 0.2441939115524292, "learning_rate": 3.707500000000001e-05, "loss": 2.3855, "step": 517 }, { "epoch": 0.05782601263548611, "grad_norm": 0.23702003061771393, "learning_rate": 3.705e-05, "loss": 2.3171, "step": 518 }, { "epoch": 0.057937645864512145, "grad_norm": 0.2465049922466278, "learning_rate": 3.7025000000000005e-05, "loss": 2.431, "step": 519 }, { "epoch": 0.05804927909353818, "grad_norm": 0.22516661882400513, "learning_rate": 3.7e-05, "loss": 2.3307, "step": 520 }, { "epoch": 0.058160912322564214, "grad_norm": 0.2372903823852539, "learning_rate": 3.6975000000000004e-05, "loss": 2.2791, "step": 521 }, { "epoch": 0.05827254555159025, "grad_norm": 0.2584116458892822, "learning_rate": 3.6950000000000004e-05, "loss": 2.4296, "step": 522 }, { "epoch": 0.05838417878061628, "grad_norm": 0.24023661017417908, "learning_rate": 3.6925e-05, "loss": 2.4579, "step": 523 }, { "epoch": 0.05849581200964232, "grad_norm": 0.26791271567344666, "learning_rate": 3.69e-05, "loss": 2.3662, "step": 524 }, { "epoch": 0.05860744523866836, "grad_norm": 0.23485226929187775, "learning_rate": 3.6875e-05, "loss": 2.1954, "step": 525 }, { "epoch": 0.05871907846769439, "grad_norm": 0.24312959611415863, "learning_rate": 3.685e-05, "loss": 2.4423, "step": 526 }, { "epoch": 0.058830711696720425, "grad_norm": 0.2558239996433258, "learning_rate": 3.6825e-05, "loss": 2.3661, "step": 527 }, { "epoch": 0.05894234492574646, "grad_norm": 0.26516368985176086, "learning_rate": 3.68e-05, "loss": 2.3181, "step": 528 }, { "epoch": 0.05905397815477249, "grad_norm": 0.24487736821174622, "learning_rate": 3.6775000000000006e-05, "loss": 2.3667, "step": 529 }, { "epoch": 0.05916561138379853, "grad_norm": 0.23648685216903687, "learning_rate": 3.675e-05, "loss": 2.3438, "step": 530 }, { "epoch": 0.05927724461282456, "grad_norm": 0.2509547173976898, "learning_rate": 3.6725000000000005e-05, "loss": 2.3128, "step": 531 }, { "epoch": 0.0593888778418506, "grad_norm": 0.24765925109386444, "learning_rate": 3.6700000000000004e-05, "loss": 2.3556, "step": 532 }, { "epoch": 0.05950051107087664, "grad_norm": 0.23786590993404388, "learning_rate": 3.6675000000000004e-05, "loss": 2.4546, "step": 533 }, { "epoch": 0.05961214429990267, "grad_norm": 0.3275027573108673, "learning_rate": 3.665e-05, "loss": 2.3161, "step": 534 }, { "epoch": 0.059723777528928705, "grad_norm": 0.24953784048557281, "learning_rate": 3.6625e-05, "loss": 2.3206, "step": 535 }, { "epoch": 0.05983541075795474, "grad_norm": 0.24241773784160614, "learning_rate": 3.66e-05, "loss": 2.2838, "step": 536 }, { "epoch": 0.05994704398698077, "grad_norm": 0.23624862730503082, "learning_rate": 3.6575e-05, "loss": 2.3082, "step": 537 }, { "epoch": 0.06005867721600681, "grad_norm": 0.24654389917850494, "learning_rate": 3.655e-05, "loss": 2.3327, "step": 538 }, { "epoch": 0.06017031044503284, "grad_norm": 0.24037936329841614, "learning_rate": 3.652500000000001e-05, "loss": 2.336, "step": 539 }, { "epoch": 0.06028194367405888, "grad_norm": 0.25897306203842163, "learning_rate": 3.65e-05, "loss": 2.3064, "step": 540 }, { "epoch": 0.06039357690308492, "grad_norm": 0.24670930206775665, "learning_rate": 3.6475000000000006e-05, "loss": 2.2893, "step": 541 }, { "epoch": 0.06050521013211095, "grad_norm": 0.23652324080467224, "learning_rate": 3.645e-05, "loss": 2.3516, "step": 542 }, { "epoch": 0.060616843361136985, "grad_norm": 0.23609410226345062, "learning_rate": 3.6425000000000004e-05, "loss": 2.3463, "step": 543 }, { "epoch": 0.06072847659016302, "grad_norm": 0.24355870485305786, "learning_rate": 3.6400000000000004e-05, "loss": 2.4344, "step": 544 }, { "epoch": 0.06084010981918905, "grad_norm": 0.24142801761627197, "learning_rate": 3.6375e-05, "loss": 2.3728, "step": 545 }, { "epoch": 0.06095174304821509, "grad_norm": 0.24296842515468597, "learning_rate": 3.635e-05, "loss": 2.3984, "step": 546 }, { "epoch": 0.06106337627724112, "grad_norm": 0.24469731748104095, "learning_rate": 3.6325e-05, "loss": 2.3478, "step": 547 }, { "epoch": 0.06117500950626716, "grad_norm": 0.25056391954421997, "learning_rate": 3.63e-05, "loss": 2.2224, "step": 548 }, { "epoch": 0.0612866427352932, "grad_norm": 0.23732852935791016, "learning_rate": 3.6275e-05, "loss": 2.4875, "step": 549 }, { "epoch": 0.06139827596431923, "grad_norm": 0.2487863302230835, "learning_rate": 3.625e-05, "loss": 2.27, "step": 550 }, { "epoch": 0.061509909193345265, "grad_norm": 0.3302936851978302, "learning_rate": 3.6225000000000006e-05, "loss": 2.2132, "step": 551 }, { "epoch": 0.0616215424223713, "grad_norm": 0.24820421636104584, "learning_rate": 3.62e-05, "loss": 2.3219, "step": 552 }, { "epoch": 0.06173317565139733, "grad_norm": 0.24405966699123383, "learning_rate": 3.6175000000000005e-05, "loss": 2.3802, "step": 553 }, { "epoch": 0.06184480888042337, "grad_norm": 0.24143864214420319, "learning_rate": 3.615e-05, "loss": 2.3897, "step": 554 }, { "epoch": 0.0619564421094494, "grad_norm": 0.25099068880081177, "learning_rate": 3.6125000000000004e-05, "loss": 2.3671, "step": 555 }, { "epoch": 0.062068075338475436, "grad_norm": 0.2503535747528076, "learning_rate": 3.61e-05, "loss": 2.4688, "step": 556 }, { "epoch": 0.06217970856750148, "grad_norm": 0.2651500999927521, "learning_rate": 3.6075e-05, "loss": 2.2332, "step": 557 }, { "epoch": 0.06229134179652751, "grad_norm": 0.2445705533027649, "learning_rate": 3.605e-05, "loss": 2.4157, "step": 558 }, { "epoch": 0.062402975025553545, "grad_norm": 0.3407208025455475, "learning_rate": 3.6025e-05, "loss": 2.2068, "step": 559 }, { "epoch": 0.06251460825457958, "grad_norm": 0.23573793470859528, "learning_rate": 3.6e-05, "loss": 2.3832, "step": 560 }, { "epoch": 0.06262624148360561, "grad_norm": 0.23974989354610443, "learning_rate": 3.5975e-05, "loss": 2.4782, "step": 561 }, { "epoch": 0.06273787471263165, "grad_norm": 0.24216674268245697, "learning_rate": 3.595e-05, "loss": 2.3656, "step": 562 }, { "epoch": 0.06284950794165768, "grad_norm": 0.23898907005786896, "learning_rate": 3.5925000000000006e-05, "loss": 2.4015, "step": 563 }, { "epoch": 0.06296114117068372, "grad_norm": 0.229814350605011, "learning_rate": 3.59e-05, "loss": 2.2476, "step": 564 }, { "epoch": 0.06307277439970975, "grad_norm": 0.37582337856292725, "learning_rate": 3.5875000000000005e-05, "loss": 2.2959, "step": 565 }, { "epoch": 0.06318440762873578, "grad_norm": 0.23359467089176178, "learning_rate": 3.585e-05, "loss": 2.3982, "step": 566 }, { "epoch": 0.06329604085776182, "grad_norm": 0.23306161165237427, "learning_rate": 3.5825000000000003e-05, "loss": 2.3396, "step": 567 }, { "epoch": 0.06340767408678785, "grad_norm": 0.2388441115617752, "learning_rate": 3.58e-05, "loss": 2.3826, "step": 568 }, { "epoch": 0.0635193073158139, "grad_norm": 0.24095419049263, "learning_rate": 3.5775e-05, "loss": 2.3553, "step": 569 }, { "epoch": 0.06363094054483993, "grad_norm": 0.2408561408519745, "learning_rate": 3.575e-05, "loss": 2.417, "step": 570 }, { "epoch": 0.06374257377386597, "grad_norm": 0.23157310485839844, "learning_rate": 3.5725e-05, "loss": 2.3161, "step": 571 }, { "epoch": 0.063854207002892, "grad_norm": 0.23947376012802124, "learning_rate": 3.57e-05, "loss": 2.3105, "step": 572 }, { "epoch": 0.06396584023191804, "grad_norm": 0.24618233740329742, "learning_rate": 3.5675e-05, "loss": 2.4877, "step": 573 }, { "epoch": 0.06407747346094407, "grad_norm": 0.24552257359027863, "learning_rate": 3.565e-05, "loss": 2.3758, "step": 574 }, { "epoch": 0.0641891066899701, "grad_norm": 0.24329356849193573, "learning_rate": 3.5625000000000005e-05, "loss": 2.4084, "step": 575 }, { "epoch": 0.06430073991899614, "grad_norm": 0.24198521673679352, "learning_rate": 3.56e-05, "loss": 2.4133, "step": 576 }, { "epoch": 0.06441237314802217, "grad_norm": 0.23678706586360931, "learning_rate": 3.5575000000000004e-05, "loss": 2.378, "step": 577 }, { "epoch": 0.06452400637704821, "grad_norm": 0.248667374253273, "learning_rate": 3.555e-05, "loss": 2.384, "step": 578 }, { "epoch": 0.06463563960607424, "grad_norm": 0.24898695945739746, "learning_rate": 3.5525e-05, "loss": 2.3269, "step": 579 }, { "epoch": 0.06474727283510028, "grad_norm": 0.240862175822258, "learning_rate": 3.55e-05, "loss": 2.381, "step": 580 }, { "epoch": 0.06485890606412631, "grad_norm": 0.23366478085517883, "learning_rate": 3.5475e-05, "loss": 2.3628, "step": 581 }, { "epoch": 0.06497053929315234, "grad_norm": 0.24220991134643555, "learning_rate": 3.545e-05, "loss": 2.2952, "step": 582 }, { "epoch": 0.06508217252217838, "grad_norm": 0.24667523801326752, "learning_rate": 3.5425e-05, "loss": 2.4119, "step": 583 }, { "epoch": 0.06519380575120441, "grad_norm": 0.24743525683879852, "learning_rate": 3.54e-05, "loss": 2.3073, "step": 584 }, { "epoch": 0.06530543898023046, "grad_norm": 0.908301591873169, "learning_rate": 3.5375e-05, "loss": 2.3342, "step": 585 }, { "epoch": 0.0654170722092565, "grad_norm": 0.2384248673915863, "learning_rate": 3.535e-05, "loss": 2.387, "step": 586 }, { "epoch": 0.06552870543828253, "grad_norm": 0.29484090209007263, "learning_rate": 3.5325000000000005e-05, "loss": 2.4215, "step": 587 }, { "epoch": 0.06564033866730856, "grad_norm": 0.2425820678472519, "learning_rate": 3.53e-05, "loss": 2.4056, "step": 588 }, { "epoch": 0.0657519718963346, "grad_norm": 0.256686270236969, "learning_rate": 3.5275000000000004e-05, "loss": 2.3973, "step": 589 }, { "epoch": 0.06586360512536063, "grad_norm": 0.23785726726055145, "learning_rate": 3.525e-05, "loss": 2.378, "step": 590 }, { "epoch": 0.06597523835438666, "grad_norm": 0.2742824852466583, "learning_rate": 3.5225e-05, "loss": 2.3745, "step": 591 }, { "epoch": 0.0660868715834127, "grad_norm": 0.2250916063785553, "learning_rate": 3.52e-05, "loss": 2.321, "step": 592 }, { "epoch": 0.06619850481243873, "grad_norm": 0.24400900304317474, "learning_rate": 3.5175e-05, "loss": 2.4296, "step": 593 }, { "epoch": 0.06631013804146477, "grad_norm": 0.24581992626190186, "learning_rate": 3.515e-05, "loss": 2.3375, "step": 594 }, { "epoch": 0.0664217712704908, "grad_norm": 0.2356722503900528, "learning_rate": 3.5125e-05, "loss": 2.4229, "step": 595 }, { "epoch": 0.06653340449951683, "grad_norm": 0.24524928629398346, "learning_rate": 3.51e-05, "loss": 2.3744, "step": 596 }, { "epoch": 0.06664503772854287, "grad_norm": 0.24158966541290283, "learning_rate": 3.5075000000000006e-05, "loss": 2.3752, "step": 597 }, { "epoch": 0.0667566709575689, "grad_norm": 0.2346276342868805, "learning_rate": 3.505e-05, "loss": 2.346, "step": 598 }, { "epoch": 0.06686830418659494, "grad_norm": 0.24407444894313812, "learning_rate": 3.5025000000000004e-05, "loss": 2.3559, "step": 599 }, { "epoch": 0.06697993741562097, "grad_norm": 0.24137824773788452, "learning_rate": 3.5e-05, "loss": 2.3965, "step": 600 }, { "epoch": 0.067091570644647, "grad_norm": 0.24912434816360474, "learning_rate": 3.4975e-05, "loss": 2.3882, "step": 601 }, { "epoch": 0.06720320387367305, "grad_norm": 0.24506230652332306, "learning_rate": 3.495e-05, "loss": 2.2875, "step": 602 }, { "epoch": 0.06731483710269909, "grad_norm": 0.2423059046268463, "learning_rate": 3.4925e-05, "loss": 2.2828, "step": 603 }, { "epoch": 0.06742647033172512, "grad_norm": 0.2598559856414795, "learning_rate": 3.49e-05, "loss": 2.288, "step": 604 }, { "epoch": 0.06753810356075116, "grad_norm": 0.2390177696943283, "learning_rate": 3.4875e-05, "loss": 2.4369, "step": 605 }, { "epoch": 0.06764973678977719, "grad_norm": 0.24242550134658813, "learning_rate": 3.485e-05, "loss": 2.2285, "step": 606 }, { "epoch": 0.06776137001880322, "grad_norm": 0.23738925158977509, "learning_rate": 3.4825e-05, "loss": 2.4707, "step": 607 }, { "epoch": 0.06787300324782926, "grad_norm": 0.23820523917675018, "learning_rate": 3.48e-05, "loss": 2.4274, "step": 608 }, { "epoch": 0.06798463647685529, "grad_norm": 0.2343960702419281, "learning_rate": 3.4775000000000005e-05, "loss": 2.2894, "step": 609 }, { "epoch": 0.06809626970588133, "grad_norm": 0.241934671998024, "learning_rate": 3.475e-05, "loss": 2.4438, "step": 610 }, { "epoch": 0.06820790293490736, "grad_norm": 0.24266451597213745, "learning_rate": 3.4725000000000004e-05, "loss": 2.4135, "step": 611 }, { "epoch": 0.0683195361639334, "grad_norm": 0.2379053235054016, "learning_rate": 3.4699999999999996e-05, "loss": 2.2596, "step": 612 }, { "epoch": 0.06843116939295943, "grad_norm": 0.2375876009464264, "learning_rate": 3.4675e-05, "loss": 2.3748, "step": 613 }, { "epoch": 0.06854280262198546, "grad_norm": 0.23114004731178284, "learning_rate": 3.465e-05, "loss": 2.4376, "step": 614 }, { "epoch": 0.0686544358510115, "grad_norm": 0.38774147629737854, "learning_rate": 3.4625e-05, "loss": 2.381, "step": 615 }, { "epoch": 0.06876606908003753, "grad_norm": 0.6831827163696289, "learning_rate": 3.46e-05, "loss": 2.3815, "step": 616 }, { "epoch": 0.06887770230906357, "grad_norm": 0.25311988592147827, "learning_rate": 3.4575e-05, "loss": 2.3525, "step": 617 }, { "epoch": 0.06898933553808961, "grad_norm": 0.2384188175201416, "learning_rate": 3.455e-05, "loss": 2.4084, "step": 618 }, { "epoch": 0.06910096876711565, "grad_norm": 0.2516483962535858, "learning_rate": 3.4525e-05, "loss": 2.3143, "step": 619 }, { "epoch": 0.06921260199614168, "grad_norm": 0.2384801059961319, "learning_rate": 3.45e-05, "loss": 2.3075, "step": 620 }, { "epoch": 0.06932423522516772, "grad_norm": 0.23414179682731628, "learning_rate": 3.4475000000000005e-05, "loss": 2.3035, "step": 621 }, { "epoch": 0.06943586845419375, "grad_norm": 0.25081461668014526, "learning_rate": 3.445e-05, "loss": 2.3292, "step": 622 }, { "epoch": 0.06954750168321978, "grad_norm": 0.22738482058048248, "learning_rate": 3.4425e-05, "loss": 2.3412, "step": 623 }, { "epoch": 0.06965913491224582, "grad_norm": 0.2414780855178833, "learning_rate": 3.4399999999999996e-05, "loss": 2.3674, "step": 624 }, { "epoch": 0.06977076814127185, "grad_norm": 0.25145867466926575, "learning_rate": 3.4375e-05, "loss": 2.3414, "step": 625 }, { "epoch": 0.06988240137029789, "grad_norm": 0.23038849234580994, "learning_rate": 3.435e-05, "loss": 2.3156, "step": 626 }, { "epoch": 0.06999403459932392, "grad_norm": 0.23264938592910767, "learning_rate": 3.4325e-05, "loss": 2.2536, "step": 627 }, { "epoch": 0.07010566782834995, "grad_norm": 0.24372529983520508, "learning_rate": 3.430000000000001e-05, "loss": 2.3152, "step": 628 }, { "epoch": 0.07021730105737599, "grad_norm": 0.23900210857391357, "learning_rate": 3.4275e-05, "loss": 2.343, "step": 629 }, { "epoch": 0.07032893428640202, "grad_norm": 0.236933633685112, "learning_rate": 3.4250000000000006e-05, "loss": 2.4373, "step": 630 }, { "epoch": 0.07044056751542806, "grad_norm": 0.23982471227645874, "learning_rate": 3.4225e-05, "loss": 2.3498, "step": 631 }, { "epoch": 0.07055220074445409, "grad_norm": 0.2617717385292053, "learning_rate": 3.4200000000000005e-05, "loss": 2.2457, "step": 632 }, { "epoch": 0.07066383397348013, "grad_norm": 0.24056993424892426, "learning_rate": 3.4175000000000004e-05, "loss": 2.3543, "step": 633 }, { "epoch": 0.07077546720250616, "grad_norm": 0.2435135543346405, "learning_rate": 3.415e-05, "loss": 2.415, "step": 634 }, { "epoch": 0.07088710043153221, "grad_norm": 0.2404133677482605, "learning_rate": 3.4125e-05, "loss": 2.3479, "step": 635 }, { "epoch": 0.07099873366055824, "grad_norm": 0.22524316608905792, "learning_rate": 3.41e-05, "loss": 2.3252, "step": 636 }, { "epoch": 0.07111036688958428, "grad_norm": 0.23215116560459137, "learning_rate": 3.4075e-05, "loss": 2.3494, "step": 637 }, { "epoch": 0.07122200011861031, "grad_norm": 0.24920037388801575, "learning_rate": 3.405e-05, "loss": 2.3705, "step": 638 }, { "epoch": 0.07133363334763634, "grad_norm": 0.2321121245622635, "learning_rate": 3.4025e-05, "loss": 2.2816, "step": 639 }, { "epoch": 0.07144526657666238, "grad_norm": 0.2596079111099243, "learning_rate": 3.4000000000000007e-05, "loss": 2.3672, "step": 640 }, { "epoch": 0.07155689980568841, "grad_norm": 0.24075333774089813, "learning_rate": 3.3975e-05, "loss": 2.1981, "step": 641 }, { "epoch": 0.07166853303471445, "grad_norm": 0.24256588518619537, "learning_rate": 3.3950000000000005e-05, "loss": 2.4488, "step": 642 }, { "epoch": 0.07178016626374048, "grad_norm": 0.24132193624973297, "learning_rate": 3.3925e-05, "loss": 2.394, "step": 643 }, { "epoch": 0.07189179949276651, "grad_norm": 0.23571936786174774, "learning_rate": 3.3900000000000004e-05, "loss": 2.3527, "step": 644 }, { "epoch": 0.07200343272179255, "grad_norm": 0.24922168254852295, "learning_rate": 3.3875000000000003e-05, "loss": 2.504, "step": 645 }, { "epoch": 0.07211506595081858, "grad_norm": 0.2349790334701538, "learning_rate": 3.385e-05, "loss": 2.375, "step": 646 }, { "epoch": 0.07222669917984462, "grad_norm": 0.23674091696739197, "learning_rate": 3.3825e-05, "loss": 2.453, "step": 647 }, { "epoch": 0.07233833240887065, "grad_norm": 0.24707044661045074, "learning_rate": 3.38e-05, "loss": 2.3144, "step": 648 }, { "epoch": 0.07244996563789668, "grad_norm": 0.24542857706546783, "learning_rate": 3.3775e-05, "loss": 2.4941, "step": 649 }, { "epoch": 0.07256159886692272, "grad_norm": 0.233075812458992, "learning_rate": 3.375000000000001e-05, "loss": 2.3719, "step": 650 }, { "epoch": 0.07267323209594877, "grad_norm": 0.23478782176971436, "learning_rate": 3.3725e-05, "loss": 2.4255, "step": 651 }, { "epoch": 0.0727848653249748, "grad_norm": 0.2310231477022171, "learning_rate": 3.3700000000000006e-05, "loss": 2.2017, "step": 652 }, { "epoch": 0.07289649855400084, "grad_norm": 0.23209823668003082, "learning_rate": 3.3675e-05, "loss": 2.2443, "step": 653 }, { "epoch": 0.07300813178302687, "grad_norm": 0.2376200407743454, "learning_rate": 3.3650000000000005e-05, "loss": 2.446, "step": 654 }, { "epoch": 0.0731197650120529, "grad_norm": 0.23651504516601562, "learning_rate": 3.3625000000000004e-05, "loss": 2.4459, "step": 655 }, { "epoch": 0.07323139824107894, "grad_norm": 0.25088030099868774, "learning_rate": 3.3600000000000004e-05, "loss": 2.4193, "step": 656 }, { "epoch": 0.07334303147010497, "grad_norm": 0.24074560403823853, "learning_rate": 3.3575e-05, "loss": 2.2942, "step": 657 }, { "epoch": 0.073454664699131, "grad_norm": 0.24596726894378662, "learning_rate": 3.355e-05, "loss": 2.3474, "step": 658 }, { "epoch": 0.07356629792815704, "grad_norm": 0.23518167436122894, "learning_rate": 3.3525e-05, "loss": 2.4345, "step": 659 }, { "epoch": 0.07367793115718307, "grad_norm": 0.23677384853363037, "learning_rate": 3.35e-05, "loss": 2.178, "step": 660 }, { "epoch": 0.07378956438620911, "grad_norm": 0.24126943945884705, "learning_rate": 3.3475e-05, "loss": 2.3258, "step": 661 }, { "epoch": 0.07390119761523514, "grad_norm": 0.24277062714099884, "learning_rate": 3.345000000000001e-05, "loss": 2.3129, "step": 662 }, { "epoch": 0.07401283084426118, "grad_norm": 0.2481059432029724, "learning_rate": 3.3425e-05, "loss": 2.3619, "step": 663 }, { "epoch": 0.07412446407328721, "grad_norm": 0.23601798713207245, "learning_rate": 3.3400000000000005e-05, "loss": 2.346, "step": 664 }, { "epoch": 0.07423609730231324, "grad_norm": 0.23907043039798737, "learning_rate": 3.3375e-05, "loss": 2.333, "step": 665 }, { "epoch": 0.07434773053133928, "grad_norm": 0.2404210865497589, "learning_rate": 3.3350000000000004e-05, "loss": 2.3695, "step": 666 }, { "epoch": 0.07445936376036531, "grad_norm": 0.23311296105384827, "learning_rate": 3.3325000000000004e-05, "loss": 2.2827, "step": 667 }, { "epoch": 0.07457099698939136, "grad_norm": 0.24998392164707184, "learning_rate": 3.33e-05, "loss": 2.2346, "step": 668 }, { "epoch": 0.0746826302184174, "grad_norm": 0.23921620845794678, "learning_rate": 3.3275e-05, "loss": 2.4188, "step": 669 }, { "epoch": 0.07479426344744343, "grad_norm": 0.23314505815505981, "learning_rate": 3.325e-05, "loss": 2.3997, "step": 670 }, { "epoch": 0.07490589667646946, "grad_norm": 0.2334446907043457, "learning_rate": 3.3225e-05, "loss": 2.397, "step": 671 }, { "epoch": 0.0750175299054955, "grad_norm": 0.23645921051502228, "learning_rate": 3.32e-05, "loss": 2.4039, "step": 672 }, { "epoch": 0.07512916313452153, "grad_norm": 0.2596314251422882, "learning_rate": 3.3175e-05, "loss": 2.4392, "step": 673 }, { "epoch": 0.07524079636354757, "grad_norm": 0.24032291769981384, "learning_rate": 3.3150000000000006e-05, "loss": 2.3858, "step": 674 }, { "epoch": 0.0753524295925736, "grad_norm": 0.2412300407886505, "learning_rate": 3.3125e-05, "loss": 2.278, "step": 675 }, { "epoch": 0.07546406282159963, "grad_norm": 0.225362628698349, "learning_rate": 3.3100000000000005e-05, "loss": 2.4296, "step": 676 }, { "epoch": 0.07557569605062567, "grad_norm": 0.24731768667697906, "learning_rate": 3.3075e-05, "loss": 2.5028, "step": 677 }, { "epoch": 0.0756873292796517, "grad_norm": 0.24850989878177643, "learning_rate": 3.3050000000000004e-05, "loss": 2.4462, "step": 678 }, { "epoch": 0.07579896250867774, "grad_norm": 0.23657028377056122, "learning_rate": 3.3025e-05, "loss": 2.3677, "step": 679 }, { "epoch": 0.07591059573770377, "grad_norm": 0.23830410838127136, "learning_rate": 3.3e-05, "loss": 2.3859, "step": 680 }, { "epoch": 0.0760222289667298, "grad_norm": 0.23412565886974335, "learning_rate": 3.2975e-05, "loss": 2.3559, "step": 681 }, { "epoch": 0.07613386219575584, "grad_norm": 0.2358182668685913, "learning_rate": 3.295e-05, "loss": 2.3619, "step": 682 }, { "epoch": 0.07624549542478187, "grad_norm": 0.24088707566261292, "learning_rate": 3.2925e-05, "loss": 2.2953, "step": 683 }, { "epoch": 0.07635712865380792, "grad_norm": 0.23123978078365326, "learning_rate": 3.29e-05, "loss": 2.33, "step": 684 }, { "epoch": 0.07646876188283395, "grad_norm": 0.24349573254585266, "learning_rate": 3.2875e-05, "loss": 2.3789, "step": 685 }, { "epoch": 0.07658039511185999, "grad_norm": 0.23393647372722626, "learning_rate": 3.2850000000000006e-05, "loss": 2.4458, "step": 686 }, { "epoch": 0.07669202834088602, "grad_norm": 0.2344076782464981, "learning_rate": 3.2825e-05, "loss": 2.2232, "step": 687 }, { "epoch": 0.07680366156991206, "grad_norm": 0.23426808416843414, "learning_rate": 3.2800000000000004e-05, "loss": 2.55, "step": 688 }, { "epoch": 0.07691529479893809, "grad_norm": 0.24055257439613342, "learning_rate": 3.2775e-05, "loss": 2.4083, "step": 689 }, { "epoch": 0.07702692802796413, "grad_norm": 0.23331885039806366, "learning_rate": 3.275e-05, "loss": 2.3025, "step": 690 }, { "epoch": 0.07713856125699016, "grad_norm": 0.24212725460529327, "learning_rate": 3.2725e-05, "loss": 2.3205, "step": 691 }, { "epoch": 0.0772501944860162, "grad_norm": 0.32642829418182373, "learning_rate": 3.27e-05, "loss": 2.2472, "step": 692 }, { "epoch": 0.07736182771504223, "grad_norm": 0.30700212717056274, "learning_rate": 3.2675e-05, "loss": 2.3868, "step": 693 }, { "epoch": 0.07747346094406826, "grad_norm": 0.23343181610107422, "learning_rate": 3.265e-05, "loss": 2.345, "step": 694 }, { "epoch": 0.0775850941730943, "grad_norm": 0.23056460916996002, "learning_rate": 3.2625e-05, "loss": 2.3514, "step": 695 }, { "epoch": 0.07769672740212033, "grad_norm": 0.24498891830444336, "learning_rate": 3.26e-05, "loss": 2.4623, "step": 696 }, { "epoch": 0.07780836063114636, "grad_norm": 0.2408551424741745, "learning_rate": 3.2575e-05, "loss": 2.3371, "step": 697 }, { "epoch": 0.0779199938601724, "grad_norm": 0.24386224150657654, "learning_rate": 3.2550000000000005e-05, "loss": 2.4246, "step": 698 }, { "epoch": 0.07803162708919843, "grad_norm": 0.23228052258491516, "learning_rate": 3.2525e-05, "loss": 2.2636, "step": 699 }, { "epoch": 0.07814326031822447, "grad_norm": 0.24233461916446686, "learning_rate": 3.2500000000000004e-05, "loss": 2.4194, "step": 700 }, { "epoch": 0.07825489354725051, "grad_norm": 0.23138944804668427, "learning_rate": 3.2474999999999997e-05, "loss": 2.3944, "step": 701 }, { "epoch": 0.07836652677627655, "grad_norm": 0.23382620513439178, "learning_rate": 3.245e-05, "loss": 2.2998, "step": 702 }, { "epoch": 0.07847816000530258, "grad_norm": 0.2373252511024475, "learning_rate": 3.2425e-05, "loss": 2.3178, "step": 703 }, { "epoch": 0.07858979323432862, "grad_norm": 0.22690844535827637, "learning_rate": 3.24e-05, "loss": 2.4387, "step": 704 }, { "epoch": 0.07870142646335465, "grad_norm": 0.22850820422172546, "learning_rate": 3.2375e-05, "loss": 2.3459, "step": 705 }, { "epoch": 0.07881305969238069, "grad_norm": 0.23736946284770966, "learning_rate": 3.235e-05, "loss": 2.2847, "step": 706 }, { "epoch": 0.07892469292140672, "grad_norm": 0.241333469748497, "learning_rate": 3.2325e-05, "loss": 2.3021, "step": 707 }, { "epoch": 0.07903632615043275, "grad_norm": 0.23721295595169067, "learning_rate": 3.2300000000000006e-05, "loss": 2.3129, "step": 708 }, { "epoch": 0.07914795937945879, "grad_norm": 0.2383483350276947, "learning_rate": 3.2275e-05, "loss": 2.3561, "step": 709 }, { "epoch": 0.07925959260848482, "grad_norm": 0.24383005499839783, "learning_rate": 3.2250000000000005e-05, "loss": 2.4278, "step": 710 }, { "epoch": 0.07937122583751086, "grad_norm": 0.24906353652477264, "learning_rate": 3.2225e-05, "loss": 2.3474, "step": 711 }, { "epoch": 0.07948285906653689, "grad_norm": 0.2293633371591568, "learning_rate": 3.2200000000000003e-05, "loss": 2.3338, "step": 712 }, { "epoch": 0.07959449229556292, "grad_norm": 0.23942622542381287, "learning_rate": 3.2175e-05, "loss": 2.3455, "step": 713 }, { "epoch": 0.07970612552458896, "grad_norm": 0.23788371682167053, "learning_rate": 3.215e-05, "loss": 2.3026, "step": 714 }, { "epoch": 0.07981775875361499, "grad_norm": 0.2387734055519104, "learning_rate": 3.2125e-05, "loss": 2.316, "step": 715 }, { "epoch": 0.07992939198264103, "grad_norm": 0.242325097322464, "learning_rate": 3.21e-05, "loss": 2.2869, "step": 716 }, { "epoch": 0.08004102521166707, "grad_norm": 0.23410271108150482, "learning_rate": 3.2075e-05, "loss": 2.3509, "step": 717 }, { "epoch": 0.08015265844069311, "grad_norm": 0.23277893662452698, "learning_rate": 3.205e-05, "loss": 2.3282, "step": 718 }, { "epoch": 0.08026429166971914, "grad_norm": 1.3853453397750854, "learning_rate": 3.2025e-05, "loss": 2.3211, "step": 719 }, { "epoch": 0.08037592489874518, "grad_norm": 0.23764613270759583, "learning_rate": 3.2000000000000005e-05, "loss": 2.4642, "step": 720 }, { "epoch": 0.08048755812777121, "grad_norm": 0.2404954433441162, "learning_rate": 3.1975e-05, "loss": 2.3829, "step": 721 }, { "epoch": 0.08059919135679725, "grad_norm": 0.2323405146598816, "learning_rate": 3.1950000000000004e-05, "loss": 2.4254, "step": 722 }, { "epoch": 0.08071082458582328, "grad_norm": 0.24208155274391174, "learning_rate": 3.1925e-05, "loss": 2.3304, "step": 723 }, { "epoch": 0.08082245781484931, "grad_norm": 0.2440410554409027, "learning_rate": 3.19e-05, "loss": 2.4517, "step": 724 }, { "epoch": 0.08093409104387535, "grad_norm": 0.23571601510047913, "learning_rate": 3.1875e-05, "loss": 2.3367, "step": 725 }, { "epoch": 0.08104572427290138, "grad_norm": 0.23201195895671844, "learning_rate": 3.185e-05, "loss": 2.3973, "step": 726 }, { "epoch": 0.08115735750192742, "grad_norm": 0.23855586349964142, "learning_rate": 3.1825e-05, "loss": 2.3875, "step": 727 }, { "epoch": 0.08126899073095345, "grad_norm": 0.23818083107471466, "learning_rate": 3.18e-05, "loss": 2.3538, "step": 728 }, { "epoch": 0.08138062395997948, "grad_norm": 0.2586081326007843, "learning_rate": 3.1775e-05, "loss": 2.3232, "step": 729 }, { "epoch": 0.08149225718900552, "grad_norm": 0.23521441221237183, "learning_rate": 3.175e-05, "loss": 2.2694, "step": 730 }, { "epoch": 0.08160389041803155, "grad_norm": 0.4255395829677582, "learning_rate": 3.1725e-05, "loss": 2.3881, "step": 731 }, { "epoch": 0.08171552364705759, "grad_norm": 0.22463633120059967, "learning_rate": 3.1700000000000005e-05, "loss": 2.3078, "step": 732 }, { "epoch": 0.08182715687608362, "grad_norm": 0.2440212070941925, "learning_rate": 3.1675e-05, "loss": 2.3463, "step": 733 }, { "epoch": 0.08193879010510967, "grad_norm": 0.22708596289157867, "learning_rate": 3.1650000000000004e-05, "loss": 2.3567, "step": 734 }, { "epoch": 0.0820504233341357, "grad_norm": 0.26711180806159973, "learning_rate": 3.1624999999999996e-05, "loss": 2.4591, "step": 735 }, { "epoch": 0.08216205656316174, "grad_norm": 0.23102666437625885, "learning_rate": 3.16e-05, "loss": 2.3309, "step": 736 }, { "epoch": 0.08227368979218777, "grad_norm": 0.24265794456005096, "learning_rate": 3.1575e-05, "loss": 2.3203, "step": 737 }, { "epoch": 0.0823853230212138, "grad_norm": 0.23661485314369202, "learning_rate": 3.155e-05, "loss": 2.3478, "step": 738 }, { "epoch": 0.08249695625023984, "grad_norm": 0.23216097056865692, "learning_rate": 3.1525e-05, "loss": 2.4765, "step": 739 }, { "epoch": 0.08260858947926587, "grad_norm": 0.2339819222688675, "learning_rate": 3.15e-05, "loss": 2.2802, "step": 740 }, { "epoch": 0.08272022270829191, "grad_norm": 0.25498029589653015, "learning_rate": 3.1475e-05, "loss": 2.4064, "step": 741 }, { "epoch": 0.08283185593731794, "grad_norm": 0.23578105866909027, "learning_rate": 3.145e-05, "loss": 2.3542, "step": 742 }, { "epoch": 0.08294348916634398, "grad_norm": 0.23970484733581543, "learning_rate": 3.1425e-05, "loss": 2.3676, "step": 743 }, { "epoch": 0.08305512239537001, "grad_norm": 0.2514541447162628, "learning_rate": 3.1400000000000004e-05, "loss": 2.4229, "step": 744 }, { "epoch": 0.08316675562439604, "grad_norm": 0.23850007355213165, "learning_rate": 3.1375e-05, "loss": 2.3794, "step": 745 }, { "epoch": 0.08327838885342208, "grad_norm": 0.30746811628341675, "learning_rate": 3.135e-05, "loss": 2.383, "step": 746 }, { "epoch": 0.08339002208244811, "grad_norm": 0.23188377916812897, "learning_rate": 3.1324999999999996e-05, "loss": 2.336, "step": 747 }, { "epoch": 0.08350165531147415, "grad_norm": 0.23534120619297028, "learning_rate": 3.13e-05, "loss": 2.398, "step": 748 }, { "epoch": 0.08361328854050018, "grad_norm": 0.2442607581615448, "learning_rate": 3.1275e-05, "loss": 2.4357, "step": 749 }, { "epoch": 0.08372492176952623, "grad_norm": 0.2328692376613617, "learning_rate": 3.125e-05, "loss": 2.3239, "step": 750 }, { "epoch": 0.08383655499855226, "grad_norm": 0.29631370306015015, "learning_rate": 3.122500000000001e-05, "loss": 2.3165, "step": 751 }, { "epoch": 0.0839481882275783, "grad_norm": 0.2432224601507187, "learning_rate": 3.12e-05, "loss": 2.372, "step": 752 }, { "epoch": 0.08405982145660433, "grad_norm": 0.2435504049062729, "learning_rate": 3.1175000000000006e-05, "loss": 2.4804, "step": 753 }, { "epoch": 0.08417145468563036, "grad_norm": 0.2799118161201477, "learning_rate": 3.115e-05, "loss": 2.3138, "step": 754 }, { "epoch": 0.0842830879146564, "grad_norm": 0.23647980391979218, "learning_rate": 3.1125000000000004e-05, "loss": 2.4502, "step": 755 }, { "epoch": 0.08439472114368243, "grad_norm": 0.23339681327342987, "learning_rate": 3.1100000000000004e-05, "loss": 2.3515, "step": 756 }, { "epoch": 0.08450635437270847, "grad_norm": 0.2358621060848236, "learning_rate": 3.1075e-05, "loss": 2.3384, "step": 757 }, { "epoch": 0.0846179876017345, "grad_norm": 0.226596862077713, "learning_rate": 3.105e-05, "loss": 2.4022, "step": 758 }, { "epoch": 0.08472962083076054, "grad_norm": 0.24496255815029144, "learning_rate": 3.1025e-05, "loss": 2.4273, "step": 759 }, { "epoch": 0.08484125405978657, "grad_norm": 0.24479149281978607, "learning_rate": 3.1e-05, "loss": 2.3464, "step": 760 }, { "epoch": 0.0849528872888126, "grad_norm": 0.23078912496566772, "learning_rate": 3.0975e-05, "loss": 2.3267, "step": 761 }, { "epoch": 0.08506452051783864, "grad_norm": 0.23886540532112122, "learning_rate": 3.095e-05, "loss": 2.243, "step": 762 }, { "epoch": 0.08517615374686467, "grad_norm": 0.2510230839252472, "learning_rate": 3.0925000000000006e-05, "loss": 2.3675, "step": 763 }, { "epoch": 0.0852877869758907, "grad_norm": 0.23274515569210052, "learning_rate": 3.09e-05, "loss": 2.3207, "step": 764 }, { "epoch": 0.08539942020491674, "grad_norm": 0.26892465353012085, "learning_rate": 3.0875000000000005e-05, "loss": 2.4453, "step": 765 }, { "epoch": 0.08551105343394277, "grad_norm": 0.24123644828796387, "learning_rate": 3.0850000000000004e-05, "loss": 2.287, "step": 766 }, { "epoch": 0.08562268666296882, "grad_norm": 0.2416788935661316, "learning_rate": 3.0825000000000004e-05, "loss": 2.301, "step": 767 }, { "epoch": 0.08573431989199486, "grad_norm": 0.23173028230667114, "learning_rate": 3.08e-05, "loss": 2.2896, "step": 768 }, { "epoch": 0.08584595312102089, "grad_norm": 0.2286095917224884, "learning_rate": 3.0775e-05, "loss": 2.341, "step": 769 }, { "epoch": 0.08595758635004692, "grad_norm": 0.24451370537281036, "learning_rate": 3.075e-05, "loss": 2.196, "step": 770 }, { "epoch": 0.08606921957907296, "grad_norm": 0.26589682698249817, "learning_rate": 3.0725e-05, "loss": 2.2902, "step": 771 }, { "epoch": 0.08618085280809899, "grad_norm": 0.26289981603622437, "learning_rate": 3.07e-05, "loss": 2.505, "step": 772 }, { "epoch": 0.08629248603712503, "grad_norm": 0.2402532994747162, "learning_rate": 3.067500000000001e-05, "loss": 2.3008, "step": 773 }, { "epoch": 0.08640411926615106, "grad_norm": 0.23948536813259125, "learning_rate": 3.065e-05, "loss": 2.4653, "step": 774 }, { "epoch": 0.0865157524951771, "grad_norm": 0.2507634162902832, "learning_rate": 3.0625000000000006e-05, "loss": 2.2703, "step": 775 }, { "epoch": 0.08662738572420313, "grad_norm": 0.24213804304599762, "learning_rate": 3.06e-05, "loss": 2.4244, "step": 776 }, { "epoch": 0.08673901895322916, "grad_norm": 0.24074698984622955, "learning_rate": 3.0575000000000005e-05, "loss": 2.3714, "step": 777 }, { "epoch": 0.0868506521822552, "grad_norm": 0.2556532323360443, "learning_rate": 3.0550000000000004e-05, "loss": 2.2702, "step": 778 }, { "epoch": 0.08696228541128123, "grad_norm": 0.23637044429779053, "learning_rate": 3.0525e-05, "loss": 2.3382, "step": 779 }, { "epoch": 0.08707391864030727, "grad_norm": 0.2363051176071167, "learning_rate": 3.05e-05, "loss": 2.2893, "step": 780 }, { "epoch": 0.0871855518693333, "grad_norm": 0.23347924649715424, "learning_rate": 3.0475000000000002e-05, "loss": 2.4117, "step": 781 }, { "epoch": 0.08729718509835933, "grad_norm": 0.37233465909957886, "learning_rate": 3.045e-05, "loss": 2.3556, "step": 782 }, { "epoch": 0.08740881832738538, "grad_norm": 0.25742578506469727, "learning_rate": 3.0425000000000004e-05, "loss": 2.4087, "step": 783 }, { "epoch": 0.08752045155641142, "grad_norm": 0.24065518379211426, "learning_rate": 3.04e-05, "loss": 2.2688, "step": 784 }, { "epoch": 0.08763208478543745, "grad_norm": 0.2395828366279602, "learning_rate": 3.0375000000000003e-05, "loss": 2.3376, "step": 785 }, { "epoch": 0.08774371801446348, "grad_norm": 0.30660635232925415, "learning_rate": 3.035e-05, "loss": 2.3353, "step": 786 }, { "epoch": 0.08785535124348952, "grad_norm": 0.23347756266593933, "learning_rate": 3.0325000000000002e-05, "loss": 2.2991, "step": 787 }, { "epoch": 0.08796698447251555, "grad_norm": 0.22873973846435547, "learning_rate": 3.03e-05, "loss": 2.3047, "step": 788 }, { "epoch": 0.08807861770154159, "grad_norm": 0.23164629936218262, "learning_rate": 3.0275000000000004e-05, "loss": 2.3264, "step": 789 }, { "epoch": 0.08819025093056762, "grad_norm": 0.2329261600971222, "learning_rate": 3.025e-05, "loss": 2.2868, "step": 790 }, { "epoch": 0.08830188415959365, "grad_norm": 0.22929538786411285, "learning_rate": 3.0225000000000003e-05, "loss": 2.3671, "step": 791 }, { "epoch": 0.08841351738861969, "grad_norm": 0.24005673825740814, "learning_rate": 3.02e-05, "loss": 2.4117, "step": 792 }, { "epoch": 0.08852515061764572, "grad_norm": 0.22921884059906006, "learning_rate": 3.0175e-05, "loss": 2.4082, "step": 793 }, { "epoch": 0.08863678384667176, "grad_norm": 0.2490616887807846, "learning_rate": 3.015e-05, "loss": 2.3136, "step": 794 }, { "epoch": 0.08874841707569779, "grad_norm": 0.24170029163360596, "learning_rate": 3.0125000000000004e-05, "loss": 2.3903, "step": 795 }, { "epoch": 0.08886005030472383, "grad_norm": 0.24534232914447784, "learning_rate": 3.01e-05, "loss": 2.3231, "step": 796 }, { "epoch": 0.08897168353374986, "grad_norm": 0.23491844534873962, "learning_rate": 3.0075000000000003e-05, "loss": 2.3657, "step": 797 }, { "epoch": 0.0890833167627759, "grad_norm": 0.23918871581554413, "learning_rate": 3.0050000000000002e-05, "loss": 2.3655, "step": 798 }, { "epoch": 0.08919494999180193, "grad_norm": 0.23670734465122223, "learning_rate": 3.0025000000000005e-05, "loss": 2.3264, "step": 799 }, { "epoch": 0.08930658322082798, "grad_norm": 0.2383037805557251, "learning_rate": 3e-05, "loss": 2.3465, "step": 800 }, { "epoch": 0.08941821644985401, "grad_norm": 0.38153019547462463, "learning_rate": 2.9975000000000004e-05, "loss": 2.4639, "step": 801 }, { "epoch": 0.08952984967888004, "grad_norm": 0.25272589921951294, "learning_rate": 2.995e-05, "loss": 2.253, "step": 802 }, { "epoch": 0.08964148290790608, "grad_norm": 0.24903567135334015, "learning_rate": 2.9925000000000002e-05, "loss": 2.3757, "step": 803 }, { "epoch": 0.08975311613693211, "grad_norm": 0.23128022253513336, "learning_rate": 2.9900000000000002e-05, "loss": 2.287, "step": 804 }, { "epoch": 0.08986474936595815, "grad_norm": 0.23053546249866486, "learning_rate": 2.9875000000000004e-05, "loss": 2.4614, "step": 805 }, { "epoch": 0.08997638259498418, "grad_norm": 0.3717436194419861, "learning_rate": 2.985e-05, "loss": 2.268, "step": 806 }, { "epoch": 0.09008801582401021, "grad_norm": 0.4570637047290802, "learning_rate": 2.9825000000000003e-05, "loss": 2.3794, "step": 807 }, { "epoch": 0.09019964905303625, "grad_norm": 0.2312982678413391, "learning_rate": 2.98e-05, "loss": 2.4141, "step": 808 }, { "epoch": 0.09031128228206228, "grad_norm": 0.2467602640390396, "learning_rate": 2.9775000000000002e-05, "loss": 2.3287, "step": 809 }, { "epoch": 0.09042291551108832, "grad_norm": 0.2299824357032776, "learning_rate": 2.975e-05, "loss": 2.4114, "step": 810 }, { "epoch": 0.09053454874011435, "grad_norm": 3.2160747051239014, "learning_rate": 2.9725000000000004e-05, "loss": 2.4177, "step": 811 }, { "epoch": 0.09064618196914039, "grad_norm": 0.2425236701965332, "learning_rate": 2.97e-05, "loss": 2.2538, "step": 812 }, { "epoch": 0.09075781519816642, "grad_norm": 0.2397298812866211, "learning_rate": 2.9675000000000003e-05, "loss": 2.4145, "step": 813 }, { "epoch": 0.09086944842719245, "grad_norm": 0.23437006771564484, "learning_rate": 2.965e-05, "loss": 2.2899, "step": 814 }, { "epoch": 0.09098108165621849, "grad_norm": 0.2314252257347107, "learning_rate": 2.9625000000000002e-05, "loss": 2.3063, "step": 815 }, { "epoch": 0.09109271488524454, "grad_norm": 0.24189533293247223, "learning_rate": 2.96e-05, "loss": 2.3931, "step": 816 }, { "epoch": 0.09120434811427057, "grad_norm": 0.2523435950279236, "learning_rate": 2.9575000000000004e-05, "loss": 2.337, "step": 817 }, { "epoch": 0.0913159813432966, "grad_norm": 0.26922857761383057, "learning_rate": 2.955e-05, "loss": 2.3536, "step": 818 }, { "epoch": 0.09142761457232264, "grad_norm": 0.24634380638599396, "learning_rate": 2.9525000000000003e-05, "loss": 2.4549, "step": 819 }, { "epoch": 0.09153924780134867, "grad_norm": 0.25171083211898804, "learning_rate": 2.95e-05, "loss": 2.2972, "step": 820 }, { "epoch": 0.0916508810303747, "grad_norm": 0.6297338604927063, "learning_rate": 2.9475e-05, "loss": 2.3931, "step": 821 }, { "epoch": 0.09176251425940074, "grad_norm": 0.2340405136346817, "learning_rate": 2.945e-05, "loss": 2.3981, "step": 822 }, { "epoch": 0.09187414748842677, "grad_norm": 0.2873988449573517, "learning_rate": 2.9425000000000004e-05, "loss": 2.3315, "step": 823 }, { "epoch": 0.09198578071745281, "grad_norm": 0.25433048605918884, "learning_rate": 2.94e-05, "loss": 2.2073, "step": 824 }, { "epoch": 0.09209741394647884, "grad_norm": 0.23423077166080475, "learning_rate": 2.9375000000000003e-05, "loss": 2.3346, "step": 825 }, { "epoch": 0.09220904717550488, "grad_norm": 0.2264171838760376, "learning_rate": 2.935e-05, "loss": 2.4019, "step": 826 }, { "epoch": 0.09232068040453091, "grad_norm": 0.27230480313301086, "learning_rate": 2.9325e-05, "loss": 2.3569, "step": 827 }, { "epoch": 0.09243231363355695, "grad_norm": 0.23226654529571533, "learning_rate": 2.93e-05, "loss": 2.4407, "step": 828 }, { "epoch": 0.09254394686258298, "grad_norm": 0.23659475147724152, "learning_rate": 2.9275000000000003e-05, "loss": 2.3602, "step": 829 }, { "epoch": 0.09265558009160901, "grad_norm": 0.2345886379480362, "learning_rate": 2.925e-05, "loss": 2.3828, "step": 830 }, { "epoch": 0.09276721332063505, "grad_norm": 0.23473899066448212, "learning_rate": 2.9225000000000002e-05, "loss": 2.2976, "step": 831 }, { "epoch": 0.09287884654966108, "grad_norm": 0.23485642671585083, "learning_rate": 2.9199999999999998e-05, "loss": 2.3747, "step": 832 }, { "epoch": 0.09299047977868713, "grad_norm": 0.24248026311397552, "learning_rate": 2.9175e-05, "loss": 2.3549, "step": 833 }, { "epoch": 0.09310211300771316, "grad_norm": 0.23264911770820618, "learning_rate": 2.915e-05, "loss": 2.4257, "step": 834 }, { "epoch": 0.0932137462367392, "grad_norm": 0.25659480690956116, "learning_rate": 2.9125000000000003e-05, "loss": 2.229, "step": 835 }, { "epoch": 0.09332537946576523, "grad_norm": 0.23583489656448364, "learning_rate": 2.91e-05, "loss": 2.3548, "step": 836 }, { "epoch": 0.09343701269479127, "grad_norm": 0.23372192680835724, "learning_rate": 2.9075000000000002e-05, "loss": 2.2546, "step": 837 }, { "epoch": 0.0935486459238173, "grad_norm": 0.2368670254945755, "learning_rate": 2.9049999999999998e-05, "loss": 2.2415, "step": 838 }, { "epoch": 0.09366027915284333, "grad_norm": 0.24544133245944977, "learning_rate": 2.9025e-05, "loss": 2.3511, "step": 839 }, { "epoch": 0.09377191238186937, "grad_norm": 0.2858797013759613, "learning_rate": 2.9e-05, "loss": 2.4491, "step": 840 }, { "epoch": 0.0938835456108954, "grad_norm": 0.24954313039779663, "learning_rate": 2.8975000000000003e-05, "loss": 2.5859, "step": 841 }, { "epoch": 0.09399517883992144, "grad_norm": 0.23716183006763458, "learning_rate": 2.895e-05, "loss": 2.2792, "step": 842 }, { "epoch": 0.09410681206894747, "grad_norm": 0.23809632658958435, "learning_rate": 2.8925000000000002e-05, "loss": 2.2958, "step": 843 }, { "epoch": 0.0942184452979735, "grad_norm": 0.23545247316360474, "learning_rate": 2.8899999999999998e-05, "loss": 2.3113, "step": 844 }, { "epoch": 0.09433007852699954, "grad_norm": 0.2312520146369934, "learning_rate": 2.8875e-05, "loss": 2.2242, "step": 845 }, { "epoch": 0.09444171175602557, "grad_norm": 0.23694823682308197, "learning_rate": 2.885e-05, "loss": 2.3629, "step": 846 }, { "epoch": 0.09455334498505161, "grad_norm": 0.22986574470996857, "learning_rate": 2.8825000000000003e-05, "loss": 2.3428, "step": 847 }, { "epoch": 0.09466497821407764, "grad_norm": 0.2305915206670761, "learning_rate": 2.88e-05, "loss": 2.291, "step": 848 }, { "epoch": 0.09477661144310369, "grad_norm": 0.23731227219104767, "learning_rate": 2.8775e-05, "loss": 2.3141, "step": 849 }, { "epoch": 0.09488824467212972, "grad_norm": 0.24682384729385376, "learning_rate": 2.8749999999999997e-05, "loss": 2.2654, "step": 850 }, { "epoch": 0.09499987790115576, "grad_norm": 0.232358917593956, "learning_rate": 2.8725e-05, "loss": 2.3816, "step": 851 }, { "epoch": 0.09511151113018179, "grad_norm": 0.23460093140602112, "learning_rate": 2.87e-05, "loss": 2.4469, "step": 852 }, { "epoch": 0.09522314435920783, "grad_norm": 0.28891122341156006, "learning_rate": 2.8675000000000002e-05, "loss": 2.3192, "step": 853 }, { "epoch": 0.09533477758823386, "grad_norm": 0.22858241200447083, "learning_rate": 2.865e-05, "loss": 2.3903, "step": 854 }, { "epoch": 0.0954464108172599, "grad_norm": 0.2979806065559387, "learning_rate": 2.8625e-05, "loss": 2.3428, "step": 855 }, { "epoch": 0.09555804404628593, "grad_norm": 0.22742506861686707, "learning_rate": 2.86e-05, "loss": 2.3086, "step": 856 }, { "epoch": 0.09566967727531196, "grad_norm": 0.2474931925535202, "learning_rate": 2.8575000000000003e-05, "loss": 2.2446, "step": 857 }, { "epoch": 0.095781310504338, "grad_norm": 0.23535043001174927, "learning_rate": 2.855e-05, "loss": 2.3, "step": 858 }, { "epoch": 0.09589294373336403, "grad_norm": 0.2337024062871933, "learning_rate": 2.8525000000000002e-05, "loss": 2.3222, "step": 859 }, { "epoch": 0.09600457696239006, "grad_norm": 0.23335273563861847, "learning_rate": 2.8499999999999998e-05, "loss": 2.3336, "step": 860 }, { "epoch": 0.0961162101914161, "grad_norm": 0.23024383187294006, "learning_rate": 2.8475e-05, "loss": 2.3664, "step": 861 }, { "epoch": 0.09622784342044213, "grad_norm": 0.27251824736595154, "learning_rate": 2.845e-05, "loss": 2.25, "step": 862 }, { "epoch": 0.09633947664946817, "grad_norm": 0.23650218546390533, "learning_rate": 2.8425000000000003e-05, "loss": 2.3429, "step": 863 }, { "epoch": 0.0964511098784942, "grad_norm": 0.23651504516601562, "learning_rate": 2.84e-05, "loss": 2.4, "step": 864 }, { "epoch": 0.09656274310752025, "grad_norm": 0.2359198033809662, "learning_rate": 2.8375000000000002e-05, "loss": 2.3672, "step": 865 }, { "epoch": 0.09667437633654628, "grad_norm": 0.2200855314731598, "learning_rate": 2.8349999999999998e-05, "loss": 2.3336, "step": 866 }, { "epoch": 0.09678600956557232, "grad_norm": 0.23432110249996185, "learning_rate": 2.8325e-05, "loss": 2.3277, "step": 867 }, { "epoch": 0.09689764279459835, "grad_norm": 0.2440565526485443, "learning_rate": 2.83e-05, "loss": 2.3388, "step": 868 }, { "epoch": 0.09700927602362439, "grad_norm": 0.23634879291057587, "learning_rate": 2.8275000000000003e-05, "loss": 2.3551, "step": 869 }, { "epoch": 0.09712090925265042, "grad_norm": 0.23254720866680145, "learning_rate": 2.825e-05, "loss": 2.4071, "step": 870 }, { "epoch": 0.09723254248167645, "grad_norm": 0.25411373376846313, "learning_rate": 2.8225e-05, "loss": 2.3591, "step": 871 }, { "epoch": 0.09734417571070249, "grad_norm": 0.23345574736595154, "learning_rate": 2.8199999999999998e-05, "loss": 2.2857, "step": 872 }, { "epoch": 0.09745580893972852, "grad_norm": 0.23449824750423431, "learning_rate": 2.8175e-05, "loss": 2.297, "step": 873 }, { "epoch": 0.09756744216875456, "grad_norm": 0.23664528131484985, "learning_rate": 2.815e-05, "loss": 2.4227, "step": 874 }, { "epoch": 0.09767907539778059, "grad_norm": 0.22787226736545563, "learning_rate": 2.8125000000000003e-05, "loss": 2.4532, "step": 875 }, { "epoch": 0.09779070862680662, "grad_norm": 0.22756721079349518, "learning_rate": 2.8100000000000005e-05, "loss": 2.2942, "step": 876 }, { "epoch": 0.09790234185583266, "grad_norm": 0.23187048733234406, "learning_rate": 2.8075e-05, "loss": 2.4004, "step": 877 }, { "epoch": 0.09801397508485869, "grad_norm": 0.23409034311771393, "learning_rate": 2.8050000000000004e-05, "loss": 2.3854, "step": 878 }, { "epoch": 0.09812560831388473, "grad_norm": 0.23317299783229828, "learning_rate": 2.8025e-05, "loss": 2.365, "step": 879 }, { "epoch": 0.09823724154291076, "grad_norm": 0.4648565948009491, "learning_rate": 2.8000000000000003e-05, "loss": 2.3482, "step": 880 }, { "epoch": 0.0983488747719368, "grad_norm": 0.23336485028266907, "learning_rate": 2.7975000000000002e-05, "loss": 2.454, "step": 881 }, { "epoch": 0.09846050800096284, "grad_norm": 0.22770002484321594, "learning_rate": 2.7950000000000005e-05, "loss": 2.2243, "step": 882 }, { "epoch": 0.09857214122998888, "grad_norm": 0.240431547164917, "learning_rate": 2.7925e-05, "loss": 2.3568, "step": 883 }, { "epoch": 0.09868377445901491, "grad_norm": 0.23338884115219116, "learning_rate": 2.7900000000000004e-05, "loss": 2.3459, "step": 884 }, { "epoch": 0.09879540768804095, "grad_norm": 0.23423053324222565, "learning_rate": 2.7875e-05, "loss": 2.4424, "step": 885 }, { "epoch": 0.09890704091706698, "grad_norm": 0.2247275859117508, "learning_rate": 2.7850000000000003e-05, "loss": 2.4401, "step": 886 }, { "epoch": 0.09901867414609301, "grad_norm": 0.23924924433231354, "learning_rate": 2.7825000000000002e-05, "loss": 2.3376, "step": 887 }, { "epoch": 0.09913030737511905, "grad_norm": 0.23396386206150055, "learning_rate": 2.7800000000000005e-05, "loss": 2.2607, "step": 888 }, { "epoch": 0.09924194060414508, "grad_norm": 0.23403829336166382, "learning_rate": 2.7775e-05, "loss": 2.2647, "step": 889 }, { "epoch": 0.09935357383317112, "grad_norm": 0.250621497631073, "learning_rate": 2.7750000000000004e-05, "loss": 2.3554, "step": 890 }, { "epoch": 0.09946520706219715, "grad_norm": 0.24255424737930298, "learning_rate": 2.7725e-05, "loss": 2.3057, "step": 891 }, { "epoch": 0.09957684029122318, "grad_norm": 0.23455750942230225, "learning_rate": 2.7700000000000002e-05, "loss": 2.405, "step": 892 }, { "epoch": 0.09968847352024922, "grad_norm": 0.24191993474960327, "learning_rate": 2.7675000000000002e-05, "loss": 2.3389, "step": 893 }, { "epoch": 0.09980010674927525, "grad_norm": 0.23159699141979218, "learning_rate": 2.7650000000000005e-05, "loss": 2.4599, "step": 894 }, { "epoch": 0.09991173997830129, "grad_norm": 0.2873140871524811, "learning_rate": 2.7625e-05, "loss": 2.2902, "step": 895 }, { "epoch": 0.10002337320732732, "grad_norm": 0.2346268594264984, "learning_rate": 2.7600000000000003e-05, "loss": 2.3954, "step": 896 }, { "epoch": 0.10013500643635335, "grad_norm": 0.2276250720024109, "learning_rate": 2.7575e-05, "loss": 2.3288, "step": 897 }, { "epoch": 0.1002466396653794, "grad_norm": 0.22809089720249176, "learning_rate": 2.7550000000000002e-05, "loss": 2.3797, "step": 898 }, { "epoch": 0.10035827289440544, "grad_norm": 0.2589645981788635, "learning_rate": 2.7525e-05, "loss": 2.235, "step": 899 }, { "epoch": 0.10046990612343147, "grad_norm": 0.26668986678123474, "learning_rate": 2.7500000000000004e-05, "loss": 2.4543, "step": 900 }, { "epoch": 0.1005815393524575, "grad_norm": 0.231545552611351, "learning_rate": 2.7475e-05, "loss": 2.2753, "step": 901 }, { "epoch": 0.10069317258148354, "grad_norm": 0.23608693480491638, "learning_rate": 2.7450000000000003e-05, "loss": 2.4207, "step": 902 }, { "epoch": 0.10080480581050957, "grad_norm": 0.22971975803375244, "learning_rate": 2.7425e-05, "loss": 2.4783, "step": 903 }, { "epoch": 0.10091643903953561, "grad_norm": 0.24497389793395996, "learning_rate": 2.7400000000000002e-05, "loss": 2.3443, "step": 904 }, { "epoch": 0.10102807226856164, "grad_norm": 0.22968074679374695, "learning_rate": 2.7375e-05, "loss": 2.3163, "step": 905 }, { "epoch": 0.10113970549758768, "grad_norm": 0.23204439878463745, "learning_rate": 2.7350000000000004e-05, "loss": 2.3681, "step": 906 }, { "epoch": 0.10125133872661371, "grad_norm": 0.24272161722183228, "learning_rate": 2.7325e-05, "loss": 2.3472, "step": 907 }, { "epoch": 0.10136297195563974, "grad_norm": 0.22961987555027008, "learning_rate": 2.7300000000000003e-05, "loss": 2.3187, "step": 908 }, { "epoch": 0.10147460518466578, "grad_norm": 0.23540601134300232, "learning_rate": 2.7275e-05, "loss": 2.289, "step": 909 }, { "epoch": 0.10158623841369181, "grad_norm": 0.260650634765625, "learning_rate": 2.725e-05, "loss": 2.3203, "step": 910 }, { "epoch": 0.10169787164271785, "grad_norm": 0.24257007241249084, "learning_rate": 2.7225e-05, "loss": 2.3648, "step": 911 }, { "epoch": 0.10180950487174388, "grad_norm": 0.2477046102285385, "learning_rate": 2.7200000000000004e-05, "loss": 2.273, "step": 912 }, { "epoch": 0.10192113810076991, "grad_norm": 0.23077093064785004, "learning_rate": 2.7175e-05, "loss": 2.3581, "step": 913 }, { "epoch": 0.10203277132979595, "grad_norm": 0.229270800948143, "learning_rate": 2.7150000000000003e-05, "loss": 2.3241, "step": 914 }, { "epoch": 0.102144404558822, "grad_norm": 0.2295754998922348, "learning_rate": 2.7125000000000002e-05, "loss": 2.3711, "step": 915 }, { "epoch": 0.10225603778784803, "grad_norm": 0.23900440335273743, "learning_rate": 2.7100000000000005e-05, "loss": 2.3576, "step": 916 }, { "epoch": 0.10236767101687407, "grad_norm": 0.234444722533226, "learning_rate": 2.7075e-05, "loss": 2.3537, "step": 917 }, { "epoch": 0.1024793042459001, "grad_norm": 0.2370821088552475, "learning_rate": 2.7050000000000004e-05, "loss": 2.3126, "step": 918 }, { "epoch": 0.10259093747492613, "grad_norm": 0.24210244417190552, "learning_rate": 2.7025e-05, "loss": 2.2511, "step": 919 }, { "epoch": 0.10270257070395217, "grad_norm": 0.23555943369865417, "learning_rate": 2.7000000000000002e-05, "loss": 2.3868, "step": 920 }, { "epoch": 0.1028142039329782, "grad_norm": 0.2252027690410614, "learning_rate": 2.6975000000000002e-05, "loss": 2.3263, "step": 921 }, { "epoch": 0.10292583716200424, "grad_norm": 0.22942887246608734, "learning_rate": 2.6950000000000005e-05, "loss": 2.3358, "step": 922 }, { "epoch": 0.10303747039103027, "grad_norm": 0.23425108194351196, "learning_rate": 2.6925e-05, "loss": 2.279, "step": 923 }, { "epoch": 0.1031491036200563, "grad_norm": 0.23959554731845856, "learning_rate": 2.6900000000000003e-05, "loss": 2.3456, "step": 924 }, { "epoch": 0.10326073684908234, "grad_norm": 0.22143711149692535, "learning_rate": 2.6875e-05, "loss": 2.3346, "step": 925 }, { "epoch": 0.10337237007810837, "grad_norm": 0.2322838306427002, "learning_rate": 2.6850000000000002e-05, "loss": 2.3597, "step": 926 }, { "epoch": 0.1034840033071344, "grad_norm": 0.22317776083946228, "learning_rate": 2.6825e-05, "loss": 2.4395, "step": 927 }, { "epoch": 0.10359563653616044, "grad_norm": 0.2381390780210495, "learning_rate": 2.6800000000000004e-05, "loss": 2.335, "step": 928 }, { "epoch": 0.10370726976518647, "grad_norm": 0.2249373197555542, "learning_rate": 2.6775e-05, "loss": 2.3763, "step": 929 }, { "epoch": 0.10381890299421251, "grad_norm": 0.23083436489105225, "learning_rate": 2.6750000000000003e-05, "loss": 2.2986, "step": 930 }, { "epoch": 0.10393053622323856, "grad_norm": 0.23313601315021515, "learning_rate": 2.6725e-05, "loss": 2.4076, "step": 931 }, { "epoch": 0.10404216945226459, "grad_norm": 0.22721858322620392, "learning_rate": 2.6700000000000002e-05, "loss": 2.2987, "step": 932 }, { "epoch": 0.10415380268129062, "grad_norm": 0.23775102198123932, "learning_rate": 2.6675e-05, "loss": 2.3398, "step": 933 }, { "epoch": 0.10426543591031666, "grad_norm": 0.22878248989582062, "learning_rate": 2.6650000000000004e-05, "loss": 2.3369, "step": 934 }, { "epoch": 0.10437706913934269, "grad_norm": 0.22213736176490784, "learning_rate": 2.6625e-05, "loss": 2.4302, "step": 935 }, { "epoch": 0.10448870236836873, "grad_norm": 0.23703357577323914, "learning_rate": 2.6600000000000003e-05, "loss": 2.2731, "step": 936 }, { "epoch": 0.10460033559739476, "grad_norm": 0.2916199266910553, "learning_rate": 2.6575e-05, "loss": 2.2714, "step": 937 }, { "epoch": 0.1047119688264208, "grad_norm": 0.22936727106571198, "learning_rate": 2.655e-05, "loss": 2.4308, "step": 938 }, { "epoch": 0.10482360205544683, "grad_norm": 0.2333354949951172, "learning_rate": 2.6525e-05, "loss": 2.3236, "step": 939 }, { "epoch": 0.10493523528447286, "grad_norm": 0.23450367152690887, "learning_rate": 2.6500000000000004e-05, "loss": 2.3854, "step": 940 }, { "epoch": 0.1050468685134989, "grad_norm": 0.2431698888540268, "learning_rate": 2.6475e-05, "loss": 2.2789, "step": 941 }, { "epoch": 0.10515850174252493, "grad_norm": 0.22219637036323547, "learning_rate": 2.6450000000000003e-05, "loss": 2.3675, "step": 942 }, { "epoch": 0.10527013497155097, "grad_norm": 0.233125239610672, "learning_rate": 2.6425e-05, "loss": 2.4779, "step": 943 }, { "epoch": 0.105381768200577, "grad_norm": 0.22824883460998535, "learning_rate": 2.64e-05, "loss": 2.3445, "step": 944 }, { "epoch": 0.10549340142960303, "grad_norm": 0.22547942399978638, "learning_rate": 2.6375e-05, "loss": 2.234, "step": 945 }, { "epoch": 0.10560503465862907, "grad_norm": 0.2325771003961563, "learning_rate": 2.6350000000000004e-05, "loss": 2.4215, "step": 946 }, { "epoch": 0.1057166678876551, "grad_norm": 0.233683779835701, "learning_rate": 2.6325e-05, "loss": 2.4137, "step": 947 }, { "epoch": 0.10582830111668115, "grad_norm": 0.24647918343544006, "learning_rate": 2.6300000000000002e-05, "loss": 2.3459, "step": 948 }, { "epoch": 0.10593993434570718, "grad_norm": 0.22863587737083435, "learning_rate": 2.6275e-05, "loss": 2.3867, "step": 949 }, { "epoch": 0.10605156757473322, "grad_norm": 0.23412172496318817, "learning_rate": 2.625e-05, "loss": 2.3621, "step": 950 }, { "epoch": 0.10616320080375925, "grad_norm": 0.22972947359085083, "learning_rate": 2.6225e-05, "loss": 2.4242, "step": 951 }, { "epoch": 0.10627483403278529, "grad_norm": 0.23886039853096008, "learning_rate": 2.6200000000000003e-05, "loss": 2.3375, "step": 952 }, { "epoch": 0.10638646726181132, "grad_norm": 0.24040424823760986, "learning_rate": 2.6175e-05, "loss": 2.3518, "step": 953 }, { "epoch": 0.10649810049083736, "grad_norm": 0.22699175775051117, "learning_rate": 2.6150000000000002e-05, "loss": 2.3435, "step": 954 }, { "epoch": 0.10660973371986339, "grad_norm": 0.23695823550224304, "learning_rate": 2.6124999999999998e-05, "loss": 2.4036, "step": 955 }, { "epoch": 0.10672136694888942, "grad_norm": 0.23027122020721436, "learning_rate": 2.61e-05, "loss": 2.286, "step": 956 }, { "epoch": 0.10683300017791546, "grad_norm": 0.24779526889324188, "learning_rate": 2.6075e-05, "loss": 2.3144, "step": 957 }, { "epoch": 0.10694463340694149, "grad_norm": 0.23636764287948608, "learning_rate": 2.6050000000000003e-05, "loss": 2.3114, "step": 958 }, { "epoch": 0.10705626663596753, "grad_norm": 0.22950230538845062, "learning_rate": 2.6025e-05, "loss": 2.3594, "step": 959 }, { "epoch": 0.10716789986499356, "grad_norm": 0.2355276644229889, "learning_rate": 2.6000000000000002e-05, "loss": 2.2385, "step": 960 }, { "epoch": 0.1072795330940196, "grad_norm": 0.25076955556869507, "learning_rate": 2.5974999999999998e-05, "loss": 2.2637, "step": 961 }, { "epoch": 0.10739116632304563, "grad_norm": 0.2247258871793747, "learning_rate": 2.595e-05, "loss": 2.2486, "step": 962 }, { "epoch": 0.10750279955207166, "grad_norm": 0.21740855276584625, "learning_rate": 2.5925e-05, "loss": 2.2609, "step": 963 }, { "epoch": 0.10761443278109771, "grad_norm": 0.2570677697658539, "learning_rate": 2.5900000000000003e-05, "loss": 2.442, "step": 964 }, { "epoch": 0.10772606601012374, "grad_norm": 0.2322109192609787, "learning_rate": 2.5875e-05, "loss": 2.3026, "step": 965 }, { "epoch": 0.10783769923914978, "grad_norm": 0.24020114541053772, "learning_rate": 2.585e-05, "loss": 2.351, "step": 966 }, { "epoch": 0.10794933246817581, "grad_norm": 0.2280672937631607, "learning_rate": 2.5824999999999998e-05, "loss": 2.4016, "step": 967 }, { "epoch": 0.10806096569720185, "grad_norm": 0.2309507429599762, "learning_rate": 2.58e-05, "loss": 2.3733, "step": 968 }, { "epoch": 0.10817259892622788, "grad_norm": 0.2293708324432373, "learning_rate": 2.5775e-05, "loss": 2.3583, "step": 969 }, { "epoch": 0.10828423215525391, "grad_norm": 0.23410175740718842, "learning_rate": 2.5750000000000002e-05, "loss": 2.352, "step": 970 }, { "epoch": 0.10839586538427995, "grad_norm": 0.22972838580608368, "learning_rate": 2.5725e-05, "loss": 2.3434, "step": 971 }, { "epoch": 0.10850749861330598, "grad_norm": 0.22537364065647125, "learning_rate": 2.57e-05, "loss": 2.3875, "step": 972 }, { "epoch": 0.10861913184233202, "grad_norm": 0.23455362021923065, "learning_rate": 2.5675e-05, "loss": 2.2759, "step": 973 }, { "epoch": 0.10873076507135805, "grad_norm": 0.23852132260799408, "learning_rate": 2.5650000000000003e-05, "loss": 2.3408, "step": 974 }, { "epoch": 0.10884239830038409, "grad_norm": 0.23479855060577393, "learning_rate": 2.5625e-05, "loss": 2.3161, "step": 975 }, { "epoch": 0.10895403152941012, "grad_norm": 0.2664550244808197, "learning_rate": 2.5600000000000002e-05, "loss": 2.387, "step": 976 }, { "epoch": 0.10906566475843615, "grad_norm": 0.22990471124649048, "learning_rate": 2.5574999999999998e-05, "loss": 2.3906, "step": 977 }, { "epoch": 0.10917729798746219, "grad_norm": 0.2335597425699234, "learning_rate": 2.555e-05, "loss": 2.3989, "step": 978 }, { "epoch": 0.10928893121648822, "grad_norm": 0.23832087218761444, "learning_rate": 2.5525e-05, "loss": 2.2881, "step": 979 }, { "epoch": 0.10940056444551426, "grad_norm": 0.23258844017982483, "learning_rate": 2.5500000000000003e-05, "loss": 2.4174, "step": 980 }, { "epoch": 0.1095121976745403, "grad_norm": 0.23663833737373352, "learning_rate": 2.5475e-05, "loss": 2.2888, "step": 981 }, { "epoch": 0.10962383090356634, "grad_norm": 0.22592462599277496, "learning_rate": 2.5450000000000002e-05, "loss": 2.3676, "step": 982 }, { "epoch": 0.10973546413259237, "grad_norm": 0.23445037007331848, "learning_rate": 2.5424999999999998e-05, "loss": 2.3627, "step": 983 }, { "epoch": 0.1098470973616184, "grad_norm": 0.2289026379585266, "learning_rate": 2.54e-05, "loss": 2.4259, "step": 984 }, { "epoch": 0.10995873059064444, "grad_norm": 0.2335384339094162, "learning_rate": 2.5375e-05, "loss": 2.2903, "step": 985 }, { "epoch": 0.11007036381967047, "grad_norm": 0.24791677296161652, "learning_rate": 2.5350000000000003e-05, "loss": 2.3497, "step": 986 }, { "epoch": 0.11018199704869651, "grad_norm": 0.21857081353664398, "learning_rate": 2.5325e-05, "loss": 2.3938, "step": 987 }, { "epoch": 0.11029363027772254, "grad_norm": 0.23194223642349243, "learning_rate": 2.5300000000000002e-05, "loss": 2.3752, "step": 988 }, { "epoch": 0.11040526350674858, "grad_norm": 0.22554685175418854, "learning_rate": 2.5274999999999998e-05, "loss": 2.3892, "step": 989 }, { "epoch": 0.11051689673577461, "grad_norm": 0.2444494664669037, "learning_rate": 2.525e-05, "loss": 2.3854, "step": 990 }, { "epoch": 0.11062852996480065, "grad_norm": 0.24324128031730652, "learning_rate": 2.5225e-05, "loss": 2.2129, "step": 991 }, { "epoch": 0.11074016319382668, "grad_norm": 0.23157966136932373, "learning_rate": 2.5200000000000003e-05, "loss": 2.3688, "step": 992 }, { "epoch": 0.11085179642285271, "grad_norm": 0.23496349155902863, "learning_rate": 2.5175e-05, "loss": 2.2166, "step": 993 }, { "epoch": 0.11096342965187875, "grad_norm": 0.22501815855503082, "learning_rate": 2.515e-05, "loss": 2.3563, "step": 994 }, { "epoch": 0.11107506288090478, "grad_norm": 0.3032657206058502, "learning_rate": 2.5124999999999997e-05, "loss": 2.2783, "step": 995 }, { "epoch": 0.11118669610993082, "grad_norm": 0.24155639111995697, "learning_rate": 2.51e-05, "loss": 2.3681, "step": 996 }, { "epoch": 0.11129832933895686, "grad_norm": 0.22777613997459412, "learning_rate": 2.5075e-05, "loss": 2.3981, "step": 997 }, { "epoch": 0.1114099625679829, "grad_norm": 0.24148933589458466, "learning_rate": 2.5050000000000002e-05, "loss": 2.2894, "step": 998 }, { "epoch": 0.11152159579700893, "grad_norm": 0.23705993592739105, "learning_rate": 2.5025e-05, "loss": 2.3486, "step": 999 }, { "epoch": 0.11163322902603497, "grad_norm": 0.23454095423221588, "learning_rate": 2.5e-05, "loss": 2.4232, "step": 1000 }, { "epoch": 0.111744862255061, "grad_norm": 0.23154820501804352, "learning_rate": 2.4975e-05, "loss": 2.2269, "step": 1001 }, { "epoch": 0.11185649548408703, "grad_norm": 0.22418555617332458, "learning_rate": 2.495e-05, "loss": 2.2112, "step": 1002 }, { "epoch": 0.11196812871311307, "grad_norm": 0.2538329064846039, "learning_rate": 2.4925000000000003e-05, "loss": 2.3927, "step": 1003 }, { "epoch": 0.1120797619421391, "grad_norm": 0.23294506967067719, "learning_rate": 2.4900000000000002e-05, "loss": 2.3024, "step": 1004 }, { "epoch": 0.11219139517116514, "grad_norm": 0.22844459116458893, "learning_rate": 2.4875e-05, "loss": 2.2385, "step": 1005 }, { "epoch": 0.11230302840019117, "grad_norm": 0.22828292846679688, "learning_rate": 2.485e-05, "loss": 2.2897, "step": 1006 }, { "epoch": 0.1124146616292172, "grad_norm": 0.2693067491054535, "learning_rate": 2.4825e-05, "loss": 2.3818, "step": 1007 }, { "epoch": 0.11252629485824324, "grad_norm": 0.22305937111377716, "learning_rate": 2.48e-05, "loss": 2.3505, "step": 1008 }, { "epoch": 0.11263792808726927, "grad_norm": 0.23566830158233643, "learning_rate": 2.4775000000000003e-05, "loss": 2.3571, "step": 1009 }, { "epoch": 0.11274956131629531, "grad_norm": 0.23289407789707184, "learning_rate": 2.4750000000000002e-05, "loss": 2.3021, "step": 1010 }, { "epoch": 0.11286119454532134, "grad_norm": 0.2305288016796112, "learning_rate": 2.4725e-05, "loss": 2.3082, "step": 1011 }, { "epoch": 0.11297282777434738, "grad_norm": 0.2262507677078247, "learning_rate": 2.47e-05, "loss": 2.3995, "step": 1012 }, { "epoch": 0.11308446100337341, "grad_norm": 0.2277233600616455, "learning_rate": 2.4675e-05, "loss": 2.3759, "step": 1013 }, { "epoch": 0.11319609423239946, "grad_norm": 0.24634131789207458, "learning_rate": 2.465e-05, "loss": 2.3487, "step": 1014 }, { "epoch": 0.11330772746142549, "grad_norm": 0.31584781408309937, "learning_rate": 2.4625000000000002e-05, "loss": 2.3787, "step": 1015 }, { "epoch": 0.11341936069045153, "grad_norm": 0.2360941767692566, "learning_rate": 2.46e-05, "loss": 2.2368, "step": 1016 }, { "epoch": 0.11353099391947756, "grad_norm": 0.24138571321964264, "learning_rate": 2.4575e-05, "loss": 2.2329, "step": 1017 }, { "epoch": 0.1136426271485036, "grad_norm": 0.24359650909900665, "learning_rate": 2.455e-05, "loss": 2.352, "step": 1018 }, { "epoch": 0.11375426037752963, "grad_norm": 0.22765910625457764, "learning_rate": 2.4525e-05, "loss": 2.4487, "step": 1019 }, { "epoch": 0.11386589360655566, "grad_norm": 0.22311876714229584, "learning_rate": 2.45e-05, "loss": 2.3413, "step": 1020 }, { "epoch": 0.1139775268355817, "grad_norm": 0.3245187997817993, "learning_rate": 2.4475000000000002e-05, "loss": 2.352, "step": 1021 }, { "epoch": 0.11408916006460773, "grad_norm": 0.267455130815506, "learning_rate": 2.445e-05, "loss": 2.4273, "step": 1022 }, { "epoch": 0.11420079329363376, "grad_norm": 0.23805475234985352, "learning_rate": 2.4425e-05, "loss": 2.3511, "step": 1023 }, { "epoch": 0.1143124265226598, "grad_norm": 0.23947173357009888, "learning_rate": 2.44e-05, "loss": 2.3554, "step": 1024 }, { "epoch": 0.11442405975168583, "grad_norm": 0.23551489412784576, "learning_rate": 2.4375e-05, "loss": 2.3869, "step": 1025 }, { "epoch": 0.11453569298071187, "grad_norm": 0.22201520204544067, "learning_rate": 2.435e-05, "loss": 2.351, "step": 1026 }, { "epoch": 0.1146473262097379, "grad_norm": 0.23246638476848602, "learning_rate": 2.4325000000000002e-05, "loss": 2.3709, "step": 1027 }, { "epoch": 0.11475895943876394, "grad_norm": 0.29901397228240967, "learning_rate": 2.43e-05, "loss": 2.33, "step": 1028 }, { "epoch": 0.11487059266778997, "grad_norm": 0.2317001223564148, "learning_rate": 2.4275e-05, "loss": 2.4711, "step": 1029 }, { "epoch": 0.11498222589681602, "grad_norm": 0.2264910489320755, "learning_rate": 2.425e-05, "loss": 2.4613, "step": 1030 }, { "epoch": 0.11509385912584205, "grad_norm": 0.23516049981117249, "learning_rate": 2.4225e-05, "loss": 2.3831, "step": 1031 }, { "epoch": 0.11520549235486809, "grad_norm": 0.23533384501934052, "learning_rate": 2.4200000000000002e-05, "loss": 2.4065, "step": 1032 }, { "epoch": 0.11531712558389412, "grad_norm": 0.22421786189079285, "learning_rate": 2.4175e-05, "loss": 2.4121, "step": 1033 }, { "epoch": 0.11542875881292015, "grad_norm": 0.23271812498569489, "learning_rate": 2.415e-05, "loss": 2.3377, "step": 1034 }, { "epoch": 0.11554039204194619, "grad_norm": 0.22628925740718842, "learning_rate": 2.4125e-05, "loss": 2.3516, "step": 1035 }, { "epoch": 0.11565202527097222, "grad_norm": 0.23225760459899902, "learning_rate": 2.41e-05, "loss": 2.4267, "step": 1036 }, { "epoch": 0.11576365849999826, "grad_norm": 0.24704919755458832, "learning_rate": 2.4075e-05, "loss": 2.348, "step": 1037 }, { "epoch": 0.11587529172902429, "grad_norm": 0.3677544593811035, "learning_rate": 2.4050000000000002e-05, "loss": 2.3355, "step": 1038 }, { "epoch": 0.11598692495805032, "grad_norm": 0.2303479164838791, "learning_rate": 2.4025e-05, "loss": 2.3829, "step": 1039 }, { "epoch": 0.11609855818707636, "grad_norm": 0.5053055882453918, "learning_rate": 2.4e-05, "loss": 2.2684, "step": 1040 }, { "epoch": 0.11621019141610239, "grad_norm": 0.23576150834560394, "learning_rate": 2.3975e-05, "loss": 2.3628, "step": 1041 }, { "epoch": 0.11632182464512843, "grad_norm": 0.22880171239376068, "learning_rate": 2.395e-05, "loss": 2.33, "step": 1042 }, { "epoch": 0.11643345787415446, "grad_norm": 0.225894957780838, "learning_rate": 2.3925e-05, "loss": 2.2474, "step": 1043 }, { "epoch": 0.1165450911031805, "grad_norm": 0.2277292162179947, "learning_rate": 2.39e-05, "loss": 2.32, "step": 1044 }, { "epoch": 0.11665672433220653, "grad_norm": 0.22571003437042236, "learning_rate": 2.3875e-05, "loss": 2.3058, "step": 1045 }, { "epoch": 0.11676835756123256, "grad_norm": 0.2490728795528412, "learning_rate": 2.385e-05, "loss": 2.38, "step": 1046 }, { "epoch": 0.11687999079025861, "grad_norm": 0.23154285550117493, "learning_rate": 2.3825e-05, "loss": 2.3528, "step": 1047 }, { "epoch": 0.11699162401928465, "grad_norm": 0.23180274665355682, "learning_rate": 2.38e-05, "loss": 2.349, "step": 1048 }, { "epoch": 0.11710325724831068, "grad_norm": 0.2314680814743042, "learning_rate": 2.3775e-05, "loss": 2.2209, "step": 1049 }, { "epoch": 0.11721489047733671, "grad_norm": 0.22533871233463287, "learning_rate": 2.375e-05, "loss": 2.2946, "step": 1050 }, { "epoch": 0.11732652370636275, "grad_norm": 0.23475436866283417, "learning_rate": 2.3725e-05, "loss": 2.3839, "step": 1051 }, { "epoch": 0.11743815693538878, "grad_norm": 0.22763217985630035, "learning_rate": 2.37e-05, "loss": 2.3039, "step": 1052 }, { "epoch": 0.11754979016441482, "grad_norm": 0.22953568398952484, "learning_rate": 2.3675e-05, "loss": 2.4163, "step": 1053 }, { "epoch": 0.11766142339344085, "grad_norm": 0.2377011775970459, "learning_rate": 2.365e-05, "loss": 2.3473, "step": 1054 }, { "epoch": 0.11777305662246688, "grad_norm": 0.23042835295200348, "learning_rate": 2.3624999999999998e-05, "loss": 2.3909, "step": 1055 }, { "epoch": 0.11788468985149292, "grad_norm": 0.23968364298343658, "learning_rate": 2.36e-05, "loss": 2.3023, "step": 1056 }, { "epoch": 0.11799632308051895, "grad_norm": 0.23102299869060516, "learning_rate": 2.3575e-05, "loss": 2.3963, "step": 1057 }, { "epoch": 0.11810795630954499, "grad_norm": 0.42533349990844727, "learning_rate": 2.355e-05, "loss": 2.396, "step": 1058 }, { "epoch": 0.11821958953857102, "grad_norm": 0.2384020835161209, "learning_rate": 2.3525e-05, "loss": 2.4379, "step": 1059 }, { "epoch": 0.11833122276759706, "grad_norm": 0.2910504639148712, "learning_rate": 2.35e-05, "loss": 2.3607, "step": 1060 }, { "epoch": 0.11844285599662309, "grad_norm": 0.23686139285564423, "learning_rate": 2.3475e-05, "loss": 2.3374, "step": 1061 }, { "epoch": 0.11855448922564912, "grad_norm": 0.27755534648895264, "learning_rate": 2.345e-05, "loss": 2.3719, "step": 1062 }, { "epoch": 0.11866612245467517, "grad_norm": 0.23504361510276794, "learning_rate": 2.3425000000000004e-05, "loss": 2.4556, "step": 1063 }, { "epoch": 0.1187777556837012, "grad_norm": 0.21698464453220367, "learning_rate": 2.3400000000000003e-05, "loss": 2.3708, "step": 1064 }, { "epoch": 0.11888938891272724, "grad_norm": 0.2256624847650528, "learning_rate": 2.3375000000000002e-05, "loss": 2.2417, "step": 1065 }, { "epoch": 0.11900102214175327, "grad_norm": 0.29147782921791077, "learning_rate": 2.3350000000000002e-05, "loss": 2.3808, "step": 1066 }, { "epoch": 0.11911265537077931, "grad_norm": 0.3733128607273102, "learning_rate": 2.3325e-05, "loss": 2.3823, "step": 1067 }, { "epoch": 0.11922428859980534, "grad_norm": 0.2350093573331833, "learning_rate": 2.3300000000000004e-05, "loss": 2.2919, "step": 1068 }, { "epoch": 0.11933592182883138, "grad_norm": 0.23306138813495636, "learning_rate": 2.3275000000000003e-05, "loss": 2.3852, "step": 1069 }, { "epoch": 0.11944755505785741, "grad_norm": 0.2352742701768875, "learning_rate": 2.3250000000000003e-05, "loss": 2.2804, "step": 1070 }, { "epoch": 0.11955918828688344, "grad_norm": 0.2168835997581482, "learning_rate": 2.3225000000000002e-05, "loss": 2.3675, "step": 1071 }, { "epoch": 0.11967082151590948, "grad_norm": 0.23424020409584045, "learning_rate": 2.32e-05, "loss": 2.4122, "step": 1072 }, { "epoch": 0.11978245474493551, "grad_norm": 0.21874938905239105, "learning_rate": 2.3175e-05, "loss": 2.3381, "step": 1073 }, { "epoch": 0.11989408797396155, "grad_norm": 0.2390536516904831, "learning_rate": 2.3150000000000004e-05, "loss": 2.2756, "step": 1074 }, { "epoch": 0.12000572120298758, "grad_norm": 0.2291589379310608, "learning_rate": 2.3125000000000003e-05, "loss": 2.3482, "step": 1075 }, { "epoch": 0.12011735443201361, "grad_norm": 0.2258395105600357, "learning_rate": 2.3100000000000002e-05, "loss": 2.3838, "step": 1076 }, { "epoch": 0.12022898766103965, "grad_norm": 0.25954297184944153, "learning_rate": 2.3075000000000002e-05, "loss": 2.2275, "step": 1077 }, { "epoch": 0.12034062089006568, "grad_norm": 0.23486246168613434, "learning_rate": 2.305e-05, "loss": 2.3866, "step": 1078 }, { "epoch": 0.12045225411909172, "grad_norm": 0.22818194329738617, "learning_rate": 2.3025e-05, "loss": 2.3244, "step": 1079 }, { "epoch": 0.12056388734811777, "grad_norm": 0.2317325472831726, "learning_rate": 2.3000000000000003e-05, "loss": 2.2603, "step": 1080 }, { "epoch": 0.1206755205771438, "grad_norm": 0.24170775711536407, "learning_rate": 2.2975000000000003e-05, "loss": 2.3409, "step": 1081 }, { "epoch": 0.12078715380616983, "grad_norm": 0.23717226088047028, "learning_rate": 2.2950000000000002e-05, "loss": 2.2956, "step": 1082 }, { "epoch": 0.12089878703519587, "grad_norm": 0.21983082592487335, "learning_rate": 2.2925e-05, "loss": 2.3578, "step": 1083 }, { "epoch": 0.1210104202642219, "grad_norm": 0.23536662757396698, "learning_rate": 2.29e-05, "loss": 2.3317, "step": 1084 }, { "epoch": 0.12112205349324794, "grad_norm": 0.2405475676059723, "learning_rate": 2.2875e-05, "loss": 2.3663, "step": 1085 }, { "epoch": 0.12123368672227397, "grad_norm": 0.23267629742622375, "learning_rate": 2.2850000000000003e-05, "loss": 2.3376, "step": 1086 }, { "epoch": 0.1213453199513, "grad_norm": 0.22715060412883759, "learning_rate": 2.2825000000000003e-05, "loss": 2.5256, "step": 1087 }, { "epoch": 0.12145695318032604, "grad_norm": 0.23004890978336334, "learning_rate": 2.2800000000000002e-05, "loss": 2.3706, "step": 1088 }, { "epoch": 0.12156858640935207, "grad_norm": 0.22569864988327026, "learning_rate": 2.2775e-05, "loss": 2.29, "step": 1089 }, { "epoch": 0.1216802196383781, "grad_norm": 0.22686640918254852, "learning_rate": 2.275e-05, "loss": 2.3978, "step": 1090 }, { "epoch": 0.12179185286740414, "grad_norm": 0.26110976934432983, "learning_rate": 2.2725000000000003e-05, "loss": 2.4019, "step": 1091 }, { "epoch": 0.12190348609643017, "grad_norm": 0.22089050710201263, "learning_rate": 2.2700000000000003e-05, "loss": 2.3078, "step": 1092 }, { "epoch": 0.12201511932545621, "grad_norm": 0.22513218224048615, "learning_rate": 2.2675000000000002e-05, "loss": 2.3088, "step": 1093 }, { "epoch": 0.12212675255448224, "grad_norm": 0.2333805114030838, "learning_rate": 2.265e-05, "loss": 2.2955, "step": 1094 }, { "epoch": 0.12223838578350828, "grad_norm": 0.22828614711761475, "learning_rate": 2.2625e-05, "loss": 2.2746, "step": 1095 }, { "epoch": 0.12235001901253433, "grad_norm": 0.23725035786628723, "learning_rate": 2.26e-05, "loss": 2.5234, "step": 1096 }, { "epoch": 0.12246165224156036, "grad_norm": 0.24143311381340027, "learning_rate": 2.2575000000000003e-05, "loss": 2.2837, "step": 1097 }, { "epoch": 0.1225732854705864, "grad_norm": 0.22875793278217316, "learning_rate": 2.2550000000000003e-05, "loss": 2.2778, "step": 1098 }, { "epoch": 0.12268491869961243, "grad_norm": 0.2337283492088318, "learning_rate": 2.2525000000000002e-05, "loss": 2.2373, "step": 1099 }, { "epoch": 0.12279655192863846, "grad_norm": 0.22070764005184174, "learning_rate": 2.25e-05, "loss": 2.2863, "step": 1100 }, { "epoch": 0.1229081851576645, "grad_norm": 0.23479975759983063, "learning_rate": 2.2475e-05, "loss": 2.3971, "step": 1101 }, { "epoch": 0.12301981838669053, "grad_norm": 0.22529123723506927, "learning_rate": 2.245e-05, "loss": 2.4699, "step": 1102 }, { "epoch": 0.12313145161571656, "grad_norm": 0.24018734693527222, "learning_rate": 2.2425000000000003e-05, "loss": 2.4539, "step": 1103 }, { "epoch": 0.1232430848447426, "grad_norm": 0.22801434993743896, "learning_rate": 2.2400000000000002e-05, "loss": 2.3764, "step": 1104 }, { "epoch": 0.12335471807376863, "grad_norm": 0.32481849193573, "learning_rate": 2.2375000000000002e-05, "loss": 2.4011, "step": 1105 }, { "epoch": 0.12346635130279467, "grad_norm": 0.22494614124298096, "learning_rate": 2.235e-05, "loss": 2.2996, "step": 1106 }, { "epoch": 0.1235779845318207, "grad_norm": 0.2304266095161438, "learning_rate": 2.2325e-05, "loss": 2.2708, "step": 1107 }, { "epoch": 0.12368961776084673, "grad_norm": 0.22332698106765747, "learning_rate": 2.23e-05, "loss": 2.2825, "step": 1108 }, { "epoch": 0.12380125098987277, "grad_norm": 0.2219666987657547, "learning_rate": 2.2275000000000003e-05, "loss": 2.3805, "step": 1109 }, { "epoch": 0.1239128842188988, "grad_norm": 0.2652471363544464, "learning_rate": 2.2250000000000002e-05, "loss": 2.2467, "step": 1110 }, { "epoch": 0.12402451744792484, "grad_norm": 0.2186206877231598, "learning_rate": 2.2225e-05, "loss": 2.2924, "step": 1111 }, { "epoch": 0.12413615067695087, "grad_norm": 0.23667913675308228, "learning_rate": 2.22e-05, "loss": 2.3545, "step": 1112 }, { "epoch": 0.12424778390597692, "grad_norm": 0.3729536831378937, "learning_rate": 2.2175e-05, "loss": 2.2641, "step": 1113 }, { "epoch": 0.12435941713500295, "grad_norm": 0.22672784328460693, "learning_rate": 2.215e-05, "loss": 2.2501, "step": 1114 }, { "epoch": 0.12447105036402899, "grad_norm": 0.2219839245080948, "learning_rate": 2.2125000000000002e-05, "loss": 2.2016, "step": 1115 }, { "epoch": 0.12458268359305502, "grad_norm": 0.24350149929523468, "learning_rate": 2.2100000000000002e-05, "loss": 2.4358, "step": 1116 }, { "epoch": 0.12469431682208106, "grad_norm": 0.2482176274061203, "learning_rate": 2.2075e-05, "loss": 2.2623, "step": 1117 }, { "epoch": 0.12480595005110709, "grad_norm": 0.23272433876991272, "learning_rate": 2.205e-05, "loss": 2.3408, "step": 1118 }, { "epoch": 0.12491758328013312, "grad_norm": 0.23357626795768738, "learning_rate": 2.2025e-05, "loss": 2.231, "step": 1119 }, { "epoch": 0.12502921650915916, "grad_norm": 0.22280560433864594, "learning_rate": 2.2000000000000003e-05, "loss": 2.3045, "step": 1120 }, { "epoch": 0.1251408497381852, "grad_norm": 0.22206202149391174, "learning_rate": 2.1975000000000002e-05, "loss": 2.3103, "step": 1121 }, { "epoch": 0.12525248296721123, "grad_norm": 0.31398919224739075, "learning_rate": 2.195e-05, "loss": 2.3665, "step": 1122 }, { "epoch": 0.12536411619623727, "grad_norm": 0.43255481123924255, "learning_rate": 2.1925e-05, "loss": 2.3153, "step": 1123 }, { "epoch": 0.1254757494252633, "grad_norm": 0.22663763165473938, "learning_rate": 2.19e-05, "loss": 2.3682, "step": 1124 }, { "epoch": 0.12558738265428934, "grad_norm": 0.2514352798461914, "learning_rate": 2.1875e-05, "loss": 2.3727, "step": 1125 }, { "epoch": 0.12569901588331536, "grad_norm": 0.22935128211975098, "learning_rate": 2.1850000000000003e-05, "loss": 2.2161, "step": 1126 }, { "epoch": 0.1258106491123414, "grad_norm": 0.2275882214307785, "learning_rate": 2.1825000000000002e-05, "loss": 2.3793, "step": 1127 }, { "epoch": 0.12592228234136743, "grad_norm": 0.2203671634197235, "learning_rate": 2.18e-05, "loss": 2.3371, "step": 1128 }, { "epoch": 0.12603391557039348, "grad_norm": 0.22310760617256165, "learning_rate": 2.1775e-05, "loss": 2.2956, "step": 1129 }, { "epoch": 0.1261455487994195, "grad_norm": 0.24273249506950378, "learning_rate": 2.175e-05, "loss": 2.3288, "step": 1130 }, { "epoch": 0.12625718202844555, "grad_norm": 0.22426392138004303, "learning_rate": 2.1725e-05, "loss": 2.387, "step": 1131 }, { "epoch": 0.12636881525747157, "grad_norm": 0.2271021008491516, "learning_rate": 2.1700000000000002e-05, "loss": 2.3739, "step": 1132 }, { "epoch": 0.12648044848649762, "grad_norm": 0.2271476835012436, "learning_rate": 2.1675e-05, "loss": 2.3942, "step": 1133 }, { "epoch": 0.12659208171552364, "grad_norm": 0.2212740033864975, "learning_rate": 2.165e-05, "loss": 2.3511, "step": 1134 }, { "epoch": 0.12670371494454968, "grad_norm": 0.23425845801830292, "learning_rate": 2.1625e-05, "loss": 2.3256, "step": 1135 }, { "epoch": 0.1268153481735757, "grad_norm": 0.22545567154884338, "learning_rate": 2.16e-05, "loss": 2.1847, "step": 1136 }, { "epoch": 0.12692698140260175, "grad_norm": 0.25626057386398315, "learning_rate": 2.1575e-05, "loss": 2.3058, "step": 1137 }, { "epoch": 0.1270386146316278, "grad_norm": 0.23703870177268982, "learning_rate": 2.1550000000000002e-05, "loss": 2.3709, "step": 1138 }, { "epoch": 0.12715024786065382, "grad_norm": 0.2455843836069107, "learning_rate": 2.1525e-05, "loss": 2.3468, "step": 1139 }, { "epoch": 0.12726188108967987, "grad_norm": 0.22671882808208466, "learning_rate": 2.15e-05, "loss": 2.3554, "step": 1140 }, { "epoch": 0.1273735143187059, "grad_norm": 0.22661983966827393, "learning_rate": 2.1475e-05, "loss": 2.2919, "step": 1141 }, { "epoch": 0.12748514754773194, "grad_norm": 0.23259443044662476, "learning_rate": 2.145e-05, "loss": 2.3463, "step": 1142 }, { "epoch": 0.12759678077675796, "grad_norm": 0.23297019302845, "learning_rate": 2.1425e-05, "loss": 2.3198, "step": 1143 }, { "epoch": 0.127708414005784, "grad_norm": 0.22478660941123962, "learning_rate": 2.1400000000000002e-05, "loss": 2.2642, "step": 1144 }, { "epoch": 0.12782004723481002, "grad_norm": 0.23042532801628113, "learning_rate": 2.1375e-05, "loss": 2.3056, "step": 1145 }, { "epoch": 0.12793168046383607, "grad_norm": 0.2341778576374054, "learning_rate": 2.135e-05, "loss": 2.3774, "step": 1146 }, { "epoch": 0.1280433136928621, "grad_norm": 0.22689583897590637, "learning_rate": 2.1325e-05, "loss": 2.3493, "step": 1147 }, { "epoch": 0.12815494692188814, "grad_norm": 0.2260814756155014, "learning_rate": 2.13e-05, "loss": 2.3548, "step": 1148 }, { "epoch": 0.12826658015091416, "grad_norm": 0.24112841486930847, "learning_rate": 2.1275000000000002e-05, "loss": 2.3739, "step": 1149 }, { "epoch": 0.1283782133799402, "grad_norm": 0.22553539276123047, "learning_rate": 2.125e-05, "loss": 2.3762, "step": 1150 }, { "epoch": 0.12848984660896623, "grad_norm": 0.2698231041431427, "learning_rate": 2.1225e-05, "loss": 2.3527, "step": 1151 }, { "epoch": 0.12860147983799228, "grad_norm": 0.2280593365430832, "learning_rate": 2.12e-05, "loss": 2.3621, "step": 1152 }, { "epoch": 0.1287131130670183, "grad_norm": 0.23123127222061157, "learning_rate": 2.1175e-05, "loss": 2.4369, "step": 1153 }, { "epoch": 0.12882474629604435, "grad_norm": 0.2183208465576172, "learning_rate": 2.115e-05, "loss": 2.3615, "step": 1154 }, { "epoch": 0.1289363795250704, "grad_norm": 0.3457687199115753, "learning_rate": 2.1125000000000002e-05, "loss": 2.3151, "step": 1155 }, { "epoch": 0.12904801275409641, "grad_norm": 0.24221017956733704, "learning_rate": 2.11e-05, "loss": 2.383, "step": 1156 }, { "epoch": 0.12915964598312246, "grad_norm": 0.31406766176223755, "learning_rate": 2.1075e-05, "loss": 2.3564, "step": 1157 }, { "epoch": 0.12927127921214848, "grad_norm": 0.2336696982383728, "learning_rate": 2.105e-05, "loss": 2.3295, "step": 1158 }, { "epoch": 0.12938291244117453, "grad_norm": 0.2292354553937912, "learning_rate": 2.1025e-05, "loss": 2.3124, "step": 1159 }, { "epoch": 0.12949454567020055, "grad_norm": 0.22350043058395386, "learning_rate": 2.1e-05, "loss": 2.3077, "step": 1160 }, { "epoch": 0.1296061788992266, "grad_norm": 0.22331391274929047, "learning_rate": 2.0975e-05, "loss": 2.2535, "step": 1161 }, { "epoch": 0.12971781212825262, "grad_norm": 0.23815131187438965, "learning_rate": 2.095e-05, "loss": 2.4475, "step": 1162 }, { "epoch": 0.12982944535727867, "grad_norm": 0.29236698150634766, "learning_rate": 2.0925e-05, "loss": 2.312, "step": 1163 }, { "epoch": 0.1299410785863047, "grad_norm": 0.23702973127365112, "learning_rate": 2.09e-05, "loss": 2.4403, "step": 1164 }, { "epoch": 0.13005271181533073, "grad_norm": 0.23101966083049774, "learning_rate": 2.0875e-05, "loss": 2.4587, "step": 1165 }, { "epoch": 0.13016434504435676, "grad_norm": 0.2322998195886612, "learning_rate": 2.085e-05, "loss": 2.421, "step": 1166 }, { "epoch": 0.1302759782733828, "grad_norm": 0.236577570438385, "learning_rate": 2.0825e-05, "loss": 2.3525, "step": 1167 }, { "epoch": 0.13038761150240882, "grad_norm": 0.2353632152080536, "learning_rate": 2.08e-05, "loss": 2.4235, "step": 1168 }, { "epoch": 0.13049924473143487, "grad_norm": 0.23410721123218536, "learning_rate": 2.0775e-05, "loss": 2.3678, "step": 1169 }, { "epoch": 0.13061087796046092, "grad_norm": 0.22162692248821259, "learning_rate": 2.075e-05, "loss": 2.4682, "step": 1170 }, { "epoch": 0.13072251118948694, "grad_norm": 0.2268594205379486, "learning_rate": 2.0725e-05, "loss": 2.3006, "step": 1171 }, { "epoch": 0.130834144418513, "grad_norm": 0.22843024134635925, "learning_rate": 2.07e-05, "loss": 2.3072, "step": 1172 }, { "epoch": 0.130945777647539, "grad_norm": 0.2605232298374176, "learning_rate": 2.0675e-05, "loss": 2.3408, "step": 1173 }, { "epoch": 0.13105741087656506, "grad_norm": 0.2354726940393448, "learning_rate": 2.065e-05, "loss": 2.3971, "step": 1174 }, { "epoch": 0.13116904410559108, "grad_norm": 0.225637286901474, "learning_rate": 2.0625e-05, "loss": 2.314, "step": 1175 }, { "epoch": 0.13128067733461712, "grad_norm": 0.22693420946598053, "learning_rate": 2.06e-05, "loss": 2.4122, "step": 1176 }, { "epoch": 0.13139231056364314, "grad_norm": 0.22487470507621765, "learning_rate": 2.0575e-05, "loss": 2.2592, "step": 1177 }, { "epoch": 0.1315039437926692, "grad_norm": 0.2332613468170166, "learning_rate": 2.055e-05, "loss": 2.3281, "step": 1178 }, { "epoch": 0.1316155770216952, "grad_norm": 0.3483419716358185, "learning_rate": 2.0525e-05, "loss": 2.2564, "step": 1179 }, { "epoch": 0.13172721025072126, "grad_norm": 0.23920704424381256, "learning_rate": 2.05e-05, "loss": 2.3591, "step": 1180 }, { "epoch": 0.13183884347974728, "grad_norm": 0.2232007533311844, "learning_rate": 2.0475e-05, "loss": 2.4216, "step": 1181 }, { "epoch": 0.13195047670877333, "grad_norm": 0.23171833157539368, "learning_rate": 2.045e-05, "loss": 2.2861, "step": 1182 }, { "epoch": 0.13206210993779935, "grad_norm": 0.22283758223056793, "learning_rate": 2.0425e-05, "loss": 2.274, "step": 1183 }, { "epoch": 0.1321737431668254, "grad_norm": 0.21574443578720093, "learning_rate": 2.04e-05, "loss": 2.4139, "step": 1184 }, { "epoch": 0.13228537639585142, "grad_norm": 0.23107174038887024, "learning_rate": 2.0375e-05, "loss": 2.2474, "step": 1185 }, { "epoch": 0.13239700962487747, "grad_norm": 0.2264859676361084, "learning_rate": 2.035e-05, "loss": 2.3655, "step": 1186 }, { "epoch": 0.1325086428539035, "grad_norm": 0.23183952271938324, "learning_rate": 2.0325e-05, "loss": 2.4179, "step": 1187 }, { "epoch": 0.13262027608292953, "grad_norm": 0.2878219187259674, "learning_rate": 2.0300000000000002e-05, "loss": 2.3113, "step": 1188 }, { "epoch": 0.13273190931195558, "grad_norm": 0.2385031133890152, "learning_rate": 2.0275e-05, "loss": 2.3088, "step": 1189 }, { "epoch": 0.1328435425409816, "grad_norm": 0.2221747487783432, "learning_rate": 2.025e-05, "loss": 2.3643, "step": 1190 }, { "epoch": 0.13295517577000765, "grad_norm": 0.23417770862579346, "learning_rate": 2.0225000000000004e-05, "loss": 2.293, "step": 1191 }, { "epoch": 0.13306680899903367, "grad_norm": 0.22443942725658417, "learning_rate": 2.0200000000000003e-05, "loss": 2.1929, "step": 1192 }, { "epoch": 0.13317844222805972, "grad_norm": 0.22454610466957092, "learning_rate": 2.0175000000000003e-05, "loss": 2.2178, "step": 1193 }, { "epoch": 0.13329007545708574, "grad_norm": 0.2309190183877945, "learning_rate": 2.0150000000000002e-05, "loss": 2.3352, "step": 1194 }, { "epoch": 0.1334017086861118, "grad_norm": 0.23848609626293182, "learning_rate": 2.0125e-05, "loss": 2.3348, "step": 1195 }, { "epoch": 0.1335133419151378, "grad_norm": 0.23410527408123016, "learning_rate": 2.01e-05, "loss": 2.1928, "step": 1196 }, { "epoch": 0.13362497514416385, "grad_norm": 0.24377335608005524, "learning_rate": 2.0075000000000003e-05, "loss": 2.2982, "step": 1197 }, { "epoch": 0.13373660837318987, "grad_norm": 0.22369886934757233, "learning_rate": 2.0050000000000003e-05, "loss": 2.2788, "step": 1198 }, { "epoch": 0.13384824160221592, "grad_norm": 0.22385212779045105, "learning_rate": 2.0025000000000002e-05, "loss": 2.3247, "step": 1199 }, { "epoch": 0.13395987483124194, "grad_norm": 0.22769370675086975, "learning_rate": 2e-05, "loss": 2.3075, "step": 1200 }, { "epoch": 0.134071508060268, "grad_norm": 0.2851308286190033, "learning_rate": 1.9975e-05, "loss": 2.317, "step": 1201 }, { "epoch": 0.134183141289294, "grad_norm": 0.24706991016864777, "learning_rate": 1.995e-05, "loss": 2.2672, "step": 1202 }, { "epoch": 0.13429477451832006, "grad_norm": 0.21826080977916718, "learning_rate": 1.9925000000000003e-05, "loss": 2.3373, "step": 1203 }, { "epoch": 0.1344064077473461, "grad_norm": 0.23195642232894897, "learning_rate": 1.9900000000000003e-05, "loss": 2.271, "step": 1204 }, { "epoch": 0.13451804097637213, "grad_norm": 0.24489794671535492, "learning_rate": 1.9875000000000002e-05, "loss": 2.3, "step": 1205 }, { "epoch": 0.13462967420539818, "grad_norm": 0.23644982278347015, "learning_rate": 1.985e-05, "loss": 2.2082, "step": 1206 }, { "epoch": 0.1347413074344242, "grad_norm": 0.22759714722633362, "learning_rate": 1.9825e-05, "loss": 2.3624, "step": 1207 }, { "epoch": 0.13485294066345024, "grad_norm": 0.23812660574913025, "learning_rate": 1.9800000000000004e-05, "loss": 2.3198, "step": 1208 }, { "epoch": 0.13496457389247626, "grad_norm": 0.22150012850761414, "learning_rate": 1.9775000000000003e-05, "loss": 2.2949, "step": 1209 }, { "epoch": 0.1350762071215023, "grad_norm": 0.23681719601154327, "learning_rate": 1.9750000000000002e-05, "loss": 2.4068, "step": 1210 }, { "epoch": 0.13518784035052833, "grad_norm": 0.22480838000774384, "learning_rate": 1.9725000000000002e-05, "loss": 2.4212, "step": 1211 }, { "epoch": 0.13529947357955438, "grad_norm": 0.2516225576400757, "learning_rate": 1.97e-05, "loss": 2.3919, "step": 1212 }, { "epoch": 0.1354111068085804, "grad_norm": 0.22985118627548218, "learning_rate": 1.9675e-05, "loss": 2.3349, "step": 1213 }, { "epoch": 0.13552274003760645, "grad_norm": 0.24210740625858307, "learning_rate": 1.9650000000000003e-05, "loss": 2.3892, "step": 1214 }, { "epoch": 0.13563437326663247, "grad_norm": 0.23913182318210602, "learning_rate": 1.9625000000000003e-05, "loss": 2.3997, "step": 1215 }, { "epoch": 0.13574600649565852, "grad_norm": 0.2336951494216919, "learning_rate": 1.9600000000000002e-05, "loss": 2.2605, "step": 1216 }, { "epoch": 0.13585763972468454, "grad_norm": 0.24042119085788727, "learning_rate": 1.9575e-05, "loss": 2.28, "step": 1217 }, { "epoch": 0.13596927295371058, "grad_norm": 0.24132166802883148, "learning_rate": 1.955e-05, "loss": 2.3416, "step": 1218 }, { "epoch": 0.1360809061827366, "grad_norm": 0.22675567865371704, "learning_rate": 1.9525e-05, "loss": 2.4432, "step": 1219 }, { "epoch": 0.13619253941176265, "grad_norm": 0.2387012392282486, "learning_rate": 1.9500000000000003e-05, "loss": 2.3219, "step": 1220 }, { "epoch": 0.1363041726407887, "grad_norm": 0.22851043939590454, "learning_rate": 1.9475000000000002e-05, "loss": 2.3047, "step": 1221 }, { "epoch": 0.13641580586981472, "grad_norm": 0.229843869805336, "learning_rate": 1.9450000000000002e-05, "loss": 2.3684, "step": 1222 }, { "epoch": 0.13652743909884077, "grad_norm": 0.22853203117847443, "learning_rate": 1.9425e-05, "loss": 2.3583, "step": 1223 }, { "epoch": 0.1366390723278668, "grad_norm": 0.22501179575920105, "learning_rate": 1.94e-05, "loss": 2.3816, "step": 1224 }, { "epoch": 0.13675070555689284, "grad_norm": 0.2398713231086731, "learning_rate": 1.9375e-05, "loss": 2.2891, "step": 1225 }, { "epoch": 0.13686233878591886, "grad_norm": 0.23787732422351837, "learning_rate": 1.9350000000000003e-05, "loss": 2.3867, "step": 1226 }, { "epoch": 0.1369739720149449, "grad_norm": 0.2299504280090332, "learning_rate": 1.9325000000000002e-05, "loss": 2.358, "step": 1227 }, { "epoch": 0.13708560524397093, "grad_norm": 0.23067769408226013, "learning_rate": 1.93e-05, "loss": 2.2818, "step": 1228 }, { "epoch": 0.13719723847299697, "grad_norm": 0.22578711807727814, "learning_rate": 1.9275e-05, "loss": 2.3077, "step": 1229 }, { "epoch": 0.137308871702023, "grad_norm": 0.2431231141090393, "learning_rate": 1.925e-05, "loss": 2.448, "step": 1230 }, { "epoch": 0.13742050493104904, "grad_norm": 0.22373032569885254, "learning_rate": 1.9225e-05, "loss": 2.3161, "step": 1231 }, { "epoch": 0.13753213816007506, "grad_norm": 0.22712518274784088, "learning_rate": 1.9200000000000003e-05, "loss": 2.327, "step": 1232 }, { "epoch": 0.1376437713891011, "grad_norm": 0.23568111658096313, "learning_rate": 1.9175000000000002e-05, "loss": 2.2981, "step": 1233 }, { "epoch": 0.13775540461812713, "grad_norm": 0.4218859076499939, "learning_rate": 1.915e-05, "loss": 2.3714, "step": 1234 }, { "epoch": 0.13786703784715318, "grad_norm": 0.25230151414871216, "learning_rate": 1.9125e-05, "loss": 2.3094, "step": 1235 }, { "epoch": 0.13797867107617923, "grad_norm": 0.2286907434463501, "learning_rate": 1.91e-05, "loss": 2.3385, "step": 1236 }, { "epoch": 0.13809030430520525, "grad_norm": 0.22605471312999725, "learning_rate": 1.9075000000000003e-05, "loss": 2.3362, "step": 1237 }, { "epoch": 0.1382019375342313, "grad_norm": 0.34180840849876404, "learning_rate": 1.9050000000000002e-05, "loss": 2.409, "step": 1238 }, { "epoch": 0.13831357076325732, "grad_norm": 0.23643071949481964, "learning_rate": 1.9025e-05, "loss": 2.3804, "step": 1239 }, { "epoch": 0.13842520399228336, "grad_norm": 0.23849943280220032, "learning_rate": 1.9e-05, "loss": 2.2734, "step": 1240 }, { "epoch": 0.13853683722130938, "grad_norm": 0.22949104011058807, "learning_rate": 1.8975e-05, "loss": 2.3875, "step": 1241 }, { "epoch": 0.13864847045033543, "grad_norm": 0.22790773212909698, "learning_rate": 1.895e-05, "loss": 2.2737, "step": 1242 }, { "epoch": 0.13876010367936145, "grad_norm": 0.23416166007518768, "learning_rate": 1.8925000000000003e-05, "loss": 2.2871, "step": 1243 }, { "epoch": 0.1388717369083875, "grad_norm": 0.22195836901664734, "learning_rate": 1.8900000000000002e-05, "loss": 2.281, "step": 1244 }, { "epoch": 0.13898337013741352, "grad_norm": 0.2336248904466629, "learning_rate": 1.8875e-05, "loss": 2.416, "step": 1245 }, { "epoch": 0.13909500336643957, "grad_norm": 0.23051492869853973, "learning_rate": 1.885e-05, "loss": 2.2719, "step": 1246 }, { "epoch": 0.1392066365954656, "grad_norm": 0.2252211719751358, "learning_rate": 1.8825e-05, "loss": 2.4248, "step": 1247 }, { "epoch": 0.13931826982449164, "grad_norm": 0.21714623272418976, "learning_rate": 1.88e-05, "loss": 2.3514, "step": 1248 }, { "epoch": 0.13942990305351766, "grad_norm": 0.23050609230995178, "learning_rate": 1.8775000000000002e-05, "loss": 2.3583, "step": 1249 }, { "epoch": 0.1395415362825437, "grad_norm": 0.2242250144481659, "learning_rate": 1.8750000000000002e-05, "loss": 2.2994, "step": 1250 }, { "epoch": 0.13965316951156972, "grad_norm": 0.23751282691955566, "learning_rate": 1.8725e-05, "loss": 2.2898, "step": 1251 }, { "epoch": 0.13976480274059577, "grad_norm": 0.240337535738945, "learning_rate": 1.87e-05, "loss": 2.3338, "step": 1252 }, { "epoch": 0.13987643596962182, "grad_norm": 0.2228308618068695, "learning_rate": 1.8675e-05, "loss": 2.3378, "step": 1253 }, { "epoch": 0.13998806919864784, "grad_norm": 0.22002586722373962, "learning_rate": 1.865e-05, "loss": 2.3768, "step": 1254 }, { "epoch": 0.1400997024276739, "grad_norm": 0.22754792869091034, "learning_rate": 1.8625000000000002e-05, "loss": 2.3875, "step": 1255 }, { "epoch": 0.1402113356566999, "grad_norm": 0.22698433697223663, "learning_rate": 1.86e-05, "loss": 2.3624, "step": 1256 }, { "epoch": 0.14032296888572596, "grad_norm": 0.2388381063938141, "learning_rate": 1.8575e-05, "loss": 2.3388, "step": 1257 }, { "epoch": 0.14043460211475198, "grad_norm": 0.2522680461406708, "learning_rate": 1.855e-05, "loss": 2.3738, "step": 1258 }, { "epoch": 0.14054623534377803, "grad_norm": 0.22983700037002563, "learning_rate": 1.8525e-05, "loss": 2.4546, "step": 1259 }, { "epoch": 0.14065786857280405, "grad_norm": 0.23299263417720795, "learning_rate": 1.85e-05, "loss": 2.3668, "step": 1260 }, { "epoch": 0.1407695018018301, "grad_norm": 0.29829832911491394, "learning_rate": 1.8475000000000002e-05, "loss": 2.3765, "step": 1261 }, { "epoch": 0.14088113503085611, "grad_norm": 0.2792015075683594, "learning_rate": 1.845e-05, "loss": 2.2744, "step": 1262 }, { "epoch": 0.14099276825988216, "grad_norm": 0.2371360808610916, "learning_rate": 1.8425e-05, "loss": 2.3681, "step": 1263 }, { "epoch": 0.14110440148890818, "grad_norm": 0.22529280185699463, "learning_rate": 1.84e-05, "loss": 2.2995, "step": 1264 }, { "epoch": 0.14121603471793423, "grad_norm": 0.23717930912971497, "learning_rate": 1.8375e-05, "loss": 2.3641, "step": 1265 }, { "epoch": 0.14132766794696025, "grad_norm": 0.23806947469711304, "learning_rate": 1.8350000000000002e-05, "loss": 2.2403, "step": 1266 }, { "epoch": 0.1414393011759863, "grad_norm": 0.2357507348060608, "learning_rate": 1.8325e-05, "loss": 2.3355, "step": 1267 }, { "epoch": 0.14155093440501232, "grad_norm": 0.2314206063747406, "learning_rate": 1.83e-05, "loss": 2.3525, "step": 1268 }, { "epoch": 0.14166256763403837, "grad_norm": 0.23410086333751678, "learning_rate": 1.8275e-05, "loss": 2.3385, "step": 1269 }, { "epoch": 0.14177420086306441, "grad_norm": 0.23491892218589783, "learning_rate": 1.825e-05, "loss": 2.443, "step": 1270 }, { "epoch": 0.14188583409209043, "grad_norm": 0.224997416138649, "learning_rate": 1.8225e-05, "loss": 2.3451, "step": 1271 }, { "epoch": 0.14199746732111648, "grad_norm": 0.2989085614681244, "learning_rate": 1.8200000000000002e-05, "loss": 2.3746, "step": 1272 }, { "epoch": 0.1421091005501425, "grad_norm": 0.2321501523256302, "learning_rate": 1.8175e-05, "loss": 2.1922, "step": 1273 }, { "epoch": 0.14222073377916855, "grad_norm": 0.2198849767446518, "learning_rate": 1.815e-05, "loss": 2.2675, "step": 1274 }, { "epoch": 0.14233236700819457, "grad_norm": 0.22828496992588043, "learning_rate": 1.8125e-05, "loss": 2.3126, "step": 1275 }, { "epoch": 0.14244400023722062, "grad_norm": 0.23503446578979492, "learning_rate": 1.81e-05, "loss": 2.325, "step": 1276 }, { "epoch": 0.14255563346624664, "grad_norm": 0.22161200642585754, "learning_rate": 1.8075e-05, "loss": 2.4076, "step": 1277 }, { "epoch": 0.1426672666952727, "grad_norm": 0.22467480599880219, "learning_rate": 1.805e-05, "loss": 2.4028, "step": 1278 }, { "epoch": 0.1427788999242987, "grad_norm": 0.23092903196811676, "learning_rate": 1.8025e-05, "loss": 2.3841, "step": 1279 }, { "epoch": 0.14289053315332476, "grad_norm": 0.23917998373508453, "learning_rate": 1.8e-05, "loss": 2.3114, "step": 1280 }, { "epoch": 0.14300216638235078, "grad_norm": 0.24001942574977875, "learning_rate": 1.7975e-05, "loss": 2.2419, "step": 1281 }, { "epoch": 0.14311379961137682, "grad_norm": 0.22311803698539734, "learning_rate": 1.795e-05, "loss": 2.35, "step": 1282 }, { "epoch": 0.14322543284040284, "grad_norm": 0.23798157274723053, "learning_rate": 1.7925e-05, "loss": 2.404, "step": 1283 }, { "epoch": 0.1433370660694289, "grad_norm": 0.22350966930389404, "learning_rate": 1.79e-05, "loss": 2.4122, "step": 1284 }, { "epoch": 0.1434486992984549, "grad_norm": 0.2272355854511261, "learning_rate": 1.7875e-05, "loss": 2.4368, "step": 1285 }, { "epoch": 0.14356033252748096, "grad_norm": 0.21722903847694397, "learning_rate": 1.785e-05, "loss": 2.3516, "step": 1286 }, { "epoch": 0.143671965756507, "grad_norm": 0.23155128955841064, "learning_rate": 1.7825e-05, "loss": 2.3527, "step": 1287 }, { "epoch": 0.14378359898553303, "grad_norm": 0.22640927135944366, "learning_rate": 1.78e-05, "loss": 2.4748, "step": 1288 }, { "epoch": 0.14389523221455908, "grad_norm": 0.23734673857688904, "learning_rate": 1.7775e-05, "loss": 2.2844, "step": 1289 }, { "epoch": 0.1440068654435851, "grad_norm": 0.2328357696533203, "learning_rate": 1.775e-05, "loss": 2.3389, "step": 1290 }, { "epoch": 0.14411849867261114, "grad_norm": 0.21968986093997955, "learning_rate": 1.7725e-05, "loss": 2.3395, "step": 1291 }, { "epoch": 0.14423013190163717, "grad_norm": 0.21870984137058258, "learning_rate": 1.77e-05, "loss": 2.3882, "step": 1292 }, { "epoch": 0.1443417651306632, "grad_norm": 0.23083637654781342, "learning_rate": 1.7675e-05, "loss": 2.3653, "step": 1293 }, { "epoch": 0.14445339835968923, "grad_norm": 0.23204779624938965, "learning_rate": 1.765e-05, "loss": 2.4653, "step": 1294 }, { "epoch": 0.14456503158871528, "grad_norm": 0.23749560117721558, "learning_rate": 1.7625e-05, "loss": 2.3105, "step": 1295 }, { "epoch": 0.1446766648177413, "grad_norm": 0.23485244810581207, "learning_rate": 1.76e-05, "loss": 2.3721, "step": 1296 }, { "epoch": 0.14478829804676735, "grad_norm": 0.24895891547203064, "learning_rate": 1.7575e-05, "loss": 2.3752, "step": 1297 }, { "epoch": 0.14489993127579337, "grad_norm": 0.24030578136444092, "learning_rate": 1.755e-05, "loss": 2.3025, "step": 1298 }, { "epoch": 0.14501156450481942, "grad_norm": 0.23427051305770874, "learning_rate": 1.7525e-05, "loss": 2.431, "step": 1299 }, { "epoch": 0.14512319773384544, "grad_norm": 0.29638248682022095, "learning_rate": 1.75e-05, "loss": 2.3806, "step": 1300 }, { "epoch": 0.1452348309628715, "grad_norm": 0.24296362698078156, "learning_rate": 1.7475e-05, "loss": 2.3058, "step": 1301 }, { "epoch": 0.14534646419189753, "grad_norm": 0.21712899208068848, "learning_rate": 1.745e-05, "loss": 2.2896, "step": 1302 }, { "epoch": 0.14545809742092355, "grad_norm": 0.22417020797729492, "learning_rate": 1.7425e-05, "loss": 2.3631, "step": 1303 }, { "epoch": 0.1455697306499496, "grad_norm": 0.22780650854110718, "learning_rate": 1.74e-05, "loss": 2.441, "step": 1304 }, { "epoch": 0.14568136387897562, "grad_norm": 0.22729064524173737, "learning_rate": 1.7375e-05, "loss": 2.4841, "step": 1305 }, { "epoch": 0.14579299710800167, "grad_norm": 0.23002713918685913, "learning_rate": 1.7349999999999998e-05, "loss": 2.2594, "step": 1306 }, { "epoch": 0.1459046303370277, "grad_norm": 0.22959677875041962, "learning_rate": 1.7325e-05, "loss": 2.3681, "step": 1307 }, { "epoch": 0.14601626356605374, "grad_norm": 0.23864157497882843, "learning_rate": 1.73e-05, "loss": 2.413, "step": 1308 }, { "epoch": 0.14612789679507976, "grad_norm": 0.2260824590921402, "learning_rate": 1.7275e-05, "loss": 2.4128, "step": 1309 }, { "epoch": 0.1462395300241058, "grad_norm": 0.23661834001541138, "learning_rate": 1.725e-05, "loss": 2.3452, "step": 1310 }, { "epoch": 0.14635116325313183, "grad_norm": 0.24539943039417267, "learning_rate": 1.7225e-05, "loss": 2.452, "step": 1311 }, { "epoch": 0.14646279648215788, "grad_norm": 0.24564427137374878, "learning_rate": 1.7199999999999998e-05, "loss": 2.2986, "step": 1312 }, { "epoch": 0.1465744297111839, "grad_norm": 0.22026251256465912, "learning_rate": 1.7175e-05, "loss": 2.4334, "step": 1313 }, { "epoch": 0.14668606294020994, "grad_norm": 0.22923065721988678, "learning_rate": 1.7150000000000004e-05, "loss": 2.3331, "step": 1314 }, { "epoch": 0.14679769616923596, "grad_norm": 0.26323196291923523, "learning_rate": 1.7125000000000003e-05, "loss": 2.2923, "step": 1315 }, { "epoch": 0.146909329398262, "grad_norm": 0.2375718653202057, "learning_rate": 1.7100000000000002e-05, "loss": 2.4087, "step": 1316 }, { "epoch": 0.14702096262728803, "grad_norm": 0.22378355264663696, "learning_rate": 1.7075e-05, "loss": 2.3135, "step": 1317 }, { "epoch": 0.14713259585631408, "grad_norm": 0.2237062305212021, "learning_rate": 1.705e-05, "loss": 2.2467, "step": 1318 }, { "epoch": 0.14724422908534013, "grad_norm": 0.24400672316551208, "learning_rate": 1.7025e-05, "loss": 2.1746, "step": 1319 }, { "epoch": 0.14735586231436615, "grad_norm": 0.22511416673660278, "learning_rate": 1.7000000000000003e-05, "loss": 2.4436, "step": 1320 }, { "epoch": 0.1474674955433922, "grad_norm": 0.23048102855682373, "learning_rate": 1.6975000000000003e-05, "loss": 2.3356, "step": 1321 }, { "epoch": 0.14757912877241822, "grad_norm": 0.2408643364906311, "learning_rate": 1.6950000000000002e-05, "loss": 2.2649, "step": 1322 }, { "epoch": 0.14769076200144426, "grad_norm": 0.2274709939956665, "learning_rate": 1.6925e-05, "loss": 2.3392, "step": 1323 }, { "epoch": 0.14780239523047028, "grad_norm": 0.22548739612102509, "learning_rate": 1.69e-05, "loss": 2.2553, "step": 1324 }, { "epoch": 0.14791402845949633, "grad_norm": 0.24427109956741333, "learning_rate": 1.6875000000000004e-05, "loss": 2.3549, "step": 1325 }, { "epoch": 0.14802566168852235, "grad_norm": 0.2285667061805725, "learning_rate": 1.6850000000000003e-05, "loss": 2.4703, "step": 1326 }, { "epoch": 0.1481372949175484, "grad_norm": 0.22185759246349335, "learning_rate": 1.6825000000000002e-05, "loss": 2.4652, "step": 1327 }, { "epoch": 0.14824892814657442, "grad_norm": 0.23354068398475647, "learning_rate": 1.6800000000000002e-05, "loss": 2.2939, "step": 1328 }, { "epoch": 0.14836056137560047, "grad_norm": 0.22763966023921967, "learning_rate": 1.6775e-05, "loss": 2.3484, "step": 1329 }, { "epoch": 0.1484721946046265, "grad_norm": 0.23366783559322357, "learning_rate": 1.675e-05, "loss": 2.2578, "step": 1330 }, { "epoch": 0.14858382783365254, "grad_norm": 0.22202405333518982, "learning_rate": 1.6725000000000003e-05, "loss": 2.4142, "step": 1331 }, { "epoch": 0.14869546106267856, "grad_norm": 0.499910831451416, "learning_rate": 1.6700000000000003e-05, "loss": 2.304, "step": 1332 }, { "epoch": 0.1488070942917046, "grad_norm": 0.22948618233203888, "learning_rate": 1.6675000000000002e-05, "loss": 2.3483, "step": 1333 }, { "epoch": 0.14891872752073063, "grad_norm": 0.2281409054994583, "learning_rate": 1.665e-05, "loss": 2.3, "step": 1334 }, { "epoch": 0.14903036074975667, "grad_norm": 0.22423812747001648, "learning_rate": 1.6625e-05, "loss": 2.3268, "step": 1335 }, { "epoch": 0.14914199397878272, "grad_norm": 0.22844356298446655, "learning_rate": 1.66e-05, "loss": 2.2694, "step": 1336 }, { "epoch": 0.14925362720780874, "grad_norm": 0.23927628993988037, "learning_rate": 1.6575000000000003e-05, "loss": 2.3171, "step": 1337 }, { "epoch": 0.1493652604368348, "grad_norm": 0.22286780178546906, "learning_rate": 1.6550000000000002e-05, "loss": 2.2984, "step": 1338 }, { "epoch": 0.1494768936658608, "grad_norm": 0.2628430128097534, "learning_rate": 1.6525000000000002e-05, "loss": 2.4076, "step": 1339 }, { "epoch": 0.14958852689488686, "grad_norm": 0.227376326918602, "learning_rate": 1.65e-05, "loss": 2.2738, "step": 1340 }, { "epoch": 0.14970016012391288, "grad_norm": 0.27796196937561035, "learning_rate": 1.6475e-05, "loss": 2.2671, "step": 1341 }, { "epoch": 0.14981179335293893, "grad_norm": 0.2336723506450653, "learning_rate": 1.645e-05, "loss": 2.3649, "step": 1342 }, { "epoch": 0.14992342658196495, "grad_norm": 0.2579742670059204, "learning_rate": 1.6425000000000003e-05, "loss": 2.4033, "step": 1343 }, { "epoch": 0.150035059810991, "grad_norm": 0.23226085305213928, "learning_rate": 1.6400000000000002e-05, "loss": 2.3702, "step": 1344 }, { "epoch": 0.15014669304001702, "grad_norm": 0.22210608422756195, "learning_rate": 1.6375e-05, "loss": 2.3124, "step": 1345 }, { "epoch": 0.15025832626904306, "grad_norm": 0.2297833412885666, "learning_rate": 1.635e-05, "loss": 2.311, "step": 1346 }, { "epoch": 0.15036995949806908, "grad_norm": 0.480037659406662, "learning_rate": 1.6325e-05, "loss": 2.3665, "step": 1347 }, { "epoch": 0.15048159272709513, "grad_norm": 0.22692376375198364, "learning_rate": 1.63e-05, "loss": 2.4278, "step": 1348 }, { "epoch": 0.15059322595612115, "grad_norm": 0.23020422458648682, "learning_rate": 1.6275000000000003e-05, "loss": 2.3182, "step": 1349 }, { "epoch": 0.1507048591851472, "grad_norm": 0.2347160279750824, "learning_rate": 1.6250000000000002e-05, "loss": 2.4645, "step": 1350 }, { "epoch": 0.15081649241417325, "grad_norm": 0.24892841279506683, "learning_rate": 1.6225e-05, "loss": 2.3542, "step": 1351 }, { "epoch": 0.15092812564319927, "grad_norm": 0.2282174974679947, "learning_rate": 1.62e-05, "loss": 2.4327, "step": 1352 }, { "epoch": 0.15103975887222532, "grad_norm": 0.22344060242176056, "learning_rate": 1.6175e-05, "loss": 2.3244, "step": 1353 }, { "epoch": 0.15115139210125134, "grad_norm": 0.2454865276813507, "learning_rate": 1.6150000000000003e-05, "loss": 2.4319, "step": 1354 }, { "epoch": 0.15126302533027738, "grad_norm": 0.238905668258667, "learning_rate": 1.6125000000000002e-05, "loss": 2.3431, "step": 1355 }, { "epoch": 0.1513746585593034, "grad_norm": 0.22759667038917542, "learning_rate": 1.6100000000000002e-05, "loss": 2.4397, "step": 1356 }, { "epoch": 0.15148629178832945, "grad_norm": 0.22996599972248077, "learning_rate": 1.6075e-05, "loss": 2.4378, "step": 1357 }, { "epoch": 0.15159792501735547, "grad_norm": 0.23668399453163147, "learning_rate": 1.605e-05, "loss": 2.3791, "step": 1358 }, { "epoch": 0.15170955824638152, "grad_norm": 0.23384137451648712, "learning_rate": 1.6025e-05, "loss": 2.2818, "step": 1359 }, { "epoch": 0.15182119147540754, "grad_norm": 0.23646210134029388, "learning_rate": 1.6000000000000003e-05, "loss": 2.306, "step": 1360 }, { "epoch": 0.1519328247044336, "grad_norm": 0.23297393321990967, "learning_rate": 1.5975000000000002e-05, "loss": 2.3061, "step": 1361 }, { "epoch": 0.1520444579334596, "grad_norm": 0.23053903877735138, "learning_rate": 1.595e-05, "loss": 2.2829, "step": 1362 }, { "epoch": 0.15215609116248566, "grad_norm": 0.22388771176338196, "learning_rate": 1.5925e-05, "loss": 2.2914, "step": 1363 }, { "epoch": 0.15226772439151168, "grad_norm": 0.22923487424850464, "learning_rate": 1.59e-05, "loss": 2.2915, "step": 1364 }, { "epoch": 0.15237935762053773, "grad_norm": 0.22146253287792206, "learning_rate": 1.5875e-05, "loss": 2.4338, "step": 1365 }, { "epoch": 0.15249099084956375, "grad_norm": 0.22123506665229797, "learning_rate": 1.5850000000000002e-05, "loss": 2.3959, "step": 1366 }, { "epoch": 0.1526026240785898, "grad_norm": 0.31164827942848206, "learning_rate": 1.5825000000000002e-05, "loss": 2.3052, "step": 1367 }, { "epoch": 0.15271425730761584, "grad_norm": 0.23311270773410797, "learning_rate": 1.58e-05, "loss": 2.343, "step": 1368 }, { "epoch": 0.15282589053664186, "grad_norm": 0.25240063667297363, "learning_rate": 1.5775e-05, "loss": 2.3322, "step": 1369 }, { "epoch": 0.1529375237656679, "grad_norm": 0.2153395712375641, "learning_rate": 1.575e-05, "loss": 2.3915, "step": 1370 }, { "epoch": 0.15304915699469393, "grad_norm": 0.23929548263549805, "learning_rate": 1.5725e-05, "loss": 2.382, "step": 1371 }, { "epoch": 0.15316079022371998, "grad_norm": 0.2313188761472702, "learning_rate": 1.5700000000000002e-05, "loss": 2.3818, "step": 1372 }, { "epoch": 0.153272423452746, "grad_norm": 0.22789841890335083, "learning_rate": 1.5675e-05, "loss": 2.3657, "step": 1373 }, { "epoch": 0.15338405668177205, "grad_norm": 0.2191040813922882, "learning_rate": 1.565e-05, "loss": 2.3667, "step": 1374 }, { "epoch": 0.15349568991079807, "grad_norm": 0.22538387775421143, "learning_rate": 1.5625e-05, "loss": 2.2652, "step": 1375 }, { "epoch": 0.15360732313982411, "grad_norm": 0.23191332817077637, "learning_rate": 1.56e-05, "loss": 2.4353, "step": 1376 }, { "epoch": 0.15371895636885013, "grad_norm": 0.23539386689662933, "learning_rate": 1.5575e-05, "loss": 2.3348, "step": 1377 }, { "epoch": 0.15383058959787618, "grad_norm": 0.2552396357059479, "learning_rate": 1.5550000000000002e-05, "loss": 2.2693, "step": 1378 }, { "epoch": 0.1539422228269022, "grad_norm": 0.2361995130777359, "learning_rate": 1.5525e-05, "loss": 2.3506, "step": 1379 }, { "epoch": 0.15405385605592825, "grad_norm": 0.2353193461894989, "learning_rate": 1.55e-05, "loss": 2.2858, "step": 1380 }, { "epoch": 0.15416548928495427, "grad_norm": 0.23434515297412872, "learning_rate": 1.5475e-05, "loss": 2.3942, "step": 1381 }, { "epoch": 0.15427712251398032, "grad_norm": 0.25974181294441223, "learning_rate": 1.545e-05, "loss": 2.3877, "step": 1382 }, { "epoch": 0.15438875574300634, "grad_norm": 0.2274274230003357, "learning_rate": 1.5425000000000002e-05, "loss": 2.4084, "step": 1383 }, { "epoch": 0.1545003889720324, "grad_norm": 0.23268136382102966, "learning_rate": 1.54e-05, "loss": 2.3343, "step": 1384 }, { "epoch": 0.15461202220105844, "grad_norm": 0.23196761310100555, "learning_rate": 1.5375e-05, "loss": 2.3164, "step": 1385 }, { "epoch": 0.15472365543008446, "grad_norm": 0.22755514085292816, "learning_rate": 1.535e-05, "loss": 2.3012, "step": 1386 }, { "epoch": 0.1548352886591105, "grad_norm": 0.21987079083919525, "learning_rate": 1.5325e-05, "loss": 2.3306, "step": 1387 }, { "epoch": 0.15494692188813652, "grad_norm": 0.22733251750469208, "learning_rate": 1.53e-05, "loss": 2.2869, "step": 1388 }, { "epoch": 0.15505855511716257, "grad_norm": 0.23861852288246155, "learning_rate": 1.5275000000000002e-05, "loss": 2.2962, "step": 1389 }, { "epoch": 0.1551701883461886, "grad_norm": 0.2321462631225586, "learning_rate": 1.525e-05, "loss": 2.2596, "step": 1390 }, { "epoch": 0.15528182157521464, "grad_norm": 0.2401476502418518, "learning_rate": 1.5225e-05, "loss": 2.3632, "step": 1391 }, { "epoch": 0.15539345480424066, "grad_norm": 0.2222481220960617, "learning_rate": 1.52e-05, "loss": 2.4276, "step": 1392 }, { "epoch": 0.1555050880332667, "grad_norm": 0.23532716929912567, "learning_rate": 1.5175e-05, "loss": 2.3198, "step": 1393 }, { "epoch": 0.15561672126229273, "grad_norm": 0.24865378439426422, "learning_rate": 1.515e-05, "loss": 2.3288, "step": 1394 }, { "epoch": 0.15572835449131878, "grad_norm": 0.23691876232624054, "learning_rate": 1.5125e-05, "loss": 2.3948, "step": 1395 }, { "epoch": 0.1558399877203448, "grad_norm": 0.21998995542526245, "learning_rate": 1.51e-05, "loss": 2.2551, "step": 1396 }, { "epoch": 0.15595162094937084, "grad_norm": 0.23590844869613647, "learning_rate": 1.5075e-05, "loss": 2.327, "step": 1397 }, { "epoch": 0.15606325417839687, "grad_norm": 0.227497398853302, "learning_rate": 1.505e-05, "loss": 2.2747, "step": 1398 }, { "epoch": 0.1561748874074229, "grad_norm": 0.23721876740455627, "learning_rate": 1.5025000000000001e-05, "loss": 2.2471, "step": 1399 }, { "epoch": 0.15628652063644893, "grad_norm": 0.23734678328037262, "learning_rate": 1.5e-05, "loss": 2.3562, "step": 1400 }, { "epoch": 0.15639815386547498, "grad_norm": 0.23354454338550568, "learning_rate": 1.4975e-05, "loss": 2.2572, "step": 1401 }, { "epoch": 0.15650978709450103, "grad_norm": 0.2284322828054428, "learning_rate": 1.4950000000000001e-05, "loss": 2.3481, "step": 1402 }, { "epoch": 0.15662142032352705, "grad_norm": 0.24697981774806976, "learning_rate": 1.4925e-05, "loss": 2.4658, "step": 1403 }, { "epoch": 0.1567330535525531, "grad_norm": 0.23272185027599335, "learning_rate": 1.49e-05, "loss": 2.4396, "step": 1404 }, { "epoch": 0.15684468678157912, "grad_norm": 0.22883062064647675, "learning_rate": 1.4875e-05, "loss": 2.4235, "step": 1405 }, { "epoch": 0.15695632001060517, "grad_norm": 0.23410239815711975, "learning_rate": 1.485e-05, "loss": 2.3252, "step": 1406 }, { "epoch": 0.1570679532396312, "grad_norm": 0.22218795120716095, "learning_rate": 1.4825e-05, "loss": 2.3215, "step": 1407 }, { "epoch": 0.15717958646865723, "grad_norm": 0.22799868881702423, "learning_rate": 1.48e-05, "loss": 2.2427, "step": 1408 }, { "epoch": 0.15729121969768325, "grad_norm": 0.2517700791358948, "learning_rate": 1.4775e-05, "loss": 2.3081, "step": 1409 }, { "epoch": 0.1574028529267093, "grad_norm": 0.23836027085781097, "learning_rate": 1.475e-05, "loss": 2.3097, "step": 1410 }, { "epoch": 0.15751448615573532, "grad_norm": 0.22234973311424255, "learning_rate": 1.4725e-05, "loss": 2.4121, "step": 1411 }, { "epoch": 0.15762611938476137, "grad_norm": 0.35192346572875977, "learning_rate": 1.47e-05, "loss": 2.3617, "step": 1412 }, { "epoch": 0.1577377526137874, "grad_norm": 0.2238391488790512, "learning_rate": 1.4675e-05, "loss": 2.3145, "step": 1413 }, { "epoch": 0.15784938584281344, "grad_norm": 0.2211901992559433, "learning_rate": 1.465e-05, "loss": 2.3678, "step": 1414 }, { "epoch": 0.15796101907183946, "grad_norm": 0.23561525344848633, "learning_rate": 1.4625e-05, "loss": 2.3846, "step": 1415 }, { "epoch": 0.1580726523008655, "grad_norm": 0.23191802203655243, "learning_rate": 1.4599999999999999e-05, "loss": 2.2361, "step": 1416 }, { "epoch": 0.15818428552989156, "grad_norm": 0.22749824821949005, "learning_rate": 1.4575e-05, "loss": 2.2809, "step": 1417 }, { "epoch": 0.15829591875891758, "grad_norm": 0.22720323503017426, "learning_rate": 1.455e-05, "loss": 2.4112, "step": 1418 }, { "epoch": 0.15840755198794362, "grad_norm": 0.22353364527225494, "learning_rate": 1.4524999999999999e-05, "loss": 2.3453, "step": 1419 }, { "epoch": 0.15851918521696964, "grad_norm": 0.22859126329421997, "learning_rate": 1.45e-05, "loss": 2.3357, "step": 1420 }, { "epoch": 0.1586308184459957, "grad_norm": 0.22939516603946686, "learning_rate": 1.4475e-05, "loss": 2.3544, "step": 1421 }, { "epoch": 0.1587424516750217, "grad_norm": 0.3586462736129761, "learning_rate": 1.4449999999999999e-05, "loss": 2.3628, "step": 1422 }, { "epoch": 0.15885408490404776, "grad_norm": 0.24023115634918213, "learning_rate": 1.4425e-05, "loss": 2.2797, "step": 1423 }, { "epoch": 0.15896571813307378, "grad_norm": 0.22737418115139008, "learning_rate": 1.44e-05, "loss": 2.3599, "step": 1424 }, { "epoch": 0.15907735136209983, "grad_norm": 0.29583752155303955, "learning_rate": 1.4374999999999999e-05, "loss": 2.3229, "step": 1425 }, { "epoch": 0.15918898459112585, "grad_norm": 0.22859321534633636, "learning_rate": 1.435e-05, "loss": 2.256, "step": 1426 }, { "epoch": 0.1593006178201519, "grad_norm": 0.2283511608839035, "learning_rate": 1.4325e-05, "loss": 2.3383, "step": 1427 }, { "epoch": 0.15941225104917792, "grad_norm": 0.24180850386619568, "learning_rate": 1.43e-05, "loss": 2.3953, "step": 1428 }, { "epoch": 0.15952388427820396, "grad_norm": 0.22382569313049316, "learning_rate": 1.4275e-05, "loss": 2.2684, "step": 1429 }, { "epoch": 0.15963551750722998, "grad_norm": 0.23606614768505096, "learning_rate": 1.4249999999999999e-05, "loss": 2.4133, "step": 1430 }, { "epoch": 0.15974715073625603, "grad_norm": 0.23414404690265656, "learning_rate": 1.4225e-05, "loss": 2.2983, "step": 1431 }, { "epoch": 0.15985878396528205, "grad_norm": 0.2279421091079712, "learning_rate": 1.42e-05, "loss": 2.3842, "step": 1432 }, { "epoch": 0.1599704171943081, "grad_norm": 0.22593237459659576, "learning_rate": 1.4174999999999999e-05, "loss": 2.3737, "step": 1433 }, { "epoch": 0.16008205042333415, "grad_norm": 0.2245030701160431, "learning_rate": 1.415e-05, "loss": 2.2731, "step": 1434 }, { "epoch": 0.16019368365236017, "grad_norm": 0.26113003492355347, "learning_rate": 1.4125e-05, "loss": 2.3804, "step": 1435 }, { "epoch": 0.16030531688138622, "grad_norm": 0.2301686853170395, "learning_rate": 1.4099999999999999e-05, "loss": 2.2143, "step": 1436 }, { "epoch": 0.16041695011041224, "grad_norm": 0.2274000197649002, "learning_rate": 1.4075e-05, "loss": 2.4118, "step": 1437 }, { "epoch": 0.16052858333943829, "grad_norm": 0.23563620448112488, "learning_rate": 1.4050000000000003e-05, "loss": 2.3734, "step": 1438 }, { "epoch": 0.1606402165684643, "grad_norm": 0.2320801466703415, "learning_rate": 1.4025000000000002e-05, "loss": 2.144, "step": 1439 }, { "epoch": 0.16075184979749035, "grad_norm": 0.2227548360824585, "learning_rate": 1.4000000000000001e-05, "loss": 2.3084, "step": 1440 }, { "epoch": 0.16086348302651637, "grad_norm": 0.22315476834774017, "learning_rate": 1.3975000000000003e-05, "loss": 2.2599, "step": 1441 }, { "epoch": 0.16097511625554242, "grad_norm": 0.2336408942937851, "learning_rate": 1.3950000000000002e-05, "loss": 2.4105, "step": 1442 }, { "epoch": 0.16108674948456844, "grad_norm": 0.22350838780403137, "learning_rate": 1.3925000000000001e-05, "loss": 2.2656, "step": 1443 }, { "epoch": 0.1611983827135945, "grad_norm": 0.2283545583486557, "learning_rate": 1.3900000000000002e-05, "loss": 2.3147, "step": 1444 }, { "epoch": 0.1613100159426205, "grad_norm": 0.22985070943832397, "learning_rate": 1.3875000000000002e-05, "loss": 2.2738, "step": 1445 }, { "epoch": 0.16142164917164656, "grad_norm": 0.22797125577926636, "learning_rate": 1.3850000000000001e-05, "loss": 2.3832, "step": 1446 }, { "epoch": 0.16153328240067258, "grad_norm": 0.22750802338123322, "learning_rate": 1.3825000000000002e-05, "loss": 2.3116, "step": 1447 }, { "epoch": 0.16164491562969863, "grad_norm": 0.23097005486488342, "learning_rate": 1.3800000000000002e-05, "loss": 2.391, "step": 1448 }, { "epoch": 0.16175654885872465, "grad_norm": 0.23328077793121338, "learning_rate": 1.3775000000000001e-05, "loss": 2.3447, "step": 1449 }, { "epoch": 0.1618681820877507, "grad_norm": 0.2308788150548935, "learning_rate": 1.3750000000000002e-05, "loss": 2.438, "step": 1450 }, { "epoch": 0.16197981531677674, "grad_norm": 0.22981509566307068, "learning_rate": 1.3725000000000002e-05, "loss": 2.3295, "step": 1451 }, { "epoch": 0.16209144854580276, "grad_norm": 0.23261301219463348, "learning_rate": 1.3700000000000001e-05, "loss": 2.3438, "step": 1452 }, { "epoch": 0.1622030817748288, "grad_norm": 0.22711966931819916, "learning_rate": 1.3675000000000002e-05, "loss": 2.3795, "step": 1453 }, { "epoch": 0.16231471500385483, "grad_norm": 0.22391866147518158, "learning_rate": 1.3650000000000001e-05, "loss": 2.2794, "step": 1454 }, { "epoch": 0.16242634823288088, "grad_norm": 0.22070038318634033, "learning_rate": 1.3625e-05, "loss": 2.3572, "step": 1455 }, { "epoch": 0.1625379814619069, "grad_norm": 0.2527678310871124, "learning_rate": 1.3600000000000002e-05, "loss": 2.4289, "step": 1456 }, { "epoch": 0.16264961469093295, "grad_norm": 0.22040753066539764, "learning_rate": 1.3575000000000001e-05, "loss": 2.3787, "step": 1457 }, { "epoch": 0.16276124791995897, "grad_norm": 0.2231033891439438, "learning_rate": 1.3550000000000002e-05, "loss": 2.3683, "step": 1458 }, { "epoch": 0.16287288114898502, "grad_norm": 0.22828775644302368, "learning_rate": 1.3525000000000002e-05, "loss": 2.3088, "step": 1459 }, { "epoch": 0.16298451437801104, "grad_norm": 0.22570709884166718, "learning_rate": 1.3500000000000001e-05, "loss": 2.307, "step": 1460 }, { "epoch": 0.16309614760703708, "grad_norm": 0.8901817798614502, "learning_rate": 1.3475000000000002e-05, "loss": 2.3083, "step": 1461 }, { "epoch": 0.1632077808360631, "grad_norm": 0.31848806142807007, "learning_rate": 1.3450000000000002e-05, "loss": 2.416, "step": 1462 }, { "epoch": 0.16331941406508915, "grad_norm": 0.7419360280036926, "learning_rate": 1.3425000000000001e-05, "loss": 2.3243, "step": 1463 }, { "epoch": 0.16343104729411517, "grad_norm": 0.22873902320861816, "learning_rate": 1.3400000000000002e-05, "loss": 2.344, "step": 1464 }, { "epoch": 0.16354268052314122, "grad_norm": 0.23168662190437317, "learning_rate": 1.3375000000000002e-05, "loss": 2.3418, "step": 1465 }, { "epoch": 0.16365431375216724, "grad_norm": 0.2376638650894165, "learning_rate": 1.3350000000000001e-05, "loss": 2.2826, "step": 1466 }, { "epoch": 0.1637659469811933, "grad_norm": 0.23183491826057434, "learning_rate": 1.3325000000000002e-05, "loss": 2.3392, "step": 1467 }, { "epoch": 0.16387758021021934, "grad_norm": 0.23273466527462006, "learning_rate": 1.3300000000000001e-05, "loss": 2.3427, "step": 1468 }, { "epoch": 0.16398921343924536, "grad_norm": 0.2245101034641266, "learning_rate": 1.3275e-05, "loss": 2.3461, "step": 1469 }, { "epoch": 0.1641008466682714, "grad_norm": 0.2254142016172409, "learning_rate": 1.3250000000000002e-05, "loss": 2.4757, "step": 1470 }, { "epoch": 0.16421247989729743, "grad_norm": 0.26567506790161133, "learning_rate": 1.3225000000000001e-05, "loss": 2.4493, "step": 1471 }, { "epoch": 0.16432411312632347, "grad_norm": 0.22109505534172058, "learning_rate": 1.32e-05, "loss": 2.2728, "step": 1472 }, { "epoch": 0.1644357463553495, "grad_norm": 0.2308911383152008, "learning_rate": 1.3175000000000002e-05, "loss": 2.2651, "step": 1473 }, { "epoch": 0.16454737958437554, "grad_norm": 0.3326926529407501, "learning_rate": 1.3150000000000001e-05, "loss": 2.3822, "step": 1474 }, { "epoch": 0.16465901281340156, "grad_norm": 0.2321234494447708, "learning_rate": 1.3125e-05, "loss": 2.3652, "step": 1475 }, { "epoch": 0.1647706460424276, "grad_norm": 0.22293105721473694, "learning_rate": 1.3100000000000002e-05, "loss": 2.3627, "step": 1476 }, { "epoch": 0.16488227927145363, "grad_norm": 0.22420254349708557, "learning_rate": 1.3075000000000001e-05, "loss": 2.3223, "step": 1477 }, { "epoch": 0.16499391250047968, "grad_norm": 0.24831721186637878, "learning_rate": 1.305e-05, "loss": 2.255, "step": 1478 }, { "epoch": 0.1651055457295057, "grad_norm": 0.23158615827560425, "learning_rate": 1.3025000000000002e-05, "loss": 2.3876, "step": 1479 }, { "epoch": 0.16521717895853175, "grad_norm": 0.2287922203540802, "learning_rate": 1.3000000000000001e-05, "loss": 2.3559, "step": 1480 }, { "epoch": 0.16532881218755777, "grad_norm": 0.26463231444358826, "learning_rate": 1.2975e-05, "loss": 2.375, "step": 1481 }, { "epoch": 0.16544044541658381, "grad_norm": 0.23984935879707336, "learning_rate": 1.2950000000000001e-05, "loss": 2.3024, "step": 1482 }, { "epoch": 0.16555207864560986, "grad_norm": 0.3325958251953125, "learning_rate": 1.2925e-05, "loss": 2.3871, "step": 1483 }, { "epoch": 0.16566371187463588, "grad_norm": 0.23081143200397491, "learning_rate": 1.29e-05, "loss": 2.4202, "step": 1484 }, { "epoch": 0.16577534510366193, "grad_norm": 0.2344684600830078, "learning_rate": 1.2875000000000001e-05, "loss": 2.2737, "step": 1485 }, { "epoch": 0.16588697833268795, "grad_norm": 0.22795481979846954, "learning_rate": 1.285e-05, "loss": 2.3237, "step": 1486 }, { "epoch": 0.165998611561714, "grad_norm": 0.2752438187599182, "learning_rate": 1.2825000000000002e-05, "loss": 2.3495, "step": 1487 }, { "epoch": 0.16611024479074002, "grad_norm": 0.22877170145511627, "learning_rate": 1.2800000000000001e-05, "loss": 2.3932, "step": 1488 }, { "epoch": 0.16622187801976607, "grad_norm": 0.23439621925354004, "learning_rate": 1.2775e-05, "loss": 2.3022, "step": 1489 }, { "epoch": 0.1663335112487921, "grad_norm": 0.2311062514781952, "learning_rate": 1.2750000000000002e-05, "loss": 2.2674, "step": 1490 }, { "epoch": 0.16644514447781814, "grad_norm": 0.2476395070552826, "learning_rate": 1.2725000000000001e-05, "loss": 2.41, "step": 1491 }, { "epoch": 0.16655677770684416, "grad_norm": 0.22706589102745056, "learning_rate": 1.27e-05, "loss": 2.2607, "step": 1492 }, { "epoch": 0.1666684109358702, "grad_norm": 0.22713389992713928, "learning_rate": 1.2675000000000001e-05, "loss": 2.3115, "step": 1493 }, { "epoch": 0.16678004416489622, "grad_norm": 0.2343800812959671, "learning_rate": 1.2650000000000001e-05, "loss": 2.3025, "step": 1494 }, { "epoch": 0.16689167739392227, "grad_norm": 0.23538857698440552, "learning_rate": 1.2625e-05, "loss": 2.2622, "step": 1495 }, { "epoch": 0.1670033106229483, "grad_norm": 0.2369028478860855, "learning_rate": 1.2600000000000001e-05, "loss": 2.4365, "step": 1496 }, { "epoch": 0.16711494385197434, "grad_norm": 0.2249869555234909, "learning_rate": 1.2575e-05, "loss": 2.3179, "step": 1497 }, { "epoch": 0.16722657708100036, "grad_norm": 0.22788971662521362, "learning_rate": 1.255e-05, "loss": 2.4045, "step": 1498 }, { "epoch": 0.1673382103100264, "grad_norm": 0.24183280766010284, "learning_rate": 1.2525000000000001e-05, "loss": 2.2165, "step": 1499 }, { "epoch": 0.16744984353905246, "grad_norm": 0.2287655472755432, "learning_rate": 1.25e-05, "loss": 2.4086, "step": 1500 }, { "epoch": 0.16756147676807848, "grad_norm": 0.22445043921470642, "learning_rate": 1.2475e-05, "loss": 2.3954, "step": 1501 }, { "epoch": 0.16767310999710452, "grad_norm": 0.23435929417610168, "learning_rate": 1.2450000000000001e-05, "loss": 2.335, "step": 1502 }, { "epoch": 0.16778474322613054, "grad_norm": 0.24711214005947113, "learning_rate": 1.2425e-05, "loss": 2.339, "step": 1503 }, { "epoch": 0.1678963764551566, "grad_norm": 0.2275834083557129, "learning_rate": 1.24e-05, "loss": 2.3166, "step": 1504 }, { "epoch": 0.1680080096841826, "grad_norm": 0.23408271372318268, "learning_rate": 1.2375000000000001e-05, "loss": 2.3536, "step": 1505 }, { "epoch": 0.16811964291320866, "grad_norm": 0.24109122157096863, "learning_rate": 1.235e-05, "loss": 2.244, "step": 1506 }, { "epoch": 0.16823127614223468, "grad_norm": 0.24764837324619293, "learning_rate": 1.2325e-05, "loss": 2.3222, "step": 1507 }, { "epoch": 0.16834290937126073, "grad_norm": 0.2633460462093353, "learning_rate": 1.23e-05, "loss": 2.2406, "step": 1508 }, { "epoch": 0.16845454260028675, "grad_norm": 0.23464356362819672, "learning_rate": 1.2275e-05, "loss": 2.4122, "step": 1509 }, { "epoch": 0.1685661758293128, "grad_norm": 0.23036916553974152, "learning_rate": 1.225e-05, "loss": 2.3748, "step": 1510 }, { "epoch": 0.16867780905833882, "grad_norm": 0.22803008556365967, "learning_rate": 1.2225e-05, "loss": 2.3297, "step": 1511 }, { "epoch": 0.16878944228736487, "grad_norm": 0.22997453808784485, "learning_rate": 1.22e-05, "loss": 2.4096, "step": 1512 }, { "epoch": 0.16890107551639089, "grad_norm": 0.2285720407962799, "learning_rate": 1.2175e-05, "loss": 2.3791, "step": 1513 }, { "epoch": 0.16901270874541693, "grad_norm": 0.23072819411754608, "learning_rate": 1.215e-05, "loss": 2.2956, "step": 1514 }, { "epoch": 0.16912434197444295, "grad_norm": 0.2504909336566925, "learning_rate": 1.2125e-05, "loss": 2.3782, "step": 1515 }, { "epoch": 0.169235975203469, "grad_norm": 0.2371397614479065, "learning_rate": 1.2100000000000001e-05, "loss": 2.2162, "step": 1516 }, { "epoch": 0.16934760843249505, "grad_norm": 0.3986074924468994, "learning_rate": 1.2075e-05, "loss": 2.4737, "step": 1517 }, { "epoch": 0.16945924166152107, "grad_norm": 0.24999715387821198, "learning_rate": 1.205e-05, "loss": 2.2256, "step": 1518 }, { "epoch": 0.16957087489054712, "grad_norm": 0.23626287281513214, "learning_rate": 1.2025000000000001e-05, "loss": 2.331, "step": 1519 }, { "epoch": 0.16968250811957314, "grad_norm": 0.24345801770687103, "learning_rate": 1.2e-05, "loss": 2.3845, "step": 1520 }, { "epoch": 0.1697941413485992, "grad_norm": 0.22939099371433258, "learning_rate": 1.1975e-05, "loss": 2.2987, "step": 1521 }, { "epoch": 0.1699057745776252, "grad_norm": 0.22851888835430145, "learning_rate": 1.195e-05, "loss": 2.2962, "step": 1522 }, { "epoch": 0.17001740780665126, "grad_norm": 0.23316575586795807, "learning_rate": 1.1925e-05, "loss": 2.2429, "step": 1523 }, { "epoch": 0.17012904103567728, "grad_norm": 0.23692427575588226, "learning_rate": 1.19e-05, "loss": 2.2889, "step": 1524 }, { "epoch": 0.17024067426470332, "grad_norm": 0.2287677824497223, "learning_rate": 1.1875e-05, "loss": 2.3004, "step": 1525 }, { "epoch": 0.17035230749372934, "grad_norm": 0.23153142631053925, "learning_rate": 1.185e-05, "loss": 2.3339, "step": 1526 }, { "epoch": 0.1704639407227554, "grad_norm": 0.23223859071731567, "learning_rate": 1.1825e-05, "loss": 2.3562, "step": 1527 }, { "epoch": 0.1705755739517814, "grad_norm": 0.2501643896102905, "learning_rate": 1.18e-05, "loss": 2.3524, "step": 1528 }, { "epoch": 0.17068720718080746, "grad_norm": 0.227057546377182, "learning_rate": 1.1775e-05, "loss": 2.3482, "step": 1529 }, { "epoch": 0.17079884040983348, "grad_norm": 0.23041006922721863, "learning_rate": 1.175e-05, "loss": 2.2926, "step": 1530 }, { "epoch": 0.17091047363885953, "grad_norm": 0.23476162552833557, "learning_rate": 1.1725e-05, "loss": 2.2397, "step": 1531 }, { "epoch": 0.17102210686788555, "grad_norm": 0.23537229001522064, "learning_rate": 1.1700000000000001e-05, "loss": 2.321, "step": 1532 }, { "epoch": 0.1711337400969116, "grad_norm": 0.22815583646297455, "learning_rate": 1.1675000000000001e-05, "loss": 2.3891, "step": 1533 }, { "epoch": 0.17124537332593764, "grad_norm": 0.23210576176643372, "learning_rate": 1.1650000000000002e-05, "loss": 2.2983, "step": 1534 }, { "epoch": 0.17135700655496366, "grad_norm": 0.2980906665325165, "learning_rate": 1.1625000000000001e-05, "loss": 2.3548, "step": 1535 }, { "epoch": 0.1714686397839897, "grad_norm": 0.2320065051317215, "learning_rate": 1.16e-05, "loss": 2.3705, "step": 1536 }, { "epoch": 0.17158027301301573, "grad_norm": 0.5941006541252136, "learning_rate": 1.1575000000000002e-05, "loss": 2.303, "step": 1537 }, { "epoch": 0.17169190624204178, "grad_norm": 0.23008465766906738, "learning_rate": 1.1550000000000001e-05, "loss": 2.2684, "step": 1538 }, { "epoch": 0.1718035394710678, "grad_norm": 0.2237699329853058, "learning_rate": 1.1525e-05, "loss": 2.3254, "step": 1539 }, { "epoch": 0.17191517270009385, "grad_norm": 0.2662789225578308, "learning_rate": 1.1500000000000002e-05, "loss": 2.3753, "step": 1540 }, { "epoch": 0.17202680592911987, "grad_norm": 0.4782852232456207, "learning_rate": 1.1475000000000001e-05, "loss": 2.2175, "step": 1541 }, { "epoch": 0.17213843915814592, "grad_norm": 0.23662321269512177, "learning_rate": 1.145e-05, "loss": 2.3435, "step": 1542 }, { "epoch": 0.17225007238717194, "grad_norm": 0.25570255517959595, "learning_rate": 1.1425000000000002e-05, "loss": 2.4225, "step": 1543 }, { "epoch": 0.17236170561619799, "grad_norm": 1.1005254983901978, "learning_rate": 1.1400000000000001e-05, "loss": 2.339, "step": 1544 }, { "epoch": 0.172473338845224, "grad_norm": 0.23017080128192902, "learning_rate": 1.1375e-05, "loss": 2.3182, "step": 1545 }, { "epoch": 0.17258497207425005, "grad_norm": 0.2425118386745453, "learning_rate": 1.1350000000000001e-05, "loss": 2.2469, "step": 1546 }, { "epoch": 0.17269660530327607, "grad_norm": 0.31308507919311523, "learning_rate": 1.1325e-05, "loss": 2.4381, "step": 1547 }, { "epoch": 0.17280823853230212, "grad_norm": 0.22850260138511658, "learning_rate": 1.13e-05, "loss": 2.401, "step": 1548 }, { "epoch": 0.17291987176132817, "grad_norm": 0.23523540794849396, "learning_rate": 1.1275000000000001e-05, "loss": 2.2428, "step": 1549 }, { "epoch": 0.1730315049903542, "grad_norm": 0.2316836565732956, "learning_rate": 1.125e-05, "loss": 2.2699, "step": 1550 }, { "epoch": 0.17314313821938024, "grad_norm": 0.23121745884418488, "learning_rate": 1.1225e-05, "loss": 2.3717, "step": 1551 }, { "epoch": 0.17325477144840626, "grad_norm": 0.2281220257282257, "learning_rate": 1.1200000000000001e-05, "loss": 2.3741, "step": 1552 }, { "epoch": 0.1733664046774323, "grad_norm": 0.2314029186964035, "learning_rate": 1.1175e-05, "loss": 2.3432, "step": 1553 }, { "epoch": 0.17347803790645833, "grad_norm": 0.229678675532341, "learning_rate": 1.115e-05, "loss": 2.4274, "step": 1554 }, { "epoch": 0.17358967113548437, "grad_norm": 0.23651176691055298, "learning_rate": 1.1125000000000001e-05, "loss": 2.47, "step": 1555 }, { "epoch": 0.1737013043645104, "grad_norm": 0.25404486060142517, "learning_rate": 1.11e-05, "loss": 2.2866, "step": 1556 }, { "epoch": 0.17381293759353644, "grad_norm": 0.2315002977848053, "learning_rate": 1.1075e-05, "loss": 2.323, "step": 1557 }, { "epoch": 0.17392457082256246, "grad_norm": 0.22683122754096985, "learning_rate": 1.1050000000000001e-05, "loss": 2.3863, "step": 1558 }, { "epoch": 0.1740362040515885, "grad_norm": 0.22364884614944458, "learning_rate": 1.1025e-05, "loss": 2.4092, "step": 1559 }, { "epoch": 0.17414783728061453, "grad_norm": 0.22706075012683868, "learning_rate": 1.1000000000000001e-05, "loss": 2.2995, "step": 1560 }, { "epoch": 0.17425947050964058, "grad_norm": 0.22615200281143188, "learning_rate": 1.0975e-05, "loss": 2.274, "step": 1561 }, { "epoch": 0.1743711037386666, "grad_norm": 0.23156067728996277, "learning_rate": 1.095e-05, "loss": 2.3346, "step": 1562 }, { "epoch": 0.17448273696769265, "grad_norm": 0.22800372540950775, "learning_rate": 1.0925000000000001e-05, "loss": 2.3311, "step": 1563 }, { "epoch": 0.17459437019671867, "grad_norm": 0.2314596325159073, "learning_rate": 1.09e-05, "loss": 2.4173, "step": 1564 }, { "epoch": 0.17470600342574472, "grad_norm": 0.22724230587482452, "learning_rate": 1.0875e-05, "loss": 2.2718, "step": 1565 }, { "epoch": 0.17481763665477076, "grad_norm": 0.22000856697559357, "learning_rate": 1.0850000000000001e-05, "loss": 2.2426, "step": 1566 }, { "epoch": 0.17492926988379678, "grad_norm": 0.22086068987846375, "learning_rate": 1.0825e-05, "loss": 2.343, "step": 1567 }, { "epoch": 0.17504090311282283, "grad_norm": 0.23039323091506958, "learning_rate": 1.08e-05, "loss": 2.3363, "step": 1568 }, { "epoch": 0.17515253634184885, "grad_norm": 0.23125073313713074, "learning_rate": 1.0775000000000001e-05, "loss": 2.3756, "step": 1569 }, { "epoch": 0.1752641695708749, "grad_norm": 0.2302304059267044, "learning_rate": 1.075e-05, "loss": 2.292, "step": 1570 }, { "epoch": 0.17537580279990092, "grad_norm": 0.2361575961112976, "learning_rate": 1.0725e-05, "loss": 2.3275, "step": 1571 }, { "epoch": 0.17548743602892697, "grad_norm": 0.2350195348262787, "learning_rate": 1.0700000000000001e-05, "loss": 2.3313, "step": 1572 }, { "epoch": 0.175599069257953, "grad_norm": 0.2335433065891266, "learning_rate": 1.0675e-05, "loss": 2.4255, "step": 1573 }, { "epoch": 0.17571070248697904, "grad_norm": 0.27532052993774414, "learning_rate": 1.065e-05, "loss": 2.4282, "step": 1574 }, { "epoch": 0.17582233571600506, "grad_norm": 0.23674359917640686, "learning_rate": 1.0625e-05, "loss": 2.3127, "step": 1575 }, { "epoch": 0.1759339689450311, "grad_norm": 0.22703874111175537, "learning_rate": 1.06e-05, "loss": 2.4281, "step": 1576 }, { "epoch": 0.17604560217405713, "grad_norm": 0.2311123013496399, "learning_rate": 1.0575e-05, "loss": 2.3588, "step": 1577 }, { "epoch": 0.17615723540308317, "grad_norm": 0.22461971640586853, "learning_rate": 1.055e-05, "loss": 2.3376, "step": 1578 }, { "epoch": 0.1762688686321092, "grad_norm": 0.2341393530368805, "learning_rate": 1.0525e-05, "loss": 2.3273, "step": 1579 }, { "epoch": 0.17638050186113524, "grad_norm": 0.24818100035190582, "learning_rate": 1.05e-05, "loss": 2.4611, "step": 1580 }, { "epoch": 0.17649213509016126, "grad_norm": 0.23257790505886078, "learning_rate": 1.0475e-05, "loss": 2.2885, "step": 1581 }, { "epoch": 0.1766037683191873, "grad_norm": 0.23357973992824554, "learning_rate": 1.045e-05, "loss": 2.2287, "step": 1582 }, { "epoch": 0.17671540154821336, "grad_norm": 0.22697918117046356, "learning_rate": 1.0425e-05, "loss": 2.2872, "step": 1583 }, { "epoch": 0.17682703477723938, "grad_norm": 0.2292308658361435, "learning_rate": 1.04e-05, "loss": 2.332, "step": 1584 }, { "epoch": 0.17693866800626543, "grad_norm": 0.235504150390625, "learning_rate": 1.0375e-05, "loss": 2.3714, "step": 1585 }, { "epoch": 0.17705030123529145, "grad_norm": 0.22994711995124817, "learning_rate": 1.035e-05, "loss": 2.3468, "step": 1586 }, { "epoch": 0.1771619344643175, "grad_norm": 0.23801718652248383, "learning_rate": 1.0325e-05, "loss": 2.3851, "step": 1587 }, { "epoch": 0.17727356769334351, "grad_norm": 0.22894729673862457, "learning_rate": 1.03e-05, "loss": 2.4219, "step": 1588 }, { "epoch": 0.17738520092236956, "grad_norm": 0.2282966822385788, "learning_rate": 1.0275e-05, "loss": 2.3184, "step": 1589 }, { "epoch": 0.17749683415139558, "grad_norm": 0.22595593333244324, "learning_rate": 1.025e-05, "loss": 2.3154, "step": 1590 }, { "epoch": 0.17760846738042163, "grad_norm": 0.2264798879623413, "learning_rate": 1.0225e-05, "loss": 2.2779, "step": 1591 }, { "epoch": 0.17772010060944765, "grad_norm": 0.22668862342834473, "learning_rate": 1.02e-05, "loss": 2.4036, "step": 1592 }, { "epoch": 0.1778317338384737, "grad_norm": 0.22386161983013153, "learning_rate": 1.0175e-05, "loss": 2.2603, "step": 1593 }, { "epoch": 0.17794336706749972, "grad_norm": 0.23508307337760925, "learning_rate": 1.0150000000000001e-05, "loss": 2.3921, "step": 1594 }, { "epoch": 0.17805500029652577, "grad_norm": 0.2235022783279419, "learning_rate": 1.0125e-05, "loss": 2.3779, "step": 1595 }, { "epoch": 0.1781666335255518, "grad_norm": 0.2393585741519928, "learning_rate": 1.0100000000000002e-05, "loss": 2.3471, "step": 1596 }, { "epoch": 0.17827826675457784, "grad_norm": 0.22901956737041473, "learning_rate": 1.0075000000000001e-05, "loss": 2.2271, "step": 1597 }, { "epoch": 0.17838989998360386, "grad_norm": 0.22379013895988464, "learning_rate": 1.005e-05, "loss": 2.3179, "step": 1598 }, { "epoch": 0.1785015332126299, "grad_norm": 0.23061637580394745, "learning_rate": 1.0025000000000001e-05, "loss": 2.4403, "step": 1599 }, { "epoch": 0.17861316644165595, "grad_norm": 0.22944380342960358, "learning_rate": 1e-05, "loss": 2.3587, "step": 1600 }, { "epoch": 0.17872479967068197, "grad_norm": 0.22462834417819977, "learning_rate": 9.975e-06, "loss": 2.366, "step": 1601 }, { "epoch": 0.17883643289970802, "grad_norm": 0.22976066172122955, "learning_rate": 9.950000000000001e-06, "loss": 2.3952, "step": 1602 }, { "epoch": 0.17894806612873404, "grad_norm": 0.22779542207717896, "learning_rate": 9.925e-06, "loss": 2.3513, "step": 1603 }, { "epoch": 0.1790596993577601, "grad_norm": 0.22328883409500122, "learning_rate": 9.900000000000002e-06, "loss": 2.3977, "step": 1604 }, { "epoch": 0.1791713325867861, "grad_norm": 0.2311943769454956, "learning_rate": 9.875000000000001e-06, "loss": 2.2658, "step": 1605 }, { "epoch": 0.17928296581581216, "grad_norm": 0.2562137246131897, "learning_rate": 9.85e-06, "loss": 2.3592, "step": 1606 }, { "epoch": 0.17939459904483818, "grad_norm": 0.2313629388809204, "learning_rate": 9.825000000000002e-06, "loss": 2.2974, "step": 1607 }, { "epoch": 0.17950623227386422, "grad_norm": 0.23150859773159027, "learning_rate": 9.800000000000001e-06, "loss": 2.428, "step": 1608 }, { "epoch": 0.17961786550289024, "grad_norm": 0.23653681576251984, "learning_rate": 9.775e-06, "loss": 2.2849, "step": 1609 }, { "epoch": 0.1797294987319163, "grad_norm": 0.23083879053592682, "learning_rate": 9.750000000000002e-06, "loss": 2.3614, "step": 1610 }, { "epoch": 0.1798411319609423, "grad_norm": 0.23122479021549225, "learning_rate": 9.725000000000001e-06, "loss": 2.3737, "step": 1611 }, { "epoch": 0.17995276518996836, "grad_norm": 0.22781018912792206, "learning_rate": 9.7e-06, "loss": 2.3419, "step": 1612 }, { "epoch": 0.18006439841899438, "grad_norm": 0.2270815223455429, "learning_rate": 9.675000000000001e-06, "loss": 2.3468, "step": 1613 }, { "epoch": 0.18017603164802043, "grad_norm": 0.2297106683254242, "learning_rate": 9.65e-06, "loss": 2.3164, "step": 1614 }, { "epoch": 0.18028766487704648, "grad_norm": 0.22484375536441803, "learning_rate": 9.625e-06, "loss": 2.5492, "step": 1615 }, { "epoch": 0.1803992981060725, "grad_norm": 0.230937659740448, "learning_rate": 9.600000000000001e-06, "loss": 2.3003, "step": 1616 }, { "epoch": 0.18051093133509855, "grad_norm": 0.23577038943767548, "learning_rate": 9.575e-06, "loss": 2.2976, "step": 1617 }, { "epoch": 0.18062256456412457, "grad_norm": 0.23040169477462769, "learning_rate": 9.55e-06, "loss": 2.3458, "step": 1618 }, { "epoch": 0.1807341977931506, "grad_norm": 0.23320811986923218, "learning_rate": 9.525000000000001e-06, "loss": 2.3712, "step": 1619 }, { "epoch": 0.18084583102217663, "grad_norm": 0.2216322273015976, "learning_rate": 9.5e-06, "loss": 2.3903, "step": 1620 }, { "epoch": 0.18095746425120268, "grad_norm": 0.22143380343914032, "learning_rate": 9.475e-06, "loss": 2.3709, "step": 1621 }, { "epoch": 0.1810690974802287, "grad_norm": 0.22373448312282562, "learning_rate": 9.450000000000001e-06, "loss": 2.3535, "step": 1622 }, { "epoch": 0.18118073070925475, "grad_norm": 0.22735032439231873, "learning_rate": 9.425e-06, "loss": 2.3133, "step": 1623 }, { "epoch": 0.18129236393828077, "grad_norm": 0.2304399609565735, "learning_rate": 9.4e-06, "loss": 2.3577, "step": 1624 }, { "epoch": 0.18140399716730682, "grad_norm": 0.22027041018009186, "learning_rate": 9.375000000000001e-06, "loss": 2.352, "step": 1625 }, { "epoch": 0.18151563039633284, "grad_norm": 0.23104159533977509, "learning_rate": 9.35e-06, "loss": 2.3603, "step": 1626 }, { "epoch": 0.1816272636253589, "grad_norm": 0.26686009764671326, "learning_rate": 9.325e-06, "loss": 2.328, "step": 1627 }, { "epoch": 0.1817388968543849, "grad_norm": 0.22924385964870453, "learning_rate": 9.3e-06, "loss": 2.3588, "step": 1628 }, { "epoch": 0.18185053008341096, "grad_norm": 0.22922426462173462, "learning_rate": 9.275e-06, "loss": 2.3412, "step": 1629 }, { "epoch": 0.18196216331243698, "grad_norm": 0.22747567296028137, "learning_rate": 9.25e-06, "loss": 2.3429, "step": 1630 }, { "epoch": 0.18207379654146302, "grad_norm": 0.2208503782749176, "learning_rate": 9.225e-06, "loss": 2.2673, "step": 1631 }, { "epoch": 0.18218542977048907, "grad_norm": 0.24815091490745544, "learning_rate": 9.2e-06, "loss": 2.4107, "step": 1632 }, { "epoch": 0.1822970629995151, "grad_norm": 0.23134523630142212, "learning_rate": 9.175000000000001e-06, "loss": 2.3822, "step": 1633 }, { "epoch": 0.18240869622854114, "grad_norm": 0.2306479662656784, "learning_rate": 9.15e-06, "loss": 2.3913, "step": 1634 }, { "epoch": 0.18252032945756716, "grad_norm": 0.22402219474315643, "learning_rate": 9.125e-06, "loss": 2.3625, "step": 1635 }, { "epoch": 0.1826319626865932, "grad_norm": 0.2309318333864212, "learning_rate": 9.100000000000001e-06, "loss": 2.4505, "step": 1636 }, { "epoch": 0.18274359591561923, "grad_norm": 0.22473448514938354, "learning_rate": 9.075e-06, "loss": 2.3079, "step": 1637 }, { "epoch": 0.18285522914464528, "grad_norm": 0.22978295385837555, "learning_rate": 9.05e-06, "loss": 2.3589, "step": 1638 }, { "epoch": 0.1829668623736713, "grad_norm": 0.22440096735954285, "learning_rate": 9.025e-06, "loss": 2.4401, "step": 1639 }, { "epoch": 0.18307849560269734, "grad_norm": 0.23170286417007446, "learning_rate": 9e-06, "loss": 2.3291, "step": 1640 }, { "epoch": 0.18319012883172336, "grad_norm": 0.24245621263980865, "learning_rate": 8.975e-06, "loss": 2.3008, "step": 1641 }, { "epoch": 0.1833017620607494, "grad_norm": 0.23115895688533783, "learning_rate": 8.95e-06, "loss": 2.3127, "step": 1642 }, { "epoch": 0.18341339528977543, "grad_norm": 0.2309594601392746, "learning_rate": 8.925e-06, "loss": 2.3205, "step": 1643 }, { "epoch": 0.18352502851880148, "grad_norm": 0.2410878688097, "learning_rate": 8.9e-06, "loss": 2.4235, "step": 1644 }, { "epoch": 0.1836366617478275, "grad_norm": 0.2556111216545105, "learning_rate": 8.875e-06, "loss": 2.4193, "step": 1645 }, { "epoch": 0.18374829497685355, "grad_norm": 0.23009131848812103, "learning_rate": 8.85e-06, "loss": 2.3439, "step": 1646 }, { "epoch": 0.18385992820587957, "grad_norm": 0.23371157050132751, "learning_rate": 8.825e-06, "loss": 2.3897, "step": 1647 }, { "epoch": 0.18397156143490562, "grad_norm": 0.23305875062942505, "learning_rate": 8.8e-06, "loss": 2.2757, "step": 1648 }, { "epoch": 0.18408319466393167, "grad_norm": 0.22758939862251282, "learning_rate": 8.775e-06, "loss": 2.3526, "step": 1649 }, { "epoch": 0.18419482789295769, "grad_norm": 0.28928419947624207, "learning_rate": 8.75e-06, "loss": 2.2889, "step": 1650 }, { "epoch": 0.18430646112198373, "grad_norm": 0.23369638621807098, "learning_rate": 8.725e-06, "loss": 2.3379, "step": 1651 }, { "epoch": 0.18441809435100975, "grad_norm": 0.23393464088439941, "learning_rate": 8.7e-06, "loss": 2.3827, "step": 1652 }, { "epoch": 0.1845297275800358, "grad_norm": 0.22967039048671722, "learning_rate": 8.674999999999999e-06, "loss": 2.4529, "step": 1653 }, { "epoch": 0.18464136080906182, "grad_norm": 0.311108261346817, "learning_rate": 8.65e-06, "loss": 2.2626, "step": 1654 }, { "epoch": 0.18475299403808787, "grad_norm": 0.23404546082019806, "learning_rate": 8.625e-06, "loss": 2.2972, "step": 1655 }, { "epoch": 0.1848646272671139, "grad_norm": 0.2330547720193863, "learning_rate": 8.599999999999999e-06, "loss": 2.3406, "step": 1656 }, { "epoch": 0.18497626049613994, "grad_norm": 0.22166459262371063, "learning_rate": 8.575000000000002e-06, "loss": 2.2893, "step": 1657 }, { "epoch": 0.18508789372516596, "grad_norm": 0.2245807945728302, "learning_rate": 8.550000000000001e-06, "loss": 2.1937, "step": 1658 }, { "epoch": 0.185199526954192, "grad_norm": 0.22609424591064453, "learning_rate": 8.525e-06, "loss": 2.3882, "step": 1659 }, { "epoch": 0.18531116018321803, "grad_norm": 0.22625020146369934, "learning_rate": 8.500000000000002e-06, "loss": 2.3893, "step": 1660 }, { "epoch": 0.18542279341224407, "grad_norm": 0.25080451369285583, "learning_rate": 8.475000000000001e-06, "loss": 2.2867, "step": 1661 }, { "epoch": 0.1855344266412701, "grad_norm": 0.23033380508422852, "learning_rate": 8.45e-06, "loss": 2.3523, "step": 1662 }, { "epoch": 0.18564605987029614, "grad_norm": 0.23714114725589752, "learning_rate": 8.425000000000001e-06, "loss": 2.3399, "step": 1663 }, { "epoch": 0.18575769309932216, "grad_norm": 0.23813962936401367, "learning_rate": 8.400000000000001e-06, "loss": 2.3729, "step": 1664 }, { "epoch": 0.1858693263283482, "grad_norm": 0.22549176216125488, "learning_rate": 8.375e-06, "loss": 2.3674, "step": 1665 }, { "epoch": 0.18598095955737426, "grad_norm": 0.2233189046382904, "learning_rate": 8.350000000000001e-06, "loss": 2.3293, "step": 1666 }, { "epoch": 0.18609259278640028, "grad_norm": 0.22096222639083862, "learning_rate": 8.325e-06, "loss": 2.3471, "step": 1667 }, { "epoch": 0.18620422601542633, "grad_norm": 0.2315039187669754, "learning_rate": 8.3e-06, "loss": 2.3451, "step": 1668 }, { "epoch": 0.18631585924445235, "grad_norm": 0.23510505259037018, "learning_rate": 8.275000000000001e-06, "loss": 2.3604, "step": 1669 }, { "epoch": 0.1864274924734784, "grad_norm": 0.23275664448738098, "learning_rate": 8.25e-06, "loss": 2.3314, "step": 1670 }, { "epoch": 0.18653912570250442, "grad_norm": 0.22906853258609772, "learning_rate": 8.225e-06, "loss": 2.3489, "step": 1671 }, { "epoch": 0.18665075893153046, "grad_norm": 0.223612442612648, "learning_rate": 8.200000000000001e-06, "loss": 2.4229, "step": 1672 }, { "epoch": 0.18676239216055648, "grad_norm": 0.22447071969509125, "learning_rate": 8.175e-06, "loss": 2.4356, "step": 1673 }, { "epoch": 0.18687402538958253, "grad_norm": 0.2319689691066742, "learning_rate": 8.15e-06, "loss": 2.3806, "step": 1674 }, { "epoch": 0.18698565861860855, "grad_norm": 0.22558631002902985, "learning_rate": 8.125000000000001e-06, "loss": 2.3246, "step": 1675 }, { "epoch": 0.1870972918476346, "grad_norm": 0.23677797615528107, "learning_rate": 8.1e-06, "loss": 2.3814, "step": 1676 }, { "epoch": 0.18720892507666062, "grad_norm": 0.2355109453201294, "learning_rate": 8.075000000000001e-06, "loss": 2.3477, "step": 1677 }, { "epoch": 0.18732055830568667, "grad_norm": 0.22588761150836945, "learning_rate": 8.050000000000001e-06, "loss": 2.2773, "step": 1678 }, { "epoch": 0.1874321915347127, "grad_norm": 0.22162844240665436, "learning_rate": 8.025e-06, "loss": 2.3412, "step": 1679 }, { "epoch": 0.18754382476373874, "grad_norm": 0.23201774060726166, "learning_rate": 8.000000000000001e-06, "loss": 2.3573, "step": 1680 }, { "epoch": 0.18765545799276478, "grad_norm": 0.22092294692993164, "learning_rate": 7.975e-06, "loss": 2.3214, "step": 1681 }, { "epoch": 0.1877670912217908, "grad_norm": 0.2279776781797409, "learning_rate": 7.95e-06, "loss": 2.3555, "step": 1682 }, { "epoch": 0.18787872445081685, "grad_norm": 0.23055800795555115, "learning_rate": 7.925000000000001e-06, "loss": 2.2783, "step": 1683 }, { "epoch": 0.18799035767984287, "grad_norm": 0.22860021889209747, "learning_rate": 7.9e-06, "loss": 2.3331, "step": 1684 }, { "epoch": 0.18810199090886892, "grad_norm": 0.22126097977161407, "learning_rate": 7.875e-06, "loss": 2.3946, "step": 1685 }, { "epoch": 0.18821362413789494, "grad_norm": 0.22869837284088135, "learning_rate": 7.850000000000001e-06, "loss": 2.2846, "step": 1686 }, { "epoch": 0.188325257366921, "grad_norm": 0.23146143555641174, "learning_rate": 7.825e-06, "loss": 2.3452, "step": 1687 }, { "epoch": 0.188436890595947, "grad_norm": 0.24097077548503876, "learning_rate": 7.8e-06, "loss": 2.2416, "step": 1688 }, { "epoch": 0.18854852382497306, "grad_norm": 0.23362663388252258, "learning_rate": 7.775000000000001e-06, "loss": 2.3445, "step": 1689 }, { "epoch": 0.18866015705399908, "grad_norm": 0.23741386830806732, "learning_rate": 7.75e-06, "loss": 2.4122, "step": 1690 }, { "epoch": 0.18877179028302513, "grad_norm": 0.2494947612285614, "learning_rate": 7.725e-06, "loss": 2.2177, "step": 1691 }, { "epoch": 0.18888342351205115, "grad_norm": 0.23337315022945404, "learning_rate": 7.7e-06, "loss": 2.3019, "step": 1692 }, { "epoch": 0.1889950567410772, "grad_norm": 0.2309933602809906, "learning_rate": 7.675e-06, "loss": 2.3468, "step": 1693 }, { "epoch": 0.18910668997010321, "grad_norm": 0.22482022643089294, "learning_rate": 7.65e-06, "loss": 2.2141, "step": 1694 }, { "epoch": 0.18921832319912926, "grad_norm": 0.24350865185260773, "learning_rate": 7.625e-06, "loss": 2.3257, "step": 1695 }, { "epoch": 0.18932995642815528, "grad_norm": 0.2380324900150299, "learning_rate": 7.6e-06, "loss": 2.3769, "step": 1696 }, { "epoch": 0.18944158965718133, "grad_norm": 0.23215071856975555, "learning_rate": 7.575e-06, "loss": 2.3028, "step": 1697 }, { "epoch": 0.18955322288620738, "grad_norm": 0.24625477194786072, "learning_rate": 7.55e-06, "loss": 2.3829, "step": 1698 }, { "epoch": 0.1896648561152334, "grad_norm": 0.22713284194469452, "learning_rate": 7.525e-06, "loss": 2.3282, "step": 1699 }, { "epoch": 0.18977648934425945, "grad_norm": 0.221211776137352, "learning_rate": 7.5e-06, "loss": 2.325, "step": 1700 }, { "epoch": 0.18988812257328547, "grad_norm": 0.23911140859127045, "learning_rate": 7.4750000000000004e-06, "loss": 2.2369, "step": 1701 }, { "epoch": 0.18999975580231152, "grad_norm": 0.22453325986862183, "learning_rate": 7.45e-06, "loss": 2.3757, "step": 1702 }, { "epoch": 0.19011138903133754, "grad_norm": 0.2336825579404831, "learning_rate": 7.425e-06, "loss": 2.2279, "step": 1703 }, { "epoch": 0.19022302226036358, "grad_norm": 0.2163713276386261, "learning_rate": 7.4e-06, "loss": 2.3206, "step": 1704 }, { "epoch": 0.1903346554893896, "grad_norm": 0.23673447966575623, "learning_rate": 7.375e-06, "loss": 2.3235, "step": 1705 }, { "epoch": 0.19044628871841565, "grad_norm": 0.23599569499492645, "learning_rate": 7.35e-06, "loss": 2.4519, "step": 1706 }, { "epoch": 0.19055792194744167, "grad_norm": 0.23098653554916382, "learning_rate": 7.325e-06, "loss": 2.344, "step": 1707 }, { "epoch": 0.19066955517646772, "grad_norm": 0.2260526716709137, "learning_rate": 7.2999999999999996e-06, "loss": 2.2734, "step": 1708 }, { "epoch": 0.19078118840549374, "grad_norm": 1.934260606765747, "learning_rate": 7.275e-06, "loss": 2.3161, "step": 1709 }, { "epoch": 0.1908928216345198, "grad_norm": 0.22405433654785156, "learning_rate": 7.25e-06, "loss": 2.3647, "step": 1710 }, { "epoch": 0.1910044548635458, "grad_norm": 0.22353483736515045, "learning_rate": 7.2249999999999994e-06, "loss": 2.3636, "step": 1711 }, { "epoch": 0.19111608809257186, "grad_norm": 0.2400137484073639, "learning_rate": 7.2e-06, "loss": 2.4212, "step": 1712 }, { "epoch": 0.19122772132159788, "grad_norm": 0.22043661773204803, "learning_rate": 7.175e-06, "loss": 2.33, "step": 1713 }, { "epoch": 0.19133935455062392, "grad_norm": 0.23072798550128937, "learning_rate": 7.15e-06, "loss": 2.3748, "step": 1714 }, { "epoch": 0.19145098777964997, "grad_norm": 0.22338813543319702, "learning_rate": 7.1249999999999995e-06, "loss": 2.3183, "step": 1715 }, { "epoch": 0.191562621008676, "grad_norm": 0.23001807928085327, "learning_rate": 7.1e-06, "loss": 2.31, "step": 1716 }, { "epoch": 0.19167425423770204, "grad_norm": 0.23304982483386993, "learning_rate": 7.075e-06, "loss": 2.3071, "step": 1717 }, { "epoch": 0.19178588746672806, "grad_norm": 0.2388128638267517, "learning_rate": 7.049999999999999e-06, "loss": 2.3441, "step": 1718 }, { "epoch": 0.1918975206957541, "grad_norm": 0.22486941516399384, "learning_rate": 7.025000000000001e-06, "loss": 2.3495, "step": 1719 }, { "epoch": 0.19200915392478013, "grad_norm": 0.23193509876728058, "learning_rate": 7.000000000000001e-06, "loss": 2.2735, "step": 1720 }, { "epoch": 0.19212078715380618, "grad_norm": 0.23028187453746796, "learning_rate": 6.975000000000001e-06, "loss": 2.2935, "step": 1721 }, { "epoch": 0.1922324203828322, "grad_norm": 0.22155210375785828, "learning_rate": 6.950000000000001e-06, "loss": 2.3499, "step": 1722 }, { "epoch": 0.19234405361185825, "grad_norm": 0.23238146305084229, "learning_rate": 6.925000000000001e-06, "loss": 2.3038, "step": 1723 }, { "epoch": 0.19245568684088427, "grad_norm": 0.2326563447713852, "learning_rate": 6.900000000000001e-06, "loss": 2.3849, "step": 1724 }, { "epoch": 0.1925673200699103, "grad_norm": 0.23055623471736908, "learning_rate": 6.875000000000001e-06, "loss": 2.294, "step": 1725 }, { "epoch": 0.19267895329893633, "grad_norm": 0.22514155507087708, "learning_rate": 6.8500000000000005e-06, "loss": 2.4518, "step": 1726 }, { "epoch": 0.19279058652796238, "grad_norm": 0.22970163822174072, "learning_rate": 6.825000000000001e-06, "loss": 2.3659, "step": 1727 }, { "epoch": 0.1929022197569884, "grad_norm": 0.22430215775966644, "learning_rate": 6.800000000000001e-06, "loss": 2.4064, "step": 1728 }, { "epoch": 0.19301385298601445, "grad_norm": 0.2254297137260437, "learning_rate": 6.775000000000001e-06, "loss": 2.2301, "step": 1729 }, { "epoch": 0.1931254862150405, "grad_norm": 0.23394359648227692, "learning_rate": 6.750000000000001e-06, "loss": 2.311, "step": 1730 }, { "epoch": 0.19323711944406652, "grad_norm": 0.23774048686027527, "learning_rate": 6.725000000000001e-06, "loss": 2.3392, "step": 1731 }, { "epoch": 0.19334875267309257, "grad_norm": 0.23314325511455536, "learning_rate": 6.700000000000001e-06, "loss": 2.2952, "step": 1732 }, { "epoch": 0.1934603859021186, "grad_norm": 0.22659903764724731, "learning_rate": 6.6750000000000005e-06, "loss": 2.3357, "step": 1733 }, { "epoch": 0.19357201913114463, "grad_norm": 0.2298395037651062, "learning_rate": 6.650000000000001e-06, "loss": 2.3717, "step": 1734 }, { "epoch": 0.19368365236017066, "grad_norm": 0.25462180376052856, "learning_rate": 6.625000000000001e-06, "loss": 2.3877, "step": 1735 }, { "epoch": 0.1937952855891967, "grad_norm": 0.22525596618652344, "learning_rate": 6.6e-06, "loss": 2.2066, "step": 1736 }, { "epoch": 0.19390691881822272, "grad_norm": 0.2275206297636032, "learning_rate": 6.5750000000000006e-06, "loss": 2.3948, "step": 1737 }, { "epoch": 0.19401855204724877, "grad_norm": 0.23473072052001953, "learning_rate": 6.550000000000001e-06, "loss": 2.3358, "step": 1738 }, { "epoch": 0.1941301852762748, "grad_norm": 0.2280065417289734, "learning_rate": 6.525e-06, "loss": 2.2785, "step": 1739 }, { "epoch": 0.19424181850530084, "grad_norm": 0.2484257072210312, "learning_rate": 6.5000000000000004e-06, "loss": 2.4323, "step": 1740 }, { "epoch": 0.19435345173432686, "grad_norm": 0.245782271027565, "learning_rate": 6.475000000000001e-06, "loss": 2.3596, "step": 1741 }, { "epoch": 0.1944650849633529, "grad_norm": 0.2271096259355545, "learning_rate": 6.45e-06, "loss": 2.3226, "step": 1742 }, { "epoch": 0.19457671819237893, "grad_norm": 0.23324838280677795, "learning_rate": 6.425e-06, "loss": 2.4395, "step": 1743 }, { "epoch": 0.19468835142140498, "grad_norm": 0.23519247770309448, "learning_rate": 6.4000000000000006e-06, "loss": 2.2311, "step": 1744 }, { "epoch": 0.194799984650431, "grad_norm": 0.22893019020557404, "learning_rate": 6.375000000000001e-06, "loss": 2.3042, "step": 1745 }, { "epoch": 0.19491161787945704, "grad_norm": 0.23204751312732697, "learning_rate": 6.35e-06, "loss": 2.3882, "step": 1746 }, { "epoch": 0.1950232511084831, "grad_norm": 0.23840852081775665, "learning_rate": 6.3250000000000004e-06, "loss": 2.3279, "step": 1747 }, { "epoch": 0.1951348843375091, "grad_norm": 0.2263709455728531, "learning_rate": 6.300000000000001e-06, "loss": 2.3376, "step": 1748 }, { "epoch": 0.19524651756653516, "grad_norm": 0.22863459587097168, "learning_rate": 6.275e-06, "loss": 2.3901, "step": 1749 }, { "epoch": 0.19535815079556118, "grad_norm": 0.3761008381843567, "learning_rate": 6.25e-06, "loss": 2.3816, "step": 1750 }, { "epoch": 0.19546978402458723, "grad_norm": 0.22513067722320557, "learning_rate": 6.2250000000000005e-06, "loss": 2.2964, "step": 1751 }, { "epoch": 0.19558141725361325, "grad_norm": 0.22629620134830475, "learning_rate": 6.2e-06, "loss": 2.4423, "step": 1752 }, { "epoch": 0.1956930504826393, "grad_norm": 0.24019905924797058, "learning_rate": 6.175e-06, "loss": 2.2727, "step": 1753 }, { "epoch": 0.19580468371166532, "grad_norm": 0.23448988795280457, "learning_rate": 6.15e-06, "loss": 2.3269, "step": 1754 }, { "epoch": 0.19591631694069137, "grad_norm": 0.22556640207767487, "learning_rate": 6.125e-06, "loss": 2.1874, "step": 1755 }, { "epoch": 0.19602795016971739, "grad_norm": 0.3827759027481079, "learning_rate": 6.1e-06, "loss": 2.3076, "step": 1756 }, { "epoch": 0.19613958339874343, "grad_norm": 0.22821791470050812, "learning_rate": 6.075e-06, "loss": 2.3521, "step": 1757 }, { "epoch": 0.19625121662776945, "grad_norm": 0.22949843108654022, "learning_rate": 6.0500000000000005e-06, "loss": 2.3864, "step": 1758 }, { "epoch": 0.1963628498567955, "grad_norm": 0.22942288219928741, "learning_rate": 6.025e-06, "loss": 2.3699, "step": 1759 }, { "epoch": 0.19647448308582152, "grad_norm": 0.2209300398826599, "learning_rate": 6e-06, "loss": 2.3237, "step": 1760 }, { "epoch": 0.19658611631484757, "grad_norm": 0.22622691094875336, "learning_rate": 5.975e-06, "loss": 2.3339, "step": 1761 }, { "epoch": 0.1966977495438736, "grad_norm": 0.23727689683437347, "learning_rate": 5.95e-06, "loss": 2.2707, "step": 1762 }, { "epoch": 0.19680938277289964, "grad_norm": 0.2961234450340271, "learning_rate": 5.925e-06, "loss": 2.2837, "step": 1763 }, { "epoch": 0.19692101600192569, "grad_norm": 0.24365472793579102, "learning_rate": 5.9e-06, "loss": 2.3899, "step": 1764 }, { "epoch": 0.1970326492309517, "grad_norm": 0.23506611585617065, "learning_rate": 5.875e-06, "loss": 2.3549, "step": 1765 }, { "epoch": 0.19714428245997775, "grad_norm": 0.22377490997314453, "learning_rate": 5.850000000000001e-06, "loss": 2.2019, "step": 1766 }, { "epoch": 0.19725591568900377, "grad_norm": 0.2170470505952835, "learning_rate": 5.825000000000001e-06, "loss": 2.3304, "step": 1767 }, { "epoch": 0.19736754891802982, "grad_norm": 0.23338063061237335, "learning_rate": 5.8e-06, "loss": 2.3452, "step": 1768 }, { "epoch": 0.19747918214705584, "grad_norm": 0.22321484982967377, "learning_rate": 5.775000000000001e-06, "loss": 2.321, "step": 1769 }, { "epoch": 0.1975908153760819, "grad_norm": 0.23838651180267334, "learning_rate": 5.750000000000001e-06, "loss": 2.3604, "step": 1770 }, { "epoch": 0.1977024486051079, "grad_norm": 0.23211868107318878, "learning_rate": 5.725e-06, "loss": 2.3697, "step": 1771 }, { "epoch": 0.19781408183413396, "grad_norm": 0.23045557737350464, "learning_rate": 5.7000000000000005e-06, "loss": 2.2953, "step": 1772 }, { "epoch": 0.19792571506315998, "grad_norm": 0.23346678912639618, "learning_rate": 5.675000000000001e-06, "loss": 2.3694, "step": 1773 }, { "epoch": 0.19803734829218603, "grad_norm": 0.23384539783000946, "learning_rate": 5.65e-06, "loss": 2.3231, "step": 1774 }, { "epoch": 0.19814898152121205, "grad_norm": 0.22029776871204376, "learning_rate": 5.625e-06, "loss": 2.2113, "step": 1775 }, { "epoch": 0.1982606147502381, "grad_norm": 0.24596329033374786, "learning_rate": 5.600000000000001e-06, "loss": 2.2872, "step": 1776 }, { "epoch": 0.19837224797926412, "grad_norm": 0.2208588421344757, "learning_rate": 5.575e-06, "loss": 2.2893, "step": 1777 }, { "epoch": 0.19848388120829016, "grad_norm": 0.27724799513816833, "learning_rate": 5.55e-06, "loss": 2.3499, "step": 1778 }, { "epoch": 0.19859551443731618, "grad_norm": 0.2821560502052307, "learning_rate": 5.5250000000000005e-06, "loss": 2.2631, "step": 1779 }, { "epoch": 0.19870714766634223, "grad_norm": 0.2291664034128189, "learning_rate": 5.500000000000001e-06, "loss": 2.4097, "step": 1780 }, { "epoch": 0.19881878089536828, "grad_norm": 0.2263956367969513, "learning_rate": 5.475e-06, "loss": 2.4372, "step": 1781 }, { "epoch": 0.1989304141243943, "grad_norm": 0.22560614347457886, "learning_rate": 5.45e-06, "loss": 2.3875, "step": 1782 }, { "epoch": 0.19904204735342035, "grad_norm": 0.3105284571647644, "learning_rate": 5.4250000000000006e-06, "loss": 2.201, "step": 1783 }, { "epoch": 0.19915368058244637, "grad_norm": 0.23622126877307892, "learning_rate": 5.4e-06, "loss": 2.3385, "step": 1784 }, { "epoch": 0.19926531381147242, "grad_norm": 0.23378053307533264, "learning_rate": 5.375e-06, "loss": 2.3863, "step": 1785 }, { "epoch": 0.19937694704049844, "grad_norm": 0.2226409614086151, "learning_rate": 5.3500000000000004e-06, "loss": 2.2851, "step": 1786 }, { "epoch": 0.19948858026952448, "grad_norm": 0.2296893298625946, "learning_rate": 5.325e-06, "loss": 2.3351, "step": 1787 }, { "epoch": 0.1996002134985505, "grad_norm": 0.22921861708164215, "learning_rate": 5.3e-06, "loss": 2.2895, "step": 1788 }, { "epoch": 0.19971184672757655, "grad_norm": 0.23278868198394775, "learning_rate": 5.275e-06, "loss": 2.2488, "step": 1789 }, { "epoch": 0.19982347995660257, "grad_norm": 0.2284834384918213, "learning_rate": 5.25e-06, "loss": 2.3363, "step": 1790 }, { "epoch": 0.19993511318562862, "grad_norm": 0.23466360569000244, "learning_rate": 5.225e-06, "loss": 2.2327, "step": 1791 }, { "epoch": 0.20004674641465464, "grad_norm": 0.23456375300884247, "learning_rate": 5.2e-06, "loss": 2.3441, "step": 1792 }, { "epoch": 0.2001583796436807, "grad_norm": 0.7794348001480103, "learning_rate": 5.175e-06, "loss": 2.2631, "step": 1793 }, { "epoch": 0.2002700128727067, "grad_norm": 0.22322069108486176, "learning_rate": 5.15e-06, "loss": 2.209, "step": 1794 }, { "epoch": 0.20038164610173276, "grad_norm": 0.2890242636203766, "learning_rate": 5.125e-06, "loss": 2.4408, "step": 1795 }, { "epoch": 0.2004932793307588, "grad_norm": 0.23017571866512299, "learning_rate": 5.1e-06, "loss": 2.3729, "step": 1796 }, { "epoch": 0.20060491255978483, "grad_norm": 0.24546416103839874, "learning_rate": 5.0750000000000005e-06, "loss": 2.3414, "step": 1797 }, { "epoch": 0.20071654578881087, "grad_norm": 0.6677102446556091, "learning_rate": 5.050000000000001e-06, "loss": 2.267, "step": 1798 }, { "epoch": 0.2008281790178369, "grad_norm": 0.22715935111045837, "learning_rate": 5.025e-06, "loss": 2.2485, "step": 1799 }, { "epoch": 0.20093981224686294, "grad_norm": 0.24433374404907227, "learning_rate": 5e-06, "loss": 2.2448, "step": 1800 }, { "epoch": 0.20105144547588896, "grad_norm": 0.2298104465007782, "learning_rate": 4.975000000000001e-06, "loss": 2.3481, "step": 1801 }, { "epoch": 0.201163078704915, "grad_norm": 0.2273014336824417, "learning_rate": 4.950000000000001e-06, "loss": 2.2275, "step": 1802 }, { "epoch": 0.20127471193394103, "grad_norm": 0.2342415452003479, "learning_rate": 4.925e-06, "loss": 2.3233, "step": 1803 }, { "epoch": 0.20138634516296708, "grad_norm": 0.23677963018417358, "learning_rate": 4.9000000000000005e-06, "loss": 2.34, "step": 1804 }, { "epoch": 0.2014979783919931, "grad_norm": 0.2304297685623169, "learning_rate": 4.875000000000001e-06, "loss": 2.4005, "step": 1805 }, { "epoch": 0.20160961162101915, "grad_norm": 0.22330938279628754, "learning_rate": 4.85e-06, "loss": 2.31, "step": 1806 }, { "epoch": 0.20172124485004517, "grad_norm": 0.2279394567012787, "learning_rate": 4.825e-06, "loss": 2.2854, "step": 1807 }, { "epoch": 0.20183287807907122, "grad_norm": 0.22466933727264404, "learning_rate": 4.800000000000001e-06, "loss": 2.3226, "step": 1808 }, { "epoch": 0.20194451130809724, "grad_norm": 0.2288239598274231, "learning_rate": 4.775e-06, "loss": 2.2858, "step": 1809 }, { "epoch": 0.20205614453712328, "grad_norm": 0.22833405435085297, "learning_rate": 4.75e-06, "loss": 2.3368, "step": 1810 }, { "epoch": 0.2021677777661493, "grad_norm": 0.23820479214191437, "learning_rate": 4.7250000000000005e-06, "loss": 2.3351, "step": 1811 }, { "epoch": 0.20227941099517535, "grad_norm": 0.22864072024822235, "learning_rate": 4.7e-06, "loss": 2.3744, "step": 1812 }, { "epoch": 0.2023910442242014, "grad_norm": 0.22456790506839752, "learning_rate": 4.675e-06, "loss": 2.2479, "step": 1813 }, { "epoch": 0.20250267745322742, "grad_norm": 0.22676309943199158, "learning_rate": 4.65e-06, "loss": 2.4175, "step": 1814 }, { "epoch": 0.20261431068225347, "grad_norm": 0.2293044477701187, "learning_rate": 4.625e-06, "loss": 2.3798, "step": 1815 }, { "epoch": 0.2027259439112795, "grad_norm": 0.23791825771331787, "learning_rate": 4.6e-06, "loss": 2.2226, "step": 1816 }, { "epoch": 0.20283757714030554, "grad_norm": 0.22495396435260773, "learning_rate": 4.575e-06, "loss": 2.3658, "step": 1817 }, { "epoch": 0.20294921036933156, "grad_norm": 0.22882862389087677, "learning_rate": 4.5500000000000005e-06, "loss": 2.3152, "step": 1818 }, { "epoch": 0.2030608435983576, "grad_norm": 0.24084174633026123, "learning_rate": 4.525e-06, "loss": 2.3789, "step": 1819 }, { "epoch": 0.20317247682738362, "grad_norm": 0.23165123164653778, "learning_rate": 4.5e-06, "loss": 2.3144, "step": 1820 }, { "epoch": 0.20328411005640967, "grad_norm": 0.22660130262374878, "learning_rate": 4.475e-06, "loss": 2.2471, "step": 1821 }, { "epoch": 0.2033957432854357, "grad_norm": 0.23442165553569794, "learning_rate": 4.45e-06, "loss": 2.2553, "step": 1822 }, { "epoch": 0.20350737651446174, "grad_norm": 0.2298109382390976, "learning_rate": 4.425e-06, "loss": 2.3108, "step": 1823 }, { "epoch": 0.20361900974348776, "grad_norm": 0.23466314375400543, "learning_rate": 4.4e-06, "loss": 2.2141, "step": 1824 }, { "epoch": 0.2037306429725138, "grad_norm": 0.2319631427526474, "learning_rate": 4.375e-06, "loss": 2.2492, "step": 1825 }, { "epoch": 0.20384227620153983, "grad_norm": 0.23453742265701294, "learning_rate": 4.35e-06, "loss": 2.2947, "step": 1826 }, { "epoch": 0.20395390943056588, "grad_norm": 0.23079949617385864, "learning_rate": 4.325e-06, "loss": 2.3735, "step": 1827 }, { "epoch": 0.2040655426595919, "grad_norm": 0.2320202887058258, "learning_rate": 4.2999999999999995e-06, "loss": 2.2555, "step": 1828 }, { "epoch": 0.20417717588861795, "grad_norm": 0.23471979796886444, "learning_rate": 4.2750000000000006e-06, "loss": 2.274, "step": 1829 }, { "epoch": 0.204288809117644, "grad_norm": 0.23321862518787384, "learning_rate": 4.250000000000001e-06, "loss": 2.2911, "step": 1830 }, { "epoch": 0.20440044234667, "grad_norm": 0.44062545895576477, "learning_rate": 4.225e-06, "loss": 2.3128, "step": 1831 }, { "epoch": 0.20451207557569606, "grad_norm": 0.29020875692367554, "learning_rate": 4.2000000000000004e-06, "loss": 2.3114, "step": 1832 }, { "epoch": 0.20462370880472208, "grad_norm": 0.2328851819038391, "learning_rate": 4.175000000000001e-06, "loss": 2.3953, "step": 1833 }, { "epoch": 0.20473534203374813, "grad_norm": 0.23879390954971313, "learning_rate": 4.15e-06, "loss": 2.2864, "step": 1834 }, { "epoch": 0.20484697526277415, "grad_norm": 0.2300870269536972, "learning_rate": 4.125e-06, "loss": 2.2513, "step": 1835 }, { "epoch": 0.2049586084918002, "grad_norm": 0.23073944449424744, "learning_rate": 4.1000000000000006e-06, "loss": 2.3797, "step": 1836 }, { "epoch": 0.20507024172082622, "grad_norm": 0.2415030598640442, "learning_rate": 4.075e-06, "loss": 2.2337, "step": 1837 }, { "epoch": 0.20518187494985227, "grad_norm": 0.22740036249160767, "learning_rate": 4.05e-06, "loss": 2.4097, "step": 1838 }, { "epoch": 0.2052935081788783, "grad_norm": 0.22621197998523712, "learning_rate": 4.0250000000000004e-06, "loss": 2.3186, "step": 1839 }, { "epoch": 0.20540514140790433, "grad_norm": 0.22574348747730255, "learning_rate": 4.000000000000001e-06, "loss": 2.268, "step": 1840 }, { "epoch": 0.20551677463693035, "grad_norm": 0.2359447032213211, "learning_rate": 3.975e-06, "loss": 2.3712, "step": 1841 }, { "epoch": 0.2056284078659564, "grad_norm": 0.2573161721229553, "learning_rate": 3.95e-06, "loss": 2.3214, "step": 1842 }, { "epoch": 0.20574004109498242, "grad_norm": 0.2339893877506256, "learning_rate": 3.9250000000000005e-06, "loss": 2.3017, "step": 1843 }, { "epoch": 0.20585167432400847, "grad_norm": 0.26078861951828003, "learning_rate": 3.9e-06, "loss": 2.3372, "step": 1844 }, { "epoch": 0.2059633075530345, "grad_norm": 0.23878271877765656, "learning_rate": 3.875e-06, "loss": 2.3236, "step": 1845 }, { "epoch": 0.20607494078206054, "grad_norm": 0.2323731631040573, "learning_rate": 3.85e-06, "loss": 2.2754, "step": 1846 }, { "epoch": 0.2061865740110866, "grad_norm": 0.23449857532978058, "learning_rate": 3.825e-06, "loss": 2.3331, "step": 1847 }, { "epoch": 0.2062982072401126, "grad_norm": 0.2303149700164795, "learning_rate": 3.8e-06, "loss": 2.2017, "step": 1848 }, { "epoch": 0.20640984046913866, "grad_norm": 0.21848297119140625, "learning_rate": 3.775e-06, "loss": 2.383, "step": 1849 }, { "epoch": 0.20652147369816468, "grad_norm": 0.22456365823745728, "learning_rate": 3.75e-06, "loss": 2.2835, "step": 1850 }, { "epoch": 0.20663310692719072, "grad_norm": 0.23150286078453064, "learning_rate": 3.725e-06, "loss": 2.3369, "step": 1851 }, { "epoch": 0.20674474015621674, "grad_norm": 0.2399853765964508, "learning_rate": 3.7e-06, "loss": 2.3235, "step": 1852 }, { "epoch": 0.2068563733852428, "grad_norm": 0.22788886725902557, "learning_rate": 3.675e-06, "loss": 2.3365, "step": 1853 }, { "epoch": 0.2069680066142688, "grad_norm": 0.22432076930999756, "learning_rate": 3.6499999999999998e-06, "loss": 2.2445, "step": 1854 }, { "epoch": 0.20707963984329486, "grad_norm": 0.230990469455719, "learning_rate": 3.625e-06, "loss": 2.3849, "step": 1855 }, { "epoch": 0.20719127307232088, "grad_norm": 0.22127412259578705, "learning_rate": 3.6e-06, "loss": 2.3069, "step": 1856 }, { "epoch": 0.20730290630134693, "grad_norm": 0.22421187162399292, "learning_rate": 3.575e-06, "loss": 2.3637, "step": 1857 }, { "epoch": 0.20741453953037295, "grad_norm": 0.22667351365089417, "learning_rate": 3.55e-06, "loss": 2.226, "step": 1858 }, { "epoch": 0.207526172759399, "grad_norm": 0.24504856765270233, "learning_rate": 3.5249999999999997e-06, "loss": 2.2467, "step": 1859 }, { "epoch": 0.20763780598842502, "grad_norm": 0.21391868591308594, "learning_rate": 3.5000000000000004e-06, "loss": 2.2939, "step": 1860 }, { "epoch": 0.20774943921745107, "grad_norm": 0.2345769703388214, "learning_rate": 3.4750000000000006e-06, "loss": 2.3104, "step": 1861 }, { "epoch": 0.2078610724464771, "grad_norm": 0.2319745272397995, "learning_rate": 3.4500000000000004e-06, "loss": 2.2999, "step": 1862 }, { "epoch": 0.20797270567550313, "grad_norm": 0.2287844866514206, "learning_rate": 3.4250000000000002e-06, "loss": 2.2385, "step": 1863 }, { "epoch": 0.20808433890452918, "grad_norm": 0.22918793559074402, "learning_rate": 3.4000000000000005e-06, "loss": 2.4109, "step": 1864 }, { "epoch": 0.2081959721335552, "grad_norm": 0.23404212296009064, "learning_rate": 3.3750000000000003e-06, "loss": 2.3283, "step": 1865 }, { "epoch": 0.20830760536258125, "grad_norm": 0.22943396866321564, "learning_rate": 3.3500000000000005e-06, "loss": 2.366, "step": 1866 }, { "epoch": 0.20841923859160727, "grad_norm": 0.2246488332748413, "learning_rate": 3.3250000000000004e-06, "loss": 2.3395, "step": 1867 }, { "epoch": 0.20853087182063332, "grad_norm": 0.24497540295124054, "learning_rate": 3.3e-06, "loss": 2.3761, "step": 1868 }, { "epoch": 0.20864250504965934, "grad_norm": 0.22545543313026428, "learning_rate": 3.2750000000000004e-06, "loss": 2.1653, "step": 1869 }, { "epoch": 0.20875413827868539, "grad_norm": 0.22694288194179535, "learning_rate": 3.2500000000000002e-06, "loss": 2.3608, "step": 1870 }, { "epoch": 0.2088657715077114, "grad_norm": 0.23120814561843872, "learning_rate": 3.225e-06, "loss": 2.1423, "step": 1871 }, { "epoch": 0.20897740473673745, "grad_norm": 0.22243161499500275, "learning_rate": 3.2000000000000003e-06, "loss": 2.3699, "step": 1872 }, { "epoch": 0.20908903796576347, "grad_norm": 0.233660027384758, "learning_rate": 3.175e-06, "loss": 2.213, "step": 1873 }, { "epoch": 0.20920067119478952, "grad_norm": 0.2548787593841553, "learning_rate": 3.1500000000000003e-06, "loss": 2.3905, "step": 1874 }, { "epoch": 0.20931230442381554, "grad_norm": 0.22625088691711426, "learning_rate": 3.125e-06, "loss": 2.3932, "step": 1875 }, { "epoch": 0.2094239376528416, "grad_norm": 0.214766263961792, "learning_rate": 3.1e-06, "loss": 2.2858, "step": 1876 }, { "epoch": 0.2095355708818676, "grad_norm": 0.22801010310649872, "learning_rate": 3.075e-06, "loss": 2.3836, "step": 1877 }, { "epoch": 0.20964720411089366, "grad_norm": 0.21751223504543304, "learning_rate": 3.05e-06, "loss": 2.3788, "step": 1878 }, { "epoch": 0.2097588373399197, "grad_norm": 0.23438245058059692, "learning_rate": 3.0250000000000003e-06, "loss": 2.3414, "step": 1879 }, { "epoch": 0.20987047056894573, "grad_norm": 0.23169319331645966, "learning_rate": 3e-06, "loss": 2.386, "step": 1880 }, { "epoch": 0.20998210379797178, "grad_norm": 0.46034878492355347, "learning_rate": 2.975e-06, "loss": 2.3816, "step": 1881 }, { "epoch": 0.2100937370269978, "grad_norm": 0.2378452867269516, "learning_rate": 2.95e-06, "loss": 2.3081, "step": 1882 }, { "epoch": 0.21020537025602384, "grad_norm": 0.23349280655384064, "learning_rate": 2.9250000000000004e-06, "loss": 2.3548, "step": 1883 }, { "epoch": 0.21031700348504986, "grad_norm": 0.2473306655883789, "learning_rate": 2.9e-06, "loss": 2.3103, "step": 1884 }, { "epoch": 0.2104286367140759, "grad_norm": 0.22535374760627747, "learning_rate": 2.8750000000000004e-06, "loss": 2.3863, "step": 1885 }, { "epoch": 0.21054026994310193, "grad_norm": 0.8349050283432007, "learning_rate": 2.8500000000000002e-06, "loss": 2.3809, "step": 1886 }, { "epoch": 0.21065190317212798, "grad_norm": 0.22305621206760406, "learning_rate": 2.825e-06, "loss": 2.3529, "step": 1887 }, { "epoch": 0.210763536401154, "grad_norm": 0.23024660348892212, "learning_rate": 2.8000000000000003e-06, "loss": 2.4181, "step": 1888 }, { "epoch": 0.21087516963018005, "grad_norm": 0.22768965363502502, "learning_rate": 2.775e-06, "loss": 2.2139, "step": 1889 }, { "epoch": 0.21098680285920607, "grad_norm": 0.22666402161121368, "learning_rate": 2.7500000000000004e-06, "loss": 2.3946, "step": 1890 }, { "epoch": 0.21109843608823212, "grad_norm": 0.24159802496433258, "learning_rate": 2.725e-06, "loss": 2.2891, "step": 1891 }, { "epoch": 0.21121006931725814, "grad_norm": 0.21914929151535034, "learning_rate": 2.7e-06, "loss": 2.4024, "step": 1892 }, { "epoch": 0.21132170254628418, "grad_norm": 0.2383839637041092, "learning_rate": 2.6750000000000002e-06, "loss": 2.3293, "step": 1893 }, { "epoch": 0.2114333357753102, "grad_norm": 0.24502025544643402, "learning_rate": 2.65e-06, "loss": 2.3402, "step": 1894 }, { "epoch": 0.21154496900433625, "grad_norm": 0.23778797686100006, "learning_rate": 2.625e-06, "loss": 2.4104, "step": 1895 }, { "epoch": 0.2116566022333623, "grad_norm": 0.2225978523492813, "learning_rate": 2.6e-06, "loss": 2.3451, "step": 1896 }, { "epoch": 0.21176823546238832, "grad_norm": 0.2651173174381256, "learning_rate": 2.575e-06, "loss": 2.3004, "step": 1897 }, { "epoch": 0.21187986869141437, "grad_norm": 0.21840894222259521, "learning_rate": 2.55e-06, "loss": 2.3576, "step": 1898 }, { "epoch": 0.2119915019204404, "grad_norm": 0.2707866132259369, "learning_rate": 2.5250000000000004e-06, "loss": 2.3301, "step": 1899 }, { "epoch": 0.21210313514946644, "grad_norm": 0.2320091277360916, "learning_rate": 2.5e-06, "loss": 2.4186, "step": 1900 }, { "epoch": 0.21221476837849246, "grad_norm": 0.22834016382694244, "learning_rate": 2.4750000000000004e-06, "loss": 2.3018, "step": 1901 }, { "epoch": 0.2123264016075185, "grad_norm": 0.2285088747739792, "learning_rate": 2.4500000000000003e-06, "loss": 2.3471, "step": 1902 }, { "epoch": 0.21243803483654453, "grad_norm": 0.2231675386428833, "learning_rate": 2.425e-06, "loss": 2.2663, "step": 1903 }, { "epoch": 0.21254966806557057, "grad_norm": 0.23813290894031525, "learning_rate": 2.4000000000000003e-06, "loss": 2.2843, "step": 1904 }, { "epoch": 0.2126613012945966, "grad_norm": 0.23623359203338623, "learning_rate": 2.375e-06, "loss": 2.3532, "step": 1905 }, { "epoch": 0.21277293452362264, "grad_norm": 0.24262453615665436, "learning_rate": 2.35e-06, "loss": 2.2889, "step": 1906 }, { "epoch": 0.21288456775264866, "grad_norm": 0.32345688343048096, "learning_rate": 2.325e-06, "loss": 2.3914, "step": 1907 }, { "epoch": 0.2129962009816747, "grad_norm": 0.23502293229103088, "learning_rate": 2.3e-06, "loss": 2.4402, "step": 1908 }, { "epoch": 0.21310783421070073, "grad_norm": 0.23672327399253845, "learning_rate": 2.2750000000000002e-06, "loss": 2.2715, "step": 1909 }, { "epoch": 0.21321946743972678, "grad_norm": 0.2367829978466034, "learning_rate": 2.25e-06, "loss": 2.264, "step": 1910 }, { "epoch": 0.2133311006687528, "grad_norm": 0.2541966140270233, "learning_rate": 2.225e-06, "loss": 2.3563, "step": 1911 }, { "epoch": 0.21344273389777885, "grad_norm": 0.238334059715271, "learning_rate": 2.2e-06, "loss": 2.4153, "step": 1912 }, { "epoch": 0.2135543671268049, "grad_norm": 0.2234756350517273, "learning_rate": 2.175e-06, "loss": 2.256, "step": 1913 }, { "epoch": 0.21366600035583092, "grad_norm": 0.24005568027496338, "learning_rate": 2.1499999999999997e-06, "loss": 2.23, "step": 1914 }, { "epoch": 0.21377763358485696, "grad_norm": 0.23683962225914001, "learning_rate": 2.1250000000000004e-06, "loss": 2.301, "step": 1915 }, { "epoch": 0.21388926681388298, "grad_norm": 0.23090413212776184, "learning_rate": 2.1000000000000002e-06, "loss": 2.3411, "step": 1916 }, { "epoch": 0.21400090004290903, "grad_norm": 0.3235081434249878, "learning_rate": 2.075e-06, "loss": 2.3339, "step": 1917 }, { "epoch": 0.21411253327193505, "grad_norm": 0.284463495016098, "learning_rate": 2.0500000000000003e-06, "loss": 2.3223, "step": 1918 }, { "epoch": 0.2142241665009611, "grad_norm": 0.24401739239692688, "learning_rate": 2.025e-06, "loss": 2.252, "step": 1919 }, { "epoch": 0.21433579972998712, "grad_norm": 0.22095558047294617, "learning_rate": 2.0000000000000003e-06, "loss": 2.1969, "step": 1920 }, { "epoch": 0.21444743295901317, "grad_norm": 0.2635052502155304, "learning_rate": 1.975e-06, "loss": 2.2948, "step": 1921 }, { "epoch": 0.2145590661880392, "grad_norm": 0.227258563041687, "learning_rate": 1.95e-06, "loss": 2.3566, "step": 1922 }, { "epoch": 0.21467069941706524, "grad_norm": 0.2326432466506958, "learning_rate": 1.925e-06, "loss": 2.2968, "step": 1923 }, { "epoch": 0.21478233264609126, "grad_norm": 0.23958462476730347, "learning_rate": 1.9e-06, "loss": 2.1632, "step": 1924 }, { "epoch": 0.2148939658751173, "grad_norm": 0.23388223350048065, "learning_rate": 1.875e-06, "loss": 2.363, "step": 1925 }, { "epoch": 0.21500559910414332, "grad_norm": 0.22733284533023834, "learning_rate": 1.85e-06, "loss": 2.3312, "step": 1926 }, { "epoch": 0.21511723233316937, "grad_norm": 0.24033506214618683, "learning_rate": 1.8249999999999999e-06, "loss": 2.2992, "step": 1927 }, { "epoch": 0.21522886556219542, "grad_norm": 0.23118910193443298, "learning_rate": 1.8e-06, "loss": 2.3916, "step": 1928 }, { "epoch": 0.21534049879122144, "grad_norm": 0.23018263280391693, "learning_rate": 1.775e-06, "loss": 2.2243, "step": 1929 }, { "epoch": 0.2154521320202475, "grad_norm": 0.2350279539823532, "learning_rate": 1.7500000000000002e-06, "loss": 2.3228, "step": 1930 }, { "epoch": 0.2155637652492735, "grad_norm": 0.23205231130123138, "learning_rate": 1.7250000000000002e-06, "loss": 2.3709, "step": 1931 }, { "epoch": 0.21567539847829956, "grad_norm": 0.23173397779464722, "learning_rate": 1.7000000000000002e-06, "loss": 2.3956, "step": 1932 }, { "epoch": 0.21578703170732558, "grad_norm": 0.23224657773971558, "learning_rate": 1.6750000000000003e-06, "loss": 2.3998, "step": 1933 }, { "epoch": 0.21589866493635163, "grad_norm": 0.23145684599876404, "learning_rate": 1.65e-06, "loss": 2.3027, "step": 1934 }, { "epoch": 0.21601029816537765, "grad_norm": 0.22632840275764465, "learning_rate": 1.6250000000000001e-06, "loss": 2.3666, "step": 1935 }, { "epoch": 0.2161219313944037, "grad_norm": 0.2313094288110733, "learning_rate": 1.6000000000000001e-06, "loss": 2.2143, "step": 1936 }, { "epoch": 0.2162335646234297, "grad_norm": 0.24646909534931183, "learning_rate": 1.5750000000000002e-06, "loss": 2.2228, "step": 1937 }, { "epoch": 0.21634519785245576, "grad_norm": 0.2413366138935089, "learning_rate": 1.55e-06, "loss": 2.4665, "step": 1938 }, { "epoch": 0.21645683108148178, "grad_norm": 0.22983328998088837, "learning_rate": 1.525e-06, "loss": 2.3433, "step": 1939 }, { "epoch": 0.21656846431050783, "grad_norm": 0.23430456221103668, "learning_rate": 1.5e-06, "loss": 2.2489, "step": 1940 }, { "epoch": 0.21668009753953385, "grad_norm": 0.2480006217956543, "learning_rate": 1.475e-06, "loss": 2.3227, "step": 1941 }, { "epoch": 0.2167917307685599, "grad_norm": 0.2268451601266861, "learning_rate": 1.45e-06, "loss": 2.4032, "step": 1942 }, { "epoch": 0.21690336399758592, "grad_norm": 0.30644339323043823, "learning_rate": 1.4250000000000001e-06, "loss": 2.3085, "step": 1943 }, { "epoch": 0.21701499722661197, "grad_norm": 0.2343900352716446, "learning_rate": 1.4000000000000001e-06, "loss": 2.291, "step": 1944 }, { "epoch": 0.21712663045563801, "grad_norm": 0.22589880228042603, "learning_rate": 1.3750000000000002e-06, "loss": 2.3656, "step": 1945 }, { "epoch": 0.21723826368466403, "grad_norm": 0.2340187281370163, "learning_rate": 1.35e-06, "loss": 2.4029, "step": 1946 }, { "epoch": 0.21734989691369008, "grad_norm": 0.23875071108341217, "learning_rate": 1.325e-06, "loss": 2.3215, "step": 1947 }, { "epoch": 0.2174615301427161, "grad_norm": 0.22057262063026428, "learning_rate": 1.3e-06, "loss": 2.4121, "step": 1948 }, { "epoch": 0.21757316337174215, "grad_norm": 0.2507198750972748, "learning_rate": 1.275e-06, "loss": 2.4401, "step": 1949 }, { "epoch": 0.21768479660076817, "grad_norm": 0.22903326153755188, "learning_rate": 1.25e-06, "loss": 2.4032, "step": 1950 }, { "epoch": 0.21779642982979422, "grad_norm": 0.22481852769851685, "learning_rate": 1.2250000000000001e-06, "loss": 2.4677, "step": 1951 }, { "epoch": 0.21790806305882024, "grad_norm": 0.23029015958309174, "learning_rate": 1.2000000000000002e-06, "loss": 2.2855, "step": 1952 }, { "epoch": 0.2180196962878463, "grad_norm": 0.2441205233335495, "learning_rate": 1.175e-06, "loss": 2.3884, "step": 1953 }, { "epoch": 0.2181313295168723, "grad_norm": 0.22523878514766693, "learning_rate": 1.15e-06, "loss": 2.341, "step": 1954 }, { "epoch": 0.21824296274589836, "grad_norm": 0.23141464591026306, "learning_rate": 1.125e-06, "loss": 2.3801, "step": 1955 }, { "epoch": 0.21835459597492438, "grad_norm": 0.23657923936843872, "learning_rate": 1.1e-06, "loss": 2.4242, "step": 1956 }, { "epoch": 0.21846622920395042, "grad_norm": 0.2270117551088333, "learning_rate": 1.0749999999999999e-06, "loss": 2.4428, "step": 1957 }, { "epoch": 0.21857786243297644, "grad_norm": 0.23563359677791595, "learning_rate": 1.0500000000000001e-06, "loss": 2.3945, "step": 1958 }, { "epoch": 0.2186894956620025, "grad_norm": 0.2353026121854782, "learning_rate": 1.0250000000000001e-06, "loss": 2.3557, "step": 1959 }, { "epoch": 0.2188011288910285, "grad_norm": 0.2238246351480484, "learning_rate": 1.0000000000000002e-06, "loss": 2.3597, "step": 1960 }, { "epoch": 0.21891276212005456, "grad_norm": 0.2265477329492569, "learning_rate": 9.75e-07, "loss": 2.3447, "step": 1961 }, { "epoch": 0.2190243953490806, "grad_norm": 0.3757992684841156, "learning_rate": 9.5e-07, "loss": 2.3917, "step": 1962 }, { "epoch": 0.21913602857810663, "grad_norm": 0.22358457744121552, "learning_rate": 9.25e-07, "loss": 2.4566, "step": 1963 }, { "epoch": 0.21924766180713268, "grad_norm": 0.2505660355091095, "learning_rate": 9e-07, "loss": 2.2797, "step": 1964 }, { "epoch": 0.2193592950361587, "grad_norm": 0.2312246561050415, "learning_rate": 8.750000000000001e-07, "loss": 2.3255, "step": 1965 }, { "epoch": 0.21947092826518474, "grad_norm": 0.23009879887104034, "learning_rate": 8.500000000000001e-07, "loss": 2.1822, "step": 1966 }, { "epoch": 0.21958256149421077, "grad_norm": 0.2438948005437851, "learning_rate": 8.25e-07, "loss": 2.3193, "step": 1967 }, { "epoch": 0.2196941947232368, "grad_norm": 0.23041820526123047, "learning_rate": 8.000000000000001e-07, "loss": 2.308, "step": 1968 }, { "epoch": 0.21980582795226283, "grad_norm": 0.22963666915893555, "learning_rate": 7.75e-07, "loss": 2.3703, "step": 1969 }, { "epoch": 0.21991746118128888, "grad_norm": 0.2525392174720764, "learning_rate": 7.5e-07, "loss": 2.3571, "step": 1970 }, { "epoch": 0.2200290944103149, "grad_norm": 0.2333141565322876, "learning_rate": 7.25e-07, "loss": 2.3727, "step": 1971 }, { "epoch": 0.22014072763934095, "grad_norm": 0.2308010309934616, "learning_rate": 7.000000000000001e-07, "loss": 2.2563, "step": 1972 }, { "epoch": 0.22025236086836697, "grad_norm": 0.2338629961013794, "learning_rate": 6.75e-07, "loss": 2.3202, "step": 1973 }, { "epoch": 0.22036399409739302, "grad_norm": 0.23211447894573212, "learning_rate": 6.5e-07, "loss": 2.4489, "step": 1974 }, { "epoch": 0.22047562732641904, "grad_norm": 0.2359444946050644, "learning_rate": 6.25e-07, "loss": 2.3855, "step": 1975 }, { "epoch": 0.22058726055544509, "grad_norm": 0.2295287847518921, "learning_rate": 6.000000000000001e-07, "loss": 2.2819, "step": 1976 }, { "epoch": 0.2206988937844711, "grad_norm": 0.2276148796081543, "learning_rate": 5.75e-07, "loss": 2.2884, "step": 1977 }, { "epoch": 0.22081052701349715, "grad_norm": 0.2364000827074051, "learning_rate": 5.5e-07, "loss": 2.3854, "step": 1978 }, { "epoch": 0.2209221602425232, "grad_norm": 0.22925642132759094, "learning_rate": 5.250000000000001e-07, "loss": 2.3348, "step": 1979 }, { "epoch": 0.22103379347154922, "grad_norm": 0.41490915417671204, "learning_rate": 5.000000000000001e-07, "loss": 2.3006, "step": 1980 }, { "epoch": 0.22114542670057527, "grad_norm": 0.23148655891418457, "learning_rate": 4.75e-07, "loss": 2.2691, "step": 1981 }, { "epoch": 0.2212570599296013, "grad_norm": 0.22861695289611816, "learning_rate": 4.5e-07, "loss": 2.3781, "step": 1982 }, { "epoch": 0.22136869315862734, "grad_norm": 0.23868124186992645, "learning_rate": 4.2500000000000006e-07, "loss": 2.3108, "step": 1983 }, { "epoch": 0.22148032638765336, "grad_norm": 0.2650975286960602, "learning_rate": 4.0000000000000003e-07, "loss": 2.3865, "step": 1984 }, { "epoch": 0.2215919596166794, "grad_norm": 0.230629101395607, "learning_rate": 3.75e-07, "loss": 2.3458, "step": 1985 }, { "epoch": 0.22170359284570543, "grad_norm": 0.2260117530822754, "learning_rate": 3.5000000000000004e-07, "loss": 2.3316, "step": 1986 }, { "epoch": 0.22181522607473148, "grad_norm": 0.2325662523508072, "learning_rate": 3.25e-07, "loss": 2.3991, "step": 1987 }, { "epoch": 0.2219268593037575, "grad_norm": 0.2188185751438141, "learning_rate": 3.0000000000000004e-07, "loss": 2.3246, "step": 1988 }, { "epoch": 0.22203849253278354, "grad_norm": 0.23791977763175964, "learning_rate": 2.75e-07, "loss": 2.3504, "step": 1989 }, { "epoch": 0.22215012576180956, "grad_norm": 0.3440607190132141, "learning_rate": 2.5000000000000004e-07, "loss": 2.3414, "step": 1990 }, { "epoch": 0.2222617589908356, "grad_norm": 0.21880482137203217, "learning_rate": 2.25e-07, "loss": 2.3769, "step": 1991 }, { "epoch": 0.22237339221986163, "grad_norm": 0.23022335767745972, "learning_rate": 2.0000000000000002e-07, "loss": 2.4113, "step": 1992 }, { "epoch": 0.22248502544888768, "grad_norm": 0.23188486695289612, "learning_rate": 1.7500000000000002e-07, "loss": 2.4116, "step": 1993 }, { "epoch": 0.22259665867791373, "grad_norm": 0.23768189549446106, "learning_rate": 1.5000000000000002e-07, "loss": 2.2946, "step": 1994 }, { "epoch": 0.22270829190693975, "grad_norm": 0.2431269735097885, "learning_rate": 1.2500000000000002e-07, "loss": 2.3861, "step": 1995 }, { "epoch": 0.2228199251359658, "grad_norm": 0.2846803665161133, "learning_rate": 1.0000000000000001e-07, "loss": 2.4809, "step": 1996 }, { "epoch": 0.22293155836499182, "grad_norm": 0.22916531562805176, "learning_rate": 7.500000000000001e-08, "loss": 2.3044, "step": 1997 }, { "epoch": 0.22304319159401786, "grad_norm": 0.25623413920402527, "learning_rate": 5.0000000000000004e-08, "loss": 2.3833, "step": 1998 }, { "epoch": 0.22315482482304388, "grad_norm": 0.22197332978248596, "learning_rate": 2.5000000000000002e-08, "loss": 2.3048, "step": 1999 }, { "epoch": 0.22326645805206993, "grad_norm": 0.2170470952987671, "learning_rate": 0.0, "loss": 2.3603, "step": 2000 }, { "epoch": 0.22326645805206993, "step": 2000, "total_flos": 1.04520375926784e+18, "train_loss": 2.3526821104288103, "train_runtime": 66080.1443, "train_samples_per_second": 0.969, "train_steps_per_second": 0.03 } ], "logging_steps": 1.0, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.04520375926784e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }