diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22326645805206993, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011163322902603496, + "grad_norm": 0.12030283361673355, + "learning_rate": 4.9975e-05, + "loss": 2.5157, + "step": 1 + }, + { + "epoch": 0.00022326645805206992, + "grad_norm": 0.10777310281991959, + "learning_rate": 4.995e-05, + "loss": 2.5461, + "step": 2 + }, + { + "epoch": 0.0003348996870781049, + "grad_norm": 0.1150362566113472, + "learning_rate": 4.992500000000001e-05, + "loss": 2.463, + "step": 3 + }, + { + "epoch": 0.00044653291610413984, + "grad_norm": 0.19681179523468018, + "learning_rate": 4.99e-05, + "loss": 2.6149, + "step": 4 + }, + { + "epoch": 0.0005581661451301748, + "grad_norm": 0.13096819818019867, + "learning_rate": 4.9875000000000006e-05, + "loss": 2.4514, + "step": 5 + }, + { + "epoch": 0.0006697993741562098, + "grad_norm": 0.13657382130622864, + "learning_rate": 4.9850000000000006e-05, + "loss": 2.6006, + "step": 6 + }, + { + "epoch": 0.0007814326031822447, + "grad_norm": 0.12205895781517029, + "learning_rate": 4.9825000000000005e-05, + "loss": 2.4963, + "step": 7 + }, + { + "epoch": 0.0008930658322082797, + "grad_norm": 0.12875059247016907, + "learning_rate": 4.9800000000000004e-05, + "loss": 2.4755, + "step": 8 + }, + { + "epoch": 0.0010046990612343147, + "grad_norm": 0.13755826652050018, + "learning_rate": 4.9775000000000004e-05, + "loss": 2.4421, + "step": 9 + }, + { + "epoch": 0.0011163322902603497, + "grad_norm": 0.14838671684265137, + "learning_rate": 4.975e-05, + "loss": 2.5355, + "step": 10 + }, + { + "epoch": 0.0012279655192863846, + "grad_norm": 0.20818187296390533, + "learning_rate": 4.9725e-05, + "loss": 2.4286, + "step": 11 + }, + { + "epoch": 0.0013395987483124196, + "grad_norm": 0.16847440600395203, + "learning_rate": 4.97e-05, + "loss": 2.4903, + "step": 12 + }, + { + "epoch": 0.0014512319773384544, + "grad_norm": 0.16514980792999268, + "learning_rate": 4.967500000000001e-05, + "loss": 2.3927, + "step": 13 + }, + { + "epoch": 0.0015628652063644894, + "grad_norm": 0.17463476955890656, + "learning_rate": 4.965e-05, + "loss": 2.4566, + "step": 14 + }, + { + "epoch": 0.0016744984353905244, + "grad_norm": 0.1806865930557251, + "learning_rate": 4.962500000000001e-05, + "loss": 2.3876, + "step": 15 + }, + { + "epoch": 0.0017861316644165594, + "grad_norm": 0.4340119957923889, + "learning_rate": 4.96e-05, + "loss": 2.5106, + "step": 16 + }, + { + "epoch": 0.0018977648934425943, + "grad_norm": 0.18326279520988464, + "learning_rate": 4.9575000000000006e-05, + "loss": 2.4918, + "step": 17 + }, + { + "epoch": 0.0020093981224686293, + "grad_norm": 0.1809333860874176, + "learning_rate": 4.9550000000000005e-05, + "loss": 2.478, + "step": 18 + }, + { + "epoch": 0.0021210313514946643, + "grad_norm": 0.1895465850830078, + "learning_rate": 4.9525000000000004e-05, + "loss": 2.5024, + "step": 19 + }, + { + "epoch": 0.0022326645805206993, + "grad_norm": 0.20786350965499878, + "learning_rate": 4.9500000000000004e-05, + "loss": 2.5115, + "step": 20 + }, + { + "epoch": 0.0023442978095467343, + "grad_norm": 0.1992110311985016, + "learning_rate": 4.9475e-05, + "loss": 2.4044, + "step": 21 + }, + { + "epoch": 0.0024559310385727693, + "grad_norm": 0.20190922915935516, + "learning_rate": 4.945e-05, + "loss": 2.4079, + "step": 22 + }, + { + "epoch": 0.0025675642675988043, + "grad_norm": 0.22950319945812225, + "learning_rate": 4.9425e-05, + "loss": 2.5367, + "step": 23 + }, + { + "epoch": 0.0026791974966248393, + "grad_norm": 0.2105780690908432, + "learning_rate": 4.94e-05, + "loss": 2.3135, + "step": 24 + }, + { + "epoch": 0.0027908307256508742, + "grad_norm": 0.21351750195026398, + "learning_rate": 4.937500000000001e-05, + "loss": 2.4224, + "step": 25 + }, + { + "epoch": 0.002902463954676909, + "grad_norm": 0.21121960878372192, + "learning_rate": 4.935e-05, + "loss": 2.4935, + "step": 26 + }, + { + "epoch": 0.003014097183702944, + "grad_norm": 0.21738024055957794, + "learning_rate": 4.9325000000000006e-05, + "loss": 2.4383, + "step": 27 + }, + { + "epoch": 0.0031257304127289788, + "grad_norm": 0.22073255479335785, + "learning_rate": 4.93e-05, + "loss": 2.4861, + "step": 28 + }, + { + "epoch": 0.0032373636417550138, + "grad_norm": 0.2213854044675827, + "learning_rate": 4.9275000000000005e-05, + "loss": 2.4449, + "step": 29 + }, + { + "epoch": 0.0033489968707810487, + "grad_norm": 0.22176030278205872, + "learning_rate": 4.9250000000000004e-05, + "loss": 2.3341, + "step": 30 + }, + { + "epoch": 0.0034606300998070837, + "grad_norm": 0.22657762467861176, + "learning_rate": 4.9225000000000004e-05, + "loss": 2.4111, + "step": 31 + }, + { + "epoch": 0.0035722633288331187, + "grad_norm": 0.23043128848075867, + "learning_rate": 4.92e-05, + "loss": 2.4449, + "step": 32 + }, + { + "epoch": 0.0036838965578591537, + "grad_norm": 0.22782647609710693, + "learning_rate": 4.9175e-05, + "loss": 2.496, + "step": 33 + }, + { + "epoch": 0.0037955297868851887, + "grad_norm": 0.2296725958585739, + "learning_rate": 4.915e-05, + "loss": 2.4957, + "step": 34 + }, + { + "epoch": 0.003907163015911224, + "grad_norm": 0.23677076399326324, + "learning_rate": 4.9125e-05, + "loss": 2.4031, + "step": 35 + }, + { + "epoch": 0.004018796244937259, + "grad_norm": 0.2547452747821808, + "learning_rate": 4.91e-05, + "loss": 2.4666, + "step": 36 + }, + { + "epoch": 0.004130429473963294, + "grad_norm": 0.24586506187915802, + "learning_rate": 4.907500000000001e-05, + "loss": 2.4721, + "step": 37 + }, + { + "epoch": 0.004242062702989329, + "grad_norm": 0.24286368489265442, + "learning_rate": 4.905e-05, + "loss": 2.4584, + "step": 38 + }, + { + "epoch": 0.004353695932015364, + "grad_norm": 0.2877066135406494, + "learning_rate": 4.9025000000000006e-05, + "loss": 2.3817, + "step": 39 + }, + { + "epoch": 0.004465329161041399, + "grad_norm": 0.36161181330680847, + "learning_rate": 4.9e-05, + "loss": 2.4246, + "step": 40 + }, + { + "epoch": 0.004576962390067434, + "grad_norm": 0.23599842190742493, + "learning_rate": 4.8975000000000005e-05, + "loss": 2.4416, + "step": 41 + }, + { + "epoch": 0.004688595619093469, + "grad_norm": 3.929527521133423, + "learning_rate": 4.8950000000000004e-05, + "loss": 2.536, + "step": 42 + }, + { + "epoch": 0.004800228848119504, + "grad_norm": 0.25463879108428955, + "learning_rate": 4.8925e-05, + "loss": 2.3839, + "step": 43 + }, + { + "epoch": 0.004911862077145539, + "grad_norm": 0.2564179599285126, + "learning_rate": 4.89e-05, + "loss": 2.5175, + "step": 44 + }, + { + "epoch": 0.0050234953061715736, + "grad_norm": 0.2552028298377991, + "learning_rate": 4.8875e-05, + "loss": 2.4055, + "step": 45 + }, + { + "epoch": 0.0051351285351976085, + "grad_norm": 0.27434229850769043, + "learning_rate": 4.885e-05, + "loss": 2.4139, + "step": 46 + }, + { + "epoch": 0.0052467617642236435, + "grad_norm": 0.2661423981189728, + "learning_rate": 4.8825e-05, + "loss": 2.4711, + "step": 47 + }, + { + "epoch": 0.0053583949932496785, + "grad_norm": 0.24605660140514374, + "learning_rate": 4.88e-05, + "loss": 2.3726, + "step": 48 + }, + { + "epoch": 0.0054700282222757135, + "grad_norm": 0.27371543645858765, + "learning_rate": 4.8775000000000007e-05, + "loss": 2.4012, + "step": 49 + }, + { + "epoch": 0.0055816614513017485, + "grad_norm": 0.26587924361228943, + "learning_rate": 4.875e-05, + "loss": 2.4409, + "step": 50 + }, + { + "epoch": 0.0056932946803277835, + "grad_norm": 0.28210633993148804, + "learning_rate": 4.8725000000000005e-05, + "loss": 2.5431, + "step": 51 + }, + { + "epoch": 0.005804927909353818, + "grad_norm": 0.2553481459617615, + "learning_rate": 4.87e-05, + "loss": 2.5122, + "step": 52 + }, + { + "epoch": 0.005916561138379853, + "grad_norm": 0.2604880928993225, + "learning_rate": 4.8675000000000004e-05, + "loss": 2.4656, + "step": 53 + }, + { + "epoch": 0.006028194367405888, + "grad_norm": 0.266725093126297, + "learning_rate": 4.8650000000000003e-05, + "loss": 2.307, + "step": 54 + }, + { + "epoch": 0.0061398275964319226, + "grad_norm": 5.911880016326904, + "learning_rate": 4.8625e-05, + "loss": 2.2753, + "step": 55 + }, + { + "epoch": 0.0062514608254579575, + "grad_norm": 0.2631521224975586, + "learning_rate": 4.86e-05, + "loss": 2.4475, + "step": 56 + }, + { + "epoch": 0.0063630940544839925, + "grad_norm": 0.2781185507774353, + "learning_rate": 4.8575e-05, + "loss": 2.2942, + "step": 57 + }, + { + "epoch": 0.0064747272835100275, + "grad_norm": 0.27916616201400757, + "learning_rate": 4.855e-05, + "loss": 2.4817, + "step": 58 + }, + { + "epoch": 0.0065863605125360625, + "grad_norm": 0.25466758012771606, + "learning_rate": 4.8525e-05, + "loss": 2.3986, + "step": 59 + }, + { + "epoch": 0.0066979937415620975, + "grad_norm": 0.3041671812534332, + "learning_rate": 4.85e-05, + "loss": 2.3411, + "step": 60 + }, + { + "epoch": 0.0068096269705881325, + "grad_norm": 0.26597726345062256, + "learning_rate": 4.8475000000000006e-05, + "loss": 2.3992, + "step": 61 + }, + { + "epoch": 0.0069212601996141675, + "grad_norm": 0.2860955595970154, + "learning_rate": 4.845e-05, + "loss": 2.535, + "step": 62 + }, + { + "epoch": 0.0070328934286402025, + "grad_norm": 0.2573089897632599, + "learning_rate": 4.8425000000000005e-05, + "loss": 2.4366, + "step": 63 + }, + { + "epoch": 0.0071445266576662374, + "grad_norm": 0.3139612376689911, + "learning_rate": 4.8400000000000004e-05, + "loss": 2.4112, + "step": 64 + }, + { + "epoch": 0.007256159886692272, + "grad_norm": 0.27238690853118896, + "learning_rate": 4.8375000000000004e-05, + "loss": 2.5764, + "step": 65 + }, + { + "epoch": 0.007367793115718307, + "grad_norm": 0.2645399272441864, + "learning_rate": 4.835e-05, + "loss": 2.3662, + "step": 66 + }, + { + "epoch": 0.007479426344744342, + "grad_norm": 0.2746032178401947, + "learning_rate": 4.8325e-05, + "loss": 2.3946, + "step": 67 + }, + { + "epoch": 0.007591059573770377, + "grad_norm": 0.2907489836215973, + "learning_rate": 4.83e-05, + "loss": 2.4238, + "step": 68 + }, + { + "epoch": 0.007702692802796412, + "grad_norm": 0.2683127522468567, + "learning_rate": 4.8275e-05, + "loss": 2.3503, + "step": 69 + }, + { + "epoch": 0.007814326031822447, + "grad_norm": 0.29885104298591614, + "learning_rate": 4.825e-05, + "loss": 2.3667, + "step": 70 + }, + { + "epoch": 0.007925959260848481, + "grad_norm": 0.35321930050849915, + "learning_rate": 4.822500000000001e-05, + "loss": 2.2999, + "step": 71 + }, + { + "epoch": 0.008037592489874517, + "grad_norm": 0.28377628326416016, + "learning_rate": 4.82e-05, + "loss": 2.3928, + "step": 72 + }, + { + "epoch": 0.008149225718900551, + "grad_norm": 0.28445249795913696, + "learning_rate": 4.8175000000000005e-05, + "loss": 2.4402, + "step": 73 + }, + { + "epoch": 0.008260858947926587, + "grad_norm": 0.26443931460380554, + "learning_rate": 4.815e-05, + "loss": 2.3292, + "step": 74 + }, + { + "epoch": 0.008372492176952621, + "grad_norm": 0.6532557010650635, + "learning_rate": 4.8125000000000004e-05, + "loss": 2.404, + "step": 75 + }, + { + "epoch": 0.008484125405978657, + "grad_norm": 0.27610066533088684, + "learning_rate": 4.8100000000000004e-05, + "loss": 2.3759, + "step": 76 + }, + { + "epoch": 0.008595758635004691, + "grad_norm": 0.6011192798614502, + "learning_rate": 4.8075e-05, + "loss": 2.4855, + "step": 77 + }, + { + "epoch": 0.008707391864030727, + "grad_norm": 0.3119165599346161, + "learning_rate": 4.805e-05, + "loss": 2.4206, + "step": 78 + }, + { + "epoch": 0.008819025093056761, + "grad_norm": 0.2999507486820221, + "learning_rate": 4.8025e-05, + "loss": 2.3795, + "step": 79 + }, + { + "epoch": 0.008930658322082797, + "grad_norm": 0.32682228088378906, + "learning_rate": 4.8e-05, + "loss": 2.4536, + "step": 80 + }, + { + "epoch": 0.009042291551108831, + "grad_norm": 0.2823121249675751, + "learning_rate": 4.7975e-05, + "loss": 2.3822, + "step": 81 + }, + { + "epoch": 0.009153924780134867, + "grad_norm": 0.29254743456840515, + "learning_rate": 4.795e-05, + "loss": 2.4421, + "step": 82 + }, + { + "epoch": 0.009265558009160901, + "grad_norm": 0.40489596128463745, + "learning_rate": 4.7925000000000006e-05, + "loss": 2.3872, + "step": 83 + }, + { + "epoch": 0.009377191238186937, + "grad_norm": 0.28919968008995056, + "learning_rate": 4.79e-05, + "loss": 2.3616, + "step": 84 + }, + { + "epoch": 0.009488824467212971, + "grad_norm": 0.2884703576564789, + "learning_rate": 4.7875000000000005e-05, + "loss": 2.4244, + "step": 85 + }, + { + "epoch": 0.009600457696239007, + "grad_norm": 0.27483540773391724, + "learning_rate": 4.785e-05, + "loss": 2.3592, + "step": 86 + }, + { + "epoch": 0.009712090925265041, + "grad_norm": 0.2799672484397888, + "learning_rate": 4.7825000000000004e-05, + "loss": 2.3994, + "step": 87 + }, + { + "epoch": 0.009823724154291077, + "grad_norm": 0.2779146134853363, + "learning_rate": 4.78e-05, + "loss": 2.5073, + "step": 88 + }, + { + "epoch": 0.009935357383317111, + "grad_norm": 0.283578097820282, + "learning_rate": 4.7775e-05, + "loss": 2.3933, + "step": 89 + }, + { + "epoch": 0.010046990612343147, + "grad_norm": 0.28255191445350647, + "learning_rate": 4.775e-05, + "loss": 2.5065, + "step": 90 + }, + { + "epoch": 0.010158623841369181, + "grad_norm": 0.9130760431289673, + "learning_rate": 4.7725e-05, + "loss": 2.353, + "step": 91 + }, + { + "epoch": 0.010270257070395217, + "grad_norm": 0.28134602308273315, + "learning_rate": 4.77e-05, + "loss": 2.465, + "step": 92 + }, + { + "epoch": 0.010381890299421251, + "grad_norm": 0.2763191759586334, + "learning_rate": 4.7675e-05, + "loss": 2.317, + "step": 93 + }, + { + "epoch": 0.010493523528447287, + "grad_norm": 0.2876272201538086, + "learning_rate": 4.765e-05, + "loss": 2.3916, + "step": 94 + }, + { + "epoch": 0.010605156757473321, + "grad_norm": 0.28046631813049316, + "learning_rate": 4.7625000000000006e-05, + "loss": 2.4663, + "step": 95 + }, + { + "epoch": 0.010716789986499357, + "grad_norm": 0.2862105667591095, + "learning_rate": 4.76e-05, + "loss": 2.4214, + "step": 96 + }, + { + "epoch": 0.010828423215525391, + "grad_norm": 0.27971214056015015, + "learning_rate": 4.7575000000000004e-05, + "loss": 2.4528, + "step": 97 + }, + { + "epoch": 0.010940056444551427, + "grad_norm": 0.29032212495803833, + "learning_rate": 4.755e-05, + "loss": 2.3312, + "step": 98 + }, + { + "epoch": 0.011051689673577461, + "grad_norm": 0.293649286031723, + "learning_rate": 4.7525e-05, + "loss": 2.3592, + "step": 99 + }, + { + "epoch": 0.011163322902603497, + "grad_norm": 0.277589350938797, + "learning_rate": 4.75e-05, + "loss": 2.392, + "step": 100 + }, + { + "epoch": 0.011274956131629531, + "grad_norm": 0.28249549865722656, + "learning_rate": 4.7475e-05, + "loss": 2.3154, + "step": 101 + }, + { + "epoch": 0.011386589360655567, + "grad_norm": 0.30689579248428345, + "learning_rate": 4.745e-05, + "loss": 2.3322, + "step": 102 + }, + { + "epoch": 0.011498222589681601, + "grad_norm": 0.2909144163131714, + "learning_rate": 4.7425e-05, + "loss": 2.4028, + "step": 103 + }, + { + "epoch": 0.011609855818707635, + "grad_norm": 0.2826705574989319, + "learning_rate": 4.74e-05, + "loss": 2.3969, + "step": 104 + }, + { + "epoch": 0.011721489047733671, + "grad_norm": 0.30071696639060974, + "learning_rate": 4.7375e-05, + "loss": 2.435, + "step": 105 + }, + { + "epoch": 0.011833122276759705, + "grad_norm": 0.29308071732521057, + "learning_rate": 4.735e-05, + "loss": 2.3299, + "step": 106 + }, + { + "epoch": 0.011944755505785741, + "grad_norm": 0.28309884667396545, + "learning_rate": 4.7325000000000005e-05, + "loss": 2.511, + "step": 107 + }, + { + "epoch": 0.012056388734811775, + "grad_norm": 0.3001827597618103, + "learning_rate": 4.73e-05, + "loss": 2.3804, + "step": 108 + }, + { + "epoch": 0.012168021963837811, + "grad_norm": 0.3125348687171936, + "learning_rate": 4.7275000000000004e-05, + "loss": 2.3186, + "step": 109 + }, + { + "epoch": 0.012279655192863845, + "grad_norm": 0.29325881600379944, + "learning_rate": 4.7249999999999997e-05, + "loss": 2.4714, + "step": 110 + }, + { + "epoch": 0.012391288421889881, + "grad_norm": 0.28102368116378784, + "learning_rate": 4.7225e-05, + "loss": 2.4592, + "step": 111 + }, + { + "epoch": 0.012502921650915915, + "grad_norm": 0.2798093259334564, + "learning_rate": 4.72e-05, + "loss": 2.398, + "step": 112 + }, + { + "epoch": 0.012614554879941951, + "grad_norm": 0.29487597942352295, + "learning_rate": 4.7175e-05, + "loss": 2.3144, + "step": 113 + }, + { + "epoch": 0.012726188108967985, + "grad_norm": 0.28528064489364624, + "learning_rate": 4.715e-05, + "loss": 2.4359, + "step": 114 + }, + { + "epoch": 0.012837821337994021, + "grad_norm": 0.29618656635284424, + "learning_rate": 4.7125e-05, + "loss": 2.3421, + "step": 115 + }, + { + "epoch": 0.012949454567020055, + "grad_norm": 0.27769914269447327, + "learning_rate": 4.71e-05, + "loss": 2.4087, + "step": 116 + }, + { + "epoch": 0.013061087796046091, + "grad_norm": 0.2721666991710663, + "learning_rate": 4.7075e-05, + "loss": 2.4001, + "step": 117 + }, + { + "epoch": 0.013172721025072125, + "grad_norm": 0.6449373960494995, + "learning_rate": 4.705e-05, + "loss": 2.4581, + "step": 118 + }, + { + "epoch": 0.013284354254098161, + "grad_norm": 0.28057020902633667, + "learning_rate": 4.7025000000000005e-05, + "loss": 2.3935, + "step": 119 + }, + { + "epoch": 0.013395987483124195, + "grad_norm": 0.2757243812084198, + "learning_rate": 4.7e-05, + "loss": 2.465, + "step": 120 + }, + { + "epoch": 0.01350762071215023, + "grad_norm": 0.2977396249771118, + "learning_rate": 4.6975000000000003e-05, + "loss": 2.4985, + "step": 121 + }, + { + "epoch": 0.013619253941176265, + "grad_norm": 0.27909162640571594, + "learning_rate": 4.695e-05, + "loss": 2.4484, + "step": 122 + }, + { + "epoch": 0.0137308871702023, + "grad_norm": 0.28472158312797546, + "learning_rate": 4.6925e-05, + "loss": 2.499, + "step": 123 + }, + { + "epoch": 0.013842520399228335, + "grad_norm": 0.2772194445133209, + "learning_rate": 4.69e-05, + "loss": 2.4161, + "step": 124 + }, + { + "epoch": 0.01395415362825437, + "grad_norm": 0.28007185459136963, + "learning_rate": 4.6875e-05, + "loss": 2.4538, + "step": 125 + }, + { + "epoch": 0.014065786857280405, + "grad_norm": 0.2890627086162567, + "learning_rate": 4.685000000000001e-05, + "loss": 2.3803, + "step": 126 + }, + { + "epoch": 0.01417742008630644, + "grad_norm": 0.28412866592407227, + "learning_rate": 4.6825e-05, + "loss": 2.4739, + "step": 127 + }, + { + "epoch": 0.014289053315332475, + "grad_norm": 0.28246861696243286, + "learning_rate": 4.6800000000000006e-05, + "loss": 2.4031, + "step": 128 + }, + { + "epoch": 0.01440068654435851, + "grad_norm": 0.27640506625175476, + "learning_rate": 4.6775000000000005e-05, + "loss": 2.3415, + "step": 129 + }, + { + "epoch": 0.014512319773384545, + "grad_norm": 0.27817410230636597, + "learning_rate": 4.6750000000000005e-05, + "loss": 2.5304, + "step": 130 + }, + { + "epoch": 0.01462395300241058, + "grad_norm": 0.27021604776382446, + "learning_rate": 4.6725000000000004e-05, + "loss": 2.34, + "step": 131 + }, + { + "epoch": 0.014735586231436615, + "grad_norm": 0.2793290913105011, + "learning_rate": 4.6700000000000003e-05, + "loss": 2.4935, + "step": 132 + }, + { + "epoch": 0.01484721946046265, + "grad_norm": 0.2610988914966583, + "learning_rate": 4.6675e-05, + "loss": 2.1694, + "step": 133 + }, + { + "epoch": 0.014958852689488685, + "grad_norm": 0.3299348056316376, + "learning_rate": 4.665e-05, + "loss": 2.3272, + "step": 134 + }, + { + "epoch": 0.01507048591851472, + "grad_norm": 0.27494046092033386, + "learning_rate": 4.6625e-05, + "loss": 2.4961, + "step": 135 + }, + { + "epoch": 0.015182119147540755, + "grad_norm": 0.8337180018424988, + "learning_rate": 4.660000000000001e-05, + "loss": 2.4057, + "step": 136 + }, + { + "epoch": 0.01529375237656679, + "grad_norm": 0.27909937500953674, + "learning_rate": 4.6575e-05, + "loss": 2.279, + "step": 137 + }, + { + "epoch": 0.015405385605592825, + "grad_norm": 0.3143457770347595, + "learning_rate": 4.655000000000001e-05, + "loss": 2.4057, + "step": 138 + }, + { + "epoch": 0.015517018834618859, + "grad_norm": 0.27124184370040894, + "learning_rate": 4.6525e-05, + "loss": 2.4525, + "step": 139 + }, + { + "epoch": 0.015628652063644895, + "grad_norm": 0.27924227714538574, + "learning_rate": 4.6500000000000005e-05, + "loss": 2.4891, + "step": 140 + }, + { + "epoch": 0.01574028529267093, + "grad_norm": 0.2761871814727783, + "learning_rate": 4.6475000000000005e-05, + "loss": 2.4127, + "step": 141 + }, + { + "epoch": 0.015851918521696963, + "grad_norm": 0.30836501717567444, + "learning_rate": 4.6450000000000004e-05, + "loss": 2.3762, + "step": 142 + }, + { + "epoch": 0.015963551750723, + "grad_norm": 0.2716349959373474, + "learning_rate": 4.6425000000000004e-05, + "loss": 2.3893, + "step": 143 + }, + { + "epoch": 0.016075184979749035, + "grad_norm": 0.27172204852104187, + "learning_rate": 4.64e-05, + "loss": 2.3451, + "step": 144 + }, + { + "epoch": 0.01618681820877507, + "grad_norm": 0.27586179971694946, + "learning_rate": 4.6375e-05, + "loss": 2.3197, + "step": 145 + }, + { + "epoch": 0.016298451437801103, + "grad_norm": 0.27401286363601685, + "learning_rate": 4.635e-05, + "loss": 2.3191, + "step": 146 + }, + { + "epoch": 0.01641008466682714, + "grad_norm": 0.27832385897636414, + "learning_rate": 4.6325e-05, + "loss": 2.4407, + "step": 147 + }, + { + "epoch": 0.016521717895853175, + "grad_norm": 0.29265516996383667, + "learning_rate": 4.630000000000001e-05, + "loss": 2.3436, + "step": 148 + }, + { + "epoch": 0.01663335112487921, + "grad_norm": 0.27826353907585144, + "learning_rate": 4.6275e-05, + "loss": 2.4081, + "step": 149 + }, + { + "epoch": 0.016744984353905243, + "grad_norm": 0.26623812317848206, + "learning_rate": 4.6250000000000006e-05, + "loss": 2.4566, + "step": 150 + }, + { + "epoch": 0.01685661758293128, + "grad_norm": 0.2699335217475891, + "learning_rate": 4.6225e-05, + "loss": 2.3, + "step": 151 + }, + { + "epoch": 0.016968250811957315, + "grad_norm": 0.27325987815856934, + "learning_rate": 4.6200000000000005e-05, + "loss": 2.3798, + "step": 152 + }, + { + "epoch": 0.01707988404098335, + "grad_norm": 0.29398098587989807, + "learning_rate": 4.6175000000000004e-05, + "loss": 2.2952, + "step": 153 + }, + { + "epoch": 0.017191517270009383, + "grad_norm": 0.264258474111557, + "learning_rate": 4.6150000000000004e-05, + "loss": 2.4543, + "step": 154 + }, + { + "epoch": 0.01730315049903542, + "grad_norm": 0.4211874008178711, + "learning_rate": 4.6125e-05, + "loss": 2.3015, + "step": 155 + }, + { + "epoch": 0.017414783728061455, + "grad_norm": 0.28733232617378235, + "learning_rate": 4.61e-05, + "loss": 2.4002, + "step": 156 + }, + { + "epoch": 0.01752641695708749, + "grad_norm": 0.2647246718406677, + "learning_rate": 4.6075e-05, + "loss": 2.2928, + "step": 157 + }, + { + "epoch": 0.017638050186113523, + "grad_norm": 0.2679901719093323, + "learning_rate": 4.605e-05, + "loss": 2.4328, + "step": 158 + }, + { + "epoch": 0.01774968341513956, + "grad_norm": 0.26848575472831726, + "learning_rate": 4.6025e-05, + "loss": 2.3402, + "step": 159 + }, + { + "epoch": 0.017861316644165594, + "grad_norm": 0.282953679561615, + "learning_rate": 4.600000000000001e-05, + "loss": 2.3454, + "step": 160 + }, + { + "epoch": 0.01797294987319163, + "grad_norm": 0.2788335084915161, + "learning_rate": 4.5975e-05, + "loss": 2.3779, + "step": 161 + }, + { + "epoch": 0.018084583102217663, + "grad_norm": 0.2903019189834595, + "learning_rate": 4.5950000000000006e-05, + "loss": 2.4031, + "step": 162 + }, + { + "epoch": 0.0181962163312437, + "grad_norm": 0.2800057828426361, + "learning_rate": 4.5925e-05, + "loss": 2.4273, + "step": 163 + }, + { + "epoch": 0.018307849560269734, + "grad_norm": 0.28730225563049316, + "learning_rate": 4.5900000000000004e-05, + "loss": 2.3713, + "step": 164 + }, + { + "epoch": 0.01841948278929577, + "grad_norm": 0.2722271978855133, + "learning_rate": 4.5875000000000004e-05, + "loss": 2.5114, + "step": 165 + }, + { + "epoch": 0.018531116018321803, + "grad_norm": 0.27777567505836487, + "learning_rate": 4.585e-05, + "loss": 2.3485, + "step": 166 + }, + { + "epoch": 0.01864274924734784, + "grad_norm": 0.2774522006511688, + "learning_rate": 4.5825e-05, + "loss": 2.4186, + "step": 167 + }, + { + "epoch": 0.018754382476373874, + "grad_norm": 0.2742158770561218, + "learning_rate": 4.58e-05, + "loss": 2.3706, + "step": 168 + }, + { + "epoch": 0.01886601570539991, + "grad_norm": 0.27442091703414917, + "learning_rate": 4.5775e-05, + "loss": 2.4425, + "step": 169 + }, + { + "epoch": 0.018977648934425943, + "grad_norm": 0.2682335376739502, + "learning_rate": 4.575e-05, + "loss": 2.3597, + "step": 170 + }, + { + "epoch": 0.01908928216345198, + "grad_norm": 0.27973178029060364, + "learning_rate": 4.5725e-05, + "loss": 2.2992, + "step": 171 + }, + { + "epoch": 0.019200915392478014, + "grad_norm": 0.3113536536693573, + "learning_rate": 4.5700000000000006e-05, + "loss": 2.4343, + "step": 172 + }, + { + "epoch": 0.01931254862150405, + "grad_norm": 0.27050501108169556, + "learning_rate": 4.5675e-05, + "loss": 2.4023, + "step": 173 + }, + { + "epoch": 0.019424181850530083, + "grad_norm": 0.5937790870666504, + "learning_rate": 4.5650000000000005e-05, + "loss": 2.4726, + "step": 174 + }, + { + "epoch": 0.019535815079556117, + "grad_norm": 0.27320945262908936, + "learning_rate": 4.5625e-05, + "loss": 2.4108, + "step": 175 + }, + { + "epoch": 0.019647448308582154, + "grad_norm": 0.2724778354167938, + "learning_rate": 4.5600000000000004e-05, + "loss": 2.2204, + "step": 176 + }, + { + "epoch": 0.01975908153760819, + "grad_norm": 0.2760343849658966, + "learning_rate": 4.5575e-05, + "loss": 2.4108, + "step": 177 + }, + { + "epoch": 0.019870714766634223, + "grad_norm": 0.27173370122909546, + "learning_rate": 4.555e-05, + "loss": 2.4278, + "step": 178 + }, + { + "epoch": 0.019982347995660257, + "grad_norm": 0.258478045463562, + "learning_rate": 4.5525e-05, + "loss": 2.3129, + "step": 179 + }, + { + "epoch": 0.020093981224686294, + "grad_norm": 0.2680318355560303, + "learning_rate": 4.55e-05, + "loss": 2.1764, + "step": 180 + }, + { + "epoch": 0.02020561445371233, + "grad_norm": 0.32532161474227905, + "learning_rate": 4.5475e-05, + "loss": 2.3988, + "step": 181 + }, + { + "epoch": 0.020317247682738362, + "grad_norm": 0.27205905318260193, + "learning_rate": 4.545000000000001e-05, + "loss": 2.3432, + "step": 182 + }, + { + "epoch": 0.020428880911764397, + "grad_norm": 0.3422660827636719, + "learning_rate": 4.5425e-05, + "loss": 2.4591, + "step": 183 + }, + { + "epoch": 0.020540514140790434, + "grad_norm": 0.2941705882549286, + "learning_rate": 4.5400000000000006e-05, + "loss": 2.4285, + "step": 184 + }, + { + "epoch": 0.02065214736981647, + "grad_norm": 0.27218639850616455, + "learning_rate": 4.5375e-05, + "loss": 2.3349, + "step": 185 + }, + { + "epoch": 0.020763780598842502, + "grad_norm": 0.26361867785453796, + "learning_rate": 4.5350000000000005e-05, + "loss": 2.3049, + "step": 186 + }, + { + "epoch": 0.020875413827868537, + "grad_norm": 0.47230780124664307, + "learning_rate": 4.5325000000000004e-05, + "loss": 2.3365, + "step": 187 + }, + { + "epoch": 0.020987047056894574, + "grad_norm": 0.26752769947052, + "learning_rate": 4.53e-05, + "loss": 2.3887, + "step": 188 + }, + { + "epoch": 0.021098680285920608, + "grad_norm": 0.27140894532203674, + "learning_rate": 4.5275e-05, + "loss": 2.4444, + "step": 189 + }, + { + "epoch": 0.021210313514946642, + "grad_norm": 0.4308728873729706, + "learning_rate": 4.525e-05, + "loss": 2.397, + "step": 190 + }, + { + "epoch": 0.021321946743972676, + "grad_norm": 0.6893981099128723, + "learning_rate": 4.5225e-05, + "loss": 2.3797, + "step": 191 + }, + { + "epoch": 0.021433579972998714, + "grad_norm": 0.27642473578453064, + "learning_rate": 4.52e-05, + "loss": 2.3374, + "step": 192 + }, + { + "epoch": 0.021545213202024748, + "grad_norm": 0.2684485912322998, + "learning_rate": 4.5175e-05, + "loss": 2.3368, + "step": 193 + }, + { + "epoch": 0.021656846431050782, + "grad_norm": 0.2804638743400574, + "learning_rate": 4.5150000000000006e-05, + "loss": 2.3378, + "step": 194 + }, + { + "epoch": 0.021768479660076816, + "grad_norm": 0.41617271304130554, + "learning_rate": 4.5125e-05, + "loss": 2.3908, + "step": 195 + }, + { + "epoch": 0.021880112889102854, + "grad_norm": 0.26013967394828796, + "learning_rate": 4.5100000000000005e-05, + "loss": 2.3378, + "step": 196 + }, + { + "epoch": 0.021991746118128888, + "grad_norm": 0.2883068919181824, + "learning_rate": 4.5075e-05, + "loss": 2.296, + "step": 197 + }, + { + "epoch": 0.022103379347154922, + "grad_norm": 0.27083417773246765, + "learning_rate": 4.5050000000000004e-05, + "loss": 2.3917, + "step": 198 + }, + { + "epoch": 0.022215012576180956, + "grad_norm": 0.26112979650497437, + "learning_rate": 4.5025000000000003e-05, + "loss": 2.4321, + "step": 199 + }, + { + "epoch": 0.022326645805206994, + "grad_norm": 0.2797684669494629, + "learning_rate": 4.5e-05, + "loss": 2.4221, + "step": 200 + }, + { + "epoch": 0.022438279034233028, + "grad_norm": 0.28574231266975403, + "learning_rate": 4.4975e-05, + "loss": 2.4143, + "step": 201 + }, + { + "epoch": 0.022549912263259062, + "grad_norm": 0.3054039180278778, + "learning_rate": 4.495e-05, + "loss": 2.3744, + "step": 202 + }, + { + "epoch": 0.022661545492285096, + "grad_norm": 0.2859933376312256, + "learning_rate": 4.4925e-05, + "loss": 2.3158, + "step": 203 + }, + { + "epoch": 0.022773178721311134, + "grad_norm": 0.30749940872192383, + "learning_rate": 4.49e-05, + "loss": 2.3274, + "step": 204 + }, + { + "epoch": 0.022884811950337168, + "grad_norm": 0.34303340315818787, + "learning_rate": 4.4875e-05, + "loss": 2.3822, + "step": 205 + }, + { + "epoch": 0.022996445179363202, + "grad_norm": 0.6377202868461609, + "learning_rate": 4.4850000000000006e-05, + "loss": 2.3162, + "step": 206 + }, + { + "epoch": 0.023108078408389236, + "grad_norm": 0.27729642391204834, + "learning_rate": 4.4825e-05, + "loss": 2.3117, + "step": 207 + }, + { + "epoch": 0.02321971163741527, + "grad_norm": 0.2766030728816986, + "learning_rate": 4.4800000000000005e-05, + "loss": 2.4142, + "step": 208 + }, + { + "epoch": 0.023331344866441308, + "grad_norm": 0.5488070249557495, + "learning_rate": 4.4775e-05, + "loss": 2.3999, + "step": 209 + }, + { + "epoch": 0.023442978095467342, + "grad_norm": 0.27591627836227417, + "learning_rate": 4.4750000000000004e-05, + "loss": 2.4271, + "step": 210 + }, + { + "epoch": 0.023554611324493376, + "grad_norm": 0.3173430562019348, + "learning_rate": 4.4725e-05, + "loss": 2.4497, + "step": 211 + }, + { + "epoch": 0.02366624455351941, + "grad_norm": 0.31145554780960083, + "learning_rate": 4.47e-05, + "loss": 2.4128, + "step": 212 + }, + { + "epoch": 0.023777877782545448, + "grad_norm": 0.27966129779815674, + "learning_rate": 4.4675e-05, + "loss": 2.4699, + "step": 213 + }, + { + "epoch": 0.023889511011571482, + "grad_norm": 0.31131359934806824, + "learning_rate": 4.465e-05, + "loss": 2.3269, + "step": 214 + }, + { + "epoch": 0.024001144240597516, + "grad_norm": 0.26797181367874146, + "learning_rate": 4.4625e-05, + "loss": 2.4169, + "step": 215 + }, + { + "epoch": 0.02411277746962355, + "grad_norm": 0.28204289078712463, + "learning_rate": 4.46e-05, + "loss": 2.2361, + "step": 216 + }, + { + "epoch": 0.024224410698649588, + "grad_norm": 0.2600002586841583, + "learning_rate": 4.4575e-05, + "loss": 2.3357, + "step": 217 + }, + { + "epoch": 0.024336043927675622, + "grad_norm": 0.2576424777507782, + "learning_rate": 4.4550000000000005e-05, + "loss": 2.3424, + "step": 218 + }, + { + "epoch": 0.024447677156701656, + "grad_norm": 0.3016074299812317, + "learning_rate": 4.4525e-05, + "loss": 2.3781, + "step": 219 + }, + { + "epoch": 0.02455931038572769, + "grad_norm": 0.2684342563152313, + "learning_rate": 4.4500000000000004e-05, + "loss": 2.3167, + "step": 220 + }, + { + "epoch": 0.024670943614753728, + "grad_norm": 0.2625711262226105, + "learning_rate": 4.4475e-05, + "loss": 2.4207, + "step": 221 + }, + { + "epoch": 0.024782576843779762, + "grad_norm": 0.29141953587532043, + "learning_rate": 4.445e-05, + "loss": 2.4231, + "step": 222 + }, + { + "epoch": 0.024894210072805796, + "grad_norm": 0.2682022452354431, + "learning_rate": 4.4425e-05, + "loss": 2.4098, + "step": 223 + }, + { + "epoch": 0.02500584330183183, + "grad_norm": 0.2591974437236786, + "learning_rate": 4.44e-05, + "loss": 2.4387, + "step": 224 + }, + { + "epoch": 0.025117476530857868, + "grad_norm": 0.2656046152114868, + "learning_rate": 4.4375e-05, + "loss": 2.415, + "step": 225 + }, + { + "epoch": 0.025229109759883902, + "grad_norm": 0.2568715214729309, + "learning_rate": 4.435e-05, + "loss": 2.3837, + "step": 226 + }, + { + "epoch": 0.025340742988909936, + "grad_norm": 0.3120313286781311, + "learning_rate": 4.4325e-05, + "loss": 2.4028, + "step": 227 + }, + { + "epoch": 0.02545237621793597, + "grad_norm": 0.36895328760147095, + "learning_rate": 4.43e-05, + "loss": 2.4162, + "step": 228 + }, + { + "epoch": 0.025564009446962008, + "grad_norm": 0.2681656777858734, + "learning_rate": 4.4275e-05, + "loss": 2.4289, + "step": 229 + }, + { + "epoch": 0.025675642675988042, + "grad_norm": 0.2715415358543396, + "learning_rate": 4.4250000000000005e-05, + "loss": 2.3207, + "step": 230 + }, + { + "epoch": 0.025787275905014076, + "grad_norm": 0.2677493989467621, + "learning_rate": 4.4225e-05, + "loss": 2.3856, + "step": 231 + }, + { + "epoch": 0.02589890913404011, + "grad_norm": 0.25962767004966736, + "learning_rate": 4.4200000000000004e-05, + "loss": 2.3136, + "step": 232 + }, + { + "epoch": 0.026010542363066148, + "grad_norm": 0.3225052058696747, + "learning_rate": 4.4174999999999996e-05, + "loss": 2.2765, + "step": 233 + }, + { + "epoch": 0.026122175592092182, + "grad_norm": 0.3049544095993042, + "learning_rate": 4.415e-05, + "loss": 2.2127, + "step": 234 + }, + { + "epoch": 0.026233808821118216, + "grad_norm": 0.25900280475616455, + "learning_rate": 4.4125e-05, + "loss": 2.3794, + "step": 235 + }, + { + "epoch": 0.02634544205014425, + "grad_norm": 0.26174089312553406, + "learning_rate": 4.41e-05, + "loss": 2.4024, + "step": 236 + }, + { + "epoch": 0.026457075279170288, + "grad_norm": 0.26936131715774536, + "learning_rate": 4.4075e-05, + "loss": 2.5543, + "step": 237 + }, + { + "epoch": 0.026568708508196322, + "grad_norm": 0.2539876103401184, + "learning_rate": 4.405e-05, + "loss": 2.4051, + "step": 238 + }, + { + "epoch": 0.026680341737222356, + "grad_norm": 0.26477983593940735, + "learning_rate": 4.4025e-05, + "loss": 2.3512, + "step": 239 + }, + { + "epoch": 0.02679197496624839, + "grad_norm": 0.2632873058319092, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.3378, + "step": 240 + }, + { + "epoch": 0.026903608195274428, + "grad_norm": 0.6845733523368835, + "learning_rate": 4.3975e-05, + "loss": 2.3815, + "step": 241 + }, + { + "epoch": 0.02701524142430046, + "grad_norm": 0.25974923372268677, + "learning_rate": 4.3950000000000004e-05, + "loss": 2.3816, + "step": 242 + }, + { + "epoch": 0.027126874653326496, + "grad_norm": 0.2636438012123108, + "learning_rate": 4.3925e-05, + "loss": 2.3288, + "step": 243 + }, + { + "epoch": 0.02723850788235253, + "grad_norm": 0.2576185464859009, + "learning_rate": 4.39e-05, + "loss": 2.4314, + "step": 244 + }, + { + "epoch": 0.027350141111378564, + "grad_norm": 0.2600337862968445, + "learning_rate": 4.3875e-05, + "loss": 2.4147, + "step": 245 + }, + { + "epoch": 0.0274617743404046, + "grad_norm": 0.2605160176753998, + "learning_rate": 4.385e-05, + "loss": 2.327, + "step": 246 + }, + { + "epoch": 0.027573407569430636, + "grad_norm": 0.25381141901016235, + "learning_rate": 4.3825e-05, + "loss": 2.309, + "step": 247 + }, + { + "epoch": 0.02768504079845667, + "grad_norm": 0.253326416015625, + "learning_rate": 4.38e-05, + "loss": 2.2849, + "step": 248 + }, + { + "epoch": 0.027796674027482704, + "grad_norm": 0.3015645444393158, + "learning_rate": 4.3775e-05, + "loss": 2.3882, + "step": 249 + }, + { + "epoch": 0.02790830725650874, + "grad_norm": 0.2591153383255005, + "learning_rate": 4.375e-05, + "loss": 2.3108, + "step": 250 + }, + { + "epoch": 0.028019940485534776, + "grad_norm": 0.255209743976593, + "learning_rate": 4.3725000000000006e-05, + "loss": 2.3026, + "step": 251 + }, + { + "epoch": 0.02813157371456081, + "grad_norm": 0.2543400228023529, + "learning_rate": 4.3700000000000005e-05, + "loss": 2.3949, + "step": 252 + }, + { + "epoch": 0.028243206943586844, + "grad_norm": 0.2584831118583679, + "learning_rate": 4.3675000000000005e-05, + "loss": 2.4274, + "step": 253 + }, + { + "epoch": 0.02835484017261288, + "grad_norm": 0.24946770071983337, + "learning_rate": 4.3650000000000004e-05, + "loss": 2.378, + "step": 254 + }, + { + "epoch": 0.028466473401638916, + "grad_norm": 0.2595466673374176, + "learning_rate": 4.3625e-05, + "loss": 2.4601, + "step": 255 + }, + { + "epoch": 0.02857810663066495, + "grad_norm": 0.25328534841537476, + "learning_rate": 4.36e-05, + "loss": 2.2783, + "step": 256 + }, + { + "epoch": 0.028689739859690984, + "grad_norm": 0.2810356616973877, + "learning_rate": 4.3575e-05, + "loss": 2.282, + "step": 257 + }, + { + "epoch": 0.02880137308871702, + "grad_norm": 0.2603547275066376, + "learning_rate": 4.355e-05, + "loss": 2.3768, + "step": 258 + }, + { + "epoch": 0.028913006317743056, + "grad_norm": 0.25737640261650085, + "learning_rate": 4.352500000000001e-05, + "loss": 2.4577, + "step": 259 + }, + { + "epoch": 0.02902463954676909, + "grad_norm": 0.25266796350479126, + "learning_rate": 4.35e-05, + "loss": 2.3293, + "step": 260 + }, + { + "epoch": 0.029136272775795124, + "grad_norm": 0.35959863662719727, + "learning_rate": 4.3475000000000006e-05, + "loss": 2.2827, + "step": 261 + }, + { + "epoch": 0.02924790600482116, + "grad_norm": 0.2564973831176758, + "learning_rate": 4.345e-05, + "loss": 2.3925, + "step": 262 + }, + { + "epoch": 0.029359539233847196, + "grad_norm": 0.2848140597343445, + "learning_rate": 4.3425000000000005e-05, + "loss": 2.3275, + "step": 263 + }, + { + "epoch": 0.02947117246287323, + "grad_norm": 0.26803499460220337, + "learning_rate": 4.3400000000000005e-05, + "loss": 2.4576, + "step": 264 + }, + { + "epoch": 0.029582805691899264, + "grad_norm": 0.24815724790096283, + "learning_rate": 4.3375000000000004e-05, + "loss": 2.611, + "step": 265 + }, + { + "epoch": 0.0296944389209253, + "grad_norm": 0.26036569476127625, + "learning_rate": 4.335e-05, + "loss": 2.3574, + "step": 266 + }, + { + "epoch": 0.029806072149951335, + "grad_norm": 0.26427459716796875, + "learning_rate": 4.3325e-05, + "loss": 2.3884, + "step": 267 + }, + { + "epoch": 0.02991770537897737, + "grad_norm": 0.25177526473999023, + "learning_rate": 4.33e-05, + "loss": 2.3812, + "step": 268 + }, + { + "epoch": 0.030029338608003404, + "grad_norm": 0.2582986354827881, + "learning_rate": 4.3275e-05, + "loss": 2.3621, + "step": 269 + }, + { + "epoch": 0.03014097183702944, + "grad_norm": 0.26251325011253357, + "learning_rate": 4.325e-05, + "loss": 2.4779, + "step": 270 + }, + { + "epoch": 0.030252605066055475, + "grad_norm": 0.2560170590877533, + "learning_rate": 4.322500000000001e-05, + "loss": 2.4519, + "step": 271 + }, + { + "epoch": 0.03036423829508151, + "grad_norm": 0.25769442319869995, + "learning_rate": 4.32e-05, + "loss": 2.255, + "step": 272 + }, + { + "epoch": 0.030475871524107544, + "grad_norm": 0.2584100067615509, + "learning_rate": 4.3175000000000006e-05, + "loss": 2.4195, + "step": 273 + }, + { + "epoch": 0.03058750475313358, + "grad_norm": 0.26976278424263, + "learning_rate": 4.315e-05, + "loss": 2.3927, + "step": 274 + }, + { + "epoch": 0.030699137982159615, + "grad_norm": 0.2528376877307892, + "learning_rate": 4.3125000000000005e-05, + "loss": 2.3596, + "step": 275 + }, + { + "epoch": 0.03081077121118565, + "grad_norm": 0.25755786895751953, + "learning_rate": 4.3100000000000004e-05, + "loss": 2.4775, + "step": 276 + }, + { + "epoch": 0.030922404440211684, + "grad_norm": 0.25737857818603516, + "learning_rate": 4.3075000000000003e-05, + "loss": 2.3443, + "step": 277 + }, + { + "epoch": 0.031034037669237718, + "grad_norm": 0.2632676362991333, + "learning_rate": 4.305e-05, + "loss": 2.3128, + "step": 278 + }, + { + "epoch": 0.031145670898263755, + "grad_norm": 0.28831636905670166, + "learning_rate": 4.3025e-05, + "loss": 2.3624, + "step": 279 + }, + { + "epoch": 0.03125730412728979, + "grad_norm": 0.2593206763267517, + "learning_rate": 4.3e-05, + "loss": 2.3546, + "step": 280 + }, + { + "epoch": 0.031368937356315824, + "grad_norm": 0.25221961736679077, + "learning_rate": 4.2975e-05, + "loss": 2.3056, + "step": 281 + }, + { + "epoch": 0.03148057058534186, + "grad_norm": 0.26562732458114624, + "learning_rate": 4.295e-05, + "loss": 2.2979, + "step": 282 + }, + { + "epoch": 0.03159220381436789, + "grad_norm": 0.2667911648750305, + "learning_rate": 4.2925000000000007e-05, + "loss": 2.5051, + "step": 283 + }, + { + "epoch": 0.031703837043393926, + "grad_norm": 0.2637230455875397, + "learning_rate": 4.29e-05, + "loss": 2.2956, + "step": 284 + }, + { + "epoch": 0.03181547027241997, + "grad_norm": 0.2591506540775299, + "learning_rate": 4.2875000000000005e-05, + "loss": 2.4331, + "step": 285 + }, + { + "epoch": 0.031927103501446, + "grad_norm": 0.25945866107940674, + "learning_rate": 4.285e-05, + "loss": 2.3397, + "step": 286 + }, + { + "epoch": 0.032038736730472035, + "grad_norm": 0.28612184524536133, + "learning_rate": 4.2825000000000004e-05, + "loss": 2.4108, + "step": 287 + }, + { + "epoch": 0.03215036995949807, + "grad_norm": 0.26425275206565857, + "learning_rate": 4.2800000000000004e-05, + "loss": 2.4308, + "step": 288 + }, + { + "epoch": 0.032262003188524103, + "grad_norm": 0.2575188875198364, + "learning_rate": 4.2775e-05, + "loss": 2.3774, + "step": 289 + }, + { + "epoch": 0.03237363641755014, + "grad_norm": 0.25762301683425903, + "learning_rate": 4.275e-05, + "loss": 2.4266, + "step": 290 + }, + { + "epoch": 0.03248526964657617, + "grad_norm": 0.2451835572719574, + "learning_rate": 4.2725e-05, + "loss": 2.2542, + "step": 291 + }, + { + "epoch": 0.032596902875602206, + "grad_norm": 0.25459688901901245, + "learning_rate": 4.27e-05, + "loss": 2.3772, + "step": 292 + }, + { + "epoch": 0.03270853610462825, + "grad_norm": 0.3179572820663452, + "learning_rate": 4.2675e-05, + "loss": 2.4139, + "step": 293 + }, + { + "epoch": 0.03282016933365428, + "grad_norm": 0.25751742720603943, + "learning_rate": 4.265e-05, + "loss": 2.2372, + "step": 294 + }, + { + "epoch": 0.032931802562680315, + "grad_norm": 0.26437416672706604, + "learning_rate": 4.2625000000000006e-05, + "loss": 2.4512, + "step": 295 + }, + { + "epoch": 0.03304343579170635, + "grad_norm": 0.26216599345207214, + "learning_rate": 4.26e-05, + "loss": 2.3041, + "step": 296 + }, + { + "epoch": 0.03315506902073238, + "grad_norm": 0.2558748126029968, + "learning_rate": 4.2575000000000005e-05, + "loss": 2.3464, + "step": 297 + }, + { + "epoch": 0.03326670224975842, + "grad_norm": 0.25759217143058777, + "learning_rate": 4.2550000000000004e-05, + "loss": 2.38, + "step": 298 + }, + { + "epoch": 0.03337833547878445, + "grad_norm": 0.2515701353549957, + "learning_rate": 4.2525000000000004e-05, + "loss": 2.347, + "step": 299 + }, + { + "epoch": 0.033489968707810486, + "grad_norm": 0.25506308674812317, + "learning_rate": 4.25e-05, + "loss": 2.4563, + "step": 300 + }, + { + "epoch": 0.03360160193683653, + "grad_norm": 0.24955067038536072, + "learning_rate": 4.2475e-05, + "loss": 2.3467, + "step": 301 + }, + { + "epoch": 0.03371323516586256, + "grad_norm": 0.26849493384361267, + "learning_rate": 4.245e-05, + "loss": 2.318, + "step": 302 + }, + { + "epoch": 0.033824868394888595, + "grad_norm": 0.2560291886329651, + "learning_rate": 4.2425e-05, + "loss": 2.3567, + "step": 303 + }, + { + "epoch": 0.03393650162391463, + "grad_norm": 0.2685459554195404, + "learning_rate": 4.24e-05, + "loss": 2.3929, + "step": 304 + }, + { + "epoch": 0.03404813485294066, + "grad_norm": 0.2724890410900116, + "learning_rate": 4.237500000000001e-05, + "loss": 2.321, + "step": 305 + }, + { + "epoch": 0.0341597680819667, + "grad_norm": 0.3363018333911896, + "learning_rate": 4.235e-05, + "loss": 2.2429, + "step": 306 + }, + { + "epoch": 0.03427140131099273, + "grad_norm": 0.2732946276664734, + "learning_rate": 4.2325000000000006e-05, + "loss": 2.4269, + "step": 307 + }, + { + "epoch": 0.034383034540018766, + "grad_norm": 0.25203391909599304, + "learning_rate": 4.23e-05, + "loss": 2.3134, + "step": 308 + }, + { + "epoch": 0.03449466776904481, + "grad_norm": 0.27153274416923523, + "learning_rate": 4.2275000000000004e-05, + "loss": 2.3181, + "step": 309 + }, + { + "epoch": 0.03460630099807084, + "grad_norm": 0.25666430592536926, + "learning_rate": 4.2250000000000004e-05, + "loss": 2.437, + "step": 310 + }, + { + "epoch": 0.034717934227096875, + "grad_norm": 0.255957692861557, + "learning_rate": 4.2225e-05, + "loss": 2.3226, + "step": 311 + }, + { + "epoch": 0.03482956745612291, + "grad_norm": 0.2525959014892578, + "learning_rate": 4.22e-05, + "loss": 2.4727, + "step": 312 + }, + { + "epoch": 0.03494120068514894, + "grad_norm": 0.34017762541770935, + "learning_rate": 4.2175e-05, + "loss": 2.2816, + "step": 313 + }, + { + "epoch": 0.03505283391417498, + "grad_norm": 0.25117596983909607, + "learning_rate": 4.215e-05, + "loss": 2.3801, + "step": 314 + }, + { + "epoch": 0.03516446714320101, + "grad_norm": 0.2531397044658661, + "learning_rate": 4.2125e-05, + "loss": 2.4461, + "step": 315 + }, + { + "epoch": 0.035276100372227046, + "grad_norm": 0.2584179639816284, + "learning_rate": 4.21e-05, + "loss": 2.2489, + "step": 316 + }, + { + "epoch": 0.03538773360125308, + "grad_norm": 0.25839826464653015, + "learning_rate": 4.2075000000000006e-05, + "loss": 2.3748, + "step": 317 + }, + { + "epoch": 0.03549936683027912, + "grad_norm": 0.2568003833293915, + "learning_rate": 4.205e-05, + "loss": 2.3484, + "step": 318 + }, + { + "epoch": 0.035611000059305155, + "grad_norm": 0.2569085657596588, + "learning_rate": 4.2025000000000005e-05, + "loss": 2.4397, + "step": 319 + }, + { + "epoch": 0.03572263328833119, + "grad_norm": 0.24986624717712402, + "learning_rate": 4.2e-05, + "loss": 2.3008, + "step": 320 + }, + { + "epoch": 0.03583426651735722, + "grad_norm": 0.26116469502449036, + "learning_rate": 4.1975000000000004e-05, + "loss": 2.2436, + "step": 321 + }, + { + "epoch": 0.03594589974638326, + "grad_norm": 0.3085286021232605, + "learning_rate": 4.195e-05, + "loss": 2.2419, + "step": 322 + }, + { + "epoch": 0.03605753297540929, + "grad_norm": 1.1896109580993652, + "learning_rate": 4.1925e-05, + "loss": 2.3528, + "step": 323 + }, + { + "epoch": 0.036169166204435325, + "grad_norm": 0.26796790957450867, + "learning_rate": 4.19e-05, + "loss": 2.314, + "step": 324 + }, + { + "epoch": 0.03628079943346136, + "grad_norm": 0.27274981141090393, + "learning_rate": 4.1875e-05, + "loss": 2.3397, + "step": 325 + }, + { + "epoch": 0.0363924326624874, + "grad_norm": 0.2676389813423157, + "learning_rate": 4.185e-05, + "loss": 2.4168, + "step": 326 + }, + { + "epoch": 0.036504065891513435, + "grad_norm": 0.4810822904109955, + "learning_rate": 4.1825e-05, + "loss": 2.3375, + "step": 327 + }, + { + "epoch": 0.03661569912053947, + "grad_norm": 0.28411370515823364, + "learning_rate": 4.18e-05, + "loss": 2.2704, + "step": 328 + }, + { + "epoch": 0.0367273323495655, + "grad_norm": 0.27649131417274475, + "learning_rate": 4.1775000000000006e-05, + "loss": 2.3377, + "step": 329 + }, + { + "epoch": 0.03683896557859154, + "grad_norm": 0.25533196330070496, + "learning_rate": 4.175e-05, + "loss": 2.2652, + "step": 330 + }, + { + "epoch": 0.03695059880761757, + "grad_norm": 0.2640891969203949, + "learning_rate": 4.1725000000000005e-05, + "loss": 2.3504, + "step": 331 + }, + { + "epoch": 0.037062232036643605, + "grad_norm": 0.2534253001213074, + "learning_rate": 4.17e-05, + "loss": 2.4219, + "step": 332 + }, + { + "epoch": 0.03717386526566964, + "grad_norm": 0.2603084146976471, + "learning_rate": 4.1675e-05, + "loss": 2.3582, + "step": 333 + }, + { + "epoch": 0.03728549849469568, + "grad_norm": 0.2546936273574829, + "learning_rate": 4.165e-05, + "loss": 2.3509, + "step": 334 + }, + { + "epoch": 0.037397131723721715, + "grad_norm": 0.2537340819835663, + "learning_rate": 4.1625e-05, + "loss": 2.3683, + "step": 335 + }, + { + "epoch": 0.03750876495274775, + "grad_norm": 0.26713791489601135, + "learning_rate": 4.16e-05, + "loss": 2.3843, + "step": 336 + }, + { + "epoch": 0.03762039818177378, + "grad_norm": 0.2577325105667114, + "learning_rate": 4.1575e-05, + "loss": 2.4786, + "step": 337 + }, + { + "epoch": 0.03773203141079982, + "grad_norm": 0.24003548920154572, + "learning_rate": 4.155e-05, + "loss": 2.4079, + "step": 338 + }, + { + "epoch": 0.03784366463982585, + "grad_norm": 0.26992398500442505, + "learning_rate": 4.1525e-05, + "loss": 2.382, + "step": 339 + }, + { + "epoch": 0.037955297868851885, + "grad_norm": 0.2569391131401062, + "learning_rate": 4.15e-05, + "loss": 2.2762, + "step": 340 + }, + { + "epoch": 0.03806693109787792, + "grad_norm": 0.2576649785041809, + "learning_rate": 4.1475000000000005e-05, + "loss": 2.3492, + "step": 341 + }, + { + "epoch": 0.03817856432690396, + "grad_norm": 0.2680610716342926, + "learning_rate": 4.145e-05, + "loss": 2.3769, + "step": 342 + }, + { + "epoch": 0.038290197555929995, + "grad_norm": 0.2511936128139496, + "learning_rate": 4.1425000000000004e-05, + "loss": 2.2633, + "step": 343 + }, + { + "epoch": 0.03840183078495603, + "grad_norm": 0.2549611032009125, + "learning_rate": 4.14e-05, + "loss": 2.4439, + "step": 344 + }, + { + "epoch": 0.03851346401398206, + "grad_norm": 0.24939557909965515, + "learning_rate": 4.1375e-05, + "loss": 2.3102, + "step": 345 + }, + { + "epoch": 0.0386250972430081, + "grad_norm": 0.2684389650821686, + "learning_rate": 4.135e-05, + "loss": 2.3636, + "step": 346 + }, + { + "epoch": 0.03873673047203413, + "grad_norm": 0.24813228845596313, + "learning_rate": 4.1325e-05, + "loss": 2.3439, + "step": 347 + }, + { + "epoch": 0.038848363701060165, + "grad_norm": 0.24416686594486237, + "learning_rate": 4.13e-05, + "loss": 2.256, + "step": 348 + }, + { + "epoch": 0.0389599969300862, + "grad_norm": 0.24512575566768646, + "learning_rate": 4.1275e-05, + "loss": 2.3966, + "step": 349 + }, + { + "epoch": 0.03907163015911223, + "grad_norm": 0.25613388419151306, + "learning_rate": 4.125e-05, + "loss": 2.3726, + "step": 350 + }, + { + "epoch": 0.039183263388138274, + "grad_norm": 0.2553405463695526, + "learning_rate": 4.1225e-05, + "loss": 2.4718, + "step": 351 + }, + { + "epoch": 0.03929489661716431, + "grad_norm": 0.2818881869316101, + "learning_rate": 4.12e-05, + "loss": 2.2828, + "step": 352 + }, + { + "epoch": 0.03940652984619034, + "grad_norm": 0.2522308826446533, + "learning_rate": 4.1175000000000005e-05, + "loss": 2.3791, + "step": 353 + }, + { + "epoch": 0.03951816307521638, + "grad_norm": 0.2561878561973572, + "learning_rate": 4.115e-05, + "loss": 2.4064, + "step": 354 + }, + { + "epoch": 0.03962979630424241, + "grad_norm": 0.2658417522907257, + "learning_rate": 4.1125000000000004e-05, + "loss": 2.4249, + "step": 355 + }, + { + "epoch": 0.039741429533268445, + "grad_norm": 0.3040316700935364, + "learning_rate": 4.11e-05, + "loss": 2.3588, + "step": 356 + }, + { + "epoch": 0.03985306276229448, + "grad_norm": 0.24854056537151337, + "learning_rate": 4.1075e-05, + "loss": 2.3466, + "step": 357 + }, + { + "epoch": 0.03996469599132051, + "grad_norm": 0.2535928785800934, + "learning_rate": 4.105e-05, + "loss": 2.2637, + "step": 358 + }, + { + "epoch": 0.040076329220346554, + "grad_norm": 0.2553252577781677, + "learning_rate": 4.1025e-05, + "loss": 2.3413, + "step": 359 + }, + { + "epoch": 0.04018796244937259, + "grad_norm": 0.25706836581230164, + "learning_rate": 4.1e-05, + "loss": 2.3426, + "step": 360 + }, + { + "epoch": 0.04029959567839862, + "grad_norm": 0.2549241781234741, + "learning_rate": 4.0975e-05, + "loss": 2.4028, + "step": 361 + }, + { + "epoch": 0.04041122890742466, + "grad_norm": 0.25191277265548706, + "learning_rate": 4.095e-05, + "loss": 2.3353, + "step": 362 + }, + { + "epoch": 0.04052286213645069, + "grad_norm": 0.2870761454105377, + "learning_rate": 4.0925000000000005e-05, + "loss": 2.2961, + "step": 363 + }, + { + "epoch": 0.040634495365476725, + "grad_norm": 0.26251882314682007, + "learning_rate": 4.09e-05, + "loss": 2.3897, + "step": 364 + }, + { + "epoch": 0.04074612859450276, + "grad_norm": 0.24886254966259003, + "learning_rate": 4.0875000000000004e-05, + "loss": 2.3752, + "step": 365 + }, + { + "epoch": 0.04085776182352879, + "grad_norm": 0.24439434707164764, + "learning_rate": 4.085e-05, + "loss": 2.3435, + "step": 366 + }, + { + "epoch": 0.040969395052554834, + "grad_norm": 0.41298142075538635, + "learning_rate": 4.0825e-05, + "loss": 2.4568, + "step": 367 + }, + { + "epoch": 0.04108102828158087, + "grad_norm": 0.2539288401603699, + "learning_rate": 4.08e-05, + "loss": 2.3788, + "step": 368 + }, + { + "epoch": 0.0411926615106069, + "grad_norm": 0.2483907788991928, + "learning_rate": 4.0775e-05, + "loss": 2.2744, + "step": 369 + }, + { + "epoch": 0.04130429473963294, + "grad_norm": 0.2610059678554535, + "learning_rate": 4.075e-05, + "loss": 2.4475, + "step": 370 + }, + { + "epoch": 0.04141592796865897, + "grad_norm": 0.2610619068145752, + "learning_rate": 4.0725e-05, + "loss": 2.4049, + "step": 371 + }, + { + "epoch": 0.041527561197685005, + "grad_norm": 0.26267480850219727, + "learning_rate": 4.07e-05, + "loss": 2.3503, + "step": 372 + }, + { + "epoch": 0.04163919442671104, + "grad_norm": 0.2515001893043518, + "learning_rate": 4.0675e-05, + "loss": 2.3344, + "step": 373 + }, + { + "epoch": 0.04175082765573707, + "grad_norm": 0.27069345116615295, + "learning_rate": 4.065e-05, + "loss": 2.3766, + "step": 374 + }, + { + "epoch": 0.041862460884763114, + "grad_norm": 0.2560361921787262, + "learning_rate": 4.0625000000000005e-05, + "loss": 2.3775, + "step": 375 + }, + { + "epoch": 0.04197409411378915, + "grad_norm": 0.2536779046058655, + "learning_rate": 4.0600000000000004e-05, + "loss": 2.3652, + "step": 376 + }, + { + "epoch": 0.04208572734281518, + "grad_norm": 0.25629979372024536, + "learning_rate": 4.0575000000000004e-05, + "loss": 2.3127, + "step": 377 + }, + { + "epoch": 0.042197360571841216, + "grad_norm": 0.25751662254333496, + "learning_rate": 4.055e-05, + "loss": 2.3268, + "step": 378 + }, + { + "epoch": 0.04230899380086725, + "grad_norm": 0.25083836913108826, + "learning_rate": 4.0525e-05, + "loss": 2.3929, + "step": 379 + }, + { + "epoch": 0.042420627029893285, + "grad_norm": 0.2616216242313385, + "learning_rate": 4.05e-05, + "loss": 2.2566, + "step": 380 + }, + { + "epoch": 0.04253226025891932, + "grad_norm": 0.2545231580734253, + "learning_rate": 4.0475e-05, + "loss": 2.3916, + "step": 381 + }, + { + "epoch": 0.04264389348794535, + "grad_norm": 0.24984431266784668, + "learning_rate": 4.045000000000001e-05, + "loss": 2.3001, + "step": 382 + }, + { + "epoch": 0.04275552671697139, + "grad_norm": 0.2487059086561203, + "learning_rate": 4.0425e-05, + "loss": 2.4398, + "step": 383 + }, + { + "epoch": 0.04286715994599743, + "grad_norm": 0.2508711516857147, + "learning_rate": 4.0400000000000006e-05, + "loss": 2.3868, + "step": 384 + }, + { + "epoch": 0.04297879317502346, + "grad_norm": 0.250318706035614, + "learning_rate": 4.0375e-05, + "loss": 2.3627, + "step": 385 + }, + { + "epoch": 0.043090426404049496, + "grad_norm": 0.27126437425613403, + "learning_rate": 4.0350000000000005e-05, + "loss": 2.3592, + "step": 386 + }, + { + "epoch": 0.04320205963307553, + "grad_norm": 0.2641567289829254, + "learning_rate": 4.0325000000000004e-05, + "loss": 2.3391, + "step": 387 + }, + { + "epoch": 0.043313692862101565, + "grad_norm": 0.24686801433563232, + "learning_rate": 4.0300000000000004e-05, + "loss": 2.4642, + "step": 388 + }, + { + "epoch": 0.0434253260911276, + "grad_norm": 0.260929137468338, + "learning_rate": 4.0275e-05, + "loss": 2.2533, + "step": 389 + }, + { + "epoch": 0.04353695932015363, + "grad_norm": 0.2568647265434265, + "learning_rate": 4.025e-05, + "loss": 2.3352, + "step": 390 + }, + { + "epoch": 0.04364859254917967, + "grad_norm": 0.24345262348651886, + "learning_rate": 4.0225e-05, + "loss": 2.3717, + "step": 391 + }, + { + "epoch": 0.04376022577820571, + "grad_norm": 0.24727144837379456, + "learning_rate": 4.02e-05, + "loss": 2.3759, + "step": 392 + }, + { + "epoch": 0.04387185900723174, + "grad_norm": 0.24666303396224976, + "learning_rate": 4.0175e-05, + "loss": 2.3249, + "step": 393 + }, + { + "epoch": 0.043983492236257776, + "grad_norm": 0.26533442735671997, + "learning_rate": 4.015000000000001e-05, + "loss": 2.4041, + "step": 394 + }, + { + "epoch": 0.04409512546528381, + "grad_norm": 0.25406232476234436, + "learning_rate": 4.0125e-05, + "loss": 2.3457, + "step": 395 + }, + { + "epoch": 0.044206758694309845, + "grad_norm": 0.250957190990448, + "learning_rate": 4.0100000000000006e-05, + "loss": 2.3591, + "step": 396 + }, + { + "epoch": 0.04431839192333588, + "grad_norm": 0.25459203124046326, + "learning_rate": 4.0075e-05, + "loss": 2.4105, + "step": 397 + }, + { + "epoch": 0.04443002515236191, + "grad_norm": 0.25286054611206055, + "learning_rate": 4.0050000000000004e-05, + "loss": 2.3757, + "step": 398 + }, + { + "epoch": 0.04454165838138795, + "grad_norm": 0.2448861449956894, + "learning_rate": 4.0025000000000004e-05, + "loss": 2.4299, + "step": 399 + }, + { + "epoch": 0.04465329161041399, + "grad_norm": 0.24877607822418213, + "learning_rate": 4e-05, + "loss": 2.4102, + "step": 400 + }, + { + "epoch": 0.04476492483944002, + "grad_norm": 0.2525503635406494, + "learning_rate": 3.9975e-05, + "loss": 2.4452, + "step": 401 + }, + { + "epoch": 0.044876558068466056, + "grad_norm": 0.24728810787200928, + "learning_rate": 3.995e-05, + "loss": 2.354, + "step": 402 + }, + { + "epoch": 0.04498819129749209, + "grad_norm": 0.26556339859962463, + "learning_rate": 3.9925e-05, + "loss": 2.3333, + "step": 403 + }, + { + "epoch": 0.045099824526518124, + "grad_norm": 0.25951698422431946, + "learning_rate": 3.99e-05, + "loss": 2.2839, + "step": 404 + }, + { + "epoch": 0.04521145775554416, + "grad_norm": 0.25703710317611694, + "learning_rate": 3.9875e-05, + "loss": 2.3381, + "step": 405 + }, + { + "epoch": 0.04532309098457019, + "grad_norm": 0.2780836820602417, + "learning_rate": 3.9850000000000006e-05, + "loss": 2.3457, + "step": 406 + }, + { + "epoch": 0.04543472421359623, + "grad_norm": 0.252136766910553, + "learning_rate": 3.9825e-05, + "loss": 2.3855, + "step": 407 + }, + { + "epoch": 0.04554635744262227, + "grad_norm": 0.2834818661212921, + "learning_rate": 3.9800000000000005e-05, + "loss": 2.4514, + "step": 408 + }, + { + "epoch": 0.0456579906716483, + "grad_norm": 0.2503218948841095, + "learning_rate": 3.9775e-05, + "loss": 2.3824, + "step": 409 + }, + { + "epoch": 0.045769623900674336, + "grad_norm": 0.2580338418483734, + "learning_rate": 3.9750000000000004e-05, + "loss": 2.3869, + "step": 410 + }, + { + "epoch": 0.04588125712970037, + "grad_norm": 0.2537950277328491, + "learning_rate": 3.9725e-05, + "loss": 2.3022, + "step": 411 + }, + { + "epoch": 0.045992890358726404, + "grad_norm": 0.2560393810272217, + "learning_rate": 3.97e-05, + "loss": 2.3521, + "step": 412 + }, + { + "epoch": 0.04610452358775244, + "grad_norm": 0.2616626024246216, + "learning_rate": 3.9675e-05, + "loss": 2.3337, + "step": 413 + }, + { + "epoch": 0.04621615681677847, + "grad_norm": 0.3034763038158417, + "learning_rate": 3.965e-05, + "loss": 2.2924, + "step": 414 + }, + { + "epoch": 0.04632779004580451, + "grad_norm": 0.2462148219347, + "learning_rate": 3.9625e-05, + "loss": 2.4056, + "step": 415 + }, + { + "epoch": 0.04643942327483054, + "grad_norm": 0.2612819969654083, + "learning_rate": 3.960000000000001e-05, + "loss": 2.2965, + "step": 416 + }, + { + "epoch": 0.04655105650385658, + "grad_norm": 0.2580772638320923, + "learning_rate": 3.9575e-05, + "loss": 2.3585, + "step": 417 + }, + { + "epoch": 0.046662689732882616, + "grad_norm": 0.25281044840812683, + "learning_rate": 3.9550000000000006e-05, + "loss": 2.4224, + "step": 418 + }, + { + "epoch": 0.04677432296190865, + "grad_norm": 0.2507364749908447, + "learning_rate": 3.9525e-05, + "loss": 2.3973, + "step": 419 + }, + { + "epoch": 0.046885956190934684, + "grad_norm": 0.25551095604896545, + "learning_rate": 3.9500000000000005e-05, + "loss": 2.3345, + "step": 420 + }, + { + "epoch": 0.04699758941996072, + "grad_norm": 0.276262104511261, + "learning_rate": 3.9475000000000004e-05, + "loss": 2.3373, + "step": 421 + }, + { + "epoch": 0.04710922264898675, + "grad_norm": 0.30092892050743103, + "learning_rate": 3.9450000000000003e-05, + "loss": 2.3881, + "step": 422 + }, + { + "epoch": 0.04722085587801279, + "grad_norm": 0.25903210043907166, + "learning_rate": 3.9425e-05, + "loss": 2.3793, + "step": 423 + }, + { + "epoch": 0.04733248910703882, + "grad_norm": 0.2587510347366333, + "learning_rate": 3.94e-05, + "loss": 2.2823, + "step": 424 + }, + { + "epoch": 0.04744412233606486, + "grad_norm": 0.2502042055130005, + "learning_rate": 3.9375e-05, + "loss": 2.3279, + "step": 425 + }, + { + "epoch": 0.047555755565090896, + "grad_norm": 0.43736714124679565, + "learning_rate": 3.935e-05, + "loss": 2.3215, + "step": 426 + }, + { + "epoch": 0.04766738879411693, + "grad_norm": 0.2547898590564728, + "learning_rate": 3.9325e-05, + "loss": 2.2728, + "step": 427 + }, + { + "epoch": 0.047779022023142964, + "grad_norm": 0.24751374125480652, + "learning_rate": 3.9300000000000007e-05, + "loss": 2.3808, + "step": 428 + }, + { + "epoch": 0.047890655252169, + "grad_norm": 0.2624378800392151, + "learning_rate": 3.9275e-05, + "loss": 2.3256, + "step": 429 + }, + { + "epoch": 0.04800228848119503, + "grad_norm": 0.4183436632156372, + "learning_rate": 3.9250000000000005e-05, + "loss": 2.3553, + "step": 430 + }, + { + "epoch": 0.048113921710221066, + "grad_norm": 0.2535308599472046, + "learning_rate": 3.9225e-05, + "loss": 2.3025, + "step": 431 + }, + { + "epoch": 0.0482255549392471, + "grad_norm": 0.3438050448894501, + "learning_rate": 3.9200000000000004e-05, + "loss": 2.322, + "step": 432 + }, + { + "epoch": 0.04833718816827314, + "grad_norm": 0.2605397403240204, + "learning_rate": 3.9175000000000004e-05, + "loss": 2.3356, + "step": 433 + }, + { + "epoch": 0.048448821397299176, + "grad_norm": 0.24819597601890564, + "learning_rate": 3.915e-05, + "loss": 2.4338, + "step": 434 + }, + { + "epoch": 0.04856045462632521, + "grad_norm": 0.24737314879894257, + "learning_rate": 3.9125e-05, + "loss": 2.3618, + "step": 435 + }, + { + "epoch": 0.048672087855351244, + "grad_norm": 0.25205180048942566, + "learning_rate": 3.91e-05, + "loss": 2.3555, + "step": 436 + }, + { + "epoch": 0.04878372108437728, + "grad_norm": 0.24479413032531738, + "learning_rate": 3.9075e-05, + "loss": 2.3788, + "step": 437 + }, + { + "epoch": 0.04889535431340331, + "grad_norm": 0.26318883895874023, + "learning_rate": 3.905e-05, + "loss": 2.4028, + "step": 438 + }, + { + "epoch": 0.049006987542429346, + "grad_norm": 0.24493621289730072, + "learning_rate": 3.9025e-05, + "loss": 2.3734, + "step": 439 + }, + { + "epoch": 0.04911862077145538, + "grad_norm": 0.2888612151145935, + "learning_rate": 3.9000000000000006e-05, + "loss": 2.3424, + "step": 440 + }, + { + "epoch": 0.04923025400048142, + "grad_norm": 0.2574605941772461, + "learning_rate": 3.8975e-05, + "loss": 2.4289, + "step": 441 + }, + { + "epoch": 0.049341887229507456, + "grad_norm": 0.329041987657547, + "learning_rate": 3.8950000000000005e-05, + "loss": 2.2217, + "step": 442 + }, + { + "epoch": 0.04945352045853349, + "grad_norm": 0.29988110065460205, + "learning_rate": 3.8925e-05, + "loss": 2.4389, + "step": 443 + }, + { + "epoch": 0.049565153687559524, + "grad_norm": 0.2377348095178604, + "learning_rate": 3.8900000000000004e-05, + "loss": 2.4175, + "step": 444 + }, + { + "epoch": 0.04967678691658556, + "grad_norm": 0.28037315607070923, + "learning_rate": 3.8875e-05, + "loss": 2.241, + "step": 445 + }, + { + "epoch": 0.04978842014561159, + "grad_norm": 0.2584727704524994, + "learning_rate": 3.885e-05, + "loss": 2.3034, + "step": 446 + }, + { + "epoch": 0.049900053374637626, + "grad_norm": 0.24890342354774475, + "learning_rate": 3.8825e-05, + "loss": 2.3393, + "step": 447 + }, + { + "epoch": 0.05001168660366366, + "grad_norm": 0.24033311009407043, + "learning_rate": 3.88e-05, + "loss": 2.339, + "step": 448 + }, + { + "epoch": 0.0501233198326897, + "grad_norm": 0.2352409064769745, + "learning_rate": 3.8775e-05, + "loss": 2.337, + "step": 449 + }, + { + "epoch": 0.050234953061715736, + "grad_norm": 0.8847767114639282, + "learning_rate": 3.875e-05, + "loss": 2.2669, + "step": 450 + }, + { + "epoch": 0.05034658629074177, + "grad_norm": 0.2545109987258911, + "learning_rate": 3.8725e-05, + "loss": 2.3277, + "step": 451 + }, + { + "epoch": 0.050458219519767804, + "grad_norm": 0.2856300175189972, + "learning_rate": 3.8700000000000006e-05, + "loss": 2.3928, + "step": 452 + }, + { + "epoch": 0.05056985274879384, + "grad_norm": 0.24699796736240387, + "learning_rate": 3.8675e-05, + "loss": 2.4304, + "step": 453 + }, + { + "epoch": 0.05068148597781987, + "grad_norm": 0.2530304193496704, + "learning_rate": 3.8650000000000004e-05, + "loss": 2.3304, + "step": 454 + }, + { + "epoch": 0.050793119206845906, + "grad_norm": 0.25681325793266296, + "learning_rate": 3.8625e-05, + "loss": 2.4092, + "step": 455 + }, + { + "epoch": 0.05090475243587194, + "grad_norm": 0.2615368962287903, + "learning_rate": 3.86e-05, + "loss": 2.3721, + "step": 456 + }, + { + "epoch": 0.051016385664897974, + "grad_norm": 0.2456715703010559, + "learning_rate": 3.8575e-05, + "loss": 2.3512, + "step": 457 + }, + { + "epoch": 0.051128018893924015, + "grad_norm": 0.26264017820358276, + "learning_rate": 3.855e-05, + "loss": 2.419, + "step": 458 + }, + { + "epoch": 0.05123965212295005, + "grad_norm": 0.2601991295814514, + "learning_rate": 3.8525e-05, + "loss": 2.3389, + "step": 459 + }, + { + "epoch": 0.051351285351976084, + "grad_norm": 0.24692246317863464, + "learning_rate": 3.85e-05, + "loss": 2.1237, + "step": 460 + }, + { + "epoch": 0.05146291858100212, + "grad_norm": 0.25850188732147217, + "learning_rate": 3.8475e-05, + "loss": 2.3413, + "step": 461 + }, + { + "epoch": 0.05157455181002815, + "grad_norm": 0.2406020611524582, + "learning_rate": 3.845e-05, + "loss": 2.36, + "step": 462 + }, + { + "epoch": 0.051686185039054186, + "grad_norm": 0.25938504934310913, + "learning_rate": 3.8425e-05, + "loss": 2.3512, + "step": 463 + }, + { + "epoch": 0.05179781826808022, + "grad_norm": 0.2700086236000061, + "learning_rate": 3.8400000000000005e-05, + "loss": 2.3883, + "step": 464 + }, + { + "epoch": 0.051909451497106254, + "grad_norm": 0.24675601720809937, + "learning_rate": 3.8375e-05, + "loss": 2.3614, + "step": 465 + }, + { + "epoch": 0.052021084726132295, + "grad_norm": 0.3309410512447357, + "learning_rate": 3.8350000000000004e-05, + "loss": 2.4193, + "step": 466 + }, + { + "epoch": 0.05213271795515833, + "grad_norm": 0.24233734607696533, + "learning_rate": 3.8324999999999996e-05, + "loss": 2.3038, + "step": 467 + }, + { + "epoch": 0.052244351184184364, + "grad_norm": 0.24930906295776367, + "learning_rate": 3.83e-05, + "loss": 2.3087, + "step": 468 + }, + { + "epoch": 0.0523559844132104, + "grad_norm": 0.2506902813911438, + "learning_rate": 3.8275e-05, + "loss": 2.4125, + "step": 469 + }, + { + "epoch": 0.05246761764223643, + "grad_norm": 0.4915805757045746, + "learning_rate": 3.825e-05, + "loss": 2.4222, + "step": 470 + }, + { + "epoch": 0.052579250871262466, + "grad_norm": 0.24434614181518555, + "learning_rate": 3.8225e-05, + "loss": 2.4653, + "step": 471 + }, + { + "epoch": 0.0526908841002885, + "grad_norm": 0.24511779844760895, + "learning_rate": 3.82e-05, + "loss": 2.3369, + "step": 472 + }, + { + "epoch": 0.052802517329314534, + "grad_norm": 0.24757201969623566, + "learning_rate": 3.8175e-05, + "loss": 2.2323, + "step": 473 + }, + { + "epoch": 0.052914150558340575, + "grad_norm": 0.2527662515640259, + "learning_rate": 3.8150000000000006e-05, + "loss": 2.4045, + "step": 474 + }, + { + "epoch": 0.05302578378736661, + "grad_norm": 0.2481798529624939, + "learning_rate": 3.8125e-05, + "loss": 2.3615, + "step": 475 + }, + { + "epoch": 0.053137417016392643, + "grad_norm": 0.25394219160079956, + "learning_rate": 3.8100000000000005e-05, + "loss": 2.3536, + "step": 476 + }, + { + "epoch": 0.05324905024541868, + "grad_norm": 0.2457951009273529, + "learning_rate": 3.8075e-05, + "loss": 2.2919, + "step": 477 + }, + { + "epoch": 0.05336068347444471, + "grad_norm": 0.43115633726119995, + "learning_rate": 3.805e-05, + "loss": 2.4222, + "step": 478 + }, + { + "epoch": 0.053472316703470746, + "grad_norm": 0.257254958152771, + "learning_rate": 3.8025e-05, + "loss": 2.3744, + "step": 479 + }, + { + "epoch": 0.05358394993249678, + "grad_norm": 0.2672825753688812, + "learning_rate": 3.8e-05, + "loss": 2.3315, + "step": 480 + }, + { + "epoch": 0.053695583161522814, + "grad_norm": 0.24192893505096436, + "learning_rate": 3.7975e-05, + "loss": 2.4601, + "step": 481 + }, + { + "epoch": 0.053807216390548855, + "grad_norm": 0.24756518006324768, + "learning_rate": 3.795e-05, + "loss": 2.2872, + "step": 482 + }, + { + "epoch": 0.05391884961957489, + "grad_norm": 0.25323203206062317, + "learning_rate": 3.7925e-05, + "loss": 2.4142, + "step": 483 + }, + { + "epoch": 0.05403048284860092, + "grad_norm": 0.25254499912261963, + "learning_rate": 3.79e-05, + "loss": 2.3528, + "step": 484 + }, + { + "epoch": 0.05414211607762696, + "grad_norm": 0.25069230794906616, + "learning_rate": 3.7875e-05, + "loss": 2.2906, + "step": 485 + }, + { + "epoch": 0.05425374930665299, + "grad_norm": 0.26814520359039307, + "learning_rate": 3.7850000000000005e-05, + "loss": 2.3368, + "step": 486 + }, + { + "epoch": 0.054365382535679026, + "grad_norm": 0.24452055990695953, + "learning_rate": 3.7825e-05, + "loss": 2.3667, + "step": 487 + }, + { + "epoch": 0.05447701576470506, + "grad_norm": 0.26742151379585266, + "learning_rate": 3.7800000000000004e-05, + "loss": 2.323, + "step": 488 + }, + { + "epoch": 0.054588648993731094, + "grad_norm": 0.25766825675964355, + "learning_rate": 3.7775e-05, + "loss": 2.3761, + "step": 489 + }, + { + "epoch": 0.05470028222275713, + "grad_norm": 0.29029610753059387, + "learning_rate": 3.775e-05, + "loss": 2.3216, + "step": 490 + }, + { + "epoch": 0.05481191545178317, + "grad_norm": 0.25023818016052246, + "learning_rate": 3.7725e-05, + "loss": 2.3441, + "step": 491 + }, + { + "epoch": 0.0549235486808092, + "grad_norm": 0.3373621702194214, + "learning_rate": 3.77e-05, + "loss": 2.3166, + "step": 492 + }, + { + "epoch": 0.05503518190983524, + "grad_norm": 0.23989304900169373, + "learning_rate": 3.7675e-05, + "loss": 2.2795, + "step": 493 + }, + { + "epoch": 0.05514681513886127, + "grad_norm": 0.3192415237426758, + "learning_rate": 3.765e-05, + "loss": 2.3273, + "step": 494 + }, + { + "epoch": 0.055258448367887306, + "grad_norm": 0.24007610976696014, + "learning_rate": 3.7625e-05, + "loss": 2.413, + "step": 495 + }, + { + "epoch": 0.05537008159691334, + "grad_norm": 0.36170458793640137, + "learning_rate": 3.76e-05, + "loss": 2.2216, + "step": 496 + }, + { + "epoch": 0.055481714825939374, + "grad_norm": 0.24714718759059906, + "learning_rate": 3.7575e-05, + "loss": 2.3394, + "step": 497 + }, + { + "epoch": 0.05559334805496541, + "grad_norm": 0.238433837890625, + "learning_rate": 3.7550000000000005e-05, + "loss": 2.3685, + "step": 498 + }, + { + "epoch": 0.05570498128399145, + "grad_norm": 0.24975870549678802, + "learning_rate": 3.7525e-05, + "loss": 2.241, + "step": 499 + }, + { + "epoch": 0.05581661451301748, + "grad_norm": 0.24853730201721191, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.4173, + "step": 500 + }, + { + "epoch": 0.05592824774204352, + "grad_norm": 0.24904385209083557, + "learning_rate": 3.7475e-05, + "loss": 2.4291, + "step": 501 + }, + { + "epoch": 0.05603988097106955, + "grad_norm": 0.23549965023994446, + "learning_rate": 3.745e-05, + "loss": 2.3054, + "step": 502 + }, + { + "epoch": 0.056151514200095586, + "grad_norm": 0.2464476376771927, + "learning_rate": 3.7425e-05, + "loss": 2.4886, + "step": 503 + }, + { + "epoch": 0.05626314742912162, + "grad_norm": 0.25297120213508606, + "learning_rate": 3.74e-05, + "loss": 2.3384, + "step": 504 + }, + { + "epoch": 0.056374780658147654, + "grad_norm": 0.2817951440811157, + "learning_rate": 3.737500000000001e-05, + "loss": 2.3319, + "step": 505 + }, + { + "epoch": 0.05648641388717369, + "grad_norm": 0.24703608453273773, + "learning_rate": 3.735e-05, + "loss": 2.4513, + "step": 506 + }, + { + "epoch": 0.05659804711619973, + "grad_norm": 0.25609731674194336, + "learning_rate": 3.7325000000000006e-05, + "loss": 2.4268, + "step": 507 + }, + { + "epoch": 0.05670968034522576, + "grad_norm": 0.2358425110578537, + "learning_rate": 3.73e-05, + "loss": 2.3388, + "step": 508 + }, + { + "epoch": 0.0568213135742518, + "grad_norm": 0.2538928687572479, + "learning_rate": 3.7275000000000005e-05, + "loss": 2.4332, + "step": 509 + }, + { + "epoch": 0.05693294680327783, + "grad_norm": 0.2499266117811203, + "learning_rate": 3.7250000000000004e-05, + "loss": 2.3393, + "step": 510 + }, + { + "epoch": 0.057044580032303865, + "grad_norm": 0.24814729392528534, + "learning_rate": 3.7225000000000004e-05, + "loss": 2.3558, + "step": 511 + }, + { + "epoch": 0.0571562132613299, + "grad_norm": 0.24115900695323944, + "learning_rate": 3.72e-05, + "loss": 2.33, + "step": 512 + }, + { + "epoch": 0.057267846490355934, + "grad_norm": 0.24013325572013855, + "learning_rate": 3.7175e-05, + "loss": 2.4107, + "step": 513 + }, + { + "epoch": 0.05737947971938197, + "grad_norm": 0.24541234970092773, + "learning_rate": 3.715e-05, + "loss": 2.3866, + "step": 514 + }, + { + "epoch": 0.05749111294840801, + "grad_norm": 0.23506753146648407, + "learning_rate": 3.7125e-05, + "loss": 2.3321, + "step": 515 + }, + { + "epoch": 0.05760274617743404, + "grad_norm": 0.24425718188285828, + "learning_rate": 3.71e-05, + "loss": 2.4, + "step": 516 + }, + { + "epoch": 0.05771437940646008, + "grad_norm": 0.2441939115524292, + "learning_rate": 3.707500000000001e-05, + "loss": 2.3855, + "step": 517 + }, + { + "epoch": 0.05782601263548611, + "grad_norm": 0.23702003061771393, + "learning_rate": 3.705e-05, + "loss": 2.3171, + "step": 518 + }, + { + "epoch": 0.057937645864512145, + "grad_norm": 0.2465049922466278, + "learning_rate": 3.7025000000000005e-05, + "loss": 2.431, + "step": 519 + }, + { + "epoch": 0.05804927909353818, + "grad_norm": 0.22516661882400513, + "learning_rate": 3.7e-05, + "loss": 2.3307, + "step": 520 + }, + { + "epoch": 0.058160912322564214, + "grad_norm": 0.2372903823852539, + "learning_rate": 3.6975000000000004e-05, + "loss": 2.2791, + "step": 521 + }, + { + "epoch": 0.05827254555159025, + "grad_norm": 0.2584116458892822, + "learning_rate": 3.6950000000000004e-05, + "loss": 2.4296, + "step": 522 + }, + { + "epoch": 0.05838417878061628, + "grad_norm": 0.24023661017417908, + "learning_rate": 3.6925e-05, + "loss": 2.4579, + "step": 523 + }, + { + "epoch": 0.05849581200964232, + "grad_norm": 0.26791271567344666, + "learning_rate": 3.69e-05, + "loss": 2.3662, + "step": 524 + }, + { + "epoch": 0.05860744523866836, + "grad_norm": 0.23485226929187775, + "learning_rate": 3.6875e-05, + "loss": 2.1954, + "step": 525 + }, + { + "epoch": 0.05871907846769439, + "grad_norm": 0.24312959611415863, + "learning_rate": 3.685e-05, + "loss": 2.4423, + "step": 526 + }, + { + "epoch": 0.058830711696720425, + "grad_norm": 0.2558239996433258, + "learning_rate": 3.6825e-05, + "loss": 2.3661, + "step": 527 + }, + { + "epoch": 0.05894234492574646, + "grad_norm": 0.26516368985176086, + "learning_rate": 3.68e-05, + "loss": 2.3181, + "step": 528 + }, + { + "epoch": 0.05905397815477249, + "grad_norm": 0.24487736821174622, + "learning_rate": 3.6775000000000006e-05, + "loss": 2.3667, + "step": 529 + }, + { + "epoch": 0.05916561138379853, + "grad_norm": 0.23648685216903687, + "learning_rate": 3.675e-05, + "loss": 2.3438, + "step": 530 + }, + { + "epoch": 0.05927724461282456, + "grad_norm": 0.2509547173976898, + "learning_rate": 3.6725000000000005e-05, + "loss": 2.3128, + "step": 531 + }, + { + "epoch": 0.0593888778418506, + "grad_norm": 0.24765925109386444, + "learning_rate": 3.6700000000000004e-05, + "loss": 2.3556, + "step": 532 + }, + { + "epoch": 0.05950051107087664, + "grad_norm": 0.23786590993404388, + "learning_rate": 3.6675000000000004e-05, + "loss": 2.4546, + "step": 533 + }, + { + "epoch": 0.05961214429990267, + "grad_norm": 0.3275027573108673, + "learning_rate": 3.665e-05, + "loss": 2.3161, + "step": 534 + }, + { + "epoch": 0.059723777528928705, + "grad_norm": 0.24953784048557281, + "learning_rate": 3.6625e-05, + "loss": 2.3206, + "step": 535 + }, + { + "epoch": 0.05983541075795474, + "grad_norm": 0.24241773784160614, + "learning_rate": 3.66e-05, + "loss": 2.2838, + "step": 536 + }, + { + "epoch": 0.05994704398698077, + "grad_norm": 0.23624862730503082, + "learning_rate": 3.6575e-05, + "loss": 2.3082, + "step": 537 + }, + { + "epoch": 0.06005867721600681, + "grad_norm": 0.24654389917850494, + "learning_rate": 3.655e-05, + "loss": 2.3327, + "step": 538 + }, + { + "epoch": 0.06017031044503284, + "grad_norm": 0.24037936329841614, + "learning_rate": 3.652500000000001e-05, + "loss": 2.336, + "step": 539 + }, + { + "epoch": 0.06028194367405888, + "grad_norm": 0.25897306203842163, + "learning_rate": 3.65e-05, + "loss": 2.3064, + "step": 540 + }, + { + "epoch": 0.06039357690308492, + "grad_norm": 0.24670930206775665, + "learning_rate": 3.6475000000000006e-05, + "loss": 2.2893, + "step": 541 + }, + { + "epoch": 0.06050521013211095, + "grad_norm": 0.23652324080467224, + "learning_rate": 3.645e-05, + "loss": 2.3516, + "step": 542 + }, + { + "epoch": 0.060616843361136985, + "grad_norm": 0.23609410226345062, + "learning_rate": 3.6425000000000004e-05, + "loss": 2.3463, + "step": 543 + }, + { + "epoch": 0.06072847659016302, + "grad_norm": 0.24355870485305786, + "learning_rate": 3.6400000000000004e-05, + "loss": 2.4344, + "step": 544 + }, + { + "epoch": 0.06084010981918905, + "grad_norm": 0.24142801761627197, + "learning_rate": 3.6375e-05, + "loss": 2.3728, + "step": 545 + }, + { + "epoch": 0.06095174304821509, + "grad_norm": 0.24296842515468597, + "learning_rate": 3.635e-05, + "loss": 2.3984, + "step": 546 + }, + { + "epoch": 0.06106337627724112, + "grad_norm": 0.24469731748104095, + "learning_rate": 3.6325e-05, + "loss": 2.3478, + "step": 547 + }, + { + "epoch": 0.06117500950626716, + "grad_norm": 0.25056391954421997, + "learning_rate": 3.63e-05, + "loss": 2.2224, + "step": 548 + }, + { + "epoch": 0.0612866427352932, + "grad_norm": 0.23732852935791016, + "learning_rate": 3.6275e-05, + "loss": 2.4875, + "step": 549 + }, + { + "epoch": 0.06139827596431923, + "grad_norm": 0.2487863302230835, + "learning_rate": 3.625e-05, + "loss": 2.27, + "step": 550 + }, + { + "epoch": 0.061509909193345265, + "grad_norm": 0.3302936851978302, + "learning_rate": 3.6225000000000006e-05, + "loss": 2.2132, + "step": 551 + }, + { + "epoch": 0.0616215424223713, + "grad_norm": 0.24820421636104584, + "learning_rate": 3.62e-05, + "loss": 2.3219, + "step": 552 + }, + { + "epoch": 0.06173317565139733, + "grad_norm": 0.24405966699123383, + "learning_rate": 3.6175000000000005e-05, + "loss": 2.3802, + "step": 553 + }, + { + "epoch": 0.06184480888042337, + "grad_norm": 0.24143864214420319, + "learning_rate": 3.615e-05, + "loss": 2.3897, + "step": 554 + }, + { + "epoch": 0.0619564421094494, + "grad_norm": 0.25099068880081177, + "learning_rate": 3.6125000000000004e-05, + "loss": 2.3671, + "step": 555 + }, + { + "epoch": 0.062068075338475436, + "grad_norm": 0.2503535747528076, + "learning_rate": 3.61e-05, + "loss": 2.4688, + "step": 556 + }, + { + "epoch": 0.06217970856750148, + "grad_norm": 0.2651500999927521, + "learning_rate": 3.6075e-05, + "loss": 2.2332, + "step": 557 + }, + { + "epoch": 0.06229134179652751, + "grad_norm": 0.2445705533027649, + "learning_rate": 3.605e-05, + "loss": 2.4157, + "step": 558 + }, + { + "epoch": 0.062402975025553545, + "grad_norm": 0.3407208025455475, + "learning_rate": 3.6025e-05, + "loss": 2.2068, + "step": 559 + }, + { + "epoch": 0.06251460825457958, + "grad_norm": 0.23573793470859528, + "learning_rate": 3.6e-05, + "loss": 2.3832, + "step": 560 + }, + { + "epoch": 0.06262624148360561, + "grad_norm": 0.23974989354610443, + "learning_rate": 3.5975e-05, + "loss": 2.4782, + "step": 561 + }, + { + "epoch": 0.06273787471263165, + "grad_norm": 0.24216674268245697, + "learning_rate": 3.595e-05, + "loss": 2.3656, + "step": 562 + }, + { + "epoch": 0.06284950794165768, + "grad_norm": 0.23898907005786896, + "learning_rate": 3.5925000000000006e-05, + "loss": 2.4015, + "step": 563 + }, + { + "epoch": 0.06296114117068372, + "grad_norm": 0.229814350605011, + "learning_rate": 3.59e-05, + "loss": 2.2476, + "step": 564 + }, + { + "epoch": 0.06307277439970975, + "grad_norm": 0.37582337856292725, + "learning_rate": 3.5875000000000005e-05, + "loss": 2.2959, + "step": 565 + }, + { + "epoch": 0.06318440762873578, + "grad_norm": 0.23359467089176178, + "learning_rate": 3.585e-05, + "loss": 2.3982, + "step": 566 + }, + { + "epoch": 0.06329604085776182, + "grad_norm": 0.23306161165237427, + "learning_rate": 3.5825000000000003e-05, + "loss": 2.3396, + "step": 567 + }, + { + "epoch": 0.06340767408678785, + "grad_norm": 0.2388441115617752, + "learning_rate": 3.58e-05, + "loss": 2.3826, + "step": 568 + }, + { + "epoch": 0.0635193073158139, + "grad_norm": 0.24095419049263, + "learning_rate": 3.5775e-05, + "loss": 2.3553, + "step": 569 + }, + { + "epoch": 0.06363094054483993, + "grad_norm": 0.2408561408519745, + "learning_rate": 3.575e-05, + "loss": 2.417, + "step": 570 + }, + { + "epoch": 0.06374257377386597, + "grad_norm": 0.23157310485839844, + "learning_rate": 3.5725e-05, + "loss": 2.3161, + "step": 571 + }, + { + "epoch": 0.063854207002892, + "grad_norm": 0.23947376012802124, + "learning_rate": 3.57e-05, + "loss": 2.3105, + "step": 572 + }, + { + "epoch": 0.06396584023191804, + "grad_norm": 0.24618233740329742, + "learning_rate": 3.5675e-05, + "loss": 2.4877, + "step": 573 + }, + { + "epoch": 0.06407747346094407, + "grad_norm": 0.24552257359027863, + "learning_rate": 3.565e-05, + "loss": 2.3758, + "step": 574 + }, + { + "epoch": 0.0641891066899701, + "grad_norm": 0.24329356849193573, + "learning_rate": 3.5625000000000005e-05, + "loss": 2.4084, + "step": 575 + }, + { + "epoch": 0.06430073991899614, + "grad_norm": 0.24198521673679352, + "learning_rate": 3.56e-05, + "loss": 2.4133, + "step": 576 + }, + { + "epoch": 0.06441237314802217, + "grad_norm": 0.23678706586360931, + "learning_rate": 3.5575000000000004e-05, + "loss": 2.378, + "step": 577 + }, + { + "epoch": 0.06452400637704821, + "grad_norm": 0.248667374253273, + "learning_rate": 3.555e-05, + "loss": 2.384, + "step": 578 + }, + { + "epoch": 0.06463563960607424, + "grad_norm": 0.24898695945739746, + "learning_rate": 3.5525e-05, + "loss": 2.3269, + "step": 579 + }, + { + "epoch": 0.06474727283510028, + "grad_norm": 0.240862175822258, + "learning_rate": 3.55e-05, + "loss": 2.381, + "step": 580 + }, + { + "epoch": 0.06485890606412631, + "grad_norm": 0.23366478085517883, + "learning_rate": 3.5475e-05, + "loss": 2.3628, + "step": 581 + }, + { + "epoch": 0.06497053929315234, + "grad_norm": 0.24220991134643555, + "learning_rate": 3.545e-05, + "loss": 2.2952, + "step": 582 + }, + { + "epoch": 0.06508217252217838, + "grad_norm": 0.24667523801326752, + "learning_rate": 3.5425e-05, + "loss": 2.4119, + "step": 583 + }, + { + "epoch": 0.06519380575120441, + "grad_norm": 0.24743525683879852, + "learning_rate": 3.54e-05, + "loss": 2.3073, + "step": 584 + }, + { + "epoch": 0.06530543898023046, + "grad_norm": 0.908301591873169, + "learning_rate": 3.5375e-05, + "loss": 2.3342, + "step": 585 + }, + { + "epoch": 0.0654170722092565, + "grad_norm": 0.2384248673915863, + "learning_rate": 3.535e-05, + "loss": 2.387, + "step": 586 + }, + { + "epoch": 0.06552870543828253, + "grad_norm": 0.29484090209007263, + "learning_rate": 3.5325000000000005e-05, + "loss": 2.4215, + "step": 587 + }, + { + "epoch": 0.06564033866730856, + "grad_norm": 0.2425820678472519, + "learning_rate": 3.53e-05, + "loss": 2.4056, + "step": 588 + }, + { + "epoch": 0.0657519718963346, + "grad_norm": 0.256686270236969, + "learning_rate": 3.5275000000000004e-05, + "loss": 2.3973, + "step": 589 + }, + { + "epoch": 0.06586360512536063, + "grad_norm": 0.23785726726055145, + "learning_rate": 3.525e-05, + "loss": 2.378, + "step": 590 + }, + { + "epoch": 0.06597523835438666, + "grad_norm": 0.2742824852466583, + "learning_rate": 3.5225e-05, + "loss": 2.3745, + "step": 591 + }, + { + "epoch": 0.0660868715834127, + "grad_norm": 0.2250916063785553, + "learning_rate": 3.52e-05, + "loss": 2.321, + "step": 592 + }, + { + "epoch": 0.06619850481243873, + "grad_norm": 0.24400900304317474, + "learning_rate": 3.5175e-05, + "loss": 2.4296, + "step": 593 + }, + { + "epoch": 0.06631013804146477, + "grad_norm": 0.24581992626190186, + "learning_rate": 3.515e-05, + "loss": 2.3375, + "step": 594 + }, + { + "epoch": 0.0664217712704908, + "grad_norm": 0.2356722503900528, + "learning_rate": 3.5125e-05, + "loss": 2.4229, + "step": 595 + }, + { + "epoch": 0.06653340449951683, + "grad_norm": 0.24524928629398346, + "learning_rate": 3.51e-05, + "loss": 2.3744, + "step": 596 + }, + { + "epoch": 0.06664503772854287, + "grad_norm": 0.24158966541290283, + "learning_rate": 3.5075000000000006e-05, + "loss": 2.3752, + "step": 597 + }, + { + "epoch": 0.0667566709575689, + "grad_norm": 0.2346276342868805, + "learning_rate": 3.505e-05, + "loss": 2.346, + "step": 598 + }, + { + "epoch": 0.06686830418659494, + "grad_norm": 0.24407444894313812, + "learning_rate": 3.5025000000000004e-05, + "loss": 2.3559, + "step": 599 + }, + { + "epoch": 0.06697993741562097, + "grad_norm": 0.24137824773788452, + "learning_rate": 3.5e-05, + "loss": 2.3965, + "step": 600 + }, + { + "epoch": 0.067091570644647, + "grad_norm": 0.24912434816360474, + "learning_rate": 3.4975e-05, + "loss": 2.3882, + "step": 601 + }, + { + "epoch": 0.06720320387367305, + "grad_norm": 0.24506230652332306, + "learning_rate": 3.495e-05, + "loss": 2.2875, + "step": 602 + }, + { + "epoch": 0.06731483710269909, + "grad_norm": 0.2423059046268463, + "learning_rate": 3.4925e-05, + "loss": 2.2828, + "step": 603 + }, + { + "epoch": 0.06742647033172512, + "grad_norm": 0.2598559856414795, + "learning_rate": 3.49e-05, + "loss": 2.288, + "step": 604 + }, + { + "epoch": 0.06753810356075116, + "grad_norm": 0.2390177696943283, + "learning_rate": 3.4875e-05, + "loss": 2.4369, + "step": 605 + }, + { + "epoch": 0.06764973678977719, + "grad_norm": 0.24242550134658813, + "learning_rate": 3.485e-05, + "loss": 2.2285, + "step": 606 + }, + { + "epoch": 0.06776137001880322, + "grad_norm": 0.23738925158977509, + "learning_rate": 3.4825e-05, + "loss": 2.4707, + "step": 607 + }, + { + "epoch": 0.06787300324782926, + "grad_norm": 0.23820523917675018, + "learning_rate": 3.48e-05, + "loss": 2.4274, + "step": 608 + }, + { + "epoch": 0.06798463647685529, + "grad_norm": 0.2343960702419281, + "learning_rate": 3.4775000000000005e-05, + "loss": 2.2894, + "step": 609 + }, + { + "epoch": 0.06809626970588133, + "grad_norm": 0.241934671998024, + "learning_rate": 3.475e-05, + "loss": 2.4438, + "step": 610 + }, + { + "epoch": 0.06820790293490736, + "grad_norm": 0.24266451597213745, + "learning_rate": 3.4725000000000004e-05, + "loss": 2.4135, + "step": 611 + }, + { + "epoch": 0.0683195361639334, + "grad_norm": 0.2379053235054016, + "learning_rate": 3.4699999999999996e-05, + "loss": 2.2596, + "step": 612 + }, + { + "epoch": 0.06843116939295943, + "grad_norm": 0.2375876009464264, + "learning_rate": 3.4675e-05, + "loss": 2.3748, + "step": 613 + }, + { + "epoch": 0.06854280262198546, + "grad_norm": 0.23114004731178284, + "learning_rate": 3.465e-05, + "loss": 2.4376, + "step": 614 + }, + { + "epoch": 0.0686544358510115, + "grad_norm": 0.38774147629737854, + "learning_rate": 3.4625e-05, + "loss": 2.381, + "step": 615 + }, + { + "epoch": 0.06876606908003753, + "grad_norm": 0.6831827163696289, + "learning_rate": 3.46e-05, + "loss": 2.3815, + "step": 616 + }, + { + "epoch": 0.06887770230906357, + "grad_norm": 0.25311988592147827, + "learning_rate": 3.4575e-05, + "loss": 2.3525, + "step": 617 + }, + { + "epoch": 0.06898933553808961, + "grad_norm": 0.2384188175201416, + "learning_rate": 3.455e-05, + "loss": 2.4084, + "step": 618 + }, + { + "epoch": 0.06910096876711565, + "grad_norm": 0.2516483962535858, + "learning_rate": 3.4525e-05, + "loss": 2.3143, + "step": 619 + }, + { + "epoch": 0.06921260199614168, + "grad_norm": 0.2384801059961319, + "learning_rate": 3.45e-05, + "loss": 2.3075, + "step": 620 + }, + { + "epoch": 0.06932423522516772, + "grad_norm": 0.23414179682731628, + "learning_rate": 3.4475000000000005e-05, + "loss": 2.3035, + "step": 621 + }, + { + "epoch": 0.06943586845419375, + "grad_norm": 0.25081461668014526, + "learning_rate": 3.445e-05, + "loss": 2.3292, + "step": 622 + }, + { + "epoch": 0.06954750168321978, + "grad_norm": 0.22738482058048248, + "learning_rate": 3.4425e-05, + "loss": 2.3412, + "step": 623 + }, + { + "epoch": 0.06965913491224582, + "grad_norm": 0.2414780855178833, + "learning_rate": 3.4399999999999996e-05, + "loss": 2.3674, + "step": 624 + }, + { + "epoch": 0.06977076814127185, + "grad_norm": 0.25145867466926575, + "learning_rate": 3.4375e-05, + "loss": 2.3414, + "step": 625 + }, + { + "epoch": 0.06988240137029789, + "grad_norm": 0.23038849234580994, + "learning_rate": 3.435e-05, + "loss": 2.3156, + "step": 626 + }, + { + "epoch": 0.06999403459932392, + "grad_norm": 0.23264938592910767, + "learning_rate": 3.4325e-05, + "loss": 2.2536, + "step": 627 + }, + { + "epoch": 0.07010566782834995, + "grad_norm": 0.24372529983520508, + "learning_rate": 3.430000000000001e-05, + "loss": 2.3152, + "step": 628 + }, + { + "epoch": 0.07021730105737599, + "grad_norm": 0.23900210857391357, + "learning_rate": 3.4275e-05, + "loss": 2.343, + "step": 629 + }, + { + "epoch": 0.07032893428640202, + "grad_norm": 0.236933633685112, + "learning_rate": 3.4250000000000006e-05, + "loss": 2.4373, + "step": 630 + }, + { + "epoch": 0.07044056751542806, + "grad_norm": 0.23982471227645874, + "learning_rate": 3.4225e-05, + "loss": 2.3498, + "step": 631 + }, + { + "epoch": 0.07055220074445409, + "grad_norm": 0.2617717385292053, + "learning_rate": 3.4200000000000005e-05, + "loss": 2.2457, + "step": 632 + }, + { + "epoch": 0.07066383397348013, + "grad_norm": 0.24056993424892426, + "learning_rate": 3.4175000000000004e-05, + "loss": 2.3543, + "step": 633 + }, + { + "epoch": 0.07077546720250616, + "grad_norm": 0.2435135543346405, + "learning_rate": 3.415e-05, + "loss": 2.415, + "step": 634 + }, + { + "epoch": 0.07088710043153221, + "grad_norm": 0.2404133677482605, + "learning_rate": 3.4125e-05, + "loss": 2.3479, + "step": 635 + }, + { + "epoch": 0.07099873366055824, + "grad_norm": 0.22524316608905792, + "learning_rate": 3.41e-05, + "loss": 2.3252, + "step": 636 + }, + { + "epoch": 0.07111036688958428, + "grad_norm": 0.23215116560459137, + "learning_rate": 3.4075e-05, + "loss": 2.3494, + "step": 637 + }, + { + "epoch": 0.07122200011861031, + "grad_norm": 0.24920037388801575, + "learning_rate": 3.405e-05, + "loss": 2.3705, + "step": 638 + }, + { + "epoch": 0.07133363334763634, + "grad_norm": 0.2321121245622635, + "learning_rate": 3.4025e-05, + "loss": 2.2816, + "step": 639 + }, + { + "epoch": 0.07144526657666238, + "grad_norm": 0.2596079111099243, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.3672, + "step": 640 + }, + { + "epoch": 0.07155689980568841, + "grad_norm": 0.24075333774089813, + "learning_rate": 3.3975e-05, + "loss": 2.1981, + "step": 641 + }, + { + "epoch": 0.07166853303471445, + "grad_norm": 0.24256588518619537, + "learning_rate": 3.3950000000000005e-05, + "loss": 2.4488, + "step": 642 + }, + { + "epoch": 0.07178016626374048, + "grad_norm": 0.24132193624973297, + "learning_rate": 3.3925e-05, + "loss": 2.394, + "step": 643 + }, + { + "epoch": 0.07189179949276651, + "grad_norm": 0.23571936786174774, + "learning_rate": 3.3900000000000004e-05, + "loss": 2.3527, + "step": 644 + }, + { + "epoch": 0.07200343272179255, + "grad_norm": 0.24922168254852295, + "learning_rate": 3.3875000000000003e-05, + "loss": 2.504, + "step": 645 + }, + { + "epoch": 0.07211506595081858, + "grad_norm": 0.2349790334701538, + "learning_rate": 3.385e-05, + "loss": 2.375, + "step": 646 + }, + { + "epoch": 0.07222669917984462, + "grad_norm": 0.23674091696739197, + "learning_rate": 3.3825e-05, + "loss": 2.453, + "step": 647 + }, + { + "epoch": 0.07233833240887065, + "grad_norm": 0.24707044661045074, + "learning_rate": 3.38e-05, + "loss": 2.3144, + "step": 648 + }, + { + "epoch": 0.07244996563789668, + "grad_norm": 0.24542857706546783, + "learning_rate": 3.3775e-05, + "loss": 2.4941, + "step": 649 + }, + { + "epoch": 0.07256159886692272, + "grad_norm": 0.233075812458992, + "learning_rate": 3.375000000000001e-05, + "loss": 2.3719, + "step": 650 + }, + { + "epoch": 0.07267323209594877, + "grad_norm": 0.23478782176971436, + "learning_rate": 3.3725e-05, + "loss": 2.4255, + "step": 651 + }, + { + "epoch": 0.0727848653249748, + "grad_norm": 0.2310231477022171, + "learning_rate": 3.3700000000000006e-05, + "loss": 2.2017, + "step": 652 + }, + { + "epoch": 0.07289649855400084, + "grad_norm": 0.23209823668003082, + "learning_rate": 3.3675e-05, + "loss": 2.2443, + "step": 653 + }, + { + "epoch": 0.07300813178302687, + "grad_norm": 0.2376200407743454, + "learning_rate": 3.3650000000000005e-05, + "loss": 2.446, + "step": 654 + }, + { + "epoch": 0.0731197650120529, + "grad_norm": 0.23651504516601562, + "learning_rate": 3.3625000000000004e-05, + "loss": 2.4459, + "step": 655 + }, + { + "epoch": 0.07323139824107894, + "grad_norm": 0.25088030099868774, + "learning_rate": 3.3600000000000004e-05, + "loss": 2.4193, + "step": 656 + }, + { + "epoch": 0.07334303147010497, + "grad_norm": 0.24074560403823853, + "learning_rate": 3.3575e-05, + "loss": 2.2942, + "step": 657 + }, + { + "epoch": 0.073454664699131, + "grad_norm": 0.24596726894378662, + "learning_rate": 3.355e-05, + "loss": 2.3474, + "step": 658 + }, + { + "epoch": 0.07356629792815704, + "grad_norm": 0.23518167436122894, + "learning_rate": 3.3525e-05, + "loss": 2.4345, + "step": 659 + }, + { + "epoch": 0.07367793115718307, + "grad_norm": 0.23677384853363037, + "learning_rate": 3.35e-05, + "loss": 2.178, + "step": 660 + }, + { + "epoch": 0.07378956438620911, + "grad_norm": 0.24126943945884705, + "learning_rate": 3.3475e-05, + "loss": 2.3258, + "step": 661 + }, + { + "epoch": 0.07390119761523514, + "grad_norm": 0.24277062714099884, + "learning_rate": 3.345000000000001e-05, + "loss": 2.3129, + "step": 662 + }, + { + "epoch": 0.07401283084426118, + "grad_norm": 0.2481059432029724, + "learning_rate": 3.3425e-05, + "loss": 2.3619, + "step": 663 + }, + { + "epoch": 0.07412446407328721, + "grad_norm": 0.23601798713207245, + "learning_rate": 3.3400000000000005e-05, + "loss": 2.346, + "step": 664 + }, + { + "epoch": 0.07423609730231324, + "grad_norm": 0.23907043039798737, + "learning_rate": 3.3375e-05, + "loss": 2.333, + "step": 665 + }, + { + "epoch": 0.07434773053133928, + "grad_norm": 0.2404210865497589, + "learning_rate": 3.3350000000000004e-05, + "loss": 2.3695, + "step": 666 + }, + { + "epoch": 0.07445936376036531, + "grad_norm": 0.23311296105384827, + "learning_rate": 3.3325000000000004e-05, + "loss": 2.2827, + "step": 667 + }, + { + "epoch": 0.07457099698939136, + "grad_norm": 0.24998392164707184, + "learning_rate": 3.33e-05, + "loss": 2.2346, + "step": 668 + }, + { + "epoch": 0.0746826302184174, + "grad_norm": 0.23921620845794678, + "learning_rate": 3.3275e-05, + "loss": 2.4188, + "step": 669 + }, + { + "epoch": 0.07479426344744343, + "grad_norm": 0.23314505815505981, + "learning_rate": 3.325e-05, + "loss": 2.3997, + "step": 670 + }, + { + "epoch": 0.07490589667646946, + "grad_norm": 0.2334446907043457, + "learning_rate": 3.3225e-05, + "loss": 2.397, + "step": 671 + }, + { + "epoch": 0.0750175299054955, + "grad_norm": 0.23645921051502228, + "learning_rate": 3.32e-05, + "loss": 2.4039, + "step": 672 + }, + { + "epoch": 0.07512916313452153, + "grad_norm": 0.2596314251422882, + "learning_rate": 3.3175e-05, + "loss": 2.4392, + "step": 673 + }, + { + "epoch": 0.07524079636354757, + "grad_norm": 0.24032291769981384, + "learning_rate": 3.3150000000000006e-05, + "loss": 2.3858, + "step": 674 + }, + { + "epoch": 0.0753524295925736, + "grad_norm": 0.2412300407886505, + "learning_rate": 3.3125e-05, + "loss": 2.278, + "step": 675 + }, + { + "epoch": 0.07546406282159963, + "grad_norm": 0.225362628698349, + "learning_rate": 3.3100000000000005e-05, + "loss": 2.4296, + "step": 676 + }, + { + "epoch": 0.07557569605062567, + "grad_norm": 0.24731768667697906, + "learning_rate": 3.3075e-05, + "loss": 2.5028, + "step": 677 + }, + { + "epoch": 0.0756873292796517, + "grad_norm": 0.24850989878177643, + "learning_rate": 3.3050000000000004e-05, + "loss": 2.4462, + "step": 678 + }, + { + "epoch": 0.07579896250867774, + "grad_norm": 0.23657028377056122, + "learning_rate": 3.3025e-05, + "loss": 2.3677, + "step": 679 + }, + { + "epoch": 0.07591059573770377, + "grad_norm": 0.23830410838127136, + "learning_rate": 3.3e-05, + "loss": 2.3859, + "step": 680 + }, + { + "epoch": 0.0760222289667298, + "grad_norm": 0.23412565886974335, + "learning_rate": 3.2975e-05, + "loss": 2.3559, + "step": 681 + }, + { + "epoch": 0.07613386219575584, + "grad_norm": 0.2358182668685913, + "learning_rate": 3.295e-05, + "loss": 2.3619, + "step": 682 + }, + { + "epoch": 0.07624549542478187, + "grad_norm": 0.24088707566261292, + "learning_rate": 3.2925e-05, + "loss": 2.2953, + "step": 683 + }, + { + "epoch": 0.07635712865380792, + "grad_norm": 0.23123978078365326, + "learning_rate": 3.29e-05, + "loss": 2.33, + "step": 684 + }, + { + "epoch": 0.07646876188283395, + "grad_norm": 0.24349573254585266, + "learning_rate": 3.2875e-05, + "loss": 2.3789, + "step": 685 + }, + { + "epoch": 0.07658039511185999, + "grad_norm": 0.23393647372722626, + "learning_rate": 3.2850000000000006e-05, + "loss": 2.4458, + "step": 686 + }, + { + "epoch": 0.07669202834088602, + "grad_norm": 0.2344076782464981, + "learning_rate": 3.2825e-05, + "loss": 2.2232, + "step": 687 + }, + { + "epoch": 0.07680366156991206, + "grad_norm": 0.23426808416843414, + "learning_rate": 3.2800000000000004e-05, + "loss": 2.55, + "step": 688 + }, + { + "epoch": 0.07691529479893809, + "grad_norm": 0.24055257439613342, + "learning_rate": 3.2775e-05, + "loss": 2.4083, + "step": 689 + }, + { + "epoch": 0.07702692802796413, + "grad_norm": 0.23331885039806366, + "learning_rate": 3.275e-05, + "loss": 2.3025, + "step": 690 + }, + { + "epoch": 0.07713856125699016, + "grad_norm": 0.24212725460529327, + "learning_rate": 3.2725e-05, + "loss": 2.3205, + "step": 691 + }, + { + "epoch": 0.0772501944860162, + "grad_norm": 0.32642829418182373, + "learning_rate": 3.27e-05, + "loss": 2.2472, + "step": 692 + }, + { + "epoch": 0.07736182771504223, + "grad_norm": 0.30700212717056274, + "learning_rate": 3.2675e-05, + "loss": 2.3868, + "step": 693 + }, + { + "epoch": 0.07747346094406826, + "grad_norm": 0.23343181610107422, + "learning_rate": 3.265e-05, + "loss": 2.345, + "step": 694 + }, + { + "epoch": 0.0775850941730943, + "grad_norm": 0.23056460916996002, + "learning_rate": 3.2625e-05, + "loss": 2.3514, + "step": 695 + }, + { + "epoch": 0.07769672740212033, + "grad_norm": 0.24498891830444336, + "learning_rate": 3.26e-05, + "loss": 2.4623, + "step": 696 + }, + { + "epoch": 0.07780836063114636, + "grad_norm": 0.2408551424741745, + "learning_rate": 3.2575e-05, + "loss": 2.3371, + "step": 697 + }, + { + "epoch": 0.0779199938601724, + "grad_norm": 0.24386224150657654, + "learning_rate": 3.2550000000000005e-05, + "loss": 2.4246, + "step": 698 + }, + { + "epoch": 0.07803162708919843, + "grad_norm": 0.23228052258491516, + "learning_rate": 3.2525e-05, + "loss": 2.2636, + "step": 699 + }, + { + "epoch": 0.07814326031822447, + "grad_norm": 0.24233461916446686, + "learning_rate": 3.2500000000000004e-05, + "loss": 2.4194, + "step": 700 + }, + { + "epoch": 0.07825489354725051, + "grad_norm": 0.23138944804668427, + "learning_rate": 3.2474999999999997e-05, + "loss": 2.3944, + "step": 701 + }, + { + "epoch": 0.07836652677627655, + "grad_norm": 0.23382620513439178, + "learning_rate": 3.245e-05, + "loss": 2.2998, + "step": 702 + }, + { + "epoch": 0.07847816000530258, + "grad_norm": 0.2373252511024475, + "learning_rate": 3.2425e-05, + "loss": 2.3178, + "step": 703 + }, + { + "epoch": 0.07858979323432862, + "grad_norm": 0.22690844535827637, + "learning_rate": 3.24e-05, + "loss": 2.4387, + "step": 704 + }, + { + "epoch": 0.07870142646335465, + "grad_norm": 0.22850820422172546, + "learning_rate": 3.2375e-05, + "loss": 2.3459, + "step": 705 + }, + { + "epoch": 0.07881305969238069, + "grad_norm": 0.23736946284770966, + "learning_rate": 3.235e-05, + "loss": 2.2847, + "step": 706 + }, + { + "epoch": 0.07892469292140672, + "grad_norm": 0.241333469748497, + "learning_rate": 3.2325e-05, + "loss": 2.3021, + "step": 707 + }, + { + "epoch": 0.07903632615043275, + "grad_norm": 0.23721295595169067, + "learning_rate": 3.2300000000000006e-05, + "loss": 2.3129, + "step": 708 + }, + { + "epoch": 0.07914795937945879, + "grad_norm": 0.2383483350276947, + "learning_rate": 3.2275e-05, + "loss": 2.3561, + "step": 709 + }, + { + "epoch": 0.07925959260848482, + "grad_norm": 0.24383005499839783, + "learning_rate": 3.2250000000000005e-05, + "loss": 2.4278, + "step": 710 + }, + { + "epoch": 0.07937122583751086, + "grad_norm": 0.24906353652477264, + "learning_rate": 3.2225e-05, + "loss": 2.3474, + "step": 711 + }, + { + "epoch": 0.07948285906653689, + "grad_norm": 0.2293633371591568, + "learning_rate": 3.2200000000000003e-05, + "loss": 2.3338, + "step": 712 + }, + { + "epoch": 0.07959449229556292, + "grad_norm": 0.23942622542381287, + "learning_rate": 3.2175e-05, + "loss": 2.3455, + "step": 713 + }, + { + "epoch": 0.07970612552458896, + "grad_norm": 0.23788371682167053, + "learning_rate": 3.215e-05, + "loss": 2.3026, + "step": 714 + }, + { + "epoch": 0.07981775875361499, + "grad_norm": 0.2387734055519104, + "learning_rate": 3.2125e-05, + "loss": 2.316, + "step": 715 + }, + { + "epoch": 0.07992939198264103, + "grad_norm": 0.242325097322464, + "learning_rate": 3.21e-05, + "loss": 2.2869, + "step": 716 + }, + { + "epoch": 0.08004102521166707, + "grad_norm": 0.23410271108150482, + "learning_rate": 3.2075e-05, + "loss": 2.3509, + "step": 717 + }, + { + "epoch": 0.08015265844069311, + "grad_norm": 0.23277893662452698, + "learning_rate": 3.205e-05, + "loss": 2.3282, + "step": 718 + }, + { + "epoch": 0.08026429166971914, + "grad_norm": 1.3853453397750854, + "learning_rate": 3.2025e-05, + "loss": 2.3211, + "step": 719 + }, + { + "epoch": 0.08037592489874518, + "grad_norm": 0.23764613270759583, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.4642, + "step": 720 + }, + { + "epoch": 0.08048755812777121, + "grad_norm": 0.2404954433441162, + "learning_rate": 3.1975e-05, + "loss": 2.3829, + "step": 721 + }, + { + "epoch": 0.08059919135679725, + "grad_norm": 0.2323405146598816, + "learning_rate": 3.1950000000000004e-05, + "loss": 2.4254, + "step": 722 + }, + { + "epoch": 0.08071082458582328, + "grad_norm": 0.24208155274391174, + "learning_rate": 3.1925e-05, + "loss": 2.3304, + "step": 723 + }, + { + "epoch": 0.08082245781484931, + "grad_norm": 0.2440410554409027, + "learning_rate": 3.19e-05, + "loss": 2.4517, + "step": 724 + }, + { + "epoch": 0.08093409104387535, + "grad_norm": 0.23571601510047913, + "learning_rate": 3.1875e-05, + "loss": 2.3367, + "step": 725 + }, + { + "epoch": 0.08104572427290138, + "grad_norm": 0.23201195895671844, + "learning_rate": 3.185e-05, + "loss": 2.3973, + "step": 726 + }, + { + "epoch": 0.08115735750192742, + "grad_norm": 0.23855586349964142, + "learning_rate": 3.1825e-05, + "loss": 2.3875, + "step": 727 + }, + { + "epoch": 0.08126899073095345, + "grad_norm": 0.23818083107471466, + "learning_rate": 3.18e-05, + "loss": 2.3538, + "step": 728 + }, + { + "epoch": 0.08138062395997948, + "grad_norm": 0.2586081326007843, + "learning_rate": 3.1775e-05, + "loss": 2.3232, + "step": 729 + }, + { + "epoch": 0.08149225718900552, + "grad_norm": 0.23521441221237183, + "learning_rate": 3.175e-05, + "loss": 2.2694, + "step": 730 + }, + { + "epoch": 0.08160389041803155, + "grad_norm": 0.4255395829677582, + "learning_rate": 3.1725e-05, + "loss": 2.3881, + "step": 731 + }, + { + "epoch": 0.08171552364705759, + "grad_norm": 0.22463633120059967, + "learning_rate": 3.1700000000000005e-05, + "loss": 2.3078, + "step": 732 + }, + { + "epoch": 0.08182715687608362, + "grad_norm": 0.2440212070941925, + "learning_rate": 3.1675e-05, + "loss": 2.3463, + "step": 733 + }, + { + "epoch": 0.08193879010510967, + "grad_norm": 0.22708596289157867, + "learning_rate": 3.1650000000000004e-05, + "loss": 2.3567, + "step": 734 + }, + { + "epoch": 0.0820504233341357, + "grad_norm": 0.26711180806159973, + "learning_rate": 3.1624999999999996e-05, + "loss": 2.4591, + "step": 735 + }, + { + "epoch": 0.08216205656316174, + "grad_norm": 0.23102666437625885, + "learning_rate": 3.16e-05, + "loss": 2.3309, + "step": 736 + }, + { + "epoch": 0.08227368979218777, + "grad_norm": 0.24265794456005096, + "learning_rate": 3.1575e-05, + "loss": 2.3203, + "step": 737 + }, + { + "epoch": 0.0823853230212138, + "grad_norm": 0.23661485314369202, + "learning_rate": 3.155e-05, + "loss": 2.3478, + "step": 738 + }, + { + "epoch": 0.08249695625023984, + "grad_norm": 0.23216097056865692, + "learning_rate": 3.1525e-05, + "loss": 2.4765, + "step": 739 + }, + { + "epoch": 0.08260858947926587, + "grad_norm": 0.2339819222688675, + "learning_rate": 3.15e-05, + "loss": 2.2802, + "step": 740 + }, + { + "epoch": 0.08272022270829191, + "grad_norm": 0.25498029589653015, + "learning_rate": 3.1475e-05, + "loss": 2.4064, + "step": 741 + }, + { + "epoch": 0.08283185593731794, + "grad_norm": 0.23578105866909027, + "learning_rate": 3.145e-05, + "loss": 2.3542, + "step": 742 + }, + { + "epoch": 0.08294348916634398, + "grad_norm": 0.23970484733581543, + "learning_rate": 3.1425e-05, + "loss": 2.3676, + "step": 743 + }, + { + "epoch": 0.08305512239537001, + "grad_norm": 0.2514541447162628, + "learning_rate": 3.1400000000000004e-05, + "loss": 2.4229, + "step": 744 + }, + { + "epoch": 0.08316675562439604, + "grad_norm": 0.23850007355213165, + "learning_rate": 3.1375e-05, + "loss": 2.3794, + "step": 745 + }, + { + "epoch": 0.08327838885342208, + "grad_norm": 0.30746811628341675, + "learning_rate": 3.135e-05, + "loss": 2.383, + "step": 746 + }, + { + "epoch": 0.08339002208244811, + "grad_norm": 0.23188377916812897, + "learning_rate": 3.1324999999999996e-05, + "loss": 2.336, + "step": 747 + }, + { + "epoch": 0.08350165531147415, + "grad_norm": 0.23534120619297028, + "learning_rate": 3.13e-05, + "loss": 2.398, + "step": 748 + }, + { + "epoch": 0.08361328854050018, + "grad_norm": 0.2442607581615448, + "learning_rate": 3.1275e-05, + "loss": 2.4357, + "step": 749 + }, + { + "epoch": 0.08372492176952623, + "grad_norm": 0.2328692376613617, + "learning_rate": 3.125e-05, + "loss": 2.3239, + "step": 750 + }, + { + "epoch": 0.08383655499855226, + "grad_norm": 0.29631370306015015, + "learning_rate": 3.122500000000001e-05, + "loss": 2.3165, + "step": 751 + }, + { + "epoch": 0.0839481882275783, + "grad_norm": 0.2432224601507187, + "learning_rate": 3.12e-05, + "loss": 2.372, + "step": 752 + }, + { + "epoch": 0.08405982145660433, + "grad_norm": 0.2435504049062729, + "learning_rate": 3.1175000000000006e-05, + "loss": 2.4804, + "step": 753 + }, + { + "epoch": 0.08417145468563036, + "grad_norm": 0.2799118161201477, + "learning_rate": 3.115e-05, + "loss": 2.3138, + "step": 754 + }, + { + "epoch": 0.0842830879146564, + "grad_norm": 0.23647980391979218, + "learning_rate": 3.1125000000000004e-05, + "loss": 2.4502, + "step": 755 + }, + { + "epoch": 0.08439472114368243, + "grad_norm": 0.23339681327342987, + "learning_rate": 3.1100000000000004e-05, + "loss": 2.3515, + "step": 756 + }, + { + "epoch": 0.08450635437270847, + "grad_norm": 0.2358621060848236, + "learning_rate": 3.1075e-05, + "loss": 2.3384, + "step": 757 + }, + { + "epoch": 0.0846179876017345, + "grad_norm": 0.226596862077713, + "learning_rate": 3.105e-05, + "loss": 2.4022, + "step": 758 + }, + { + "epoch": 0.08472962083076054, + "grad_norm": 0.24496255815029144, + "learning_rate": 3.1025e-05, + "loss": 2.4273, + "step": 759 + }, + { + "epoch": 0.08484125405978657, + "grad_norm": 0.24479149281978607, + "learning_rate": 3.1e-05, + "loss": 2.3464, + "step": 760 + }, + { + "epoch": 0.0849528872888126, + "grad_norm": 0.23078912496566772, + "learning_rate": 3.0975e-05, + "loss": 2.3267, + "step": 761 + }, + { + "epoch": 0.08506452051783864, + "grad_norm": 0.23886540532112122, + "learning_rate": 3.095e-05, + "loss": 2.243, + "step": 762 + }, + { + "epoch": 0.08517615374686467, + "grad_norm": 0.2510230839252472, + "learning_rate": 3.0925000000000006e-05, + "loss": 2.3675, + "step": 763 + }, + { + "epoch": 0.0852877869758907, + "grad_norm": 0.23274515569210052, + "learning_rate": 3.09e-05, + "loss": 2.3207, + "step": 764 + }, + { + "epoch": 0.08539942020491674, + "grad_norm": 0.26892465353012085, + "learning_rate": 3.0875000000000005e-05, + "loss": 2.4453, + "step": 765 + }, + { + "epoch": 0.08551105343394277, + "grad_norm": 0.24123644828796387, + "learning_rate": 3.0850000000000004e-05, + "loss": 2.287, + "step": 766 + }, + { + "epoch": 0.08562268666296882, + "grad_norm": 0.2416788935661316, + "learning_rate": 3.0825000000000004e-05, + "loss": 2.301, + "step": 767 + }, + { + "epoch": 0.08573431989199486, + "grad_norm": 0.23173028230667114, + "learning_rate": 3.08e-05, + "loss": 2.2896, + "step": 768 + }, + { + "epoch": 0.08584595312102089, + "grad_norm": 0.2286095917224884, + "learning_rate": 3.0775e-05, + "loss": 2.341, + "step": 769 + }, + { + "epoch": 0.08595758635004692, + "grad_norm": 0.24451370537281036, + "learning_rate": 3.075e-05, + "loss": 2.196, + "step": 770 + }, + { + "epoch": 0.08606921957907296, + "grad_norm": 0.26589682698249817, + "learning_rate": 3.0725e-05, + "loss": 2.2902, + "step": 771 + }, + { + "epoch": 0.08618085280809899, + "grad_norm": 0.26289981603622437, + "learning_rate": 3.07e-05, + "loss": 2.505, + "step": 772 + }, + { + "epoch": 0.08629248603712503, + "grad_norm": 0.2402532994747162, + "learning_rate": 3.067500000000001e-05, + "loss": 2.3008, + "step": 773 + }, + { + "epoch": 0.08640411926615106, + "grad_norm": 0.23948536813259125, + "learning_rate": 3.065e-05, + "loss": 2.4653, + "step": 774 + }, + { + "epoch": 0.0865157524951771, + "grad_norm": 0.2507634162902832, + "learning_rate": 3.0625000000000006e-05, + "loss": 2.2703, + "step": 775 + }, + { + "epoch": 0.08662738572420313, + "grad_norm": 0.24213804304599762, + "learning_rate": 3.06e-05, + "loss": 2.4244, + "step": 776 + }, + { + "epoch": 0.08673901895322916, + "grad_norm": 0.24074698984622955, + "learning_rate": 3.0575000000000005e-05, + "loss": 2.3714, + "step": 777 + }, + { + "epoch": 0.0868506521822552, + "grad_norm": 0.2556532323360443, + "learning_rate": 3.0550000000000004e-05, + "loss": 2.2702, + "step": 778 + }, + { + "epoch": 0.08696228541128123, + "grad_norm": 0.23637044429779053, + "learning_rate": 3.0525e-05, + "loss": 2.3382, + "step": 779 + }, + { + "epoch": 0.08707391864030727, + "grad_norm": 0.2363051176071167, + "learning_rate": 3.05e-05, + "loss": 2.2893, + "step": 780 + }, + { + "epoch": 0.0871855518693333, + "grad_norm": 0.23347924649715424, + "learning_rate": 3.0475000000000002e-05, + "loss": 2.4117, + "step": 781 + }, + { + "epoch": 0.08729718509835933, + "grad_norm": 0.37233465909957886, + "learning_rate": 3.045e-05, + "loss": 2.3556, + "step": 782 + }, + { + "epoch": 0.08740881832738538, + "grad_norm": 0.25742578506469727, + "learning_rate": 3.0425000000000004e-05, + "loss": 2.4087, + "step": 783 + }, + { + "epoch": 0.08752045155641142, + "grad_norm": 0.24065518379211426, + "learning_rate": 3.04e-05, + "loss": 2.2688, + "step": 784 + }, + { + "epoch": 0.08763208478543745, + "grad_norm": 0.2395828366279602, + "learning_rate": 3.0375000000000003e-05, + "loss": 2.3376, + "step": 785 + }, + { + "epoch": 0.08774371801446348, + "grad_norm": 0.30660635232925415, + "learning_rate": 3.035e-05, + "loss": 2.3353, + "step": 786 + }, + { + "epoch": 0.08785535124348952, + "grad_norm": 0.23347756266593933, + "learning_rate": 3.0325000000000002e-05, + "loss": 2.2991, + "step": 787 + }, + { + "epoch": 0.08796698447251555, + "grad_norm": 0.22873973846435547, + "learning_rate": 3.03e-05, + "loss": 2.3047, + "step": 788 + }, + { + "epoch": 0.08807861770154159, + "grad_norm": 0.23164629936218262, + "learning_rate": 3.0275000000000004e-05, + "loss": 2.3264, + "step": 789 + }, + { + "epoch": 0.08819025093056762, + "grad_norm": 0.2329261600971222, + "learning_rate": 3.025e-05, + "loss": 2.2868, + "step": 790 + }, + { + "epoch": 0.08830188415959365, + "grad_norm": 0.22929538786411285, + "learning_rate": 3.0225000000000003e-05, + "loss": 2.3671, + "step": 791 + }, + { + "epoch": 0.08841351738861969, + "grad_norm": 0.24005673825740814, + "learning_rate": 3.02e-05, + "loss": 2.4117, + "step": 792 + }, + { + "epoch": 0.08852515061764572, + "grad_norm": 0.22921884059906006, + "learning_rate": 3.0175e-05, + "loss": 2.4082, + "step": 793 + }, + { + "epoch": 0.08863678384667176, + "grad_norm": 0.2490616887807846, + "learning_rate": 3.015e-05, + "loss": 2.3136, + "step": 794 + }, + { + "epoch": 0.08874841707569779, + "grad_norm": 0.24170029163360596, + "learning_rate": 3.0125000000000004e-05, + "loss": 2.3903, + "step": 795 + }, + { + "epoch": 0.08886005030472383, + "grad_norm": 0.24534232914447784, + "learning_rate": 3.01e-05, + "loss": 2.3231, + "step": 796 + }, + { + "epoch": 0.08897168353374986, + "grad_norm": 0.23491844534873962, + "learning_rate": 3.0075000000000003e-05, + "loss": 2.3657, + "step": 797 + }, + { + "epoch": 0.0890833167627759, + "grad_norm": 0.23918871581554413, + "learning_rate": 3.0050000000000002e-05, + "loss": 2.3655, + "step": 798 + }, + { + "epoch": 0.08919494999180193, + "grad_norm": 0.23670734465122223, + "learning_rate": 3.0025000000000005e-05, + "loss": 2.3264, + "step": 799 + }, + { + "epoch": 0.08930658322082798, + "grad_norm": 0.2383037805557251, + "learning_rate": 3e-05, + "loss": 2.3465, + "step": 800 + }, + { + "epoch": 0.08941821644985401, + "grad_norm": 0.38153019547462463, + "learning_rate": 2.9975000000000004e-05, + "loss": 2.4639, + "step": 801 + }, + { + "epoch": 0.08952984967888004, + "grad_norm": 0.25272589921951294, + "learning_rate": 2.995e-05, + "loss": 2.253, + "step": 802 + }, + { + "epoch": 0.08964148290790608, + "grad_norm": 0.24903567135334015, + "learning_rate": 2.9925000000000002e-05, + "loss": 2.3757, + "step": 803 + }, + { + "epoch": 0.08975311613693211, + "grad_norm": 0.23128022253513336, + "learning_rate": 2.9900000000000002e-05, + "loss": 2.287, + "step": 804 + }, + { + "epoch": 0.08986474936595815, + "grad_norm": 0.23053546249866486, + "learning_rate": 2.9875000000000004e-05, + "loss": 2.4614, + "step": 805 + }, + { + "epoch": 0.08997638259498418, + "grad_norm": 0.3717436194419861, + "learning_rate": 2.985e-05, + "loss": 2.268, + "step": 806 + }, + { + "epoch": 0.09008801582401021, + "grad_norm": 0.4570637047290802, + "learning_rate": 2.9825000000000003e-05, + "loss": 2.3794, + "step": 807 + }, + { + "epoch": 0.09019964905303625, + "grad_norm": 0.2312982678413391, + "learning_rate": 2.98e-05, + "loss": 2.4141, + "step": 808 + }, + { + "epoch": 0.09031128228206228, + "grad_norm": 0.2467602640390396, + "learning_rate": 2.9775000000000002e-05, + "loss": 2.3287, + "step": 809 + }, + { + "epoch": 0.09042291551108832, + "grad_norm": 0.2299824357032776, + "learning_rate": 2.975e-05, + "loss": 2.4114, + "step": 810 + }, + { + "epoch": 0.09053454874011435, + "grad_norm": 3.2160747051239014, + "learning_rate": 2.9725000000000004e-05, + "loss": 2.4177, + "step": 811 + }, + { + "epoch": 0.09064618196914039, + "grad_norm": 0.2425236701965332, + "learning_rate": 2.97e-05, + "loss": 2.2538, + "step": 812 + }, + { + "epoch": 0.09075781519816642, + "grad_norm": 0.2397298812866211, + "learning_rate": 2.9675000000000003e-05, + "loss": 2.4145, + "step": 813 + }, + { + "epoch": 0.09086944842719245, + "grad_norm": 0.23437006771564484, + "learning_rate": 2.965e-05, + "loss": 2.2899, + "step": 814 + }, + { + "epoch": 0.09098108165621849, + "grad_norm": 0.2314252257347107, + "learning_rate": 2.9625000000000002e-05, + "loss": 2.3063, + "step": 815 + }, + { + "epoch": 0.09109271488524454, + "grad_norm": 0.24189533293247223, + "learning_rate": 2.96e-05, + "loss": 2.3931, + "step": 816 + }, + { + "epoch": 0.09120434811427057, + "grad_norm": 0.2523435950279236, + "learning_rate": 2.9575000000000004e-05, + "loss": 2.337, + "step": 817 + }, + { + "epoch": 0.0913159813432966, + "grad_norm": 0.26922857761383057, + "learning_rate": 2.955e-05, + "loss": 2.3536, + "step": 818 + }, + { + "epoch": 0.09142761457232264, + "grad_norm": 0.24634380638599396, + "learning_rate": 2.9525000000000003e-05, + "loss": 2.4549, + "step": 819 + }, + { + "epoch": 0.09153924780134867, + "grad_norm": 0.25171083211898804, + "learning_rate": 2.95e-05, + "loss": 2.2972, + "step": 820 + }, + { + "epoch": 0.0916508810303747, + "grad_norm": 0.6297338604927063, + "learning_rate": 2.9475e-05, + "loss": 2.3931, + "step": 821 + }, + { + "epoch": 0.09176251425940074, + "grad_norm": 0.2340405136346817, + "learning_rate": 2.945e-05, + "loss": 2.3981, + "step": 822 + }, + { + "epoch": 0.09187414748842677, + "grad_norm": 0.2873988449573517, + "learning_rate": 2.9425000000000004e-05, + "loss": 2.3315, + "step": 823 + }, + { + "epoch": 0.09198578071745281, + "grad_norm": 0.25433048605918884, + "learning_rate": 2.94e-05, + "loss": 2.2073, + "step": 824 + }, + { + "epoch": 0.09209741394647884, + "grad_norm": 0.23423077166080475, + "learning_rate": 2.9375000000000003e-05, + "loss": 2.3346, + "step": 825 + }, + { + "epoch": 0.09220904717550488, + "grad_norm": 0.2264171838760376, + "learning_rate": 2.935e-05, + "loss": 2.4019, + "step": 826 + }, + { + "epoch": 0.09232068040453091, + "grad_norm": 0.27230480313301086, + "learning_rate": 2.9325e-05, + "loss": 2.3569, + "step": 827 + }, + { + "epoch": 0.09243231363355695, + "grad_norm": 0.23226654529571533, + "learning_rate": 2.93e-05, + "loss": 2.4407, + "step": 828 + }, + { + "epoch": 0.09254394686258298, + "grad_norm": 0.23659475147724152, + "learning_rate": 2.9275000000000003e-05, + "loss": 2.3602, + "step": 829 + }, + { + "epoch": 0.09265558009160901, + "grad_norm": 0.2345886379480362, + "learning_rate": 2.925e-05, + "loss": 2.3828, + "step": 830 + }, + { + "epoch": 0.09276721332063505, + "grad_norm": 0.23473899066448212, + "learning_rate": 2.9225000000000002e-05, + "loss": 2.2976, + "step": 831 + }, + { + "epoch": 0.09287884654966108, + "grad_norm": 0.23485642671585083, + "learning_rate": 2.9199999999999998e-05, + "loss": 2.3747, + "step": 832 + }, + { + "epoch": 0.09299047977868713, + "grad_norm": 0.24248026311397552, + "learning_rate": 2.9175e-05, + "loss": 2.3549, + "step": 833 + }, + { + "epoch": 0.09310211300771316, + "grad_norm": 0.23264911770820618, + "learning_rate": 2.915e-05, + "loss": 2.4257, + "step": 834 + }, + { + "epoch": 0.0932137462367392, + "grad_norm": 0.25659480690956116, + "learning_rate": 2.9125000000000003e-05, + "loss": 2.229, + "step": 835 + }, + { + "epoch": 0.09332537946576523, + "grad_norm": 0.23583489656448364, + "learning_rate": 2.91e-05, + "loss": 2.3548, + "step": 836 + }, + { + "epoch": 0.09343701269479127, + "grad_norm": 0.23372192680835724, + "learning_rate": 2.9075000000000002e-05, + "loss": 2.2546, + "step": 837 + }, + { + "epoch": 0.0935486459238173, + "grad_norm": 0.2368670254945755, + "learning_rate": 2.9049999999999998e-05, + "loss": 2.2415, + "step": 838 + }, + { + "epoch": 0.09366027915284333, + "grad_norm": 0.24544133245944977, + "learning_rate": 2.9025e-05, + "loss": 2.3511, + "step": 839 + }, + { + "epoch": 0.09377191238186937, + "grad_norm": 0.2858797013759613, + "learning_rate": 2.9e-05, + "loss": 2.4491, + "step": 840 + }, + { + "epoch": 0.0938835456108954, + "grad_norm": 0.24954313039779663, + "learning_rate": 2.8975000000000003e-05, + "loss": 2.5859, + "step": 841 + }, + { + "epoch": 0.09399517883992144, + "grad_norm": 0.23716183006763458, + "learning_rate": 2.895e-05, + "loss": 2.2792, + "step": 842 + }, + { + "epoch": 0.09410681206894747, + "grad_norm": 0.23809632658958435, + "learning_rate": 2.8925000000000002e-05, + "loss": 2.2958, + "step": 843 + }, + { + "epoch": 0.0942184452979735, + "grad_norm": 0.23545247316360474, + "learning_rate": 2.8899999999999998e-05, + "loss": 2.3113, + "step": 844 + }, + { + "epoch": 0.09433007852699954, + "grad_norm": 0.2312520146369934, + "learning_rate": 2.8875e-05, + "loss": 2.2242, + "step": 845 + }, + { + "epoch": 0.09444171175602557, + "grad_norm": 0.23694823682308197, + "learning_rate": 2.885e-05, + "loss": 2.3629, + "step": 846 + }, + { + "epoch": 0.09455334498505161, + "grad_norm": 0.22986574470996857, + "learning_rate": 2.8825000000000003e-05, + "loss": 2.3428, + "step": 847 + }, + { + "epoch": 0.09466497821407764, + "grad_norm": 0.2305915206670761, + "learning_rate": 2.88e-05, + "loss": 2.291, + "step": 848 + }, + { + "epoch": 0.09477661144310369, + "grad_norm": 0.23731227219104767, + "learning_rate": 2.8775e-05, + "loss": 2.3141, + "step": 849 + }, + { + "epoch": 0.09488824467212972, + "grad_norm": 0.24682384729385376, + "learning_rate": 2.8749999999999997e-05, + "loss": 2.2654, + "step": 850 + }, + { + "epoch": 0.09499987790115576, + "grad_norm": 0.232358917593956, + "learning_rate": 2.8725e-05, + "loss": 2.3816, + "step": 851 + }, + { + "epoch": 0.09511151113018179, + "grad_norm": 0.23460093140602112, + "learning_rate": 2.87e-05, + "loss": 2.4469, + "step": 852 + }, + { + "epoch": 0.09522314435920783, + "grad_norm": 0.28891122341156006, + "learning_rate": 2.8675000000000002e-05, + "loss": 2.3192, + "step": 853 + }, + { + "epoch": 0.09533477758823386, + "grad_norm": 0.22858241200447083, + "learning_rate": 2.865e-05, + "loss": 2.3903, + "step": 854 + }, + { + "epoch": 0.0954464108172599, + "grad_norm": 0.2979806065559387, + "learning_rate": 2.8625e-05, + "loss": 2.3428, + "step": 855 + }, + { + "epoch": 0.09555804404628593, + "grad_norm": 0.22742506861686707, + "learning_rate": 2.86e-05, + "loss": 2.3086, + "step": 856 + }, + { + "epoch": 0.09566967727531196, + "grad_norm": 0.2474931925535202, + "learning_rate": 2.8575000000000003e-05, + "loss": 2.2446, + "step": 857 + }, + { + "epoch": 0.095781310504338, + "grad_norm": 0.23535043001174927, + "learning_rate": 2.855e-05, + "loss": 2.3, + "step": 858 + }, + { + "epoch": 0.09589294373336403, + "grad_norm": 0.2337024062871933, + "learning_rate": 2.8525000000000002e-05, + "loss": 2.3222, + "step": 859 + }, + { + "epoch": 0.09600457696239006, + "grad_norm": 0.23335273563861847, + "learning_rate": 2.8499999999999998e-05, + "loss": 2.3336, + "step": 860 + }, + { + "epoch": 0.0961162101914161, + "grad_norm": 0.23024383187294006, + "learning_rate": 2.8475e-05, + "loss": 2.3664, + "step": 861 + }, + { + "epoch": 0.09622784342044213, + "grad_norm": 0.27251824736595154, + "learning_rate": 2.845e-05, + "loss": 2.25, + "step": 862 + }, + { + "epoch": 0.09633947664946817, + "grad_norm": 0.23650218546390533, + "learning_rate": 2.8425000000000003e-05, + "loss": 2.3429, + "step": 863 + }, + { + "epoch": 0.0964511098784942, + "grad_norm": 0.23651504516601562, + "learning_rate": 2.84e-05, + "loss": 2.4, + "step": 864 + }, + { + "epoch": 0.09656274310752025, + "grad_norm": 0.2359198033809662, + "learning_rate": 2.8375000000000002e-05, + "loss": 2.3672, + "step": 865 + }, + { + "epoch": 0.09667437633654628, + "grad_norm": 0.2200855314731598, + "learning_rate": 2.8349999999999998e-05, + "loss": 2.3336, + "step": 866 + }, + { + "epoch": 0.09678600956557232, + "grad_norm": 0.23432110249996185, + "learning_rate": 2.8325e-05, + "loss": 2.3277, + "step": 867 + }, + { + "epoch": 0.09689764279459835, + "grad_norm": 0.2440565526485443, + "learning_rate": 2.83e-05, + "loss": 2.3388, + "step": 868 + }, + { + "epoch": 0.09700927602362439, + "grad_norm": 0.23634879291057587, + "learning_rate": 2.8275000000000003e-05, + "loss": 2.3551, + "step": 869 + }, + { + "epoch": 0.09712090925265042, + "grad_norm": 0.23254720866680145, + "learning_rate": 2.825e-05, + "loss": 2.4071, + "step": 870 + }, + { + "epoch": 0.09723254248167645, + "grad_norm": 0.25411373376846313, + "learning_rate": 2.8225e-05, + "loss": 2.3591, + "step": 871 + }, + { + "epoch": 0.09734417571070249, + "grad_norm": 0.23345574736595154, + "learning_rate": 2.8199999999999998e-05, + "loss": 2.2857, + "step": 872 + }, + { + "epoch": 0.09745580893972852, + "grad_norm": 0.23449824750423431, + "learning_rate": 2.8175e-05, + "loss": 2.297, + "step": 873 + }, + { + "epoch": 0.09756744216875456, + "grad_norm": 0.23664528131484985, + "learning_rate": 2.815e-05, + "loss": 2.4227, + "step": 874 + }, + { + "epoch": 0.09767907539778059, + "grad_norm": 0.22787226736545563, + "learning_rate": 2.8125000000000003e-05, + "loss": 2.4532, + "step": 875 + }, + { + "epoch": 0.09779070862680662, + "grad_norm": 0.22756721079349518, + "learning_rate": 2.8100000000000005e-05, + "loss": 2.2942, + "step": 876 + }, + { + "epoch": 0.09790234185583266, + "grad_norm": 0.23187048733234406, + "learning_rate": 2.8075e-05, + "loss": 2.4004, + "step": 877 + }, + { + "epoch": 0.09801397508485869, + "grad_norm": 0.23409034311771393, + "learning_rate": 2.8050000000000004e-05, + "loss": 2.3854, + "step": 878 + }, + { + "epoch": 0.09812560831388473, + "grad_norm": 0.23317299783229828, + "learning_rate": 2.8025e-05, + "loss": 2.365, + "step": 879 + }, + { + "epoch": 0.09823724154291076, + "grad_norm": 0.4648565948009491, + "learning_rate": 2.8000000000000003e-05, + "loss": 2.3482, + "step": 880 + }, + { + "epoch": 0.0983488747719368, + "grad_norm": 0.23336485028266907, + "learning_rate": 2.7975000000000002e-05, + "loss": 2.454, + "step": 881 + }, + { + "epoch": 0.09846050800096284, + "grad_norm": 0.22770002484321594, + "learning_rate": 2.7950000000000005e-05, + "loss": 2.2243, + "step": 882 + }, + { + "epoch": 0.09857214122998888, + "grad_norm": 0.240431547164917, + "learning_rate": 2.7925e-05, + "loss": 2.3568, + "step": 883 + }, + { + "epoch": 0.09868377445901491, + "grad_norm": 0.23338884115219116, + "learning_rate": 2.7900000000000004e-05, + "loss": 2.3459, + "step": 884 + }, + { + "epoch": 0.09879540768804095, + "grad_norm": 0.23423053324222565, + "learning_rate": 2.7875e-05, + "loss": 2.4424, + "step": 885 + }, + { + "epoch": 0.09890704091706698, + "grad_norm": 0.2247275859117508, + "learning_rate": 2.7850000000000003e-05, + "loss": 2.4401, + "step": 886 + }, + { + "epoch": 0.09901867414609301, + "grad_norm": 0.23924924433231354, + "learning_rate": 2.7825000000000002e-05, + "loss": 2.3376, + "step": 887 + }, + { + "epoch": 0.09913030737511905, + "grad_norm": 0.23396386206150055, + "learning_rate": 2.7800000000000005e-05, + "loss": 2.2607, + "step": 888 + }, + { + "epoch": 0.09924194060414508, + "grad_norm": 0.23403829336166382, + "learning_rate": 2.7775e-05, + "loss": 2.2647, + "step": 889 + }, + { + "epoch": 0.09935357383317112, + "grad_norm": 0.250621497631073, + "learning_rate": 2.7750000000000004e-05, + "loss": 2.3554, + "step": 890 + }, + { + "epoch": 0.09946520706219715, + "grad_norm": 0.24255424737930298, + "learning_rate": 2.7725e-05, + "loss": 2.3057, + "step": 891 + }, + { + "epoch": 0.09957684029122318, + "grad_norm": 0.23455750942230225, + "learning_rate": 2.7700000000000002e-05, + "loss": 2.405, + "step": 892 + }, + { + "epoch": 0.09968847352024922, + "grad_norm": 0.24191993474960327, + "learning_rate": 2.7675000000000002e-05, + "loss": 2.3389, + "step": 893 + }, + { + "epoch": 0.09980010674927525, + "grad_norm": 0.23159699141979218, + "learning_rate": 2.7650000000000005e-05, + "loss": 2.4599, + "step": 894 + }, + { + "epoch": 0.09991173997830129, + "grad_norm": 0.2873140871524811, + "learning_rate": 2.7625e-05, + "loss": 2.2902, + "step": 895 + }, + { + "epoch": 0.10002337320732732, + "grad_norm": 0.2346268594264984, + "learning_rate": 2.7600000000000003e-05, + "loss": 2.3954, + "step": 896 + }, + { + "epoch": 0.10013500643635335, + "grad_norm": 0.2276250720024109, + "learning_rate": 2.7575e-05, + "loss": 2.3288, + "step": 897 + }, + { + "epoch": 0.1002466396653794, + "grad_norm": 0.22809089720249176, + "learning_rate": 2.7550000000000002e-05, + "loss": 2.3797, + "step": 898 + }, + { + "epoch": 0.10035827289440544, + "grad_norm": 0.2589645981788635, + "learning_rate": 2.7525e-05, + "loss": 2.235, + "step": 899 + }, + { + "epoch": 0.10046990612343147, + "grad_norm": 0.26668986678123474, + "learning_rate": 2.7500000000000004e-05, + "loss": 2.4543, + "step": 900 + }, + { + "epoch": 0.1005815393524575, + "grad_norm": 0.231545552611351, + "learning_rate": 2.7475e-05, + "loss": 2.2753, + "step": 901 + }, + { + "epoch": 0.10069317258148354, + "grad_norm": 0.23608693480491638, + "learning_rate": 2.7450000000000003e-05, + "loss": 2.4207, + "step": 902 + }, + { + "epoch": 0.10080480581050957, + "grad_norm": 0.22971975803375244, + "learning_rate": 2.7425e-05, + "loss": 2.4783, + "step": 903 + }, + { + "epoch": 0.10091643903953561, + "grad_norm": 0.24497389793395996, + "learning_rate": 2.7400000000000002e-05, + "loss": 2.3443, + "step": 904 + }, + { + "epoch": 0.10102807226856164, + "grad_norm": 0.22968074679374695, + "learning_rate": 2.7375e-05, + "loss": 2.3163, + "step": 905 + }, + { + "epoch": 0.10113970549758768, + "grad_norm": 0.23204439878463745, + "learning_rate": 2.7350000000000004e-05, + "loss": 2.3681, + "step": 906 + }, + { + "epoch": 0.10125133872661371, + "grad_norm": 0.24272161722183228, + "learning_rate": 2.7325e-05, + "loss": 2.3472, + "step": 907 + }, + { + "epoch": 0.10136297195563974, + "grad_norm": 0.22961987555027008, + "learning_rate": 2.7300000000000003e-05, + "loss": 2.3187, + "step": 908 + }, + { + "epoch": 0.10147460518466578, + "grad_norm": 0.23540601134300232, + "learning_rate": 2.7275e-05, + "loss": 2.289, + "step": 909 + }, + { + "epoch": 0.10158623841369181, + "grad_norm": 0.260650634765625, + "learning_rate": 2.725e-05, + "loss": 2.3203, + "step": 910 + }, + { + "epoch": 0.10169787164271785, + "grad_norm": 0.24257007241249084, + "learning_rate": 2.7225e-05, + "loss": 2.3648, + "step": 911 + }, + { + "epoch": 0.10180950487174388, + "grad_norm": 0.2477046102285385, + "learning_rate": 2.7200000000000004e-05, + "loss": 2.273, + "step": 912 + }, + { + "epoch": 0.10192113810076991, + "grad_norm": 0.23077093064785004, + "learning_rate": 2.7175e-05, + "loss": 2.3581, + "step": 913 + }, + { + "epoch": 0.10203277132979595, + "grad_norm": 0.229270800948143, + "learning_rate": 2.7150000000000003e-05, + "loss": 2.3241, + "step": 914 + }, + { + "epoch": 0.102144404558822, + "grad_norm": 0.2295754998922348, + "learning_rate": 2.7125000000000002e-05, + "loss": 2.3711, + "step": 915 + }, + { + "epoch": 0.10225603778784803, + "grad_norm": 0.23900440335273743, + "learning_rate": 2.7100000000000005e-05, + "loss": 2.3576, + "step": 916 + }, + { + "epoch": 0.10236767101687407, + "grad_norm": 0.234444722533226, + "learning_rate": 2.7075e-05, + "loss": 2.3537, + "step": 917 + }, + { + "epoch": 0.1024793042459001, + "grad_norm": 0.2370821088552475, + "learning_rate": 2.7050000000000004e-05, + "loss": 2.3126, + "step": 918 + }, + { + "epoch": 0.10259093747492613, + "grad_norm": 0.24210244417190552, + "learning_rate": 2.7025e-05, + "loss": 2.2511, + "step": 919 + }, + { + "epoch": 0.10270257070395217, + "grad_norm": 0.23555943369865417, + "learning_rate": 2.7000000000000002e-05, + "loss": 2.3868, + "step": 920 + }, + { + "epoch": 0.1028142039329782, + "grad_norm": 0.2252027690410614, + "learning_rate": 2.6975000000000002e-05, + "loss": 2.3263, + "step": 921 + }, + { + "epoch": 0.10292583716200424, + "grad_norm": 0.22942887246608734, + "learning_rate": 2.6950000000000005e-05, + "loss": 2.3358, + "step": 922 + }, + { + "epoch": 0.10303747039103027, + "grad_norm": 0.23425108194351196, + "learning_rate": 2.6925e-05, + "loss": 2.279, + "step": 923 + }, + { + "epoch": 0.1031491036200563, + "grad_norm": 0.23959554731845856, + "learning_rate": 2.6900000000000003e-05, + "loss": 2.3456, + "step": 924 + }, + { + "epoch": 0.10326073684908234, + "grad_norm": 0.22143711149692535, + "learning_rate": 2.6875e-05, + "loss": 2.3346, + "step": 925 + }, + { + "epoch": 0.10337237007810837, + "grad_norm": 0.2322838306427002, + "learning_rate": 2.6850000000000002e-05, + "loss": 2.3597, + "step": 926 + }, + { + "epoch": 0.1034840033071344, + "grad_norm": 0.22317776083946228, + "learning_rate": 2.6825e-05, + "loss": 2.4395, + "step": 927 + }, + { + "epoch": 0.10359563653616044, + "grad_norm": 0.2381390780210495, + "learning_rate": 2.6800000000000004e-05, + "loss": 2.335, + "step": 928 + }, + { + "epoch": 0.10370726976518647, + "grad_norm": 0.2249373197555542, + "learning_rate": 2.6775e-05, + "loss": 2.3763, + "step": 929 + }, + { + "epoch": 0.10381890299421251, + "grad_norm": 0.23083436489105225, + "learning_rate": 2.6750000000000003e-05, + "loss": 2.2986, + "step": 930 + }, + { + "epoch": 0.10393053622323856, + "grad_norm": 0.23313601315021515, + "learning_rate": 2.6725e-05, + "loss": 2.4076, + "step": 931 + }, + { + "epoch": 0.10404216945226459, + "grad_norm": 0.22721858322620392, + "learning_rate": 2.6700000000000002e-05, + "loss": 2.2987, + "step": 932 + }, + { + "epoch": 0.10415380268129062, + "grad_norm": 0.23775102198123932, + "learning_rate": 2.6675e-05, + "loss": 2.3398, + "step": 933 + }, + { + "epoch": 0.10426543591031666, + "grad_norm": 0.22878248989582062, + "learning_rate": 2.6650000000000004e-05, + "loss": 2.3369, + "step": 934 + }, + { + "epoch": 0.10437706913934269, + "grad_norm": 0.22213736176490784, + "learning_rate": 2.6625e-05, + "loss": 2.4302, + "step": 935 + }, + { + "epoch": 0.10448870236836873, + "grad_norm": 0.23703357577323914, + "learning_rate": 2.6600000000000003e-05, + "loss": 2.2731, + "step": 936 + }, + { + "epoch": 0.10460033559739476, + "grad_norm": 0.2916199266910553, + "learning_rate": 2.6575e-05, + "loss": 2.2714, + "step": 937 + }, + { + "epoch": 0.1047119688264208, + "grad_norm": 0.22936727106571198, + "learning_rate": 2.655e-05, + "loss": 2.4308, + "step": 938 + }, + { + "epoch": 0.10482360205544683, + "grad_norm": 0.2333354949951172, + "learning_rate": 2.6525e-05, + "loss": 2.3236, + "step": 939 + }, + { + "epoch": 0.10493523528447286, + "grad_norm": 0.23450367152690887, + "learning_rate": 2.6500000000000004e-05, + "loss": 2.3854, + "step": 940 + }, + { + "epoch": 0.1050468685134989, + "grad_norm": 0.2431698888540268, + "learning_rate": 2.6475e-05, + "loss": 2.2789, + "step": 941 + }, + { + "epoch": 0.10515850174252493, + "grad_norm": 0.22219637036323547, + "learning_rate": 2.6450000000000003e-05, + "loss": 2.3675, + "step": 942 + }, + { + "epoch": 0.10527013497155097, + "grad_norm": 0.233125239610672, + "learning_rate": 2.6425e-05, + "loss": 2.4779, + "step": 943 + }, + { + "epoch": 0.105381768200577, + "grad_norm": 0.22824883460998535, + "learning_rate": 2.64e-05, + "loss": 2.3445, + "step": 944 + }, + { + "epoch": 0.10549340142960303, + "grad_norm": 0.22547942399978638, + "learning_rate": 2.6375e-05, + "loss": 2.234, + "step": 945 + }, + { + "epoch": 0.10560503465862907, + "grad_norm": 0.2325771003961563, + "learning_rate": 2.6350000000000004e-05, + "loss": 2.4215, + "step": 946 + }, + { + "epoch": 0.1057166678876551, + "grad_norm": 0.233683779835701, + "learning_rate": 2.6325e-05, + "loss": 2.4137, + "step": 947 + }, + { + "epoch": 0.10582830111668115, + "grad_norm": 0.24647918343544006, + "learning_rate": 2.6300000000000002e-05, + "loss": 2.3459, + "step": 948 + }, + { + "epoch": 0.10593993434570718, + "grad_norm": 0.22863587737083435, + "learning_rate": 2.6275e-05, + "loss": 2.3867, + "step": 949 + }, + { + "epoch": 0.10605156757473322, + "grad_norm": 0.23412172496318817, + "learning_rate": 2.625e-05, + "loss": 2.3621, + "step": 950 + }, + { + "epoch": 0.10616320080375925, + "grad_norm": 0.22972947359085083, + "learning_rate": 2.6225e-05, + "loss": 2.4242, + "step": 951 + }, + { + "epoch": 0.10627483403278529, + "grad_norm": 0.23886039853096008, + "learning_rate": 2.6200000000000003e-05, + "loss": 2.3375, + "step": 952 + }, + { + "epoch": 0.10638646726181132, + "grad_norm": 0.24040424823760986, + "learning_rate": 2.6175e-05, + "loss": 2.3518, + "step": 953 + }, + { + "epoch": 0.10649810049083736, + "grad_norm": 0.22699175775051117, + "learning_rate": 2.6150000000000002e-05, + "loss": 2.3435, + "step": 954 + }, + { + "epoch": 0.10660973371986339, + "grad_norm": 0.23695823550224304, + "learning_rate": 2.6124999999999998e-05, + "loss": 2.4036, + "step": 955 + }, + { + "epoch": 0.10672136694888942, + "grad_norm": 0.23027122020721436, + "learning_rate": 2.61e-05, + "loss": 2.286, + "step": 956 + }, + { + "epoch": 0.10683300017791546, + "grad_norm": 0.24779526889324188, + "learning_rate": 2.6075e-05, + "loss": 2.3144, + "step": 957 + }, + { + "epoch": 0.10694463340694149, + "grad_norm": 0.23636764287948608, + "learning_rate": 2.6050000000000003e-05, + "loss": 2.3114, + "step": 958 + }, + { + "epoch": 0.10705626663596753, + "grad_norm": 0.22950230538845062, + "learning_rate": 2.6025e-05, + "loss": 2.3594, + "step": 959 + }, + { + "epoch": 0.10716789986499356, + "grad_norm": 0.2355276644229889, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.2385, + "step": 960 + }, + { + "epoch": 0.1072795330940196, + "grad_norm": 0.25076955556869507, + "learning_rate": 2.5974999999999998e-05, + "loss": 2.2637, + "step": 961 + }, + { + "epoch": 0.10739116632304563, + "grad_norm": 0.2247258871793747, + "learning_rate": 2.595e-05, + "loss": 2.2486, + "step": 962 + }, + { + "epoch": 0.10750279955207166, + "grad_norm": 0.21740855276584625, + "learning_rate": 2.5925e-05, + "loss": 2.2609, + "step": 963 + }, + { + "epoch": 0.10761443278109771, + "grad_norm": 0.2570677697658539, + "learning_rate": 2.5900000000000003e-05, + "loss": 2.442, + "step": 964 + }, + { + "epoch": 0.10772606601012374, + "grad_norm": 0.2322109192609787, + "learning_rate": 2.5875e-05, + "loss": 2.3026, + "step": 965 + }, + { + "epoch": 0.10783769923914978, + "grad_norm": 0.24020114541053772, + "learning_rate": 2.585e-05, + "loss": 2.351, + "step": 966 + }, + { + "epoch": 0.10794933246817581, + "grad_norm": 0.2280672937631607, + "learning_rate": 2.5824999999999998e-05, + "loss": 2.4016, + "step": 967 + }, + { + "epoch": 0.10806096569720185, + "grad_norm": 0.2309507429599762, + "learning_rate": 2.58e-05, + "loss": 2.3733, + "step": 968 + }, + { + "epoch": 0.10817259892622788, + "grad_norm": 0.2293708324432373, + "learning_rate": 2.5775e-05, + "loss": 2.3583, + "step": 969 + }, + { + "epoch": 0.10828423215525391, + "grad_norm": 0.23410175740718842, + "learning_rate": 2.5750000000000002e-05, + "loss": 2.352, + "step": 970 + }, + { + "epoch": 0.10839586538427995, + "grad_norm": 0.22972838580608368, + "learning_rate": 2.5725e-05, + "loss": 2.3434, + "step": 971 + }, + { + "epoch": 0.10850749861330598, + "grad_norm": 0.22537364065647125, + "learning_rate": 2.57e-05, + "loss": 2.3875, + "step": 972 + }, + { + "epoch": 0.10861913184233202, + "grad_norm": 0.23455362021923065, + "learning_rate": 2.5675e-05, + "loss": 2.2759, + "step": 973 + }, + { + "epoch": 0.10873076507135805, + "grad_norm": 0.23852132260799408, + "learning_rate": 2.5650000000000003e-05, + "loss": 2.3408, + "step": 974 + }, + { + "epoch": 0.10884239830038409, + "grad_norm": 0.23479855060577393, + "learning_rate": 2.5625e-05, + "loss": 2.3161, + "step": 975 + }, + { + "epoch": 0.10895403152941012, + "grad_norm": 0.2664550244808197, + "learning_rate": 2.5600000000000002e-05, + "loss": 2.387, + "step": 976 + }, + { + "epoch": 0.10906566475843615, + "grad_norm": 0.22990471124649048, + "learning_rate": 2.5574999999999998e-05, + "loss": 2.3906, + "step": 977 + }, + { + "epoch": 0.10917729798746219, + "grad_norm": 0.2335597425699234, + "learning_rate": 2.555e-05, + "loss": 2.3989, + "step": 978 + }, + { + "epoch": 0.10928893121648822, + "grad_norm": 0.23832087218761444, + "learning_rate": 2.5525e-05, + "loss": 2.2881, + "step": 979 + }, + { + "epoch": 0.10940056444551426, + "grad_norm": 0.23258844017982483, + "learning_rate": 2.5500000000000003e-05, + "loss": 2.4174, + "step": 980 + }, + { + "epoch": 0.1095121976745403, + "grad_norm": 0.23663833737373352, + "learning_rate": 2.5475e-05, + "loss": 2.2888, + "step": 981 + }, + { + "epoch": 0.10962383090356634, + "grad_norm": 0.22592462599277496, + "learning_rate": 2.5450000000000002e-05, + "loss": 2.3676, + "step": 982 + }, + { + "epoch": 0.10973546413259237, + "grad_norm": 0.23445037007331848, + "learning_rate": 2.5424999999999998e-05, + "loss": 2.3627, + "step": 983 + }, + { + "epoch": 0.1098470973616184, + "grad_norm": 0.2289026379585266, + "learning_rate": 2.54e-05, + "loss": 2.4259, + "step": 984 + }, + { + "epoch": 0.10995873059064444, + "grad_norm": 0.2335384339094162, + "learning_rate": 2.5375e-05, + "loss": 2.2903, + "step": 985 + }, + { + "epoch": 0.11007036381967047, + "grad_norm": 0.24791677296161652, + "learning_rate": 2.5350000000000003e-05, + "loss": 2.3497, + "step": 986 + }, + { + "epoch": 0.11018199704869651, + "grad_norm": 0.21857081353664398, + "learning_rate": 2.5325e-05, + "loss": 2.3938, + "step": 987 + }, + { + "epoch": 0.11029363027772254, + "grad_norm": 0.23194223642349243, + "learning_rate": 2.5300000000000002e-05, + "loss": 2.3752, + "step": 988 + }, + { + "epoch": 0.11040526350674858, + "grad_norm": 0.22554685175418854, + "learning_rate": 2.5274999999999998e-05, + "loss": 2.3892, + "step": 989 + }, + { + "epoch": 0.11051689673577461, + "grad_norm": 0.2444494664669037, + "learning_rate": 2.525e-05, + "loss": 2.3854, + "step": 990 + }, + { + "epoch": 0.11062852996480065, + "grad_norm": 0.24324128031730652, + "learning_rate": 2.5225e-05, + "loss": 2.2129, + "step": 991 + }, + { + "epoch": 0.11074016319382668, + "grad_norm": 0.23157966136932373, + "learning_rate": 2.5200000000000003e-05, + "loss": 2.3688, + "step": 992 + }, + { + "epoch": 0.11085179642285271, + "grad_norm": 0.23496349155902863, + "learning_rate": 2.5175e-05, + "loss": 2.2166, + "step": 993 + }, + { + "epoch": 0.11096342965187875, + "grad_norm": 0.22501815855503082, + "learning_rate": 2.515e-05, + "loss": 2.3563, + "step": 994 + }, + { + "epoch": 0.11107506288090478, + "grad_norm": 0.3032657206058502, + "learning_rate": 2.5124999999999997e-05, + "loss": 2.2783, + "step": 995 + }, + { + "epoch": 0.11118669610993082, + "grad_norm": 0.24155639111995697, + "learning_rate": 2.51e-05, + "loss": 2.3681, + "step": 996 + }, + { + "epoch": 0.11129832933895686, + "grad_norm": 0.22777613997459412, + "learning_rate": 2.5075e-05, + "loss": 2.3981, + "step": 997 + }, + { + "epoch": 0.1114099625679829, + "grad_norm": 0.24148933589458466, + "learning_rate": 2.5050000000000002e-05, + "loss": 2.2894, + "step": 998 + }, + { + "epoch": 0.11152159579700893, + "grad_norm": 0.23705993592739105, + "learning_rate": 2.5025e-05, + "loss": 2.3486, + "step": 999 + }, + { + "epoch": 0.11163322902603497, + "grad_norm": 0.23454095423221588, + "learning_rate": 2.5e-05, + "loss": 2.4232, + "step": 1000 + }, + { + "epoch": 0.111744862255061, + "grad_norm": 0.23154820501804352, + "learning_rate": 2.4975e-05, + "loss": 2.2269, + "step": 1001 + }, + { + "epoch": 0.11185649548408703, + "grad_norm": 0.22418555617332458, + "learning_rate": 2.495e-05, + "loss": 2.2112, + "step": 1002 + }, + { + "epoch": 0.11196812871311307, + "grad_norm": 0.2538329064846039, + "learning_rate": 2.4925000000000003e-05, + "loss": 2.3927, + "step": 1003 + }, + { + "epoch": 0.1120797619421391, + "grad_norm": 0.23294506967067719, + "learning_rate": 2.4900000000000002e-05, + "loss": 2.3024, + "step": 1004 + }, + { + "epoch": 0.11219139517116514, + "grad_norm": 0.22844459116458893, + "learning_rate": 2.4875e-05, + "loss": 2.2385, + "step": 1005 + }, + { + "epoch": 0.11230302840019117, + "grad_norm": 0.22828292846679688, + "learning_rate": 2.485e-05, + "loss": 2.2897, + "step": 1006 + }, + { + "epoch": 0.1124146616292172, + "grad_norm": 0.2693067491054535, + "learning_rate": 2.4825e-05, + "loss": 2.3818, + "step": 1007 + }, + { + "epoch": 0.11252629485824324, + "grad_norm": 0.22305937111377716, + "learning_rate": 2.48e-05, + "loss": 2.3505, + "step": 1008 + }, + { + "epoch": 0.11263792808726927, + "grad_norm": 0.23566830158233643, + "learning_rate": 2.4775000000000003e-05, + "loss": 2.3571, + "step": 1009 + }, + { + "epoch": 0.11274956131629531, + "grad_norm": 0.23289407789707184, + "learning_rate": 2.4750000000000002e-05, + "loss": 2.3021, + "step": 1010 + }, + { + "epoch": 0.11286119454532134, + "grad_norm": 0.2305288016796112, + "learning_rate": 2.4725e-05, + "loss": 2.3082, + "step": 1011 + }, + { + "epoch": 0.11297282777434738, + "grad_norm": 0.2262507677078247, + "learning_rate": 2.47e-05, + "loss": 2.3995, + "step": 1012 + }, + { + "epoch": 0.11308446100337341, + "grad_norm": 0.2277233600616455, + "learning_rate": 2.4675e-05, + "loss": 2.3759, + "step": 1013 + }, + { + "epoch": 0.11319609423239946, + "grad_norm": 0.24634131789207458, + "learning_rate": 2.465e-05, + "loss": 2.3487, + "step": 1014 + }, + { + "epoch": 0.11330772746142549, + "grad_norm": 0.31584781408309937, + "learning_rate": 2.4625000000000002e-05, + "loss": 2.3787, + "step": 1015 + }, + { + "epoch": 0.11341936069045153, + "grad_norm": 0.2360941767692566, + "learning_rate": 2.46e-05, + "loss": 2.2368, + "step": 1016 + }, + { + "epoch": 0.11353099391947756, + "grad_norm": 0.24138571321964264, + "learning_rate": 2.4575e-05, + "loss": 2.2329, + "step": 1017 + }, + { + "epoch": 0.1136426271485036, + "grad_norm": 0.24359650909900665, + "learning_rate": 2.455e-05, + "loss": 2.352, + "step": 1018 + }, + { + "epoch": 0.11375426037752963, + "grad_norm": 0.22765910625457764, + "learning_rate": 2.4525e-05, + "loss": 2.4487, + "step": 1019 + }, + { + "epoch": 0.11386589360655566, + "grad_norm": 0.22311876714229584, + "learning_rate": 2.45e-05, + "loss": 2.3413, + "step": 1020 + }, + { + "epoch": 0.1139775268355817, + "grad_norm": 0.3245187997817993, + "learning_rate": 2.4475000000000002e-05, + "loss": 2.352, + "step": 1021 + }, + { + "epoch": 0.11408916006460773, + "grad_norm": 0.267455130815506, + "learning_rate": 2.445e-05, + "loss": 2.4273, + "step": 1022 + }, + { + "epoch": 0.11420079329363376, + "grad_norm": 0.23805475234985352, + "learning_rate": 2.4425e-05, + "loss": 2.3511, + "step": 1023 + }, + { + "epoch": 0.1143124265226598, + "grad_norm": 0.23947173357009888, + "learning_rate": 2.44e-05, + "loss": 2.3554, + "step": 1024 + }, + { + "epoch": 0.11442405975168583, + "grad_norm": 0.23551489412784576, + "learning_rate": 2.4375e-05, + "loss": 2.3869, + "step": 1025 + }, + { + "epoch": 0.11453569298071187, + "grad_norm": 0.22201520204544067, + "learning_rate": 2.435e-05, + "loss": 2.351, + "step": 1026 + }, + { + "epoch": 0.1146473262097379, + "grad_norm": 0.23246638476848602, + "learning_rate": 2.4325000000000002e-05, + "loss": 2.3709, + "step": 1027 + }, + { + "epoch": 0.11475895943876394, + "grad_norm": 0.29901397228240967, + "learning_rate": 2.43e-05, + "loss": 2.33, + "step": 1028 + }, + { + "epoch": 0.11487059266778997, + "grad_norm": 0.2317001223564148, + "learning_rate": 2.4275e-05, + "loss": 2.4711, + "step": 1029 + }, + { + "epoch": 0.11498222589681602, + "grad_norm": 0.2264910489320755, + "learning_rate": 2.425e-05, + "loss": 2.4613, + "step": 1030 + }, + { + "epoch": 0.11509385912584205, + "grad_norm": 0.23516049981117249, + "learning_rate": 2.4225e-05, + "loss": 2.3831, + "step": 1031 + }, + { + "epoch": 0.11520549235486809, + "grad_norm": 0.23533384501934052, + "learning_rate": 2.4200000000000002e-05, + "loss": 2.4065, + "step": 1032 + }, + { + "epoch": 0.11531712558389412, + "grad_norm": 0.22421786189079285, + "learning_rate": 2.4175e-05, + "loss": 2.4121, + "step": 1033 + }, + { + "epoch": 0.11542875881292015, + "grad_norm": 0.23271812498569489, + "learning_rate": 2.415e-05, + "loss": 2.3377, + "step": 1034 + }, + { + "epoch": 0.11554039204194619, + "grad_norm": 0.22628925740718842, + "learning_rate": 2.4125e-05, + "loss": 2.3516, + "step": 1035 + }, + { + "epoch": 0.11565202527097222, + "grad_norm": 0.23225760459899902, + "learning_rate": 2.41e-05, + "loss": 2.4267, + "step": 1036 + }, + { + "epoch": 0.11576365849999826, + "grad_norm": 0.24704919755458832, + "learning_rate": 2.4075e-05, + "loss": 2.348, + "step": 1037 + }, + { + "epoch": 0.11587529172902429, + "grad_norm": 0.3677544593811035, + "learning_rate": 2.4050000000000002e-05, + "loss": 2.3355, + "step": 1038 + }, + { + "epoch": 0.11598692495805032, + "grad_norm": 0.2303479164838791, + "learning_rate": 2.4025e-05, + "loss": 2.3829, + "step": 1039 + }, + { + "epoch": 0.11609855818707636, + "grad_norm": 0.5053055882453918, + "learning_rate": 2.4e-05, + "loss": 2.2684, + "step": 1040 + }, + { + "epoch": 0.11621019141610239, + "grad_norm": 0.23576150834560394, + "learning_rate": 2.3975e-05, + "loss": 2.3628, + "step": 1041 + }, + { + "epoch": 0.11632182464512843, + "grad_norm": 0.22880171239376068, + "learning_rate": 2.395e-05, + "loss": 2.33, + "step": 1042 + }, + { + "epoch": 0.11643345787415446, + "grad_norm": 0.225894957780838, + "learning_rate": 2.3925e-05, + "loss": 2.2474, + "step": 1043 + }, + { + "epoch": 0.1165450911031805, + "grad_norm": 0.2277292162179947, + "learning_rate": 2.39e-05, + "loss": 2.32, + "step": 1044 + }, + { + "epoch": 0.11665672433220653, + "grad_norm": 0.22571003437042236, + "learning_rate": 2.3875e-05, + "loss": 2.3058, + "step": 1045 + }, + { + "epoch": 0.11676835756123256, + "grad_norm": 0.2490728795528412, + "learning_rate": 2.385e-05, + "loss": 2.38, + "step": 1046 + }, + { + "epoch": 0.11687999079025861, + "grad_norm": 0.23154285550117493, + "learning_rate": 2.3825e-05, + "loss": 2.3528, + "step": 1047 + }, + { + "epoch": 0.11699162401928465, + "grad_norm": 0.23180274665355682, + "learning_rate": 2.38e-05, + "loss": 2.349, + "step": 1048 + }, + { + "epoch": 0.11710325724831068, + "grad_norm": 0.2314680814743042, + "learning_rate": 2.3775e-05, + "loss": 2.2209, + "step": 1049 + }, + { + "epoch": 0.11721489047733671, + "grad_norm": 0.22533871233463287, + "learning_rate": 2.375e-05, + "loss": 2.2946, + "step": 1050 + }, + { + "epoch": 0.11732652370636275, + "grad_norm": 0.23475436866283417, + "learning_rate": 2.3725e-05, + "loss": 2.3839, + "step": 1051 + }, + { + "epoch": 0.11743815693538878, + "grad_norm": 0.22763217985630035, + "learning_rate": 2.37e-05, + "loss": 2.3039, + "step": 1052 + }, + { + "epoch": 0.11754979016441482, + "grad_norm": 0.22953568398952484, + "learning_rate": 2.3675e-05, + "loss": 2.4163, + "step": 1053 + }, + { + "epoch": 0.11766142339344085, + "grad_norm": 0.2377011775970459, + "learning_rate": 2.365e-05, + "loss": 2.3473, + "step": 1054 + }, + { + "epoch": 0.11777305662246688, + "grad_norm": 0.23042835295200348, + "learning_rate": 2.3624999999999998e-05, + "loss": 2.3909, + "step": 1055 + }, + { + "epoch": 0.11788468985149292, + "grad_norm": 0.23968364298343658, + "learning_rate": 2.36e-05, + "loss": 2.3023, + "step": 1056 + }, + { + "epoch": 0.11799632308051895, + "grad_norm": 0.23102299869060516, + "learning_rate": 2.3575e-05, + "loss": 2.3963, + "step": 1057 + }, + { + "epoch": 0.11810795630954499, + "grad_norm": 0.42533349990844727, + "learning_rate": 2.355e-05, + "loss": 2.396, + "step": 1058 + }, + { + "epoch": 0.11821958953857102, + "grad_norm": 0.2384020835161209, + "learning_rate": 2.3525e-05, + "loss": 2.4379, + "step": 1059 + }, + { + "epoch": 0.11833122276759706, + "grad_norm": 0.2910504639148712, + "learning_rate": 2.35e-05, + "loss": 2.3607, + "step": 1060 + }, + { + "epoch": 0.11844285599662309, + "grad_norm": 0.23686139285564423, + "learning_rate": 2.3475e-05, + "loss": 2.3374, + "step": 1061 + }, + { + "epoch": 0.11855448922564912, + "grad_norm": 0.27755534648895264, + "learning_rate": 2.345e-05, + "loss": 2.3719, + "step": 1062 + }, + { + "epoch": 0.11866612245467517, + "grad_norm": 0.23504361510276794, + "learning_rate": 2.3425000000000004e-05, + "loss": 2.4556, + "step": 1063 + }, + { + "epoch": 0.1187777556837012, + "grad_norm": 0.21698464453220367, + "learning_rate": 2.3400000000000003e-05, + "loss": 2.3708, + "step": 1064 + }, + { + "epoch": 0.11888938891272724, + "grad_norm": 0.2256624847650528, + "learning_rate": 2.3375000000000002e-05, + "loss": 2.2417, + "step": 1065 + }, + { + "epoch": 0.11900102214175327, + "grad_norm": 0.29147782921791077, + "learning_rate": 2.3350000000000002e-05, + "loss": 2.3808, + "step": 1066 + }, + { + "epoch": 0.11911265537077931, + "grad_norm": 0.3733128607273102, + "learning_rate": 2.3325e-05, + "loss": 2.3823, + "step": 1067 + }, + { + "epoch": 0.11922428859980534, + "grad_norm": 0.2350093573331833, + "learning_rate": 2.3300000000000004e-05, + "loss": 2.2919, + "step": 1068 + }, + { + "epoch": 0.11933592182883138, + "grad_norm": 0.23306138813495636, + "learning_rate": 2.3275000000000003e-05, + "loss": 2.3852, + "step": 1069 + }, + { + "epoch": 0.11944755505785741, + "grad_norm": 0.2352742701768875, + "learning_rate": 2.3250000000000003e-05, + "loss": 2.2804, + "step": 1070 + }, + { + "epoch": 0.11955918828688344, + "grad_norm": 0.2168835997581482, + "learning_rate": 2.3225000000000002e-05, + "loss": 2.3675, + "step": 1071 + }, + { + "epoch": 0.11967082151590948, + "grad_norm": 0.23424020409584045, + "learning_rate": 2.32e-05, + "loss": 2.4122, + "step": 1072 + }, + { + "epoch": 0.11978245474493551, + "grad_norm": 0.21874938905239105, + "learning_rate": 2.3175e-05, + "loss": 2.3381, + "step": 1073 + }, + { + "epoch": 0.11989408797396155, + "grad_norm": 0.2390536516904831, + "learning_rate": 2.3150000000000004e-05, + "loss": 2.2756, + "step": 1074 + }, + { + "epoch": 0.12000572120298758, + "grad_norm": 0.2291589379310608, + "learning_rate": 2.3125000000000003e-05, + "loss": 2.3482, + "step": 1075 + }, + { + "epoch": 0.12011735443201361, + "grad_norm": 0.2258395105600357, + "learning_rate": 2.3100000000000002e-05, + "loss": 2.3838, + "step": 1076 + }, + { + "epoch": 0.12022898766103965, + "grad_norm": 0.25954297184944153, + "learning_rate": 2.3075000000000002e-05, + "loss": 2.2275, + "step": 1077 + }, + { + "epoch": 0.12034062089006568, + "grad_norm": 0.23486246168613434, + "learning_rate": 2.305e-05, + "loss": 2.3866, + "step": 1078 + }, + { + "epoch": 0.12045225411909172, + "grad_norm": 0.22818194329738617, + "learning_rate": 2.3025e-05, + "loss": 2.3244, + "step": 1079 + }, + { + "epoch": 0.12056388734811777, + "grad_norm": 0.2317325472831726, + "learning_rate": 2.3000000000000003e-05, + "loss": 2.2603, + "step": 1080 + }, + { + "epoch": 0.1206755205771438, + "grad_norm": 0.24170775711536407, + "learning_rate": 2.2975000000000003e-05, + "loss": 2.3409, + "step": 1081 + }, + { + "epoch": 0.12078715380616983, + "grad_norm": 0.23717226088047028, + "learning_rate": 2.2950000000000002e-05, + "loss": 2.2956, + "step": 1082 + }, + { + "epoch": 0.12089878703519587, + "grad_norm": 0.21983082592487335, + "learning_rate": 2.2925e-05, + "loss": 2.3578, + "step": 1083 + }, + { + "epoch": 0.1210104202642219, + "grad_norm": 0.23536662757396698, + "learning_rate": 2.29e-05, + "loss": 2.3317, + "step": 1084 + }, + { + "epoch": 0.12112205349324794, + "grad_norm": 0.2405475676059723, + "learning_rate": 2.2875e-05, + "loss": 2.3663, + "step": 1085 + }, + { + "epoch": 0.12123368672227397, + "grad_norm": 0.23267629742622375, + "learning_rate": 2.2850000000000003e-05, + "loss": 2.3376, + "step": 1086 + }, + { + "epoch": 0.1213453199513, + "grad_norm": 0.22715060412883759, + "learning_rate": 2.2825000000000003e-05, + "loss": 2.5256, + "step": 1087 + }, + { + "epoch": 0.12145695318032604, + "grad_norm": 0.23004890978336334, + "learning_rate": 2.2800000000000002e-05, + "loss": 2.3706, + "step": 1088 + }, + { + "epoch": 0.12156858640935207, + "grad_norm": 0.22569864988327026, + "learning_rate": 2.2775e-05, + "loss": 2.29, + "step": 1089 + }, + { + "epoch": 0.1216802196383781, + "grad_norm": 0.22686640918254852, + "learning_rate": 2.275e-05, + "loss": 2.3978, + "step": 1090 + }, + { + "epoch": 0.12179185286740414, + "grad_norm": 0.26110976934432983, + "learning_rate": 2.2725000000000003e-05, + "loss": 2.4019, + "step": 1091 + }, + { + "epoch": 0.12190348609643017, + "grad_norm": 0.22089050710201263, + "learning_rate": 2.2700000000000003e-05, + "loss": 2.3078, + "step": 1092 + }, + { + "epoch": 0.12201511932545621, + "grad_norm": 0.22513218224048615, + "learning_rate": 2.2675000000000002e-05, + "loss": 2.3088, + "step": 1093 + }, + { + "epoch": 0.12212675255448224, + "grad_norm": 0.2333805114030838, + "learning_rate": 2.265e-05, + "loss": 2.2955, + "step": 1094 + }, + { + "epoch": 0.12223838578350828, + "grad_norm": 0.22828614711761475, + "learning_rate": 2.2625e-05, + "loss": 2.2746, + "step": 1095 + }, + { + "epoch": 0.12235001901253433, + "grad_norm": 0.23725035786628723, + "learning_rate": 2.26e-05, + "loss": 2.5234, + "step": 1096 + }, + { + "epoch": 0.12246165224156036, + "grad_norm": 0.24143311381340027, + "learning_rate": 2.2575000000000003e-05, + "loss": 2.2837, + "step": 1097 + }, + { + "epoch": 0.1225732854705864, + "grad_norm": 0.22875793278217316, + "learning_rate": 2.2550000000000003e-05, + "loss": 2.2778, + "step": 1098 + }, + { + "epoch": 0.12268491869961243, + "grad_norm": 0.2337283492088318, + "learning_rate": 2.2525000000000002e-05, + "loss": 2.2373, + "step": 1099 + }, + { + "epoch": 0.12279655192863846, + "grad_norm": 0.22070764005184174, + "learning_rate": 2.25e-05, + "loss": 2.2863, + "step": 1100 + }, + { + "epoch": 0.1229081851576645, + "grad_norm": 0.23479975759983063, + "learning_rate": 2.2475e-05, + "loss": 2.3971, + "step": 1101 + }, + { + "epoch": 0.12301981838669053, + "grad_norm": 0.22529123723506927, + "learning_rate": 2.245e-05, + "loss": 2.4699, + "step": 1102 + }, + { + "epoch": 0.12313145161571656, + "grad_norm": 0.24018734693527222, + "learning_rate": 2.2425000000000003e-05, + "loss": 2.4539, + "step": 1103 + }, + { + "epoch": 0.1232430848447426, + "grad_norm": 0.22801434993743896, + "learning_rate": 2.2400000000000002e-05, + "loss": 2.3764, + "step": 1104 + }, + { + "epoch": 0.12335471807376863, + "grad_norm": 0.32481849193573, + "learning_rate": 2.2375000000000002e-05, + "loss": 2.4011, + "step": 1105 + }, + { + "epoch": 0.12346635130279467, + "grad_norm": 0.22494614124298096, + "learning_rate": 2.235e-05, + "loss": 2.2996, + "step": 1106 + }, + { + "epoch": 0.1235779845318207, + "grad_norm": 0.2304266095161438, + "learning_rate": 2.2325e-05, + "loss": 2.2708, + "step": 1107 + }, + { + "epoch": 0.12368961776084673, + "grad_norm": 0.22332698106765747, + "learning_rate": 2.23e-05, + "loss": 2.2825, + "step": 1108 + }, + { + "epoch": 0.12380125098987277, + "grad_norm": 0.2219666987657547, + "learning_rate": 2.2275000000000003e-05, + "loss": 2.3805, + "step": 1109 + }, + { + "epoch": 0.1239128842188988, + "grad_norm": 0.2652471363544464, + "learning_rate": 2.2250000000000002e-05, + "loss": 2.2467, + "step": 1110 + }, + { + "epoch": 0.12402451744792484, + "grad_norm": 0.2186206877231598, + "learning_rate": 2.2225e-05, + "loss": 2.2924, + "step": 1111 + }, + { + "epoch": 0.12413615067695087, + "grad_norm": 0.23667913675308228, + "learning_rate": 2.22e-05, + "loss": 2.3545, + "step": 1112 + }, + { + "epoch": 0.12424778390597692, + "grad_norm": 0.3729536831378937, + "learning_rate": 2.2175e-05, + "loss": 2.2641, + "step": 1113 + }, + { + "epoch": 0.12435941713500295, + "grad_norm": 0.22672784328460693, + "learning_rate": 2.215e-05, + "loss": 2.2501, + "step": 1114 + }, + { + "epoch": 0.12447105036402899, + "grad_norm": 0.2219839245080948, + "learning_rate": 2.2125000000000002e-05, + "loss": 2.2016, + "step": 1115 + }, + { + "epoch": 0.12458268359305502, + "grad_norm": 0.24350149929523468, + "learning_rate": 2.2100000000000002e-05, + "loss": 2.4358, + "step": 1116 + }, + { + "epoch": 0.12469431682208106, + "grad_norm": 0.2482176274061203, + "learning_rate": 2.2075e-05, + "loss": 2.2623, + "step": 1117 + }, + { + "epoch": 0.12480595005110709, + "grad_norm": 0.23272433876991272, + "learning_rate": 2.205e-05, + "loss": 2.3408, + "step": 1118 + }, + { + "epoch": 0.12491758328013312, + "grad_norm": 0.23357626795768738, + "learning_rate": 2.2025e-05, + "loss": 2.231, + "step": 1119 + }, + { + "epoch": 0.12502921650915916, + "grad_norm": 0.22280560433864594, + "learning_rate": 2.2000000000000003e-05, + "loss": 2.3045, + "step": 1120 + }, + { + "epoch": 0.1251408497381852, + "grad_norm": 0.22206202149391174, + "learning_rate": 2.1975000000000002e-05, + "loss": 2.3103, + "step": 1121 + }, + { + "epoch": 0.12525248296721123, + "grad_norm": 0.31398919224739075, + "learning_rate": 2.195e-05, + "loss": 2.3665, + "step": 1122 + }, + { + "epoch": 0.12536411619623727, + "grad_norm": 0.43255481123924255, + "learning_rate": 2.1925e-05, + "loss": 2.3153, + "step": 1123 + }, + { + "epoch": 0.1254757494252633, + "grad_norm": 0.22663763165473938, + "learning_rate": 2.19e-05, + "loss": 2.3682, + "step": 1124 + }, + { + "epoch": 0.12558738265428934, + "grad_norm": 0.2514352798461914, + "learning_rate": 2.1875e-05, + "loss": 2.3727, + "step": 1125 + }, + { + "epoch": 0.12569901588331536, + "grad_norm": 0.22935128211975098, + "learning_rate": 2.1850000000000003e-05, + "loss": 2.2161, + "step": 1126 + }, + { + "epoch": 0.1258106491123414, + "grad_norm": 0.2275882214307785, + "learning_rate": 2.1825000000000002e-05, + "loss": 2.3793, + "step": 1127 + }, + { + "epoch": 0.12592228234136743, + "grad_norm": 0.2203671634197235, + "learning_rate": 2.18e-05, + "loss": 2.3371, + "step": 1128 + }, + { + "epoch": 0.12603391557039348, + "grad_norm": 0.22310760617256165, + "learning_rate": 2.1775e-05, + "loss": 2.2956, + "step": 1129 + }, + { + "epoch": 0.1261455487994195, + "grad_norm": 0.24273249506950378, + "learning_rate": 2.175e-05, + "loss": 2.3288, + "step": 1130 + }, + { + "epoch": 0.12625718202844555, + "grad_norm": 0.22426392138004303, + "learning_rate": 2.1725e-05, + "loss": 2.387, + "step": 1131 + }, + { + "epoch": 0.12636881525747157, + "grad_norm": 0.2271021008491516, + "learning_rate": 2.1700000000000002e-05, + "loss": 2.3739, + "step": 1132 + }, + { + "epoch": 0.12648044848649762, + "grad_norm": 0.2271476835012436, + "learning_rate": 2.1675e-05, + "loss": 2.3942, + "step": 1133 + }, + { + "epoch": 0.12659208171552364, + "grad_norm": 0.2212740033864975, + "learning_rate": 2.165e-05, + "loss": 2.3511, + "step": 1134 + }, + { + "epoch": 0.12670371494454968, + "grad_norm": 0.23425845801830292, + "learning_rate": 2.1625e-05, + "loss": 2.3256, + "step": 1135 + }, + { + "epoch": 0.1268153481735757, + "grad_norm": 0.22545567154884338, + "learning_rate": 2.16e-05, + "loss": 2.1847, + "step": 1136 + }, + { + "epoch": 0.12692698140260175, + "grad_norm": 0.25626057386398315, + "learning_rate": 2.1575e-05, + "loss": 2.3058, + "step": 1137 + }, + { + "epoch": 0.1270386146316278, + "grad_norm": 0.23703870177268982, + "learning_rate": 2.1550000000000002e-05, + "loss": 2.3709, + "step": 1138 + }, + { + "epoch": 0.12715024786065382, + "grad_norm": 0.2455843836069107, + "learning_rate": 2.1525e-05, + "loss": 2.3468, + "step": 1139 + }, + { + "epoch": 0.12726188108967987, + "grad_norm": 0.22671882808208466, + "learning_rate": 2.15e-05, + "loss": 2.3554, + "step": 1140 + }, + { + "epoch": 0.1273735143187059, + "grad_norm": 0.22661983966827393, + "learning_rate": 2.1475e-05, + "loss": 2.2919, + "step": 1141 + }, + { + "epoch": 0.12748514754773194, + "grad_norm": 0.23259443044662476, + "learning_rate": 2.145e-05, + "loss": 2.3463, + "step": 1142 + }, + { + "epoch": 0.12759678077675796, + "grad_norm": 0.23297019302845, + "learning_rate": 2.1425e-05, + "loss": 2.3198, + "step": 1143 + }, + { + "epoch": 0.127708414005784, + "grad_norm": 0.22478660941123962, + "learning_rate": 2.1400000000000002e-05, + "loss": 2.2642, + "step": 1144 + }, + { + "epoch": 0.12782004723481002, + "grad_norm": 0.23042532801628113, + "learning_rate": 2.1375e-05, + "loss": 2.3056, + "step": 1145 + }, + { + "epoch": 0.12793168046383607, + "grad_norm": 0.2341778576374054, + "learning_rate": 2.135e-05, + "loss": 2.3774, + "step": 1146 + }, + { + "epoch": 0.1280433136928621, + "grad_norm": 0.22689583897590637, + "learning_rate": 2.1325e-05, + "loss": 2.3493, + "step": 1147 + }, + { + "epoch": 0.12815494692188814, + "grad_norm": 0.2260814756155014, + "learning_rate": 2.13e-05, + "loss": 2.3548, + "step": 1148 + }, + { + "epoch": 0.12826658015091416, + "grad_norm": 0.24112841486930847, + "learning_rate": 2.1275000000000002e-05, + "loss": 2.3739, + "step": 1149 + }, + { + "epoch": 0.1283782133799402, + "grad_norm": 0.22553539276123047, + "learning_rate": 2.125e-05, + "loss": 2.3762, + "step": 1150 + }, + { + "epoch": 0.12848984660896623, + "grad_norm": 0.2698231041431427, + "learning_rate": 2.1225e-05, + "loss": 2.3527, + "step": 1151 + }, + { + "epoch": 0.12860147983799228, + "grad_norm": 0.2280593365430832, + "learning_rate": 2.12e-05, + "loss": 2.3621, + "step": 1152 + }, + { + "epoch": 0.1287131130670183, + "grad_norm": 0.23123127222061157, + "learning_rate": 2.1175e-05, + "loss": 2.4369, + "step": 1153 + }, + { + "epoch": 0.12882474629604435, + "grad_norm": 0.2183208465576172, + "learning_rate": 2.115e-05, + "loss": 2.3615, + "step": 1154 + }, + { + "epoch": 0.1289363795250704, + "grad_norm": 0.3457687199115753, + "learning_rate": 2.1125000000000002e-05, + "loss": 2.3151, + "step": 1155 + }, + { + "epoch": 0.12904801275409641, + "grad_norm": 0.24221017956733704, + "learning_rate": 2.11e-05, + "loss": 2.383, + "step": 1156 + }, + { + "epoch": 0.12915964598312246, + "grad_norm": 0.31406766176223755, + "learning_rate": 2.1075e-05, + "loss": 2.3564, + "step": 1157 + }, + { + "epoch": 0.12927127921214848, + "grad_norm": 0.2336696982383728, + "learning_rate": 2.105e-05, + "loss": 2.3295, + "step": 1158 + }, + { + "epoch": 0.12938291244117453, + "grad_norm": 0.2292354553937912, + "learning_rate": 2.1025e-05, + "loss": 2.3124, + "step": 1159 + }, + { + "epoch": 0.12949454567020055, + "grad_norm": 0.22350043058395386, + "learning_rate": 2.1e-05, + "loss": 2.3077, + "step": 1160 + }, + { + "epoch": 0.1296061788992266, + "grad_norm": 0.22331391274929047, + "learning_rate": 2.0975e-05, + "loss": 2.2535, + "step": 1161 + }, + { + "epoch": 0.12971781212825262, + "grad_norm": 0.23815131187438965, + "learning_rate": 2.095e-05, + "loss": 2.4475, + "step": 1162 + }, + { + "epoch": 0.12982944535727867, + "grad_norm": 0.29236698150634766, + "learning_rate": 2.0925e-05, + "loss": 2.312, + "step": 1163 + }, + { + "epoch": 0.1299410785863047, + "grad_norm": 0.23702973127365112, + "learning_rate": 2.09e-05, + "loss": 2.4403, + "step": 1164 + }, + { + "epoch": 0.13005271181533073, + "grad_norm": 0.23101966083049774, + "learning_rate": 2.0875e-05, + "loss": 2.4587, + "step": 1165 + }, + { + "epoch": 0.13016434504435676, + "grad_norm": 0.2322998195886612, + "learning_rate": 2.085e-05, + "loss": 2.421, + "step": 1166 + }, + { + "epoch": 0.1302759782733828, + "grad_norm": 0.236577570438385, + "learning_rate": 2.0825e-05, + "loss": 2.3525, + "step": 1167 + }, + { + "epoch": 0.13038761150240882, + "grad_norm": 0.2353632152080536, + "learning_rate": 2.08e-05, + "loss": 2.4235, + "step": 1168 + }, + { + "epoch": 0.13049924473143487, + "grad_norm": 0.23410721123218536, + "learning_rate": 2.0775e-05, + "loss": 2.3678, + "step": 1169 + }, + { + "epoch": 0.13061087796046092, + "grad_norm": 0.22162692248821259, + "learning_rate": 2.075e-05, + "loss": 2.4682, + "step": 1170 + }, + { + "epoch": 0.13072251118948694, + "grad_norm": 0.2268594205379486, + "learning_rate": 2.0725e-05, + "loss": 2.3006, + "step": 1171 + }, + { + "epoch": 0.130834144418513, + "grad_norm": 0.22843024134635925, + "learning_rate": 2.07e-05, + "loss": 2.3072, + "step": 1172 + }, + { + "epoch": 0.130945777647539, + "grad_norm": 0.2605232298374176, + "learning_rate": 2.0675e-05, + "loss": 2.3408, + "step": 1173 + }, + { + "epoch": 0.13105741087656506, + "grad_norm": 0.2354726940393448, + "learning_rate": 2.065e-05, + "loss": 2.3971, + "step": 1174 + }, + { + "epoch": 0.13116904410559108, + "grad_norm": 0.225637286901474, + "learning_rate": 2.0625e-05, + "loss": 2.314, + "step": 1175 + }, + { + "epoch": 0.13128067733461712, + "grad_norm": 0.22693420946598053, + "learning_rate": 2.06e-05, + "loss": 2.4122, + "step": 1176 + }, + { + "epoch": 0.13139231056364314, + "grad_norm": 0.22487470507621765, + "learning_rate": 2.0575e-05, + "loss": 2.2592, + "step": 1177 + }, + { + "epoch": 0.1315039437926692, + "grad_norm": 0.2332613468170166, + "learning_rate": 2.055e-05, + "loss": 2.3281, + "step": 1178 + }, + { + "epoch": 0.1316155770216952, + "grad_norm": 0.3483419716358185, + "learning_rate": 2.0525e-05, + "loss": 2.2564, + "step": 1179 + }, + { + "epoch": 0.13172721025072126, + "grad_norm": 0.23920704424381256, + "learning_rate": 2.05e-05, + "loss": 2.3591, + "step": 1180 + }, + { + "epoch": 0.13183884347974728, + "grad_norm": 0.2232007533311844, + "learning_rate": 2.0475e-05, + "loss": 2.4216, + "step": 1181 + }, + { + "epoch": 0.13195047670877333, + "grad_norm": 0.23171833157539368, + "learning_rate": 2.045e-05, + "loss": 2.2861, + "step": 1182 + }, + { + "epoch": 0.13206210993779935, + "grad_norm": 0.22283758223056793, + "learning_rate": 2.0425e-05, + "loss": 2.274, + "step": 1183 + }, + { + "epoch": 0.1321737431668254, + "grad_norm": 0.21574443578720093, + "learning_rate": 2.04e-05, + "loss": 2.4139, + "step": 1184 + }, + { + "epoch": 0.13228537639585142, + "grad_norm": 0.23107174038887024, + "learning_rate": 2.0375e-05, + "loss": 2.2474, + "step": 1185 + }, + { + "epoch": 0.13239700962487747, + "grad_norm": 0.2264859676361084, + "learning_rate": 2.035e-05, + "loss": 2.3655, + "step": 1186 + }, + { + "epoch": 0.1325086428539035, + "grad_norm": 0.23183952271938324, + "learning_rate": 2.0325e-05, + "loss": 2.4179, + "step": 1187 + }, + { + "epoch": 0.13262027608292953, + "grad_norm": 0.2878219187259674, + "learning_rate": 2.0300000000000002e-05, + "loss": 2.3113, + "step": 1188 + }, + { + "epoch": 0.13273190931195558, + "grad_norm": 0.2385031133890152, + "learning_rate": 2.0275e-05, + "loss": 2.3088, + "step": 1189 + }, + { + "epoch": 0.1328435425409816, + "grad_norm": 0.2221747487783432, + "learning_rate": 2.025e-05, + "loss": 2.3643, + "step": 1190 + }, + { + "epoch": 0.13295517577000765, + "grad_norm": 0.23417770862579346, + "learning_rate": 2.0225000000000004e-05, + "loss": 2.293, + "step": 1191 + }, + { + "epoch": 0.13306680899903367, + "grad_norm": 0.22443942725658417, + "learning_rate": 2.0200000000000003e-05, + "loss": 2.1929, + "step": 1192 + }, + { + "epoch": 0.13317844222805972, + "grad_norm": 0.22454610466957092, + "learning_rate": 2.0175000000000003e-05, + "loss": 2.2178, + "step": 1193 + }, + { + "epoch": 0.13329007545708574, + "grad_norm": 0.2309190183877945, + "learning_rate": 2.0150000000000002e-05, + "loss": 2.3352, + "step": 1194 + }, + { + "epoch": 0.1334017086861118, + "grad_norm": 0.23848609626293182, + "learning_rate": 2.0125e-05, + "loss": 2.3348, + "step": 1195 + }, + { + "epoch": 0.1335133419151378, + "grad_norm": 0.23410527408123016, + "learning_rate": 2.01e-05, + "loss": 2.1928, + "step": 1196 + }, + { + "epoch": 0.13362497514416385, + "grad_norm": 0.24377335608005524, + "learning_rate": 2.0075000000000003e-05, + "loss": 2.2982, + "step": 1197 + }, + { + "epoch": 0.13373660837318987, + "grad_norm": 0.22369886934757233, + "learning_rate": 2.0050000000000003e-05, + "loss": 2.2788, + "step": 1198 + }, + { + "epoch": 0.13384824160221592, + "grad_norm": 0.22385212779045105, + "learning_rate": 2.0025000000000002e-05, + "loss": 2.3247, + "step": 1199 + }, + { + "epoch": 0.13395987483124194, + "grad_norm": 0.22769370675086975, + "learning_rate": 2e-05, + "loss": 2.3075, + "step": 1200 + }, + { + "epoch": 0.134071508060268, + "grad_norm": 0.2851308286190033, + "learning_rate": 1.9975e-05, + "loss": 2.317, + "step": 1201 + }, + { + "epoch": 0.134183141289294, + "grad_norm": 0.24706991016864777, + "learning_rate": 1.995e-05, + "loss": 2.2672, + "step": 1202 + }, + { + "epoch": 0.13429477451832006, + "grad_norm": 0.21826080977916718, + "learning_rate": 1.9925000000000003e-05, + "loss": 2.3373, + "step": 1203 + }, + { + "epoch": 0.1344064077473461, + "grad_norm": 0.23195642232894897, + "learning_rate": 1.9900000000000003e-05, + "loss": 2.271, + "step": 1204 + }, + { + "epoch": 0.13451804097637213, + "grad_norm": 0.24489794671535492, + "learning_rate": 1.9875000000000002e-05, + "loss": 2.3, + "step": 1205 + }, + { + "epoch": 0.13462967420539818, + "grad_norm": 0.23644982278347015, + "learning_rate": 1.985e-05, + "loss": 2.2082, + "step": 1206 + }, + { + "epoch": 0.1347413074344242, + "grad_norm": 0.22759714722633362, + "learning_rate": 1.9825e-05, + "loss": 2.3624, + "step": 1207 + }, + { + "epoch": 0.13485294066345024, + "grad_norm": 0.23812660574913025, + "learning_rate": 1.9800000000000004e-05, + "loss": 2.3198, + "step": 1208 + }, + { + "epoch": 0.13496457389247626, + "grad_norm": 0.22150012850761414, + "learning_rate": 1.9775000000000003e-05, + "loss": 2.2949, + "step": 1209 + }, + { + "epoch": 0.1350762071215023, + "grad_norm": 0.23681719601154327, + "learning_rate": 1.9750000000000002e-05, + "loss": 2.4068, + "step": 1210 + }, + { + "epoch": 0.13518784035052833, + "grad_norm": 0.22480838000774384, + "learning_rate": 1.9725000000000002e-05, + "loss": 2.4212, + "step": 1211 + }, + { + "epoch": 0.13529947357955438, + "grad_norm": 0.2516225576400757, + "learning_rate": 1.97e-05, + "loss": 2.3919, + "step": 1212 + }, + { + "epoch": 0.1354111068085804, + "grad_norm": 0.22985118627548218, + "learning_rate": 1.9675e-05, + "loss": 2.3349, + "step": 1213 + }, + { + "epoch": 0.13552274003760645, + "grad_norm": 0.24210740625858307, + "learning_rate": 1.9650000000000003e-05, + "loss": 2.3892, + "step": 1214 + }, + { + "epoch": 0.13563437326663247, + "grad_norm": 0.23913182318210602, + "learning_rate": 1.9625000000000003e-05, + "loss": 2.3997, + "step": 1215 + }, + { + "epoch": 0.13574600649565852, + "grad_norm": 0.2336951494216919, + "learning_rate": 1.9600000000000002e-05, + "loss": 2.2605, + "step": 1216 + }, + { + "epoch": 0.13585763972468454, + "grad_norm": 0.24042119085788727, + "learning_rate": 1.9575e-05, + "loss": 2.28, + "step": 1217 + }, + { + "epoch": 0.13596927295371058, + "grad_norm": 0.24132166802883148, + "learning_rate": 1.955e-05, + "loss": 2.3416, + "step": 1218 + }, + { + "epoch": 0.1360809061827366, + "grad_norm": 0.22675567865371704, + "learning_rate": 1.9525e-05, + "loss": 2.4432, + "step": 1219 + }, + { + "epoch": 0.13619253941176265, + "grad_norm": 0.2387012392282486, + "learning_rate": 1.9500000000000003e-05, + "loss": 2.3219, + "step": 1220 + }, + { + "epoch": 0.1363041726407887, + "grad_norm": 0.22851043939590454, + "learning_rate": 1.9475000000000002e-05, + "loss": 2.3047, + "step": 1221 + }, + { + "epoch": 0.13641580586981472, + "grad_norm": 0.229843869805336, + "learning_rate": 1.9450000000000002e-05, + "loss": 2.3684, + "step": 1222 + }, + { + "epoch": 0.13652743909884077, + "grad_norm": 0.22853203117847443, + "learning_rate": 1.9425e-05, + "loss": 2.3583, + "step": 1223 + }, + { + "epoch": 0.1366390723278668, + "grad_norm": 0.22501179575920105, + "learning_rate": 1.94e-05, + "loss": 2.3816, + "step": 1224 + }, + { + "epoch": 0.13675070555689284, + "grad_norm": 0.2398713231086731, + "learning_rate": 1.9375e-05, + "loss": 2.2891, + "step": 1225 + }, + { + "epoch": 0.13686233878591886, + "grad_norm": 0.23787732422351837, + "learning_rate": 1.9350000000000003e-05, + "loss": 2.3867, + "step": 1226 + }, + { + "epoch": 0.1369739720149449, + "grad_norm": 0.2299504280090332, + "learning_rate": 1.9325000000000002e-05, + "loss": 2.358, + "step": 1227 + }, + { + "epoch": 0.13708560524397093, + "grad_norm": 0.23067769408226013, + "learning_rate": 1.93e-05, + "loss": 2.2818, + "step": 1228 + }, + { + "epoch": 0.13719723847299697, + "grad_norm": 0.22578711807727814, + "learning_rate": 1.9275e-05, + "loss": 2.3077, + "step": 1229 + }, + { + "epoch": 0.137308871702023, + "grad_norm": 0.2431231141090393, + "learning_rate": 1.925e-05, + "loss": 2.448, + "step": 1230 + }, + { + "epoch": 0.13742050493104904, + "grad_norm": 0.22373032569885254, + "learning_rate": 1.9225e-05, + "loss": 2.3161, + "step": 1231 + }, + { + "epoch": 0.13753213816007506, + "grad_norm": 0.22712518274784088, + "learning_rate": 1.9200000000000003e-05, + "loss": 2.327, + "step": 1232 + }, + { + "epoch": 0.1376437713891011, + "grad_norm": 0.23568111658096313, + "learning_rate": 1.9175000000000002e-05, + "loss": 2.2981, + "step": 1233 + }, + { + "epoch": 0.13775540461812713, + "grad_norm": 0.4218859076499939, + "learning_rate": 1.915e-05, + "loss": 2.3714, + "step": 1234 + }, + { + "epoch": 0.13786703784715318, + "grad_norm": 0.25230151414871216, + "learning_rate": 1.9125e-05, + "loss": 2.3094, + "step": 1235 + }, + { + "epoch": 0.13797867107617923, + "grad_norm": 0.2286907434463501, + "learning_rate": 1.91e-05, + "loss": 2.3385, + "step": 1236 + }, + { + "epoch": 0.13809030430520525, + "grad_norm": 0.22605471312999725, + "learning_rate": 1.9075000000000003e-05, + "loss": 2.3362, + "step": 1237 + }, + { + "epoch": 0.1382019375342313, + "grad_norm": 0.34180840849876404, + "learning_rate": 1.9050000000000002e-05, + "loss": 2.409, + "step": 1238 + }, + { + "epoch": 0.13831357076325732, + "grad_norm": 0.23643071949481964, + "learning_rate": 1.9025e-05, + "loss": 2.3804, + "step": 1239 + }, + { + "epoch": 0.13842520399228336, + "grad_norm": 0.23849943280220032, + "learning_rate": 1.9e-05, + "loss": 2.2734, + "step": 1240 + }, + { + "epoch": 0.13853683722130938, + "grad_norm": 0.22949104011058807, + "learning_rate": 1.8975e-05, + "loss": 2.3875, + "step": 1241 + }, + { + "epoch": 0.13864847045033543, + "grad_norm": 0.22790773212909698, + "learning_rate": 1.895e-05, + "loss": 2.2737, + "step": 1242 + }, + { + "epoch": 0.13876010367936145, + "grad_norm": 0.23416166007518768, + "learning_rate": 1.8925000000000003e-05, + "loss": 2.2871, + "step": 1243 + }, + { + "epoch": 0.1388717369083875, + "grad_norm": 0.22195836901664734, + "learning_rate": 1.8900000000000002e-05, + "loss": 2.281, + "step": 1244 + }, + { + "epoch": 0.13898337013741352, + "grad_norm": 0.2336248904466629, + "learning_rate": 1.8875e-05, + "loss": 2.416, + "step": 1245 + }, + { + "epoch": 0.13909500336643957, + "grad_norm": 0.23051492869853973, + "learning_rate": 1.885e-05, + "loss": 2.2719, + "step": 1246 + }, + { + "epoch": 0.1392066365954656, + "grad_norm": 0.2252211719751358, + "learning_rate": 1.8825e-05, + "loss": 2.4248, + "step": 1247 + }, + { + "epoch": 0.13931826982449164, + "grad_norm": 0.21714623272418976, + "learning_rate": 1.88e-05, + "loss": 2.3514, + "step": 1248 + }, + { + "epoch": 0.13942990305351766, + "grad_norm": 0.23050609230995178, + "learning_rate": 1.8775000000000002e-05, + "loss": 2.3583, + "step": 1249 + }, + { + "epoch": 0.1395415362825437, + "grad_norm": 0.2242250144481659, + "learning_rate": 1.8750000000000002e-05, + "loss": 2.2994, + "step": 1250 + }, + { + "epoch": 0.13965316951156972, + "grad_norm": 0.23751282691955566, + "learning_rate": 1.8725e-05, + "loss": 2.2898, + "step": 1251 + }, + { + "epoch": 0.13976480274059577, + "grad_norm": 0.240337535738945, + "learning_rate": 1.87e-05, + "loss": 2.3338, + "step": 1252 + }, + { + "epoch": 0.13987643596962182, + "grad_norm": 0.2228308618068695, + "learning_rate": 1.8675e-05, + "loss": 2.3378, + "step": 1253 + }, + { + "epoch": 0.13998806919864784, + "grad_norm": 0.22002586722373962, + "learning_rate": 1.865e-05, + "loss": 2.3768, + "step": 1254 + }, + { + "epoch": 0.1400997024276739, + "grad_norm": 0.22754792869091034, + "learning_rate": 1.8625000000000002e-05, + "loss": 2.3875, + "step": 1255 + }, + { + "epoch": 0.1402113356566999, + "grad_norm": 0.22698433697223663, + "learning_rate": 1.86e-05, + "loss": 2.3624, + "step": 1256 + }, + { + "epoch": 0.14032296888572596, + "grad_norm": 0.2388381063938141, + "learning_rate": 1.8575e-05, + "loss": 2.3388, + "step": 1257 + }, + { + "epoch": 0.14043460211475198, + "grad_norm": 0.2522680461406708, + "learning_rate": 1.855e-05, + "loss": 2.3738, + "step": 1258 + }, + { + "epoch": 0.14054623534377803, + "grad_norm": 0.22983700037002563, + "learning_rate": 1.8525e-05, + "loss": 2.4546, + "step": 1259 + }, + { + "epoch": 0.14065786857280405, + "grad_norm": 0.23299263417720795, + "learning_rate": 1.85e-05, + "loss": 2.3668, + "step": 1260 + }, + { + "epoch": 0.1407695018018301, + "grad_norm": 0.29829832911491394, + "learning_rate": 1.8475000000000002e-05, + "loss": 2.3765, + "step": 1261 + }, + { + "epoch": 0.14088113503085611, + "grad_norm": 0.2792015075683594, + "learning_rate": 1.845e-05, + "loss": 2.2744, + "step": 1262 + }, + { + "epoch": 0.14099276825988216, + "grad_norm": 0.2371360808610916, + "learning_rate": 1.8425e-05, + "loss": 2.3681, + "step": 1263 + }, + { + "epoch": 0.14110440148890818, + "grad_norm": 0.22529280185699463, + "learning_rate": 1.84e-05, + "loss": 2.2995, + "step": 1264 + }, + { + "epoch": 0.14121603471793423, + "grad_norm": 0.23717930912971497, + "learning_rate": 1.8375e-05, + "loss": 2.3641, + "step": 1265 + }, + { + "epoch": 0.14132766794696025, + "grad_norm": 0.23806947469711304, + "learning_rate": 1.8350000000000002e-05, + "loss": 2.2403, + "step": 1266 + }, + { + "epoch": 0.1414393011759863, + "grad_norm": 0.2357507348060608, + "learning_rate": 1.8325e-05, + "loss": 2.3355, + "step": 1267 + }, + { + "epoch": 0.14155093440501232, + "grad_norm": 0.2314206063747406, + "learning_rate": 1.83e-05, + "loss": 2.3525, + "step": 1268 + }, + { + "epoch": 0.14166256763403837, + "grad_norm": 0.23410086333751678, + "learning_rate": 1.8275e-05, + "loss": 2.3385, + "step": 1269 + }, + { + "epoch": 0.14177420086306441, + "grad_norm": 0.23491892218589783, + "learning_rate": 1.825e-05, + "loss": 2.443, + "step": 1270 + }, + { + "epoch": 0.14188583409209043, + "grad_norm": 0.224997416138649, + "learning_rate": 1.8225e-05, + "loss": 2.3451, + "step": 1271 + }, + { + "epoch": 0.14199746732111648, + "grad_norm": 0.2989085614681244, + "learning_rate": 1.8200000000000002e-05, + "loss": 2.3746, + "step": 1272 + }, + { + "epoch": 0.1421091005501425, + "grad_norm": 0.2321501523256302, + "learning_rate": 1.8175e-05, + "loss": 2.1922, + "step": 1273 + }, + { + "epoch": 0.14222073377916855, + "grad_norm": 0.2198849767446518, + "learning_rate": 1.815e-05, + "loss": 2.2675, + "step": 1274 + }, + { + "epoch": 0.14233236700819457, + "grad_norm": 0.22828496992588043, + "learning_rate": 1.8125e-05, + "loss": 2.3126, + "step": 1275 + }, + { + "epoch": 0.14244400023722062, + "grad_norm": 0.23503446578979492, + "learning_rate": 1.81e-05, + "loss": 2.325, + "step": 1276 + }, + { + "epoch": 0.14255563346624664, + "grad_norm": 0.22161200642585754, + "learning_rate": 1.8075e-05, + "loss": 2.4076, + "step": 1277 + }, + { + "epoch": 0.1426672666952727, + "grad_norm": 0.22467480599880219, + "learning_rate": 1.805e-05, + "loss": 2.4028, + "step": 1278 + }, + { + "epoch": 0.1427788999242987, + "grad_norm": 0.23092903196811676, + "learning_rate": 1.8025e-05, + "loss": 2.3841, + "step": 1279 + }, + { + "epoch": 0.14289053315332476, + "grad_norm": 0.23917998373508453, + "learning_rate": 1.8e-05, + "loss": 2.3114, + "step": 1280 + }, + { + "epoch": 0.14300216638235078, + "grad_norm": 0.24001942574977875, + "learning_rate": 1.7975e-05, + "loss": 2.2419, + "step": 1281 + }, + { + "epoch": 0.14311379961137682, + "grad_norm": 0.22311803698539734, + "learning_rate": 1.795e-05, + "loss": 2.35, + "step": 1282 + }, + { + "epoch": 0.14322543284040284, + "grad_norm": 0.23798157274723053, + "learning_rate": 1.7925e-05, + "loss": 2.404, + "step": 1283 + }, + { + "epoch": 0.1433370660694289, + "grad_norm": 0.22350966930389404, + "learning_rate": 1.79e-05, + "loss": 2.4122, + "step": 1284 + }, + { + "epoch": 0.1434486992984549, + "grad_norm": 0.2272355854511261, + "learning_rate": 1.7875e-05, + "loss": 2.4368, + "step": 1285 + }, + { + "epoch": 0.14356033252748096, + "grad_norm": 0.21722903847694397, + "learning_rate": 1.785e-05, + "loss": 2.3516, + "step": 1286 + }, + { + "epoch": 0.143671965756507, + "grad_norm": 0.23155128955841064, + "learning_rate": 1.7825e-05, + "loss": 2.3527, + "step": 1287 + }, + { + "epoch": 0.14378359898553303, + "grad_norm": 0.22640927135944366, + "learning_rate": 1.78e-05, + "loss": 2.4748, + "step": 1288 + }, + { + "epoch": 0.14389523221455908, + "grad_norm": 0.23734673857688904, + "learning_rate": 1.7775e-05, + "loss": 2.2844, + "step": 1289 + }, + { + "epoch": 0.1440068654435851, + "grad_norm": 0.2328357696533203, + "learning_rate": 1.775e-05, + "loss": 2.3389, + "step": 1290 + }, + { + "epoch": 0.14411849867261114, + "grad_norm": 0.21968986093997955, + "learning_rate": 1.7725e-05, + "loss": 2.3395, + "step": 1291 + }, + { + "epoch": 0.14423013190163717, + "grad_norm": 0.21870984137058258, + "learning_rate": 1.77e-05, + "loss": 2.3882, + "step": 1292 + }, + { + "epoch": 0.1443417651306632, + "grad_norm": 0.23083637654781342, + "learning_rate": 1.7675e-05, + "loss": 2.3653, + "step": 1293 + }, + { + "epoch": 0.14445339835968923, + "grad_norm": 0.23204779624938965, + "learning_rate": 1.765e-05, + "loss": 2.4653, + "step": 1294 + }, + { + "epoch": 0.14456503158871528, + "grad_norm": 0.23749560117721558, + "learning_rate": 1.7625e-05, + "loss": 2.3105, + "step": 1295 + }, + { + "epoch": 0.1446766648177413, + "grad_norm": 0.23485244810581207, + "learning_rate": 1.76e-05, + "loss": 2.3721, + "step": 1296 + }, + { + "epoch": 0.14478829804676735, + "grad_norm": 0.24895891547203064, + "learning_rate": 1.7575e-05, + "loss": 2.3752, + "step": 1297 + }, + { + "epoch": 0.14489993127579337, + "grad_norm": 0.24030578136444092, + "learning_rate": 1.755e-05, + "loss": 2.3025, + "step": 1298 + }, + { + "epoch": 0.14501156450481942, + "grad_norm": 0.23427051305770874, + "learning_rate": 1.7525e-05, + "loss": 2.431, + "step": 1299 + }, + { + "epoch": 0.14512319773384544, + "grad_norm": 0.29638248682022095, + "learning_rate": 1.75e-05, + "loss": 2.3806, + "step": 1300 + }, + { + "epoch": 0.1452348309628715, + "grad_norm": 0.24296362698078156, + "learning_rate": 1.7475e-05, + "loss": 2.3058, + "step": 1301 + }, + { + "epoch": 0.14534646419189753, + "grad_norm": 0.21712899208068848, + "learning_rate": 1.745e-05, + "loss": 2.2896, + "step": 1302 + }, + { + "epoch": 0.14545809742092355, + "grad_norm": 0.22417020797729492, + "learning_rate": 1.7425e-05, + "loss": 2.3631, + "step": 1303 + }, + { + "epoch": 0.1455697306499496, + "grad_norm": 0.22780650854110718, + "learning_rate": 1.74e-05, + "loss": 2.441, + "step": 1304 + }, + { + "epoch": 0.14568136387897562, + "grad_norm": 0.22729064524173737, + "learning_rate": 1.7375e-05, + "loss": 2.4841, + "step": 1305 + }, + { + "epoch": 0.14579299710800167, + "grad_norm": 0.23002713918685913, + "learning_rate": 1.7349999999999998e-05, + "loss": 2.2594, + "step": 1306 + }, + { + "epoch": 0.1459046303370277, + "grad_norm": 0.22959677875041962, + "learning_rate": 1.7325e-05, + "loss": 2.3681, + "step": 1307 + }, + { + "epoch": 0.14601626356605374, + "grad_norm": 0.23864157497882843, + "learning_rate": 1.73e-05, + "loss": 2.413, + "step": 1308 + }, + { + "epoch": 0.14612789679507976, + "grad_norm": 0.2260824590921402, + "learning_rate": 1.7275e-05, + "loss": 2.4128, + "step": 1309 + }, + { + "epoch": 0.1462395300241058, + "grad_norm": 0.23661834001541138, + "learning_rate": 1.725e-05, + "loss": 2.3452, + "step": 1310 + }, + { + "epoch": 0.14635116325313183, + "grad_norm": 0.24539943039417267, + "learning_rate": 1.7225e-05, + "loss": 2.452, + "step": 1311 + }, + { + "epoch": 0.14646279648215788, + "grad_norm": 0.24564427137374878, + "learning_rate": 1.7199999999999998e-05, + "loss": 2.2986, + "step": 1312 + }, + { + "epoch": 0.1465744297111839, + "grad_norm": 0.22026251256465912, + "learning_rate": 1.7175e-05, + "loss": 2.4334, + "step": 1313 + }, + { + "epoch": 0.14668606294020994, + "grad_norm": 0.22923065721988678, + "learning_rate": 1.7150000000000004e-05, + "loss": 2.3331, + "step": 1314 + }, + { + "epoch": 0.14679769616923596, + "grad_norm": 0.26323196291923523, + "learning_rate": 1.7125000000000003e-05, + "loss": 2.2923, + "step": 1315 + }, + { + "epoch": 0.146909329398262, + "grad_norm": 0.2375718653202057, + "learning_rate": 1.7100000000000002e-05, + "loss": 2.4087, + "step": 1316 + }, + { + "epoch": 0.14702096262728803, + "grad_norm": 0.22378355264663696, + "learning_rate": 1.7075e-05, + "loss": 2.3135, + "step": 1317 + }, + { + "epoch": 0.14713259585631408, + "grad_norm": 0.2237062305212021, + "learning_rate": 1.705e-05, + "loss": 2.2467, + "step": 1318 + }, + { + "epoch": 0.14724422908534013, + "grad_norm": 0.24400672316551208, + "learning_rate": 1.7025e-05, + "loss": 2.1746, + "step": 1319 + }, + { + "epoch": 0.14735586231436615, + "grad_norm": 0.22511416673660278, + "learning_rate": 1.7000000000000003e-05, + "loss": 2.4436, + "step": 1320 + }, + { + "epoch": 0.1474674955433922, + "grad_norm": 0.23048102855682373, + "learning_rate": 1.6975000000000003e-05, + "loss": 2.3356, + "step": 1321 + }, + { + "epoch": 0.14757912877241822, + "grad_norm": 0.2408643364906311, + "learning_rate": 1.6950000000000002e-05, + "loss": 2.2649, + "step": 1322 + }, + { + "epoch": 0.14769076200144426, + "grad_norm": 0.2274709939956665, + "learning_rate": 1.6925e-05, + "loss": 2.3392, + "step": 1323 + }, + { + "epoch": 0.14780239523047028, + "grad_norm": 0.22548739612102509, + "learning_rate": 1.69e-05, + "loss": 2.2553, + "step": 1324 + }, + { + "epoch": 0.14791402845949633, + "grad_norm": 0.24427109956741333, + "learning_rate": 1.6875000000000004e-05, + "loss": 2.3549, + "step": 1325 + }, + { + "epoch": 0.14802566168852235, + "grad_norm": 0.2285667061805725, + "learning_rate": 1.6850000000000003e-05, + "loss": 2.4703, + "step": 1326 + }, + { + "epoch": 0.1481372949175484, + "grad_norm": 0.22185759246349335, + "learning_rate": 1.6825000000000002e-05, + "loss": 2.4652, + "step": 1327 + }, + { + "epoch": 0.14824892814657442, + "grad_norm": 0.23354068398475647, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.2939, + "step": 1328 + }, + { + "epoch": 0.14836056137560047, + "grad_norm": 0.22763966023921967, + "learning_rate": 1.6775e-05, + "loss": 2.3484, + "step": 1329 + }, + { + "epoch": 0.1484721946046265, + "grad_norm": 0.23366783559322357, + "learning_rate": 1.675e-05, + "loss": 2.2578, + "step": 1330 + }, + { + "epoch": 0.14858382783365254, + "grad_norm": 0.22202405333518982, + "learning_rate": 1.6725000000000003e-05, + "loss": 2.4142, + "step": 1331 + }, + { + "epoch": 0.14869546106267856, + "grad_norm": 0.499910831451416, + "learning_rate": 1.6700000000000003e-05, + "loss": 2.304, + "step": 1332 + }, + { + "epoch": 0.1488070942917046, + "grad_norm": 0.22948618233203888, + "learning_rate": 1.6675000000000002e-05, + "loss": 2.3483, + "step": 1333 + }, + { + "epoch": 0.14891872752073063, + "grad_norm": 0.2281409054994583, + "learning_rate": 1.665e-05, + "loss": 2.3, + "step": 1334 + }, + { + "epoch": 0.14903036074975667, + "grad_norm": 0.22423812747001648, + "learning_rate": 1.6625e-05, + "loss": 2.3268, + "step": 1335 + }, + { + "epoch": 0.14914199397878272, + "grad_norm": 0.22844356298446655, + "learning_rate": 1.66e-05, + "loss": 2.2694, + "step": 1336 + }, + { + "epoch": 0.14925362720780874, + "grad_norm": 0.23927628993988037, + "learning_rate": 1.6575000000000003e-05, + "loss": 2.3171, + "step": 1337 + }, + { + "epoch": 0.1493652604368348, + "grad_norm": 0.22286780178546906, + "learning_rate": 1.6550000000000002e-05, + "loss": 2.2984, + "step": 1338 + }, + { + "epoch": 0.1494768936658608, + "grad_norm": 0.2628430128097534, + "learning_rate": 1.6525000000000002e-05, + "loss": 2.4076, + "step": 1339 + }, + { + "epoch": 0.14958852689488686, + "grad_norm": 0.227376326918602, + "learning_rate": 1.65e-05, + "loss": 2.2738, + "step": 1340 + }, + { + "epoch": 0.14970016012391288, + "grad_norm": 0.27796196937561035, + "learning_rate": 1.6475e-05, + "loss": 2.2671, + "step": 1341 + }, + { + "epoch": 0.14981179335293893, + "grad_norm": 0.2336723506450653, + "learning_rate": 1.645e-05, + "loss": 2.3649, + "step": 1342 + }, + { + "epoch": 0.14992342658196495, + "grad_norm": 0.2579742670059204, + "learning_rate": 1.6425000000000003e-05, + "loss": 2.4033, + "step": 1343 + }, + { + "epoch": 0.150035059810991, + "grad_norm": 0.23226085305213928, + "learning_rate": 1.6400000000000002e-05, + "loss": 2.3702, + "step": 1344 + }, + { + "epoch": 0.15014669304001702, + "grad_norm": 0.22210608422756195, + "learning_rate": 1.6375e-05, + "loss": 2.3124, + "step": 1345 + }, + { + "epoch": 0.15025832626904306, + "grad_norm": 0.2297833412885666, + "learning_rate": 1.635e-05, + "loss": 2.311, + "step": 1346 + }, + { + "epoch": 0.15036995949806908, + "grad_norm": 0.480037659406662, + "learning_rate": 1.6325e-05, + "loss": 2.3665, + "step": 1347 + }, + { + "epoch": 0.15048159272709513, + "grad_norm": 0.22692376375198364, + "learning_rate": 1.63e-05, + "loss": 2.4278, + "step": 1348 + }, + { + "epoch": 0.15059322595612115, + "grad_norm": 0.23020422458648682, + "learning_rate": 1.6275000000000003e-05, + "loss": 2.3182, + "step": 1349 + }, + { + "epoch": 0.1507048591851472, + "grad_norm": 0.2347160279750824, + "learning_rate": 1.6250000000000002e-05, + "loss": 2.4645, + "step": 1350 + }, + { + "epoch": 0.15081649241417325, + "grad_norm": 0.24892841279506683, + "learning_rate": 1.6225e-05, + "loss": 2.3542, + "step": 1351 + }, + { + "epoch": 0.15092812564319927, + "grad_norm": 0.2282174974679947, + "learning_rate": 1.62e-05, + "loss": 2.4327, + "step": 1352 + }, + { + "epoch": 0.15103975887222532, + "grad_norm": 0.22344060242176056, + "learning_rate": 1.6175e-05, + "loss": 2.3244, + "step": 1353 + }, + { + "epoch": 0.15115139210125134, + "grad_norm": 0.2454865276813507, + "learning_rate": 1.6150000000000003e-05, + "loss": 2.4319, + "step": 1354 + }, + { + "epoch": 0.15126302533027738, + "grad_norm": 0.238905668258667, + "learning_rate": 1.6125000000000002e-05, + "loss": 2.3431, + "step": 1355 + }, + { + "epoch": 0.1513746585593034, + "grad_norm": 0.22759667038917542, + "learning_rate": 1.6100000000000002e-05, + "loss": 2.4397, + "step": 1356 + }, + { + "epoch": 0.15148629178832945, + "grad_norm": 0.22996599972248077, + "learning_rate": 1.6075e-05, + "loss": 2.4378, + "step": 1357 + }, + { + "epoch": 0.15159792501735547, + "grad_norm": 0.23668399453163147, + "learning_rate": 1.605e-05, + "loss": 2.3791, + "step": 1358 + }, + { + "epoch": 0.15170955824638152, + "grad_norm": 0.23384137451648712, + "learning_rate": 1.6025e-05, + "loss": 2.2818, + "step": 1359 + }, + { + "epoch": 0.15182119147540754, + "grad_norm": 0.23646210134029388, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.306, + "step": 1360 + }, + { + "epoch": 0.1519328247044336, + "grad_norm": 0.23297393321990967, + "learning_rate": 1.5975000000000002e-05, + "loss": 2.3061, + "step": 1361 + }, + { + "epoch": 0.1520444579334596, + "grad_norm": 0.23053903877735138, + "learning_rate": 1.595e-05, + "loss": 2.2829, + "step": 1362 + }, + { + "epoch": 0.15215609116248566, + "grad_norm": 0.22388771176338196, + "learning_rate": 1.5925e-05, + "loss": 2.2914, + "step": 1363 + }, + { + "epoch": 0.15226772439151168, + "grad_norm": 0.22923487424850464, + "learning_rate": 1.59e-05, + "loss": 2.2915, + "step": 1364 + }, + { + "epoch": 0.15237935762053773, + "grad_norm": 0.22146253287792206, + "learning_rate": 1.5875e-05, + "loss": 2.4338, + "step": 1365 + }, + { + "epoch": 0.15249099084956375, + "grad_norm": 0.22123506665229797, + "learning_rate": 1.5850000000000002e-05, + "loss": 2.3959, + "step": 1366 + }, + { + "epoch": 0.1526026240785898, + "grad_norm": 0.31164827942848206, + "learning_rate": 1.5825000000000002e-05, + "loss": 2.3052, + "step": 1367 + }, + { + "epoch": 0.15271425730761584, + "grad_norm": 0.23311270773410797, + "learning_rate": 1.58e-05, + "loss": 2.343, + "step": 1368 + }, + { + "epoch": 0.15282589053664186, + "grad_norm": 0.25240063667297363, + "learning_rate": 1.5775e-05, + "loss": 2.3322, + "step": 1369 + }, + { + "epoch": 0.1529375237656679, + "grad_norm": 0.2153395712375641, + "learning_rate": 1.575e-05, + "loss": 2.3915, + "step": 1370 + }, + { + "epoch": 0.15304915699469393, + "grad_norm": 0.23929548263549805, + "learning_rate": 1.5725e-05, + "loss": 2.382, + "step": 1371 + }, + { + "epoch": 0.15316079022371998, + "grad_norm": 0.2313188761472702, + "learning_rate": 1.5700000000000002e-05, + "loss": 2.3818, + "step": 1372 + }, + { + "epoch": 0.153272423452746, + "grad_norm": 0.22789841890335083, + "learning_rate": 1.5675e-05, + "loss": 2.3657, + "step": 1373 + }, + { + "epoch": 0.15338405668177205, + "grad_norm": 0.2191040813922882, + "learning_rate": 1.565e-05, + "loss": 2.3667, + "step": 1374 + }, + { + "epoch": 0.15349568991079807, + "grad_norm": 0.22538387775421143, + "learning_rate": 1.5625e-05, + "loss": 2.2652, + "step": 1375 + }, + { + "epoch": 0.15360732313982411, + "grad_norm": 0.23191332817077637, + "learning_rate": 1.56e-05, + "loss": 2.4353, + "step": 1376 + }, + { + "epoch": 0.15371895636885013, + "grad_norm": 0.23539386689662933, + "learning_rate": 1.5575e-05, + "loss": 2.3348, + "step": 1377 + }, + { + "epoch": 0.15383058959787618, + "grad_norm": 0.2552396357059479, + "learning_rate": 1.5550000000000002e-05, + "loss": 2.2693, + "step": 1378 + }, + { + "epoch": 0.1539422228269022, + "grad_norm": 0.2361995130777359, + "learning_rate": 1.5525e-05, + "loss": 2.3506, + "step": 1379 + }, + { + "epoch": 0.15405385605592825, + "grad_norm": 0.2353193461894989, + "learning_rate": 1.55e-05, + "loss": 2.2858, + "step": 1380 + }, + { + "epoch": 0.15416548928495427, + "grad_norm": 0.23434515297412872, + "learning_rate": 1.5475e-05, + "loss": 2.3942, + "step": 1381 + }, + { + "epoch": 0.15427712251398032, + "grad_norm": 0.25974181294441223, + "learning_rate": 1.545e-05, + "loss": 2.3877, + "step": 1382 + }, + { + "epoch": 0.15438875574300634, + "grad_norm": 0.2274274230003357, + "learning_rate": 1.5425000000000002e-05, + "loss": 2.4084, + "step": 1383 + }, + { + "epoch": 0.1545003889720324, + "grad_norm": 0.23268136382102966, + "learning_rate": 1.54e-05, + "loss": 2.3343, + "step": 1384 + }, + { + "epoch": 0.15461202220105844, + "grad_norm": 0.23196761310100555, + "learning_rate": 1.5375e-05, + "loss": 2.3164, + "step": 1385 + }, + { + "epoch": 0.15472365543008446, + "grad_norm": 0.22755514085292816, + "learning_rate": 1.535e-05, + "loss": 2.3012, + "step": 1386 + }, + { + "epoch": 0.1548352886591105, + "grad_norm": 0.21987079083919525, + "learning_rate": 1.5325e-05, + "loss": 2.3306, + "step": 1387 + }, + { + "epoch": 0.15494692188813652, + "grad_norm": 0.22733251750469208, + "learning_rate": 1.53e-05, + "loss": 2.2869, + "step": 1388 + }, + { + "epoch": 0.15505855511716257, + "grad_norm": 0.23861852288246155, + "learning_rate": 1.5275000000000002e-05, + "loss": 2.2962, + "step": 1389 + }, + { + "epoch": 0.1551701883461886, + "grad_norm": 0.2321462631225586, + "learning_rate": 1.525e-05, + "loss": 2.2596, + "step": 1390 + }, + { + "epoch": 0.15528182157521464, + "grad_norm": 0.2401476502418518, + "learning_rate": 1.5225e-05, + "loss": 2.3632, + "step": 1391 + }, + { + "epoch": 0.15539345480424066, + "grad_norm": 0.2222481220960617, + "learning_rate": 1.52e-05, + "loss": 2.4276, + "step": 1392 + }, + { + "epoch": 0.1555050880332667, + "grad_norm": 0.23532716929912567, + "learning_rate": 1.5175e-05, + "loss": 2.3198, + "step": 1393 + }, + { + "epoch": 0.15561672126229273, + "grad_norm": 0.24865378439426422, + "learning_rate": 1.515e-05, + "loss": 2.3288, + "step": 1394 + }, + { + "epoch": 0.15572835449131878, + "grad_norm": 0.23691876232624054, + "learning_rate": 1.5125e-05, + "loss": 2.3948, + "step": 1395 + }, + { + "epoch": 0.1558399877203448, + "grad_norm": 0.21998995542526245, + "learning_rate": 1.51e-05, + "loss": 2.2551, + "step": 1396 + }, + { + "epoch": 0.15595162094937084, + "grad_norm": 0.23590844869613647, + "learning_rate": 1.5075e-05, + "loss": 2.327, + "step": 1397 + }, + { + "epoch": 0.15606325417839687, + "grad_norm": 0.227497398853302, + "learning_rate": 1.505e-05, + "loss": 2.2747, + "step": 1398 + }, + { + "epoch": 0.1561748874074229, + "grad_norm": 0.23721876740455627, + "learning_rate": 1.5025000000000001e-05, + "loss": 2.2471, + "step": 1399 + }, + { + "epoch": 0.15628652063644893, + "grad_norm": 0.23734678328037262, + "learning_rate": 1.5e-05, + "loss": 2.3562, + "step": 1400 + }, + { + "epoch": 0.15639815386547498, + "grad_norm": 0.23354454338550568, + "learning_rate": 1.4975e-05, + "loss": 2.2572, + "step": 1401 + }, + { + "epoch": 0.15650978709450103, + "grad_norm": 0.2284322828054428, + "learning_rate": 1.4950000000000001e-05, + "loss": 2.3481, + "step": 1402 + }, + { + "epoch": 0.15662142032352705, + "grad_norm": 0.24697981774806976, + "learning_rate": 1.4925e-05, + "loss": 2.4658, + "step": 1403 + }, + { + "epoch": 0.1567330535525531, + "grad_norm": 0.23272185027599335, + "learning_rate": 1.49e-05, + "loss": 2.4396, + "step": 1404 + }, + { + "epoch": 0.15684468678157912, + "grad_norm": 0.22883062064647675, + "learning_rate": 1.4875e-05, + "loss": 2.4235, + "step": 1405 + }, + { + "epoch": 0.15695632001060517, + "grad_norm": 0.23410239815711975, + "learning_rate": 1.485e-05, + "loss": 2.3252, + "step": 1406 + }, + { + "epoch": 0.1570679532396312, + "grad_norm": 0.22218795120716095, + "learning_rate": 1.4825e-05, + "loss": 2.3215, + "step": 1407 + }, + { + "epoch": 0.15717958646865723, + "grad_norm": 0.22799868881702423, + "learning_rate": 1.48e-05, + "loss": 2.2427, + "step": 1408 + }, + { + "epoch": 0.15729121969768325, + "grad_norm": 0.2517700791358948, + "learning_rate": 1.4775e-05, + "loss": 2.3081, + "step": 1409 + }, + { + "epoch": 0.1574028529267093, + "grad_norm": 0.23836027085781097, + "learning_rate": 1.475e-05, + "loss": 2.3097, + "step": 1410 + }, + { + "epoch": 0.15751448615573532, + "grad_norm": 0.22234973311424255, + "learning_rate": 1.4725e-05, + "loss": 2.4121, + "step": 1411 + }, + { + "epoch": 0.15762611938476137, + "grad_norm": 0.35192346572875977, + "learning_rate": 1.47e-05, + "loss": 2.3617, + "step": 1412 + }, + { + "epoch": 0.1577377526137874, + "grad_norm": 0.2238391488790512, + "learning_rate": 1.4675e-05, + "loss": 2.3145, + "step": 1413 + }, + { + "epoch": 0.15784938584281344, + "grad_norm": 0.2211901992559433, + "learning_rate": 1.465e-05, + "loss": 2.3678, + "step": 1414 + }, + { + "epoch": 0.15796101907183946, + "grad_norm": 0.23561525344848633, + "learning_rate": 1.4625e-05, + "loss": 2.3846, + "step": 1415 + }, + { + "epoch": 0.1580726523008655, + "grad_norm": 0.23191802203655243, + "learning_rate": 1.4599999999999999e-05, + "loss": 2.2361, + "step": 1416 + }, + { + "epoch": 0.15818428552989156, + "grad_norm": 0.22749824821949005, + "learning_rate": 1.4575e-05, + "loss": 2.2809, + "step": 1417 + }, + { + "epoch": 0.15829591875891758, + "grad_norm": 0.22720323503017426, + "learning_rate": 1.455e-05, + "loss": 2.4112, + "step": 1418 + }, + { + "epoch": 0.15840755198794362, + "grad_norm": 0.22353364527225494, + "learning_rate": 1.4524999999999999e-05, + "loss": 2.3453, + "step": 1419 + }, + { + "epoch": 0.15851918521696964, + "grad_norm": 0.22859126329421997, + "learning_rate": 1.45e-05, + "loss": 2.3357, + "step": 1420 + }, + { + "epoch": 0.1586308184459957, + "grad_norm": 0.22939516603946686, + "learning_rate": 1.4475e-05, + "loss": 2.3544, + "step": 1421 + }, + { + "epoch": 0.1587424516750217, + "grad_norm": 0.3586462736129761, + "learning_rate": 1.4449999999999999e-05, + "loss": 2.3628, + "step": 1422 + }, + { + "epoch": 0.15885408490404776, + "grad_norm": 0.24023115634918213, + "learning_rate": 1.4425e-05, + "loss": 2.2797, + "step": 1423 + }, + { + "epoch": 0.15896571813307378, + "grad_norm": 0.22737418115139008, + "learning_rate": 1.44e-05, + "loss": 2.3599, + "step": 1424 + }, + { + "epoch": 0.15907735136209983, + "grad_norm": 0.29583752155303955, + "learning_rate": 1.4374999999999999e-05, + "loss": 2.3229, + "step": 1425 + }, + { + "epoch": 0.15918898459112585, + "grad_norm": 0.22859321534633636, + "learning_rate": 1.435e-05, + "loss": 2.256, + "step": 1426 + }, + { + "epoch": 0.1593006178201519, + "grad_norm": 0.2283511608839035, + "learning_rate": 1.4325e-05, + "loss": 2.3383, + "step": 1427 + }, + { + "epoch": 0.15941225104917792, + "grad_norm": 0.24180850386619568, + "learning_rate": 1.43e-05, + "loss": 2.3953, + "step": 1428 + }, + { + "epoch": 0.15952388427820396, + "grad_norm": 0.22382569313049316, + "learning_rate": 1.4275e-05, + "loss": 2.2684, + "step": 1429 + }, + { + "epoch": 0.15963551750722998, + "grad_norm": 0.23606614768505096, + "learning_rate": 1.4249999999999999e-05, + "loss": 2.4133, + "step": 1430 + }, + { + "epoch": 0.15974715073625603, + "grad_norm": 0.23414404690265656, + "learning_rate": 1.4225e-05, + "loss": 2.2983, + "step": 1431 + }, + { + "epoch": 0.15985878396528205, + "grad_norm": 0.2279421091079712, + "learning_rate": 1.42e-05, + "loss": 2.3842, + "step": 1432 + }, + { + "epoch": 0.1599704171943081, + "grad_norm": 0.22593237459659576, + "learning_rate": 1.4174999999999999e-05, + "loss": 2.3737, + "step": 1433 + }, + { + "epoch": 0.16008205042333415, + "grad_norm": 0.2245030701160431, + "learning_rate": 1.415e-05, + "loss": 2.2731, + "step": 1434 + }, + { + "epoch": 0.16019368365236017, + "grad_norm": 0.26113003492355347, + "learning_rate": 1.4125e-05, + "loss": 2.3804, + "step": 1435 + }, + { + "epoch": 0.16030531688138622, + "grad_norm": 0.2301686853170395, + "learning_rate": 1.4099999999999999e-05, + "loss": 2.2143, + "step": 1436 + }, + { + "epoch": 0.16041695011041224, + "grad_norm": 0.2274000197649002, + "learning_rate": 1.4075e-05, + "loss": 2.4118, + "step": 1437 + }, + { + "epoch": 0.16052858333943829, + "grad_norm": 0.23563620448112488, + "learning_rate": 1.4050000000000003e-05, + "loss": 2.3734, + "step": 1438 + }, + { + "epoch": 0.1606402165684643, + "grad_norm": 0.2320801466703415, + "learning_rate": 1.4025000000000002e-05, + "loss": 2.144, + "step": 1439 + }, + { + "epoch": 0.16075184979749035, + "grad_norm": 0.2227548360824585, + "learning_rate": 1.4000000000000001e-05, + "loss": 2.3084, + "step": 1440 + }, + { + "epoch": 0.16086348302651637, + "grad_norm": 0.22315476834774017, + "learning_rate": 1.3975000000000003e-05, + "loss": 2.2599, + "step": 1441 + }, + { + "epoch": 0.16097511625554242, + "grad_norm": 0.2336408942937851, + "learning_rate": 1.3950000000000002e-05, + "loss": 2.4105, + "step": 1442 + }, + { + "epoch": 0.16108674948456844, + "grad_norm": 0.22350838780403137, + "learning_rate": 1.3925000000000001e-05, + "loss": 2.2656, + "step": 1443 + }, + { + "epoch": 0.1611983827135945, + "grad_norm": 0.2283545583486557, + "learning_rate": 1.3900000000000002e-05, + "loss": 2.3147, + "step": 1444 + }, + { + "epoch": 0.1613100159426205, + "grad_norm": 0.22985070943832397, + "learning_rate": 1.3875000000000002e-05, + "loss": 2.2738, + "step": 1445 + }, + { + "epoch": 0.16142164917164656, + "grad_norm": 0.22797125577926636, + "learning_rate": 1.3850000000000001e-05, + "loss": 2.3832, + "step": 1446 + }, + { + "epoch": 0.16153328240067258, + "grad_norm": 0.22750802338123322, + "learning_rate": 1.3825000000000002e-05, + "loss": 2.3116, + "step": 1447 + }, + { + "epoch": 0.16164491562969863, + "grad_norm": 0.23097005486488342, + "learning_rate": 1.3800000000000002e-05, + "loss": 2.391, + "step": 1448 + }, + { + "epoch": 0.16175654885872465, + "grad_norm": 0.23328077793121338, + "learning_rate": 1.3775000000000001e-05, + "loss": 2.3447, + "step": 1449 + }, + { + "epoch": 0.1618681820877507, + "grad_norm": 0.2308788150548935, + "learning_rate": 1.3750000000000002e-05, + "loss": 2.438, + "step": 1450 + }, + { + "epoch": 0.16197981531677674, + "grad_norm": 0.22981509566307068, + "learning_rate": 1.3725000000000002e-05, + "loss": 2.3295, + "step": 1451 + }, + { + "epoch": 0.16209144854580276, + "grad_norm": 0.23261301219463348, + "learning_rate": 1.3700000000000001e-05, + "loss": 2.3438, + "step": 1452 + }, + { + "epoch": 0.1622030817748288, + "grad_norm": 0.22711966931819916, + "learning_rate": 1.3675000000000002e-05, + "loss": 2.3795, + "step": 1453 + }, + { + "epoch": 0.16231471500385483, + "grad_norm": 0.22391866147518158, + "learning_rate": 1.3650000000000001e-05, + "loss": 2.2794, + "step": 1454 + }, + { + "epoch": 0.16242634823288088, + "grad_norm": 0.22070038318634033, + "learning_rate": 1.3625e-05, + "loss": 2.3572, + "step": 1455 + }, + { + "epoch": 0.1625379814619069, + "grad_norm": 0.2527678310871124, + "learning_rate": 1.3600000000000002e-05, + "loss": 2.4289, + "step": 1456 + }, + { + "epoch": 0.16264961469093295, + "grad_norm": 0.22040753066539764, + "learning_rate": 1.3575000000000001e-05, + "loss": 2.3787, + "step": 1457 + }, + { + "epoch": 0.16276124791995897, + "grad_norm": 0.2231033891439438, + "learning_rate": 1.3550000000000002e-05, + "loss": 2.3683, + "step": 1458 + }, + { + "epoch": 0.16287288114898502, + "grad_norm": 0.22828775644302368, + "learning_rate": 1.3525000000000002e-05, + "loss": 2.3088, + "step": 1459 + }, + { + "epoch": 0.16298451437801104, + "grad_norm": 0.22570709884166718, + "learning_rate": 1.3500000000000001e-05, + "loss": 2.307, + "step": 1460 + }, + { + "epoch": 0.16309614760703708, + "grad_norm": 0.8901817798614502, + "learning_rate": 1.3475000000000002e-05, + "loss": 2.3083, + "step": 1461 + }, + { + "epoch": 0.1632077808360631, + "grad_norm": 0.31848806142807007, + "learning_rate": 1.3450000000000002e-05, + "loss": 2.416, + "step": 1462 + }, + { + "epoch": 0.16331941406508915, + "grad_norm": 0.7419360280036926, + "learning_rate": 1.3425000000000001e-05, + "loss": 2.3243, + "step": 1463 + }, + { + "epoch": 0.16343104729411517, + "grad_norm": 0.22873902320861816, + "learning_rate": 1.3400000000000002e-05, + "loss": 2.344, + "step": 1464 + }, + { + "epoch": 0.16354268052314122, + "grad_norm": 0.23168662190437317, + "learning_rate": 1.3375000000000002e-05, + "loss": 2.3418, + "step": 1465 + }, + { + "epoch": 0.16365431375216724, + "grad_norm": 0.2376638650894165, + "learning_rate": 1.3350000000000001e-05, + "loss": 2.2826, + "step": 1466 + }, + { + "epoch": 0.1637659469811933, + "grad_norm": 0.23183491826057434, + "learning_rate": 1.3325000000000002e-05, + "loss": 2.3392, + "step": 1467 + }, + { + "epoch": 0.16387758021021934, + "grad_norm": 0.23273466527462006, + "learning_rate": 1.3300000000000001e-05, + "loss": 2.3427, + "step": 1468 + }, + { + "epoch": 0.16398921343924536, + "grad_norm": 0.2245101034641266, + "learning_rate": 1.3275e-05, + "loss": 2.3461, + "step": 1469 + }, + { + "epoch": 0.1641008466682714, + "grad_norm": 0.2254142016172409, + "learning_rate": 1.3250000000000002e-05, + "loss": 2.4757, + "step": 1470 + }, + { + "epoch": 0.16421247989729743, + "grad_norm": 0.26567506790161133, + "learning_rate": 1.3225000000000001e-05, + "loss": 2.4493, + "step": 1471 + }, + { + "epoch": 0.16432411312632347, + "grad_norm": 0.22109505534172058, + "learning_rate": 1.32e-05, + "loss": 2.2728, + "step": 1472 + }, + { + "epoch": 0.1644357463553495, + "grad_norm": 0.2308911383152008, + "learning_rate": 1.3175000000000002e-05, + "loss": 2.2651, + "step": 1473 + }, + { + "epoch": 0.16454737958437554, + "grad_norm": 0.3326926529407501, + "learning_rate": 1.3150000000000001e-05, + "loss": 2.3822, + "step": 1474 + }, + { + "epoch": 0.16465901281340156, + "grad_norm": 0.2321234494447708, + "learning_rate": 1.3125e-05, + "loss": 2.3652, + "step": 1475 + }, + { + "epoch": 0.1647706460424276, + "grad_norm": 0.22293105721473694, + "learning_rate": 1.3100000000000002e-05, + "loss": 2.3627, + "step": 1476 + }, + { + "epoch": 0.16488227927145363, + "grad_norm": 0.22420254349708557, + "learning_rate": 1.3075000000000001e-05, + "loss": 2.3223, + "step": 1477 + }, + { + "epoch": 0.16499391250047968, + "grad_norm": 0.24831721186637878, + "learning_rate": 1.305e-05, + "loss": 2.255, + "step": 1478 + }, + { + "epoch": 0.1651055457295057, + "grad_norm": 0.23158615827560425, + "learning_rate": 1.3025000000000002e-05, + "loss": 2.3876, + "step": 1479 + }, + { + "epoch": 0.16521717895853175, + "grad_norm": 0.2287922203540802, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.3559, + "step": 1480 + }, + { + "epoch": 0.16532881218755777, + "grad_norm": 0.26463231444358826, + "learning_rate": 1.2975e-05, + "loss": 2.375, + "step": 1481 + }, + { + "epoch": 0.16544044541658381, + "grad_norm": 0.23984935879707336, + "learning_rate": 1.2950000000000001e-05, + "loss": 2.3024, + "step": 1482 + }, + { + "epoch": 0.16555207864560986, + "grad_norm": 0.3325958251953125, + "learning_rate": 1.2925e-05, + "loss": 2.3871, + "step": 1483 + }, + { + "epoch": 0.16566371187463588, + "grad_norm": 0.23081143200397491, + "learning_rate": 1.29e-05, + "loss": 2.4202, + "step": 1484 + }, + { + "epoch": 0.16577534510366193, + "grad_norm": 0.2344684600830078, + "learning_rate": 1.2875000000000001e-05, + "loss": 2.2737, + "step": 1485 + }, + { + "epoch": 0.16588697833268795, + "grad_norm": 0.22795481979846954, + "learning_rate": 1.285e-05, + "loss": 2.3237, + "step": 1486 + }, + { + "epoch": 0.165998611561714, + "grad_norm": 0.2752438187599182, + "learning_rate": 1.2825000000000002e-05, + "loss": 2.3495, + "step": 1487 + }, + { + "epoch": 0.16611024479074002, + "grad_norm": 0.22877170145511627, + "learning_rate": 1.2800000000000001e-05, + "loss": 2.3932, + "step": 1488 + }, + { + "epoch": 0.16622187801976607, + "grad_norm": 0.23439621925354004, + "learning_rate": 1.2775e-05, + "loss": 2.3022, + "step": 1489 + }, + { + "epoch": 0.1663335112487921, + "grad_norm": 0.2311062514781952, + "learning_rate": 1.2750000000000002e-05, + "loss": 2.2674, + "step": 1490 + }, + { + "epoch": 0.16644514447781814, + "grad_norm": 0.2476395070552826, + "learning_rate": 1.2725000000000001e-05, + "loss": 2.41, + "step": 1491 + }, + { + "epoch": 0.16655677770684416, + "grad_norm": 0.22706589102745056, + "learning_rate": 1.27e-05, + "loss": 2.2607, + "step": 1492 + }, + { + "epoch": 0.1666684109358702, + "grad_norm": 0.22713389992713928, + "learning_rate": 1.2675000000000001e-05, + "loss": 2.3115, + "step": 1493 + }, + { + "epoch": 0.16678004416489622, + "grad_norm": 0.2343800812959671, + "learning_rate": 1.2650000000000001e-05, + "loss": 2.3025, + "step": 1494 + }, + { + "epoch": 0.16689167739392227, + "grad_norm": 0.23538857698440552, + "learning_rate": 1.2625e-05, + "loss": 2.2622, + "step": 1495 + }, + { + "epoch": 0.1670033106229483, + "grad_norm": 0.2369028478860855, + "learning_rate": 1.2600000000000001e-05, + "loss": 2.4365, + "step": 1496 + }, + { + "epoch": 0.16711494385197434, + "grad_norm": 0.2249869555234909, + "learning_rate": 1.2575e-05, + "loss": 2.3179, + "step": 1497 + }, + { + "epoch": 0.16722657708100036, + "grad_norm": 0.22788971662521362, + "learning_rate": 1.255e-05, + "loss": 2.4045, + "step": 1498 + }, + { + "epoch": 0.1673382103100264, + "grad_norm": 0.24183280766010284, + "learning_rate": 1.2525000000000001e-05, + "loss": 2.2165, + "step": 1499 + }, + { + "epoch": 0.16744984353905246, + "grad_norm": 0.2287655472755432, + "learning_rate": 1.25e-05, + "loss": 2.4086, + "step": 1500 + }, + { + "epoch": 0.16756147676807848, + "grad_norm": 0.22445043921470642, + "learning_rate": 1.2475e-05, + "loss": 2.3954, + "step": 1501 + }, + { + "epoch": 0.16767310999710452, + "grad_norm": 0.23435929417610168, + "learning_rate": 1.2450000000000001e-05, + "loss": 2.335, + "step": 1502 + }, + { + "epoch": 0.16778474322613054, + "grad_norm": 0.24711214005947113, + "learning_rate": 1.2425e-05, + "loss": 2.339, + "step": 1503 + }, + { + "epoch": 0.1678963764551566, + "grad_norm": 0.2275834083557129, + "learning_rate": 1.24e-05, + "loss": 2.3166, + "step": 1504 + }, + { + "epoch": 0.1680080096841826, + "grad_norm": 0.23408271372318268, + "learning_rate": 1.2375000000000001e-05, + "loss": 2.3536, + "step": 1505 + }, + { + "epoch": 0.16811964291320866, + "grad_norm": 0.24109122157096863, + "learning_rate": 1.235e-05, + "loss": 2.244, + "step": 1506 + }, + { + "epoch": 0.16823127614223468, + "grad_norm": 0.24764837324619293, + "learning_rate": 1.2325e-05, + "loss": 2.3222, + "step": 1507 + }, + { + "epoch": 0.16834290937126073, + "grad_norm": 0.2633460462093353, + "learning_rate": 1.23e-05, + "loss": 2.2406, + "step": 1508 + }, + { + "epoch": 0.16845454260028675, + "grad_norm": 0.23464356362819672, + "learning_rate": 1.2275e-05, + "loss": 2.4122, + "step": 1509 + }, + { + "epoch": 0.1685661758293128, + "grad_norm": 0.23036916553974152, + "learning_rate": 1.225e-05, + "loss": 2.3748, + "step": 1510 + }, + { + "epoch": 0.16867780905833882, + "grad_norm": 0.22803008556365967, + "learning_rate": 1.2225e-05, + "loss": 2.3297, + "step": 1511 + }, + { + "epoch": 0.16878944228736487, + "grad_norm": 0.22997453808784485, + "learning_rate": 1.22e-05, + "loss": 2.4096, + "step": 1512 + }, + { + "epoch": 0.16890107551639089, + "grad_norm": 0.2285720407962799, + "learning_rate": 1.2175e-05, + "loss": 2.3791, + "step": 1513 + }, + { + "epoch": 0.16901270874541693, + "grad_norm": 0.23072819411754608, + "learning_rate": 1.215e-05, + "loss": 2.2956, + "step": 1514 + }, + { + "epoch": 0.16912434197444295, + "grad_norm": 0.2504909336566925, + "learning_rate": 1.2125e-05, + "loss": 2.3782, + "step": 1515 + }, + { + "epoch": 0.169235975203469, + "grad_norm": 0.2371397614479065, + "learning_rate": 1.2100000000000001e-05, + "loss": 2.2162, + "step": 1516 + }, + { + "epoch": 0.16934760843249505, + "grad_norm": 0.3986074924468994, + "learning_rate": 1.2075e-05, + "loss": 2.4737, + "step": 1517 + }, + { + "epoch": 0.16945924166152107, + "grad_norm": 0.24999715387821198, + "learning_rate": 1.205e-05, + "loss": 2.2256, + "step": 1518 + }, + { + "epoch": 0.16957087489054712, + "grad_norm": 0.23626287281513214, + "learning_rate": 1.2025000000000001e-05, + "loss": 2.331, + "step": 1519 + }, + { + "epoch": 0.16968250811957314, + "grad_norm": 0.24345801770687103, + "learning_rate": 1.2e-05, + "loss": 2.3845, + "step": 1520 + }, + { + "epoch": 0.1697941413485992, + "grad_norm": 0.22939099371433258, + "learning_rate": 1.1975e-05, + "loss": 2.2987, + "step": 1521 + }, + { + "epoch": 0.1699057745776252, + "grad_norm": 0.22851888835430145, + "learning_rate": 1.195e-05, + "loss": 2.2962, + "step": 1522 + }, + { + "epoch": 0.17001740780665126, + "grad_norm": 0.23316575586795807, + "learning_rate": 1.1925e-05, + "loss": 2.2429, + "step": 1523 + }, + { + "epoch": 0.17012904103567728, + "grad_norm": 0.23692427575588226, + "learning_rate": 1.19e-05, + "loss": 2.2889, + "step": 1524 + }, + { + "epoch": 0.17024067426470332, + "grad_norm": 0.2287677824497223, + "learning_rate": 1.1875e-05, + "loss": 2.3004, + "step": 1525 + }, + { + "epoch": 0.17035230749372934, + "grad_norm": 0.23153142631053925, + "learning_rate": 1.185e-05, + "loss": 2.3339, + "step": 1526 + }, + { + "epoch": 0.1704639407227554, + "grad_norm": 0.23223859071731567, + "learning_rate": 1.1825e-05, + "loss": 2.3562, + "step": 1527 + }, + { + "epoch": 0.1705755739517814, + "grad_norm": 0.2501643896102905, + "learning_rate": 1.18e-05, + "loss": 2.3524, + "step": 1528 + }, + { + "epoch": 0.17068720718080746, + "grad_norm": 0.227057546377182, + "learning_rate": 1.1775e-05, + "loss": 2.3482, + "step": 1529 + }, + { + "epoch": 0.17079884040983348, + "grad_norm": 0.23041006922721863, + "learning_rate": 1.175e-05, + "loss": 2.2926, + "step": 1530 + }, + { + "epoch": 0.17091047363885953, + "grad_norm": 0.23476162552833557, + "learning_rate": 1.1725e-05, + "loss": 2.2397, + "step": 1531 + }, + { + "epoch": 0.17102210686788555, + "grad_norm": 0.23537229001522064, + "learning_rate": 1.1700000000000001e-05, + "loss": 2.321, + "step": 1532 + }, + { + "epoch": 0.1711337400969116, + "grad_norm": 0.22815583646297455, + "learning_rate": 1.1675000000000001e-05, + "loss": 2.3891, + "step": 1533 + }, + { + "epoch": 0.17124537332593764, + "grad_norm": 0.23210576176643372, + "learning_rate": 1.1650000000000002e-05, + "loss": 2.2983, + "step": 1534 + }, + { + "epoch": 0.17135700655496366, + "grad_norm": 0.2980906665325165, + "learning_rate": 1.1625000000000001e-05, + "loss": 2.3548, + "step": 1535 + }, + { + "epoch": 0.1714686397839897, + "grad_norm": 0.2320065051317215, + "learning_rate": 1.16e-05, + "loss": 2.3705, + "step": 1536 + }, + { + "epoch": 0.17158027301301573, + "grad_norm": 0.5941006541252136, + "learning_rate": 1.1575000000000002e-05, + "loss": 2.303, + "step": 1537 + }, + { + "epoch": 0.17169190624204178, + "grad_norm": 0.23008465766906738, + "learning_rate": 1.1550000000000001e-05, + "loss": 2.2684, + "step": 1538 + }, + { + "epoch": 0.1718035394710678, + "grad_norm": 0.2237699329853058, + "learning_rate": 1.1525e-05, + "loss": 2.3254, + "step": 1539 + }, + { + "epoch": 0.17191517270009385, + "grad_norm": 0.2662789225578308, + "learning_rate": 1.1500000000000002e-05, + "loss": 2.3753, + "step": 1540 + }, + { + "epoch": 0.17202680592911987, + "grad_norm": 0.4782852232456207, + "learning_rate": 1.1475000000000001e-05, + "loss": 2.2175, + "step": 1541 + }, + { + "epoch": 0.17213843915814592, + "grad_norm": 0.23662321269512177, + "learning_rate": 1.145e-05, + "loss": 2.3435, + "step": 1542 + }, + { + "epoch": 0.17225007238717194, + "grad_norm": 0.25570255517959595, + "learning_rate": 1.1425000000000002e-05, + "loss": 2.4225, + "step": 1543 + }, + { + "epoch": 0.17236170561619799, + "grad_norm": 1.1005254983901978, + "learning_rate": 1.1400000000000001e-05, + "loss": 2.339, + "step": 1544 + }, + { + "epoch": 0.172473338845224, + "grad_norm": 0.23017080128192902, + "learning_rate": 1.1375e-05, + "loss": 2.3182, + "step": 1545 + }, + { + "epoch": 0.17258497207425005, + "grad_norm": 0.2425118386745453, + "learning_rate": 1.1350000000000001e-05, + "loss": 2.2469, + "step": 1546 + }, + { + "epoch": 0.17269660530327607, + "grad_norm": 0.31308507919311523, + "learning_rate": 1.1325e-05, + "loss": 2.4381, + "step": 1547 + }, + { + "epoch": 0.17280823853230212, + "grad_norm": 0.22850260138511658, + "learning_rate": 1.13e-05, + "loss": 2.401, + "step": 1548 + }, + { + "epoch": 0.17291987176132817, + "grad_norm": 0.23523540794849396, + "learning_rate": 1.1275000000000001e-05, + "loss": 2.2428, + "step": 1549 + }, + { + "epoch": 0.1730315049903542, + "grad_norm": 0.2316836565732956, + "learning_rate": 1.125e-05, + "loss": 2.2699, + "step": 1550 + }, + { + "epoch": 0.17314313821938024, + "grad_norm": 0.23121745884418488, + "learning_rate": 1.1225e-05, + "loss": 2.3717, + "step": 1551 + }, + { + "epoch": 0.17325477144840626, + "grad_norm": 0.2281220257282257, + "learning_rate": 1.1200000000000001e-05, + "loss": 2.3741, + "step": 1552 + }, + { + "epoch": 0.1733664046774323, + "grad_norm": 0.2314029186964035, + "learning_rate": 1.1175e-05, + "loss": 2.3432, + "step": 1553 + }, + { + "epoch": 0.17347803790645833, + "grad_norm": 0.229678675532341, + "learning_rate": 1.115e-05, + "loss": 2.4274, + "step": 1554 + }, + { + "epoch": 0.17358967113548437, + "grad_norm": 0.23651176691055298, + "learning_rate": 1.1125000000000001e-05, + "loss": 2.47, + "step": 1555 + }, + { + "epoch": 0.1737013043645104, + "grad_norm": 0.25404486060142517, + "learning_rate": 1.11e-05, + "loss": 2.2866, + "step": 1556 + }, + { + "epoch": 0.17381293759353644, + "grad_norm": 0.2315002977848053, + "learning_rate": 1.1075e-05, + "loss": 2.323, + "step": 1557 + }, + { + "epoch": 0.17392457082256246, + "grad_norm": 0.22683122754096985, + "learning_rate": 1.1050000000000001e-05, + "loss": 2.3863, + "step": 1558 + }, + { + "epoch": 0.1740362040515885, + "grad_norm": 0.22364884614944458, + "learning_rate": 1.1025e-05, + "loss": 2.4092, + "step": 1559 + }, + { + "epoch": 0.17414783728061453, + "grad_norm": 0.22706075012683868, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.2995, + "step": 1560 + }, + { + "epoch": 0.17425947050964058, + "grad_norm": 0.22615200281143188, + "learning_rate": 1.0975e-05, + "loss": 2.274, + "step": 1561 + }, + { + "epoch": 0.1743711037386666, + "grad_norm": 0.23156067728996277, + "learning_rate": 1.095e-05, + "loss": 2.3346, + "step": 1562 + }, + { + "epoch": 0.17448273696769265, + "grad_norm": 0.22800372540950775, + "learning_rate": 1.0925000000000001e-05, + "loss": 2.3311, + "step": 1563 + }, + { + "epoch": 0.17459437019671867, + "grad_norm": 0.2314596325159073, + "learning_rate": 1.09e-05, + "loss": 2.4173, + "step": 1564 + }, + { + "epoch": 0.17470600342574472, + "grad_norm": 0.22724230587482452, + "learning_rate": 1.0875e-05, + "loss": 2.2718, + "step": 1565 + }, + { + "epoch": 0.17481763665477076, + "grad_norm": 0.22000856697559357, + "learning_rate": 1.0850000000000001e-05, + "loss": 2.2426, + "step": 1566 + }, + { + "epoch": 0.17492926988379678, + "grad_norm": 0.22086068987846375, + "learning_rate": 1.0825e-05, + "loss": 2.343, + "step": 1567 + }, + { + "epoch": 0.17504090311282283, + "grad_norm": 0.23039323091506958, + "learning_rate": 1.08e-05, + "loss": 2.3363, + "step": 1568 + }, + { + "epoch": 0.17515253634184885, + "grad_norm": 0.23125073313713074, + "learning_rate": 1.0775000000000001e-05, + "loss": 2.3756, + "step": 1569 + }, + { + "epoch": 0.1752641695708749, + "grad_norm": 0.2302304059267044, + "learning_rate": 1.075e-05, + "loss": 2.292, + "step": 1570 + }, + { + "epoch": 0.17537580279990092, + "grad_norm": 0.2361575961112976, + "learning_rate": 1.0725e-05, + "loss": 2.3275, + "step": 1571 + }, + { + "epoch": 0.17548743602892697, + "grad_norm": 0.2350195348262787, + "learning_rate": 1.0700000000000001e-05, + "loss": 2.3313, + "step": 1572 + }, + { + "epoch": 0.175599069257953, + "grad_norm": 0.2335433065891266, + "learning_rate": 1.0675e-05, + "loss": 2.4255, + "step": 1573 + }, + { + "epoch": 0.17571070248697904, + "grad_norm": 0.27532052993774414, + "learning_rate": 1.065e-05, + "loss": 2.4282, + "step": 1574 + }, + { + "epoch": 0.17582233571600506, + "grad_norm": 0.23674359917640686, + "learning_rate": 1.0625e-05, + "loss": 2.3127, + "step": 1575 + }, + { + "epoch": 0.1759339689450311, + "grad_norm": 0.22703874111175537, + "learning_rate": 1.06e-05, + "loss": 2.4281, + "step": 1576 + }, + { + "epoch": 0.17604560217405713, + "grad_norm": 0.2311123013496399, + "learning_rate": 1.0575e-05, + "loss": 2.3588, + "step": 1577 + }, + { + "epoch": 0.17615723540308317, + "grad_norm": 0.22461971640586853, + "learning_rate": 1.055e-05, + "loss": 2.3376, + "step": 1578 + }, + { + "epoch": 0.1762688686321092, + "grad_norm": 0.2341393530368805, + "learning_rate": 1.0525e-05, + "loss": 2.3273, + "step": 1579 + }, + { + "epoch": 0.17638050186113524, + "grad_norm": 0.24818100035190582, + "learning_rate": 1.05e-05, + "loss": 2.4611, + "step": 1580 + }, + { + "epoch": 0.17649213509016126, + "grad_norm": 0.23257790505886078, + "learning_rate": 1.0475e-05, + "loss": 2.2885, + "step": 1581 + }, + { + "epoch": 0.1766037683191873, + "grad_norm": 0.23357973992824554, + "learning_rate": 1.045e-05, + "loss": 2.2287, + "step": 1582 + }, + { + "epoch": 0.17671540154821336, + "grad_norm": 0.22697918117046356, + "learning_rate": 1.0425e-05, + "loss": 2.2872, + "step": 1583 + }, + { + "epoch": 0.17682703477723938, + "grad_norm": 0.2292308658361435, + "learning_rate": 1.04e-05, + "loss": 2.332, + "step": 1584 + }, + { + "epoch": 0.17693866800626543, + "grad_norm": 0.235504150390625, + "learning_rate": 1.0375e-05, + "loss": 2.3714, + "step": 1585 + }, + { + "epoch": 0.17705030123529145, + "grad_norm": 0.22994711995124817, + "learning_rate": 1.035e-05, + "loss": 2.3468, + "step": 1586 + }, + { + "epoch": 0.1771619344643175, + "grad_norm": 0.23801718652248383, + "learning_rate": 1.0325e-05, + "loss": 2.3851, + "step": 1587 + }, + { + "epoch": 0.17727356769334351, + "grad_norm": 0.22894729673862457, + "learning_rate": 1.03e-05, + "loss": 2.4219, + "step": 1588 + }, + { + "epoch": 0.17738520092236956, + "grad_norm": 0.2282966822385788, + "learning_rate": 1.0275e-05, + "loss": 2.3184, + "step": 1589 + }, + { + "epoch": 0.17749683415139558, + "grad_norm": 0.22595593333244324, + "learning_rate": 1.025e-05, + "loss": 2.3154, + "step": 1590 + }, + { + "epoch": 0.17760846738042163, + "grad_norm": 0.2264798879623413, + "learning_rate": 1.0225e-05, + "loss": 2.2779, + "step": 1591 + }, + { + "epoch": 0.17772010060944765, + "grad_norm": 0.22668862342834473, + "learning_rate": 1.02e-05, + "loss": 2.4036, + "step": 1592 + }, + { + "epoch": 0.1778317338384737, + "grad_norm": 0.22386161983013153, + "learning_rate": 1.0175e-05, + "loss": 2.2603, + "step": 1593 + }, + { + "epoch": 0.17794336706749972, + "grad_norm": 0.23508307337760925, + "learning_rate": 1.0150000000000001e-05, + "loss": 2.3921, + "step": 1594 + }, + { + "epoch": 0.17805500029652577, + "grad_norm": 0.2235022783279419, + "learning_rate": 1.0125e-05, + "loss": 2.3779, + "step": 1595 + }, + { + "epoch": 0.1781666335255518, + "grad_norm": 0.2393585741519928, + "learning_rate": 1.0100000000000002e-05, + "loss": 2.3471, + "step": 1596 + }, + { + "epoch": 0.17827826675457784, + "grad_norm": 0.22901956737041473, + "learning_rate": 1.0075000000000001e-05, + "loss": 2.2271, + "step": 1597 + }, + { + "epoch": 0.17838989998360386, + "grad_norm": 0.22379013895988464, + "learning_rate": 1.005e-05, + "loss": 2.3179, + "step": 1598 + }, + { + "epoch": 0.1785015332126299, + "grad_norm": 0.23061637580394745, + "learning_rate": 1.0025000000000001e-05, + "loss": 2.4403, + "step": 1599 + }, + { + "epoch": 0.17861316644165595, + "grad_norm": 0.22944380342960358, + "learning_rate": 1e-05, + "loss": 2.3587, + "step": 1600 + }, + { + "epoch": 0.17872479967068197, + "grad_norm": 0.22462834417819977, + "learning_rate": 9.975e-06, + "loss": 2.366, + "step": 1601 + }, + { + "epoch": 0.17883643289970802, + "grad_norm": 0.22976066172122955, + "learning_rate": 9.950000000000001e-06, + "loss": 2.3952, + "step": 1602 + }, + { + "epoch": 0.17894806612873404, + "grad_norm": 0.22779542207717896, + "learning_rate": 9.925e-06, + "loss": 2.3513, + "step": 1603 + }, + { + "epoch": 0.1790596993577601, + "grad_norm": 0.22328883409500122, + "learning_rate": 9.900000000000002e-06, + "loss": 2.3977, + "step": 1604 + }, + { + "epoch": 0.1791713325867861, + "grad_norm": 0.2311943769454956, + "learning_rate": 9.875000000000001e-06, + "loss": 2.2658, + "step": 1605 + }, + { + "epoch": 0.17928296581581216, + "grad_norm": 0.2562137246131897, + "learning_rate": 9.85e-06, + "loss": 2.3592, + "step": 1606 + }, + { + "epoch": 0.17939459904483818, + "grad_norm": 0.2313629388809204, + "learning_rate": 9.825000000000002e-06, + "loss": 2.2974, + "step": 1607 + }, + { + "epoch": 0.17950623227386422, + "grad_norm": 0.23150859773159027, + "learning_rate": 9.800000000000001e-06, + "loss": 2.428, + "step": 1608 + }, + { + "epoch": 0.17961786550289024, + "grad_norm": 0.23653681576251984, + "learning_rate": 9.775e-06, + "loss": 2.2849, + "step": 1609 + }, + { + "epoch": 0.1797294987319163, + "grad_norm": 0.23083879053592682, + "learning_rate": 9.750000000000002e-06, + "loss": 2.3614, + "step": 1610 + }, + { + "epoch": 0.1798411319609423, + "grad_norm": 0.23122479021549225, + "learning_rate": 9.725000000000001e-06, + "loss": 2.3737, + "step": 1611 + }, + { + "epoch": 0.17995276518996836, + "grad_norm": 0.22781018912792206, + "learning_rate": 9.7e-06, + "loss": 2.3419, + "step": 1612 + }, + { + "epoch": 0.18006439841899438, + "grad_norm": 0.2270815223455429, + "learning_rate": 9.675000000000001e-06, + "loss": 2.3468, + "step": 1613 + }, + { + "epoch": 0.18017603164802043, + "grad_norm": 0.2297106683254242, + "learning_rate": 9.65e-06, + "loss": 2.3164, + "step": 1614 + }, + { + "epoch": 0.18028766487704648, + "grad_norm": 0.22484375536441803, + "learning_rate": 9.625e-06, + "loss": 2.5492, + "step": 1615 + }, + { + "epoch": 0.1803992981060725, + "grad_norm": 0.230937659740448, + "learning_rate": 9.600000000000001e-06, + "loss": 2.3003, + "step": 1616 + }, + { + "epoch": 0.18051093133509855, + "grad_norm": 0.23577038943767548, + "learning_rate": 9.575e-06, + "loss": 2.2976, + "step": 1617 + }, + { + "epoch": 0.18062256456412457, + "grad_norm": 0.23040169477462769, + "learning_rate": 9.55e-06, + "loss": 2.3458, + "step": 1618 + }, + { + "epoch": 0.1807341977931506, + "grad_norm": 0.23320811986923218, + "learning_rate": 9.525000000000001e-06, + "loss": 2.3712, + "step": 1619 + }, + { + "epoch": 0.18084583102217663, + "grad_norm": 0.2216322273015976, + "learning_rate": 9.5e-06, + "loss": 2.3903, + "step": 1620 + }, + { + "epoch": 0.18095746425120268, + "grad_norm": 0.22143380343914032, + "learning_rate": 9.475e-06, + "loss": 2.3709, + "step": 1621 + }, + { + "epoch": 0.1810690974802287, + "grad_norm": 0.22373448312282562, + "learning_rate": 9.450000000000001e-06, + "loss": 2.3535, + "step": 1622 + }, + { + "epoch": 0.18118073070925475, + "grad_norm": 0.22735032439231873, + "learning_rate": 9.425e-06, + "loss": 2.3133, + "step": 1623 + }, + { + "epoch": 0.18129236393828077, + "grad_norm": 0.2304399609565735, + "learning_rate": 9.4e-06, + "loss": 2.3577, + "step": 1624 + }, + { + "epoch": 0.18140399716730682, + "grad_norm": 0.22027041018009186, + "learning_rate": 9.375000000000001e-06, + "loss": 2.352, + "step": 1625 + }, + { + "epoch": 0.18151563039633284, + "grad_norm": 0.23104159533977509, + "learning_rate": 9.35e-06, + "loss": 2.3603, + "step": 1626 + }, + { + "epoch": 0.1816272636253589, + "grad_norm": 0.26686009764671326, + "learning_rate": 9.325e-06, + "loss": 2.328, + "step": 1627 + }, + { + "epoch": 0.1817388968543849, + "grad_norm": 0.22924385964870453, + "learning_rate": 9.3e-06, + "loss": 2.3588, + "step": 1628 + }, + { + "epoch": 0.18185053008341096, + "grad_norm": 0.22922426462173462, + "learning_rate": 9.275e-06, + "loss": 2.3412, + "step": 1629 + }, + { + "epoch": 0.18196216331243698, + "grad_norm": 0.22747567296028137, + "learning_rate": 9.25e-06, + "loss": 2.3429, + "step": 1630 + }, + { + "epoch": 0.18207379654146302, + "grad_norm": 0.2208503782749176, + "learning_rate": 9.225e-06, + "loss": 2.2673, + "step": 1631 + }, + { + "epoch": 0.18218542977048907, + "grad_norm": 0.24815091490745544, + "learning_rate": 9.2e-06, + "loss": 2.4107, + "step": 1632 + }, + { + "epoch": 0.1822970629995151, + "grad_norm": 0.23134523630142212, + "learning_rate": 9.175000000000001e-06, + "loss": 2.3822, + "step": 1633 + }, + { + "epoch": 0.18240869622854114, + "grad_norm": 0.2306479662656784, + "learning_rate": 9.15e-06, + "loss": 2.3913, + "step": 1634 + }, + { + "epoch": 0.18252032945756716, + "grad_norm": 0.22402219474315643, + "learning_rate": 9.125e-06, + "loss": 2.3625, + "step": 1635 + }, + { + "epoch": 0.1826319626865932, + "grad_norm": 0.2309318333864212, + "learning_rate": 9.100000000000001e-06, + "loss": 2.4505, + "step": 1636 + }, + { + "epoch": 0.18274359591561923, + "grad_norm": 0.22473448514938354, + "learning_rate": 9.075e-06, + "loss": 2.3079, + "step": 1637 + }, + { + "epoch": 0.18285522914464528, + "grad_norm": 0.22978295385837555, + "learning_rate": 9.05e-06, + "loss": 2.3589, + "step": 1638 + }, + { + "epoch": 0.1829668623736713, + "grad_norm": 0.22440096735954285, + "learning_rate": 9.025e-06, + "loss": 2.4401, + "step": 1639 + }, + { + "epoch": 0.18307849560269734, + "grad_norm": 0.23170286417007446, + "learning_rate": 9e-06, + "loss": 2.3291, + "step": 1640 + }, + { + "epoch": 0.18319012883172336, + "grad_norm": 0.24245621263980865, + "learning_rate": 8.975e-06, + "loss": 2.3008, + "step": 1641 + }, + { + "epoch": 0.1833017620607494, + "grad_norm": 0.23115895688533783, + "learning_rate": 8.95e-06, + "loss": 2.3127, + "step": 1642 + }, + { + "epoch": 0.18341339528977543, + "grad_norm": 0.2309594601392746, + "learning_rate": 8.925e-06, + "loss": 2.3205, + "step": 1643 + }, + { + "epoch": 0.18352502851880148, + "grad_norm": 0.2410878688097, + "learning_rate": 8.9e-06, + "loss": 2.4235, + "step": 1644 + }, + { + "epoch": 0.1836366617478275, + "grad_norm": 0.2556111216545105, + "learning_rate": 8.875e-06, + "loss": 2.4193, + "step": 1645 + }, + { + "epoch": 0.18374829497685355, + "grad_norm": 0.23009131848812103, + "learning_rate": 8.85e-06, + "loss": 2.3439, + "step": 1646 + }, + { + "epoch": 0.18385992820587957, + "grad_norm": 0.23371157050132751, + "learning_rate": 8.825e-06, + "loss": 2.3897, + "step": 1647 + }, + { + "epoch": 0.18397156143490562, + "grad_norm": 0.23305875062942505, + "learning_rate": 8.8e-06, + "loss": 2.2757, + "step": 1648 + }, + { + "epoch": 0.18408319466393167, + "grad_norm": 0.22758939862251282, + "learning_rate": 8.775e-06, + "loss": 2.3526, + "step": 1649 + }, + { + "epoch": 0.18419482789295769, + "grad_norm": 0.28928419947624207, + "learning_rate": 8.75e-06, + "loss": 2.2889, + "step": 1650 + }, + { + "epoch": 0.18430646112198373, + "grad_norm": 0.23369638621807098, + "learning_rate": 8.725e-06, + "loss": 2.3379, + "step": 1651 + }, + { + "epoch": 0.18441809435100975, + "grad_norm": 0.23393464088439941, + "learning_rate": 8.7e-06, + "loss": 2.3827, + "step": 1652 + }, + { + "epoch": 0.1845297275800358, + "grad_norm": 0.22967039048671722, + "learning_rate": 8.674999999999999e-06, + "loss": 2.4529, + "step": 1653 + }, + { + "epoch": 0.18464136080906182, + "grad_norm": 0.311108261346817, + "learning_rate": 8.65e-06, + "loss": 2.2626, + "step": 1654 + }, + { + "epoch": 0.18475299403808787, + "grad_norm": 0.23404546082019806, + "learning_rate": 8.625e-06, + "loss": 2.2972, + "step": 1655 + }, + { + "epoch": 0.1848646272671139, + "grad_norm": 0.2330547720193863, + "learning_rate": 8.599999999999999e-06, + "loss": 2.3406, + "step": 1656 + }, + { + "epoch": 0.18497626049613994, + "grad_norm": 0.22166459262371063, + "learning_rate": 8.575000000000002e-06, + "loss": 2.2893, + "step": 1657 + }, + { + "epoch": 0.18508789372516596, + "grad_norm": 0.2245807945728302, + "learning_rate": 8.550000000000001e-06, + "loss": 2.1937, + "step": 1658 + }, + { + "epoch": 0.185199526954192, + "grad_norm": 0.22609424591064453, + "learning_rate": 8.525e-06, + "loss": 2.3882, + "step": 1659 + }, + { + "epoch": 0.18531116018321803, + "grad_norm": 0.22625020146369934, + "learning_rate": 8.500000000000002e-06, + "loss": 2.3893, + "step": 1660 + }, + { + "epoch": 0.18542279341224407, + "grad_norm": 0.25080451369285583, + "learning_rate": 8.475000000000001e-06, + "loss": 2.2867, + "step": 1661 + }, + { + "epoch": 0.1855344266412701, + "grad_norm": 0.23033380508422852, + "learning_rate": 8.45e-06, + "loss": 2.3523, + "step": 1662 + }, + { + "epoch": 0.18564605987029614, + "grad_norm": 0.23714114725589752, + "learning_rate": 8.425000000000001e-06, + "loss": 2.3399, + "step": 1663 + }, + { + "epoch": 0.18575769309932216, + "grad_norm": 0.23813962936401367, + "learning_rate": 8.400000000000001e-06, + "loss": 2.3729, + "step": 1664 + }, + { + "epoch": 0.1858693263283482, + "grad_norm": 0.22549176216125488, + "learning_rate": 8.375e-06, + "loss": 2.3674, + "step": 1665 + }, + { + "epoch": 0.18598095955737426, + "grad_norm": 0.2233189046382904, + "learning_rate": 8.350000000000001e-06, + "loss": 2.3293, + "step": 1666 + }, + { + "epoch": 0.18609259278640028, + "grad_norm": 0.22096222639083862, + "learning_rate": 8.325e-06, + "loss": 2.3471, + "step": 1667 + }, + { + "epoch": 0.18620422601542633, + "grad_norm": 0.2315039187669754, + "learning_rate": 8.3e-06, + "loss": 2.3451, + "step": 1668 + }, + { + "epoch": 0.18631585924445235, + "grad_norm": 0.23510505259037018, + "learning_rate": 8.275000000000001e-06, + "loss": 2.3604, + "step": 1669 + }, + { + "epoch": 0.1864274924734784, + "grad_norm": 0.23275664448738098, + "learning_rate": 8.25e-06, + "loss": 2.3314, + "step": 1670 + }, + { + "epoch": 0.18653912570250442, + "grad_norm": 0.22906853258609772, + "learning_rate": 8.225e-06, + "loss": 2.3489, + "step": 1671 + }, + { + "epoch": 0.18665075893153046, + "grad_norm": 0.223612442612648, + "learning_rate": 8.200000000000001e-06, + "loss": 2.4229, + "step": 1672 + }, + { + "epoch": 0.18676239216055648, + "grad_norm": 0.22447071969509125, + "learning_rate": 8.175e-06, + "loss": 2.4356, + "step": 1673 + }, + { + "epoch": 0.18687402538958253, + "grad_norm": 0.2319689691066742, + "learning_rate": 8.15e-06, + "loss": 2.3806, + "step": 1674 + }, + { + "epoch": 0.18698565861860855, + "grad_norm": 0.22558631002902985, + "learning_rate": 8.125000000000001e-06, + "loss": 2.3246, + "step": 1675 + }, + { + "epoch": 0.1870972918476346, + "grad_norm": 0.23677797615528107, + "learning_rate": 8.1e-06, + "loss": 2.3814, + "step": 1676 + }, + { + "epoch": 0.18720892507666062, + "grad_norm": 0.2355109453201294, + "learning_rate": 8.075000000000001e-06, + "loss": 2.3477, + "step": 1677 + }, + { + "epoch": 0.18732055830568667, + "grad_norm": 0.22588761150836945, + "learning_rate": 8.050000000000001e-06, + "loss": 2.2773, + "step": 1678 + }, + { + "epoch": 0.1874321915347127, + "grad_norm": 0.22162844240665436, + "learning_rate": 8.025e-06, + "loss": 2.3412, + "step": 1679 + }, + { + "epoch": 0.18754382476373874, + "grad_norm": 0.23201774060726166, + "learning_rate": 8.000000000000001e-06, + "loss": 2.3573, + "step": 1680 + }, + { + "epoch": 0.18765545799276478, + "grad_norm": 0.22092294692993164, + "learning_rate": 7.975e-06, + "loss": 2.3214, + "step": 1681 + }, + { + "epoch": 0.1877670912217908, + "grad_norm": 0.2279776781797409, + "learning_rate": 7.95e-06, + "loss": 2.3555, + "step": 1682 + }, + { + "epoch": 0.18787872445081685, + "grad_norm": 0.23055800795555115, + "learning_rate": 7.925000000000001e-06, + "loss": 2.2783, + "step": 1683 + }, + { + "epoch": 0.18799035767984287, + "grad_norm": 0.22860021889209747, + "learning_rate": 7.9e-06, + "loss": 2.3331, + "step": 1684 + }, + { + "epoch": 0.18810199090886892, + "grad_norm": 0.22126097977161407, + "learning_rate": 7.875e-06, + "loss": 2.3946, + "step": 1685 + }, + { + "epoch": 0.18821362413789494, + "grad_norm": 0.22869837284088135, + "learning_rate": 7.850000000000001e-06, + "loss": 2.2846, + "step": 1686 + }, + { + "epoch": 0.188325257366921, + "grad_norm": 0.23146143555641174, + "learning_rate": 7.825e-06, + "loss": 2.3452, + "step": 1687 + }, + { + "epoch": 0.188436890595947, + "grad_norm": 0.24097077548503876, + "learning_rate": 7.8e-06, + "loss": 2.2416, + "step": 1688 + }, + { + "epoch": 0.18854852382497306, + "grad_norm": 0.23362663388252258, + "learning_rate": 7.775000000000001e-06, + "loss": 2.3445, + "step": 1689 + }, + { + "epoch": 0.18866015705399908, + "grad_norm": 0.23741386830806732, + "learning_rate": 7.75e-06, + "loss": 2.4122, + "step": 1690 + }, + { + "epoch": 0.18877179028302513, + "grad_norm": 0.2494947612285614, + "learning_rate": 7.725e-06, + "loss": 2.2177, + "step": 1691 + }, + { + "epoch": 0.18888342351205115, + "grad_norm": 0.23337315022945404, + "learning_rate": 7.7e-06, + "loss": 2.3019, + "step": 1692 + }, + { + "epoch": 0.1889950567410772, + "grad_norm": 0.2309933602809906, + "learning_rate": 7.675e-06, + "loss": 2.3468, + "step": 1693 + }, + { + "epoch": 0.18910668997010321, + "grad_norm": 0.22482022643089294, + "learning_rate": 7.65e-06, + "loss": 2.2141, + "step": 1694 + }, + { + "epoch": 0.18921832319912926, + "grad_norm": 0.24350865185260773, + "learning_rate": 7.625e-06, + "loss": 2.3257, + "step": 1695 + }, + { + "epoch": 0.18932995642815528, + "grad_norm": 0.2380324900150299, + "learning_rate": 7.6e-06, + "loss": 2.3769, + "step": 1696 + }, + { + "epoch": 0.18944158965718133, + "grad_norm": 0.23215071856975555, + "learning_rate": 7.575e-06, + "loss": 2.3028, + "step": 1697 + }, + { + "epoch": 0.18955322288620738, + "grad_norm": 0.24625477194786072, + "learning_rate": 7.55e-06, + "loss": 2.3829, + "step": 1698 + }, + { + "epoch": 0.1896648561152334, + "grad_norm": 0.22713284194469452, + "learning_rate": 7.525e-06, + "loss": 2.3282, + "step": 1699 + }, + { + "epoch": 0.18977648934425945, + "grad_norm": 0.221211776137352, + "learning_rate": 7.5e-06, + "loss": 2.325, + "step": 1700 + }, + { + "epoch": 0.18988812257328547, + "grad_norm": 0.23911140859127045, + "learning_rate": 7.4750000000000004e-06, + "loss": 2.2369, + "step": 1701 + }, + { + "epoch": 0.18999975580231152, + "grad_norm": 0.22453325986862183, + "learning_rate": 7.45e-06, + "loss": 2.3757, + "step": 1702 + }, + { + "epoch": 0.19011138903133754, + "grad_norm": 0.2336825579404831, + "learning_rate": 7.425e-06, + "loss": 2.2279, + "step": 1703 + }, + { + "epoch": 0.19022302226036358, + "grad_norm": 0.2163713276386261, + "learning_rate": 7.4e-06, + "loss": 2.3206, + "step": 1704 + }, + { + "epoch": 0.1903346554893896, + "grad_norm": 0.23673447966575623, + "learning_rate": 7.375e-06, + "loss": 2.3235, + "step": 1705 + }, + { + "epoch": 0.19044628871841565, + "grad_norm": 0.23599569499492645, + "learning_rate": 7.35e-06, + "loss": 2.4519, + "step": 1706 + }, + { + "epoch": 0.19055792194744167, + "grad_norm": 0.23098653554916382, + "learning_rate": 7.325e-06, + "loss": 2.344, + "step": 1707 + }, + { + "epoch": 0.19066955517646772, + "grad_norm": 0.2260526716709137, + "learning_rate": 7.2999999999999996e-06, + "loss": 2.2734, + "step": 1708 + }, + { + "epoch": 0.19078118840549374, + "grad_norm": 1.934260606765747, + "learning_rate": 7.275e-06, + "loss": 2.3161, + "step": 1709 + }, + { + "epoch": 0.1908928216345198, + "grad_norm": 0.22405433654785156, + "learning_rate": 7.25e-06, + "loss": 2.3647, + "step": 1710 + }, + { + "epoch": 0.1910044548635458, + "grad_norm": 0.22353483736515045, + "learning_rate": 7.2249999999999994e-06, + "loss": 2.3636, + "step": 1711 + }, + { + "epoch": 0.19111608809257186, + "grad_norm": 0.2400137484073639, + "learning_rate": 7.2e-06, + "loss": 2.4212, + "step": 1712 + }, + { + "epoch": 0.19122772132159788, + "grad_norm": 0.22043661773204803, + "learning_rate": 7.175e-06, + "loss": 2.33, + "step": 1713 + }, + { + "epoch": 0.19133935455062392, + "grad_norm": 0.23072798550128937, + "learning_rate": 7.15e-06, + "loss": 2.3748, + "step": 1714 + }, + { + "epoch": 0.19145098777964997, + "grad_norm": 0.22338813543319702, + "learning_rate": 7.1249999999999995e-06, + "loss": 2.3183, + "step": 1715 + }, + { + "epoch": 0.191562621008676, + "grad_norm": 0.23001807928085327, + "learning_rate": 7.1e-06, + "loss": 2.31, + "step": 1716 + }, + { + "epoch": 0.19167425423770204, + "grad_norm": 0.23304982483386993, + "learning_rate": 7.075e-06, + "loss": 2.3071, + "step": 1717 + }, + { + "epoch": 0.19178588746672806, + "grad_norm": 0.2388128638267517, + "learning_rate": 7.049999999999999e-06, + "loss": 2.3441, + "step": 1718 + }, + { + "epoch": 0.1918975206957541, + "grad_norm": 0.22486941516399384, + "learning_rate": 7.025000000000001e-06, + "loss": 2.3495, + "step": 1719 + }, + { + "epoch": 0.19200915392478013, + "grad_norm": 0.23193509876728058, + "learning_rate": 7.000000000000001e-06, + "loss": 2.2735, + "step": 1720 + }, + { + "epoch": 0.19212078715380618, + "grad_norm": 0.23028187453746796, + "learning_rate": 6.975000000000001e-06, + "loss": 2.2935, + "step": 1721 + }, + { + "epoch": 0.1922324203828322, + "grad_norm": 0.22155210375785828, + "learning_rate": 6.950000000000001e-06, + "loss": 2.3499, + "step": 1722 + }, + { + "epoch": 0.19234405361185825, + "grad_norm": 0.23238146305084229, + "learning_rate": 6.925000000000001e-06, + "loss": 2.3038, + "step": 1723 + }, + { + "epoch": 0.19245568684088427, + "grad_norm": 0.2326563447713852, + "learning_rate": 6.900000000000001e-06, + "loss": 2.3849, + "step": 1724 + }, + { + "epoch": 0.1925673200699103, + "grad_norm": 0.23055623471736908, + "learning_rate": 6.875000000000001e-06, + "loss": 2.294, + "step": 1725 + }, + { + "epoch": 0.19267895329893633, + "grad_norm": 0.22514155507087708, + "learning_rate": 6.8500000000000005e-06, + "loss": 2.4518, + "step": 1726 + }, + { + "epoch": 0.19279058652796238, + "grad_norm": 0.22970163822174072, + "learning_rate": 6.825000000000001e-06, + "loss": 2.3659, + "step": 1727 + }, + { + "epoch": 0.1929022197569884, + "grad_norm": 0.22430215775966644, + "learning_rate": 6.800000000000001e-06, + "loss": 2.4064, + "step": 1728 + }, + { + "epoch": 0.19301385298601445, + "grad_norm": 0.2254297137260437, + "learning_rate": 6.775000000000001e-06, + "loss": 2.2301, + "step": 1729 + }, + { + "epoch": 0.1931254862150405, + "grad_norm": 0.23394359648227692, + "learning_rate": 6.750000000000001e-06, + "loss": 2.311, + "step": 1730 + }, + { + "epoch": 0.19323711944406652, + "grad_norm": 0.23774048686027527, + "learning_rate": 6.725000000000001e-06, + "loss": 2.3392, + "step": 1731 + }, + { + "epoch": 0.19334875267309257, + "grad_norm": 0.23314325511455536, + "learning_rate": 6.700000000000001e-06, + "loss": 2.2952, + "step": 1732 + }, + { + "epoch": 0.1934603859021186, + "grad_norm": 0.22659903764724731, + "learning_rate": 6.6750000000000005e-06, + "loss": 2.3357, + "step": 1733 + }, + { + "epoch": 0.19357201913114463, + "grad_norm": 0.2298395037651062, + "learning_rate": 6.650000000000001e-06, + "loss": 2.3717, + "step": 1734 + }, + { + "epoch": 0.19368365236017066, + "grad_norm": 0.25462180376052856, + "learning_rate": 6.625000000000001e-06, + "loss": 2.3877, + "step": 1735 + }, + { + "epoch": 0.1937952855891967, + "grad_norm": 0.22525596618652344, + "learning_rate": 6.6e-06, + "loss": 2.2066, + "step": 1736 + }, + { + "epoch": 0.19390691881822272, + "grad_norm": 0.2275206297636032, + "learning_rate": 6.5750000000000006e-06, + "loss": 2.3948, + "step": 1737 + }, + { + "epoch": 0.19401855204724877, + "grad_norm": 0.23473072052001953, + "learning_rate": 6.550000000000001e-06, + "loss": 2.3358, + "step": 1738 + }, + { + "epoch": 0.1941301852762748, + "grad_norm": 0.2280065417289734, + "learning_rate": 6.525e-06, + "loss": 2.2785, + "step": 1739 + }, + { + "epoch": 0.19424181850530084, + "grad_norm": 0.2484257072210312, + "learning_rate": 6.5000000000000004e-06, + "loss": 2.4323, + "step": 1740 + }, + { + "epoch": 0.19435345173432686, + "grad_norm": 0.245782271027565, + "learning_rate": 6.475000000000001e-06, + "loss": 2.3596, + "step": 1741 + }, + { + "epoch": 0.1944650849633529, + "grad_norm": 0.2271096259355545, + "learning_rate": 6.45e-06, + "loss": 2.3226, + "step": 1742 + }, + { + "epoch": 0.19457671819237893, + "grad_norm": 0.23324838280677795, + "learning_rate": 6.425e-06, + "loss": 2.4395, + "step": 1743 + }, + { + "epoch": 0.19468835142140498, + "grad_norm": 0.23519247770309448, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.2311, + "step": 1744 + }, + { + "epoch": 0.194799984650431, + "grad_norm": 0.22893019020557404, + "learning_rate": 6.375000000000001e-06, + "loss": 2.3042, + "step": 1745 + }, + { + "epoch": 0.19491161787945704, + "grad_norm": 0.23204751312732697, + "learning_rate": 6.35e-06, + "loss": 2.3882, + "step": 1746 + }, + { + "epoch": 0.1950232511084831, + "grad_norm": 0.23840852081775665, + "learning_rate": 6.3250000000000004e-06, + "loss": 2.3279, + "step": 1747 + }, + { + "epoch": 0.1951348843375091, + "grad_norm": 0.2263709455728531, + "learning_rate": 6.300000000000001e-06, + "loss": 2.3376, + "step": 1748 + }, + { + "epoch": 0.19524651756653516, + "grad_norm": 0.22863459587097168, + "learning_rate": 6.275e-06, + "loss": 2.3901, + "step": 1749 + }, + { + "epoch": 0.19535815079556118, + "grad_norm": 0.3761008381843567, + "learning_rate": 6.25e-06, + "loss": 2.3816, + "step": 1750 + }, + { + "epoch": 0.19546978402458723, + "grad_norm": 0.22513067722320557, + "learning_rate": 6.2250000000000005e-06, + "loss": 2.2964, + "step": 1751 + }, + { + "epoch": 0.19558141725361325, + "grad_norm": 0.22629620134830475, + "learning_rate": 6.2e-06, + "loss": 2.4423, + "step": 1752 + }, + { + "epoch": 0.1956930504826393, + "grad_norm": 0.24019905924797058, + "learning_rate": 6.175e-06, + "loss": 2.2727, + "step": 1753 + }, + { + "epoch": 0.19580468371166532, + "grad_norm": 0.23448988795280457, + "learning_rate": 6.15e-06, + "loss": 2.3269, + "step": 1754 + }, + { + "epoch": 0.19591631694069137, + "grad_norm": 0.22556640207767487, + "learning_rate": 6.125e-06, + "loss": 2.1874, + "step": 1755 + }, + { + "epoch": 0.19602795016971739, + "grad_norm": 0.3827759027481079, + "learning_rate": 6.1e-06, + "loss": 2.3076, + "step": 1756 + }, + { + "epoch": 0.19613958339874343, + "grad_norm": 0.22821791470050812, + "learning_rate": 6.075e-06, + "loss": 2.3521, + "step": 1757 + }, + { + "epoch": 0.19625121662776945, + "grad_norm": 0.22949843108654022, + "learning_rate": 6.0500000000000005e-06, + "loss": 2.3864, + "step": 1758 + }, + { + "epoch": 0.1963628498567955, + "grad_norm": 0.22942288219928741, + "learning_rate": 6.025e-06, + "loss": 2.3699, + "step": 1759 + }, + { + "epoch": 0.19647448308582152, + "grad_norm": 0.2209300398826599, + "learning_rate": 6e-06, + "loss": 2.3237, + "step": 1760 + }, + { + "epoch": 0.19658611631484757, + "grad_norm": 0.22622691094875336, + "learning_rate": 5.975e-06, + "loss": 2.3339, + "step": 1761 + }, + { + "epoch": 0.1966977495438736, + "grad_norm": 0.23727689683437347, + "learning_rate": 5.95e-06, + "loss": 2.2707, + "step": 1762 + }, + { + "epoch": 0.19680938277289964, + "grad_norm": 0.2961234450340271, + "learning_rate": 5.925e-06, + "loss": 2.2837, + "step": 1763 + }, + { + "epoch": 0.19692101600192569, + "grad_norm": 0.24365472793579102, + "learning_rate": 5.9e-06, + "loss": 2.3899, + "step": 1764 + }, + { + "epoch": 0.1970326492309517, + "grad_norm": 0.23506611585617065, + "learning_rate": 5.875e-06, + "loss": 2.3549, + "step": 1765 + }, + { + "epoch": 0.19714428245997775, + "grad_norm": 0.22377490997314453, + "learning_rate": 5.850000000000001e-06, + "loss": 2.2019, + "step": 1766 + }, + { + "epoch": 0.19725591568900377, + "grad_norm": 0.2170470505952835, + "learning_rate": 5.825000000000001e-06, + "loss": 2.3304, + "step": 1767 + }, + { + "epoch": 0.19736754891802982, + "grad_norm": 0.23338063061237335, + "learning_rate": 5.8e-06, + "loss": 2.3452, + "step": 1768 + }, + { + "epoch": 0.19747918214705584, + "grad_norm": 0.22321484982967377, + "learning_rate": 5.775000000000001e-06, + "loss": 2.321, + "step": 1769 + }, + { + "epoch": 0.1975908153760819, + "grad_norm": 0.23838651180267334, + "learning_rate": 5.750000000000001e-06, + "loss": 2.3604, + "step": 1770 + }, + { + "epoch": 0.1977024486051079, + "grad_norm": 0.23211868107318878, + "learning_rate": 5.725e-06, + "loss": 2.3697, + "step": 1771 + }, + { + "epoch": 0.19781408183413396, + "grad_norm": 0.23045557737350464, + "learning_rate": 5.7000000000000005e-06, + "loss": 2.2953, + "step": 1772 + }, + { + "epoch": 0.19792571506315998, + "grad_norm": 0.23346678912639618, + "learning_rate": 5.675000000000001e-06, + "loss": 2.3694, + "step": 1773 + }, + { + "epoch": 0.19803734829218603, + "grad_norm": 0.23384539783000946, + "learning_rate": 5.65e-06, + "loss": 2.3231, + "step": 1774 + }, + { + "epoch": 0.19814898152121205, + "grad_norm": 0.22029776871204376, + "learning_rate": 5.625e-06, + "loss": 2.2113, + "step": 1775 + }, + { + "epoch": 0.1982606147502381, + "grad_norm": 0.24596329033374786, + "learning_rate": 5.600000000000001e-06, + "loss": 2.2872, + "step": 1776 + }, + { + "epoch": 0.19837224797926412, + "grad_norm": 0.2208588421344757, + "learning_rate": 5.575e-06, + "loss": 2.2893, + "step": 1777 + }, + { + "epoch": 0.19848388120829016, + "grad_norm": 0.27724799513816833, + "learning_rate": 5.55e-06, + "loss": 2.3499, + "step": 1778 + }, + { + "epoch": 0.19859551443731618, + "grad_norm": 0.2821560502052307, + "learning_rate": 5.5250000000000005e-06, + "loss": 2.2631, + "step": 1779 + }, + { + "epoch": 0.19870714766634223, + "grad_norm": 0.2291664034128189, + "learning_rate": 5.500000000000001e-06, + "loss": 2.4097, + "step": 1780 + }, + { + "epoch": 0.19881878089536828, + "grad_norm": 0.2263956367969513, + "learning_rate": 5.475e-06, + "loss": 2.4372, + "step": 1781 + }, + { + "epoch": 0.1989304141243943, + "grad_norm": 0.22560614347457886, + "learning_rate": 5.45e-06, + "loss": 2.3875, + "step": 1782 + }, + { + "epoch": 0.19904204735342035, + "grad_norm": 0.3105284571647644, + "learning_rate": 5.4250000000000006e-06, + "loss": 2.201, + "step": 1783 + }, + { + "epoch": 0.19915368058244637, + "grad_norm": 0.23622126877307892, + "learning_rate": 5.4e-06, + "loss": 2.3385, + "step": 1784 + }, + { + "epoch": 0.19926531381147242, + "grad_norm": 0.23378053307533264, + "learning_rate": 5.375e-06, + "loss": 2.3863, + "step": 1785 + }, + { + "epoch": 0.19937694704049844, + "grad_norm": 0.2226409614086151, + "learning_rate": 5.3500000000000004e-06, + "loss": 2.2851, + "step": 1786 + }, + { + "epoch": 0.19948858026952448, + "grad_norm": 0.2296893298625946, + "learning_rate": 5.325e-06, + "loss": 2.3351, + "step": 1787 + }, + { + "epoch": 0.1996002134985505, + "grad_norm": 0.22921861708164215, + "learning_rate": 5.3e-06, + "loss": 2.2895, + "step": 1788 + }, + { + "epoch": 0.19971184672757655, + "grad_norm": 0.23278868198394775, + "learning_rate": 5.275e-06, + "loss": 2.2488, + "step": 1789 + }, + { + "epoch": 0.19982347995660257, + "grad_norm": 0.2284834384918213, + "learning_rate": 5.25e-06, + "loss": 2.3363, + "step": 1790 + }, + { + "epoch": 0.19993511318562862, + "grad_norm": 0.23466360569000244, + "learning_rate": 5.225e-06, + "loss": 2.2327, + "step": 1791 + }, + { + "epoch": 0.20004674641465464, + "grad_norm": 0.23456375300884247, + "learning_rate": 5.2e-06, + "loss": 2.3441, + "step": 1792 + }, + { + "epoch": 0.2001583796436807, + "grad_norm": 0.7794348001480103, + "learning_rate": 5.175e-06, + "loss": 2.2631, + "step": 1793 + }, + { + "epoch": 0.2002700128727067, + "grad_norm": 0.22322069108486176, + "learning_rate": 5.15e-06, + "loss": 2.209, + "step": 1794 + }, + { + "epoch": 0.20038164610173276, + "grad_norm": 0.2890242636203766, + "learning_rate": 5.125e-06, + "loss": 2.4408, + "step": 1795 + }, + { + "epoch": 0.2004932793307588, + "grad_norm": 0.23017571866512299, + "learning_rate": 5.1e-06, + "loss": 2.3729, + "step": 1796 + }, + { + "epoch": 0.20060491255978483, + "grad_norm": 0.24546416103839874, + "learning_rate": 5.0750000000000005e-06, + "loss": 2.3414, + "step": 1797 + }, + { + "epoch": 0.20071654578881087, + "grad_norm": 0.6677102446556091, + "learning_rate": 5.050000000000001e-06, + "loss": 2.267, + "step": 1798 + }, + { + "epoch": 0.2008281790178369, + "grad_norm": 0.22715935111045837, + "learning_rate": 5.025e-06, + "loss": 2.2485, + "step": 1799 + }, + { + "epoch": 0.20093981224686294, + "grad_norm": 0.24433374404907227, + "learning_rate": 5e-06, + "loss": 2.2448, + "step": 1800 + }, + { + "epoch": 0.20105144547588896, + "grad_norm": 0.2298104465007782, + "learning_rate": 4.975000000000001e-06, + "loss": 2.3481, + "step": 1801 + }, + { + "epoch": 0.201163078704915, + "grad_norm": 0.2273014336824417, + "learning_rate": 4.950000000000001e-06, + "loss": 2.2275, + "step": 1802 + }, + { + "epoch": 0.20127471193394103, + "grad_norm": 0.2342415452003479, + "learning_rate": 4.925e-06, + "loss": 2.3233, + "step": 1803 + }, + { + "epoch": 0.20138634516296708, + "grad_norm": 0.23677963018417358, + "learning_rate": 4.9000000000000005e-06, + "loss": 2.34, + "step": 1804 + }, + { + "epoch": 0.2014979783919931, + "grad_norm": 0.2304297685623169, + "learning_rate": 4.875000000000001e-06, + "loss": 2.4005, + "step": 1805 + }, + { + "epoch": 0.20160961162101915, + "grad_norm": 0.22330938279628754, + "learning_rate": 4.85e-06, + "loss": 2.31, + "step": 1806 + }, + { + "epoch": 0.20172124485004517, + "grad_norm": 0.2279394567012787, + "learning_rate": 4.825e-06, + "loss": 2.2854, + "step": 1807 + }, + { + "epoch": 0.20183287807907122, + "grad_norm": 0.22466933727264404, + "learning_rate": 4.800000000000001e-06, + "loss": 2.3226, + "step": 1808 + }, + { + "epoch": 0.20194451130809724, + "grad_norm": 0.2288239598274231, + "learning_rate": 4.775e-06, + "loss": 2.2858, + "step": 1809 + }, + { + "epoch": 0.20205614453712328, + "grad_norm": 0.22833405435085297, + "learning_rate": 4.75e-06, + "loss": 2.3368, + "step": 1810 + }, + { + "epoch": 0.2021677777661493, + "grad_norm": 0.23820479214191437, + "learning_rate": 4.7250000000000005e-06, + "loss": 2.3351, + "step": 1811 + }, + { + "epoch": 0.20227941099517535, + "grad_norm": 0.22864072024822235, + "learning_rate": 4.7e-06, + "loss": 2.3744, + "step": 1812 + }, + { + "epoch": 0.2023910442242014, + "grad_norm": 0.22456790506839752, + "learning_rate": 4.675e-06, + "loss": 2.2479, + "step": 1813 + }, + { + "epoch": 0.20250267745322742, + "grad_norm": 0.22676309943199158, + "learning_rate": 4.65e-06, + "loss": 2.4175, + "step": 1814 + }, + { + "epoch": 0.20261431068225347, + "grad_norm": 0.2293044477701187, + "learning_rate": 4.625e-06, + "loss": 2.3798, + "step": 1815 + }, + { + "epoch": 0.2027259439112795, + "grad_norm": 0.23791825771331787, + "learning_rate": 4.6e-06, + "loss": 2.2226, + "step": 1816 + }, + { + "epoch": 0.20283757714030554, + "grad_norm": 0.22495396435260773, + "learning_rate": 4.575e-06, + "loss": 2.3658, + "step": 1817 + }, + { + "epoch": 0.20294921036933156, + "grad_norm": 0.22882862389087677, + "learning_rate": 4.5500000000000005e-06, + "loss": 2.3152, + "step": 1818 + }, + { + "epoch": 0.2030608435983576, + "grad_norm": 0.24084174633026123, + "learning_rate": 4.525e-06, + "loss": 2.3789, + "step": 1819 + }, + { + "epoch": 0.20317247682738362, + "grad_norm": 0.23165123164653778, + "learning_rate": 4.5e-06, + "loss": 2.3144, + "step": 1820 + }, + { + "epoch": 0.20328411005640967, + "grad_norm": 0.22660130262374878, + "learning_rate": 4.475e-06, + "loss": 2.2471, + "step": 1821 + }, + { + "epoch": 0.2033957432854357, + "grad_norm": 0.23442165553569794, + "learning_rate": 4.45e-06, + "loss": 2.2553, + "step": 1822 + }, + { + "epoch": 0.20350737651446174, + "grad_norm": 0.2298109382390976, + "learning_rate": 4.425e-06, + "loss": 2.3108, + "step": 1823 + }, + { + "epoch": 0.20361900974348776, + "grad_norm": 0.23466314375400543, + "learning_rate": 4.4e-06, + "loss": 2.2141, + "step": 1824 + }, + { + "epoch": 0.2037306429725138, + "grad_norm": 0.2319631427526474, + "learning_rate": 4.375e-06, + "loss": 2.2492, + "step": 1825 + }, + { + "epoch": 0.20384227620153983, + "grad_norm": 0.23453742265701294, + "learning_rate": 4.35e-06, + "loss": 2.2947, + "step": 1826 + }, + { + "epoch": 0.20395390943056588, + "grad_norm": 0.23079949617385864, + "learning_rate": 4.325e-06, + "loss": 2.3735, + "step": 1827 + }, + { + "epoch": 0.2040655426595919, + "grad_norm": 0.2320202887058258, + "learning_rate": 4.2999999999999995e-06, + "loss": 2.2555, + "step": 1828 + }, + { + "epoch": 0.20417717588861795, + "grad_norm": 0.23471979796886444, + "learning_rate": 4.2750000000000006e-06, + "loss": 2.274, + "step": 1829 + }, + { + "epoch": 0.204288809117644, + "grad_norm": 0.23321862518787384, + "learning_rate": 4.250000000000001e-06, + "loss": 2.2911, + "step": 1830 + }, + { + "epoch": 0.20440044234667, + "grad_norm": 0.44062545895576477, + "learning_rate": 4.225e-06, + "loss": 2.3128, + "step": 1831 + }, + { + "epoch": 0.20451207557569606, + "grad_norm": 0.29020875692367554, + "learning_rate": 4.2000000000000004e-06, + "loss": 2.3114, + "step": 1832 + }, + { + "epoch": 0.20462370880472208, + "grad_norm": 0.2328851819038391, + "learning_rate": 4.175000000000001e-06, + "loss": 2.3953, + "step": 1833 + }, + { + "epoch": 0.20473534203374813, + "grad_norm": 0.23879390954971313, + "learning_rate": 4.15e-06, + "loss": 2.2864, + "step": 1834 + }, + { + "epoch": 0.20484697526277415, + "grad_norm": 0.2300870269536972, + "learning_rate": 4.125e-06, + "loss": 2.2513, + "step": 1835 + }, + { + "epoch": 0.2049586084918002, + "grad_norm": 0.23073944449424744, + "learning_rate": 4.1000000000000006e-06, + "loss": 2.3797, + "step": 1836 + }, + { + "epoch": 0.20507024172082622, + "grad_norm": 0.2415030598640442, + "learning_rate": 4.075e-06, + "loss": 2.2337, + "step": 1837 + }, + { + "epoch": 0.20518187494985227, + "grad_norm": 0.22740036249160767, + "learning_rate": 4.05e-06, + "loss": 2.4097, + "step": 1838 + }, + { + "epoch": 0.2052935081788783, + "grad_norm": 0.22621197998523712, + "learning_rate": 4.0250000000000004e-06, + "loss": 2.3186, + "step": 1839 + }, + { + "epoch": 0.20540514140790433, + "grad_norm": 0.22574348747730255, + "learning_rate": 4.000000000000001e-06, + "loss": 2.268, + "step": 1840 + }, + { + "epoch": 0.20551677463693035, + "grad_norm": 0.2359447032213211, + "learning_rate": 3.975e-06, + "loss": 2.3712, + "step": 1841 + }, + { + "epoch": 0.2056284078659564, + "grad_norm": 0.2573161721229553, + "learning_rate": 3.95e-06, + "loss": 2.3214, + "step": 1842 + }, + { + "epoch": 0.20574004109498242, + "grad_norm": 0.2339893877506256, + "learning_rate": 3.9250000000000005e-06, + "loss": 2.3017, + "step": 1843 + }, + { + "epoch": 0.20585167432400847, + "grad_norm": 0.26078861951828003, + "learning_rate": 3.9e-06, + "loss": 2.3372, + "step": 1844 + }, + { + "epoch": 0.2059633075530345, + "grad_norm": 0.23878271877765656, + "learning_rate": 3.875e-06, + "loss": 2.3236, + "step": 1845 + }, + { + "epoch": 0.20607494078206054, + "grad_norm": 0.2323731631040573, + "learning_rate": 3.85e-06, + "loss": 2.2754, + "step": 1846 + }, + { + "epoch": 0.2061865740110866, + "grad_norm": 0.23449857532978058, + "learning_rate": 3.825e-06, + "loss": 2.3331, + "step": 1847 + }, + { + "epoch": 0.2062982072401126, + "grad_norm": 0.2303149700164795, + "learning_rate": 3.8e-06, + "loss": 2.2017, + "step": 1848 + }, + { + "epoch": 0.20640984046913866, + "grad_norm": 0.21848297119140625, + "learning_rate": 3.775e-06, + "loss": 2.383, + "step": 1849 + }, + { + "epoch": 0.20652147369816468, + "grad_norm": 0.22456365823745728, + "learning_rate": 3.75e-06, + "loss": 2.2835, + "step": 1850 + }, + { + "epoch": 0.20663310692719072, + "grad_norm": 0.23150286078453064, + "learning_rate": 3.725e-06, + "loss": 2.3369, + "step": 1851 + }, + { + "epoch": 0.20674474015621674, + "grad_norm": 0.2399853765964508, + "learning_rate": 3.7e-06, + "loss": 2.3235, + "step": 1852 + }, + { + "epoch": 0.2068563733852428, + "grad_norm": 0.22788886725902557, + "learning_rate": 3.675e-06, + "loss": 2.3365, + "step": 1853 + }, + { + "epoch": 0.2069680066142688, + "grad_norm": 0.22432076930999756, + "learning_rate": 3.6499999999999998e-06, + "loss": 2.2445, + "step": 1854 + }, + { + "epoch": 0.20707963984329486, + "grad_norm": 0.230990469455719, + "learning_rate": 3.625e-06, + "loss": 2.3849, + "step": 1855 + }, + { + "epoch": 0.20719127307232088, + "grad_norm": 0.22127412259578705, + "learning_rate": 3.6e-06, + "loss": 2.3069, + "step": 1856 + }, + { + "epoch": 0.20730290630134693, + "grad_norm": 0.22421187162399292, + "learning_rate": 3.575e-06, + "loss": 2.3637, + "step": 1857 + }, + { + "epoch": 0.20741453953037295, + "grad_norm": 0.22667351365089417, + "learning_rate": 3.55e-06, + "loss": 2.226, + "step": 1858 + }, + { + "epoch": 0.207526172759399, + "grad_norm": 0.24504856765270233, + "learning_rate": 3.5249999999999997e-06, + "loss": 2.2467, + "step": 1859 + }, + { + "epoch": 0.20763780598842502, + "grad_norm": 0.21391868591308594, + "learning_rate": 3.5000000000000004e-06, + "loss": 2.2939, + "step": 1860 + }, + { + "epoch": 0.20774943921745107, + "grad_norm": 0.2345769703388214, + "learning_rate": 3.4750000000000006e-06, + "loss": 2.3104, + "step": 1861 + }, + { + "epoch": 0.2078610724464771, + "grad_norm": 0.2319745272397995, + "learning_rate": 3.4500000000000004e-06, + "loss": 2.2999, + "step": 1862 + }, + { + "epoch": 0.20797270567550313, + "grad_norm": 0.2287844866514206, + "learning_rate": 3.4250000000000002e-06, + "loss": 2.2385, + "step": 1863 + }, + { + "epoch": 0.20808433890452918, + "grad_norm": 0.22918793559074402, + "learning_rate": 3.4000000000000005e-06, + "loss": 2.4109, + "step": 1864 + }, + { + "epoch": 0.2081959721335552, + "grad_norm": 0.23404212296009064, + "learning_rate": 3.3750000000000003e-06, + "loss": 2.3283, + "step": 1865 + }, + { + "epoch": 0.20830760536258125, + "grad_norm": 0.22943396866321564, + "learning_rate": 3.3500000000000005e-06, + "loss": 2.366, + "step": 1866 + }, + { + "epoch": 0.20841923859160727, + "grad_norm": 0.2246488332748413, + "learning_rate": 3.3250000000000004e-06, + "loss": 2.3395, + "step": 1867 + }, + { + "epoch": 0.20853087182063332, + "grad_norm": 0.24497540295124054, + "learning_rate": 3.3e-06, + "loss": 2.3761, + "step": 1868 + }, + { + "epoch": 0.20864250504965934, + "grad_norm": 0.22545543313026428, + "learning_rate": 3.2750000000000004e-06, + "loss": 2.1653, + "step": 1869 + }, + { + "epoch": 0.20875413827868539, + "grad_norm": 0.22694288194179535, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.3608, + "step": 1870 + }, + { + "epoch": 0.2088657715077114, + "grad_norm": 0.23120814561843872, + "learning_rate": 3.225e-06, + "loss": 2.1423, + "step": 1871 + }, + { + "epoch": 0.20897740473673745, + "grad_norm": 0.22243161499500275, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.3699, + "step": 1872 + }, + { + "epoch": 0.20908903796576347, + "grad_norm": 0.233660027384758, + "learning_rate": 3.175e-06, + "loss": 2.213, + "step": 1873 + }, + { + "epoch": 0.20920067119478952, + "grad_norm": 0.2548787593841553, + "learning_rate": 3.1500000000000003e-06, + "loss": 2.3905, + "step": 1874 + }, + { + "epoch": 0.20931230442381554, + "grad_norm": 0.22625088691711426, + "learning_rate": 3.125e-06, + "loss": 2.3932, + "step": 1875 + }, + { + "epoch": 0.2094239376528416, + "grad_norm": 0.214766263961792, + "learning_rate": 3.1e-06, + "loss": 2.2858, + "step": 1876 + }, + { + "epoch": 0.2095355708818676, + "grad_norm": 0.22801010310649872, + "learning_rate": 3.075e-06, + "loss": 2.3836, + "step": 1877 + }, + { + "epoch": 0.20964720411089366, + "grad_norm": 0.21751223504543304, + "learning_rate": 3.05e-06, + "loss": 2.3788, + "step": 1878 + }, + { + "epoch": 0.2097588373399197, + "grad_norm": 0.23438245058059692, + "learning_rate": 3.0250000000000003e-06, + "loss": 2.3414, + "step": 1879 + }, + { + "epoch": 0.20987047056894573, + "grad_norm": 0.23169319331645966, + "learning_rate": 3e-06, + "loss": 2.386, + "step": 1880 + }, + { + "epoch": 0.20998210379797178, + "grad_norm": 0.46034878492355347, + "learning_rate": 2.975e-06, + "loss": 2.3816, + "step": 1881 + }, + { + "epoch": 0.2100937370269978, + "grad_norm": 0.2378452867269516, + "learning_rate": 2.95e-06, + "loss": 2.3081, + "step": 1882 + }, + { + "epoch": 0.21020537025602384, + "grad_norm": 0.23349280655384064, + "learning_rate": 2.9250000000000004e-06, + "loss": 2.3548, + "step": 1883 + }, + { + "epoch": 0.21031700348504986, + "grad_norm": 0.2473306655883789, + "learning_rate": 2.9e-06, + "loss": 2.3103, + "step": 1884 + }, + { + "epoch": 0.2104286367140759, + "grad_norm": 0.22535374760627747, + "learning_rate": 2.8750000000000004e-06, + "loss": 2.3863, + "step": 1885 + }, + { + "epoch": 0.21054026994310193, + "grad_norm": 0.8349050283432007, + "learning_rate": 2.8500000000000002e-06, + "loss": 2.3809, + "step": 1886 + }, + { + "epoch": 0.21065190317212798, + "grad_norm": 0.22305621206760406, + "learning_rate": 2.825e-06, + "loss": 2.3529, + "step": 1887 + }, + { + "epoch": 0.210763536401154, + "grad_norm": 0.23024660348892212, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.4181, + "step": 1888 + }, + { + "epoch": 0.21087516963018005, + "grad_norm": 0.22768965363502502, + "learning_rate": 2.775e-06, + "loss": 2.2139, + "step": 1889 + }, + { + "epoch": 0.21098680285920607, + "grad_norm": 0.22666402161121368, + "learning_rate": 2.7500000000000004e-06, + "loss": 2.3946, + "step": 1890 + }, + { + "epoch": 0.21109843608823212, + "grad_norm": 0.24159802496433258, + "learning_rate": 2.725e-06, + "loss": 2.2891, + "step": 1891 + }, + { + "epoch": 0.21121006931725814, + "grad_norm": 0.21914929151535034, + "learning_rate": 2.7e-06, + "loss": 2.4024, + "step": 1892 + }, + { + "epoch": 0.21132170254628418, + "grad_norm": 0.2383839637041092, + "learning_rate": 2.6750000000000002e-06, + "loss": 2.3293, + "step": 1893 + }, + { + "epoch": 0.2114333357753102, + "grad_norm": 0.24502025544643402, + "learning_rate": 2.65e-06, + "loss": 2.3402, + "step": 1894 + }, + { + "epoch": 0.21154496900433625, + "grad_norm": 0.23778797686100006, + "learning_rate": 2.625e-06, + "loss": 2.4104, + "step": 1895 + }, + { + "epoch": 0.2116566022333623, + "grad_norm": 0.2225978523492813, + "learning_rate": 2.6e-06, + "loss": 2.3451, + "step": 1896 + }, + { + "epoch": 0.21176823546238832, + "grad_norm": 0.2651173174381256, + "learning_rate": 2.575e-06, + "loss": 2.3004, + "step": 1897 + }, + { + "epoch": 0.21187986869141437, + "grad_norm": 0.21840894222259521, + "learning_rate": 2.55e-06, + "loss": 2.3576, + "step": 1898 + }, + { + "epoch": 0.2119915019204404, + "grad_norm": 0.2707866132259369, + "learning_rate": 2.5250000000000004e-06, + "loss": 2.3301, + "step": 1899 + }, + { + "epoch": 0.21210313514946644, + "grad_norm": 0.2320091277360916, + "learning_rate": 2.5e-06, + "loss": 2.4186, + "step": 1900 + }, + { + "epoch": 0.21221476837849246, + "grad_norm": 0.22834016382694244, + "learning_rate": 2.4750000000000004e-06, + "loss": 2.3018, + "step": 1901 + }, + { + "epoch": 0.2123264016075185, + "grad_norm": 0.2285088747739792, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.3471, + "step": 1902 + }, + { + "epoch": 0.21243803483654453, + "grad_norm": 0.2231675386428833, + "learning_rate": 2.425e-06, + "loss": 2.2663, + "step": 1903 + }, + { + "epoch": 0.21254966806557057, + "grad_norm": 0.23813290894031525, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2843, + "step": 1904 + }, + { + "epoch": 0.2126613012945966, + "grad_norm": 0.23623359203338623, + "learning_rate": 2.375e-06, + "loss": 2.3532, + "step": 1905 + }, + { + "epoch": 0.21277293452362264, + "grad_norm": 0.24262453615665436, + "learning_rate": 2.35e-06, + "loss": 2.2889, + "step": 1906 + }, + { + "epoch": 0.21288456775264866, + "grad_norm": 0.32345688343048096, + "learning_rate": 2.325e-06, + "loss": 2.3914, + "step": 1907 + }, + { + "epoch": 0.2129962009816747, + "grad_norm": 0.23502293229103088, + "learning_rate": 2.3e-06, + "loss": 2.4402, + "step": 1908 + }, + { + "epoch": 0.21310783421070073, + "grad_norm": 0.23672327399253845, + "learning_rate": 2.2750000000000002e-06, + "loss": 2.2715, + "step": 1909 + }, + { + "epoch": 0.21321946743972678, + "grad_norm": 0.2367829978466034, + "learning_rate": 2.25e-06, + "loss": 2.264, + "step": 1910 + }, + { + "epoch": 0.2133311006687528, + "grad_norm": 0.2541966140270233, + "learning_rate": 2.225e-06, + "loss": 2.3563, + "step": 1911 + }, + { + "epoch": 0.21344273389777885, + "grad_norm": 0.238334059715271, + "learning_rate": 2.2e-06, + "loss": 2.4153, + "step": 1912 + }, + { + "epoch": 0.2135543671268049, + "grad_norm": 0.2234756350517273, + "learning_rate": 2.175e-06, + "loss": 2.256, + "step": 1913 + }, + { + "epoch": 0.21366600035583092, + "grad_norm": 0.24005568027496338, + "learning_rate": 2.1499999999999997e-06, + "loss": 2.23, + "step": 1914 + }, + { + "epoch": 0.21377763358485696, + "grad_norm": 0.23683962225914001, + "learning_rate": 2.1250000000000004e-06, + "loss": 2.301, + "step": 1915 + }, + { + "epoch": 0.21388926681388298, + "grad_norm": 0.23090413212776184, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.3411, + "step": 1916 + }, + { + "epoch": 0.21400090004290903, + "grad_norm": 0.3235081434249878, + "learning_rate": 2.075e-06, + "loss": 2.3339, + "step": 1917 + }, + { + "epoch": 0.21411253327193505, + "grad_norm": 0.284463495016098, + "learning_rate": 2.0500000000000003e-06, + "loss": 2.3223, + "step": 1918 + }, + { + "epoch": 0.2142241665009611, + "grad_norm": 0.24401739239692688, + "learning_rate": 2.025e-06, + "loss": 2.252, + "step": 1919 + }, + { + "epoch": 0.21433579972998712, + "grad_norm": 0.22095558047294617, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.1969, + "step": 1920 + }, + { + "epoch": 0.21444743295901317, + "grad_norm": 0.2635052502155304, + "learning_rate": 1.975e-06, + "loss": 2.2948, + "step": 1921 + }, + { + "epoch": 0.2145590661880392, + "grad_norm": 0.227258563041687, + "learning_rate": 1.95e-06, + "loss": 2.3566, + "step": 1922 + }, + { + "epoch": 0.21467069941706524, + "grad_norm": 0.2326432466506958, + "learning_rate": 1.925e-06, + "loss": 2.2968, + "step": 1923 + }, + { + "epoch": 0.21478233264609126, + "grad_norm": 0.23958462476730347, + "learning_rate": 1.9e-06, + "loss": 2.1632, + "step": 1924 + }, + { + "epoch": 0.2148939658751173, + "grad_norm": 0.23388223350048065, + "learning_rate": 1.875e-06, + "loss": 2.363, + "step": 1925 + }, + { + "epoch": 0.21500559910414332, + "grad_norm": 0.22733284533023834, + "learning_rate": 1.85e-06, + "loss": 2.3312, + "step": 1926 + }, + { + "epoch": 0.21511723233316937, + "grad_norm": 0.24033506214618683, + "learning_rate": 1.8249999999999999e-06, + "loss": 2.2992, + "step": 1927 + }, + { + "epoch": 0.21522886556219542, + "grad_norm": 0.23118910193443298, + "learning_rate": 1.8e-06, + "loss": 2.3916, + "step": 1928 + }, + { + "epoch": 0.21534049879122144, + "grad_norm": 0.23018263280391693, + "learning_rate": 1.775e-06, + "loss": 2.2243, + "step": 1929 + }, + { + "epoch": 0.2154521320202475, + "grad_norm": 0.2350279539823532, + "learning_rate": 1.7500000000000002e-06, + "loss": 2.3228, + "step": 1930 + }, + { + "epoch": 0.2155637652492735, + "grad_norm": 0.23205231130123138, + "learning_rate": 1.7250000000000002e-06, + "loss": 2.3709, + "step": 1931 + }, + { + "epoch": 0.21567539847829956, + "grad_norm": 0.23173397779464722, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.3956, + "step": 1932 + }, + { + "epoch": 0.21578703170732558, + "grad_norm": 0.23224657773971558, + "learning_rate": 1.6750000000000003e-06, + "loss": 2.3998, + "step": 1933 + }, + { + "epoch": 0.21589866493635163, + "grad_norm": 0.23145684599876404, + "learning_rate": 1.65e-06, + "loss": 2.3027, + "step": 1934 + }, + { + "epoch": 0.21601029816537765, + "grad_norm": 0.22632840275764465, + "learning_rate": 1.6250000000000001e-06, + "loss": 2.3666, + "step": 1935 + }, + { + "epoch": 0.2161219313944037, + "grad_norm": 0.2313094288110733, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.2143, + "step": 1936 + }, + { + "epoch": 0.2162335646234297, + "grad_norm": 0.24646909534931183, + "learning_rate": 1.5750000000000002e-06, + "loss": 2.2228, + "step": 1937 + }, + { + "epoch": 0.21634519785245576, + "grad_norm": 0.2413366138935089, + "learning_rate": 1.55e-06, + "loss": 2.4665, + "step": 1938 + }, + { + "epoch": 0.21645683108148178, + "grad_norm": 0.22983328998088837, + "learning_rate": 1.525e-06, + "loss": 2.3433, + "step": 1939 + }, + { + "epoch": 0.21656846431050783, + "grad_norm": 0.23430456221103668, + "learning_rate": 1.5e-06, + "loss": 2.2489, + "step": 1940 + }, + { + "epoch": 0.21668009753953385, + "grad_norm": 0.2480006217956543, + "learning_rate": 1.475e-06, + "loss": 2.3227, + "step": 1941 + }, + { + "epoch": 0.2167917307685599, + "grad_norm": 0.2268451601266861, + "learning_rate": 1.45e-06, + "loss": 2.4032, + "step": 1942 + }, + { + "epoch": 0.21690336399758592, + "grad_norm": 0.30644339323043823, + "learning_rate": 1.4250000000000001e-06, + "loss": 2.3085, + "step": 1943 + }, + { + "epoch": 0.21701499722661197, + "grad_norm": 0.2343900352716446, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.291, + "step": 1944 + }, + { + "epoch": 0.21712663045563801, + "grad_norm": 0.22589880228042603, + "learning_rate": 1.3750000000000002e-06, + "loss": 2.3656, + "step": 1945 + }, + { + "epoch": 0.21723826368466403, + "grad_norm": 0.2340187281370163, + "learning_rate": 1.35e-06, + "loss": 2.4029, + "step": 1946 + }, + { + "epoch": 0.21734989691369008, + "grad_norm": 0.23875071108341217, + "learning_rate": 1.325e-06, + "loss": 2.3215, + "step": 1947 + }, + { + "epoch": 0.2174615301427161, + "grad_norm": 0.22057262063026428, + "learning_rate": 1.3e-06, + "loss": 2.4121, + "step": 1948 + }, + { + "epoch": 0.21757316337174215, + "grad_norm": 0.2507198750972748, + "learning_rate": 1.275e-06, + "loss": 2.4401, + "step": 1949 + }, + { + "epoch": 0.21768479660076817, + "grad_norm": 0.22903326153755188, + "learning_rate": 1.25e-06, + "loss": 2.4032, + "step": 1950 + }, + { + "epoch": 0.21779642982979422, + "grad_norm": 0.22481852769851685, + "learning_rate": 1.2250000000000001e-06, + "loss": 2.4677, + "step": 1951 + }, + { + "epoch": 0.21790806305882024, + "grad_norm": 0.23029015958309174, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2855, + "step": 1952 + }, + { + "epoch": 0.2180196962878463, + "grad_norm": 0.2441205233335495, + "learning_rate": 1.175e-06, + "loss": 2.3884, + "step": 1953 + }, + { + "epoch": 0.2181313295168723, + "grad_norm": 0.22523878514766693, + "learning_rate": 1.15e-06, + "loss": 2.341, + "step": 1954 + }, + { + "epoch": 0.21824296274589836, + "grad_norm": 0.23141464591026306, + "learning_rate": 1.125e-06, + "loss": 2.3801, + "step": 1955 + }, + { + "epoch": 0.21835459597492438, + "grad_norm": 0.23657923936843872, + "learning_rate": 1.1e-06, + "loss": 2.4242, + "step": 1956 + }, + { + "epoch": 0.21846622920395042, + "grad_norm": 0.2270117551088333, + "learning_rate": 1.0749999999999999e-06, + "loss": 2.4428, + "step": 1957 + }, + { + "epoch": 0.21857786243297644, + "grad_norm": 0.23563359677791595, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.3945, + "step": 1958 + }, + { + "epoch": 0.2186894956620025, + "grad_norm": 0.2353026121854782, + "learning_rate": 1.0250000000000001e-06, + "loss": 2.3557, + "step": 1959 + }, + { + "epoch": 0.2188011288910285, + "grad_norm": 0.2238246351480484, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.3597, + "step": 1960 + }, + { + "epoch": 0.21891276212005456, + "grad_norm": 0.2265477329492569, + "learning_rate": 9.75e-07, + "loss": 2.3447, + "step": 1961 + }, + { + "epoch": 0.2190243953490806, + "grad_norm": 0.3757992684841156, + "learning_rate": 9.5e-07, + "loss": 2.3917, + "step": 1962 + }, + { + "epoch": 0.21913602857810663, + "grad_norm": 0.22358457744121552, + "learning_rate": 9.25e-07, + "loss": 2.4566, + "step": 1963 + }, + { + "epoch": 0.21924766180713268, + "grad_norm": 0.2505660355091095, + "learning_rate": 9e-07, + "loss": 2.2797, + "step": 1964 + }, + { + "epoch": 0.2193592950361587, + "grad_norm": 0.2312246561050415, + "learning_rate": 8.750000000000001e-07, + "loss": 2.3255, + "step": 1965 + }, + { + "epoch": 0.21947092826518474, + "grad_norm": 0.23009879887104034, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1822, + "step": 1966 + }, + { + "epoch": 0.21958256149421077, + "grad_norm": 0.2438948005437851, + "learning_rate": 8.25e-07, + "loss": 2.3193, + "step": 1967 + }, + { + "epoch": 0.2196941947232368, + "grad_norm": 0.23041820526123047, + "learning_rate": 8.000000000000001e-07, + "loss": 2.308, + "step": 1968 + }, + { + "epoch": 0.21980582795226283, + "grad_norm": 0.22963666915893555, + "learning_rate": 7.75e-07, + "loss": 2.3703, + "step": 1969 + }, + { + "epoch": 0.21991746118128888, + "grad_norm": 0.2525392174720764, + "learning_rate": 7.5e-07, + "loss": 2.3571, + "step": 1970 + }, + { + "epoch": 0.2200290944103149, + "grad_norm": 0.2333141565322876, + "learning_rate": 7.25e-07, + "loss": 2.3727, + "step": 1971 + }, + { + "epoch": 0.22014072763934095, + "grad_norm": 0.2308010309934616, + "learning_rate": 7.000000000000001e-07, + "loss": 2.2563, + "step": 1972 + }, + { + "epoch": 0.22025236086836697, + "grad_norm": 0.2338629961013794, + "learning_rate": 6.75e-07, + "loss": 2.3202, + "step": 1973 + }, + { + "epoch": 0.22036399409739302, + "grad_norm": 0.23211447894573212, + "learning_rate": 6.5e-07, + "loss": 2.4489, + "step": 1974 + }, + { + "epoch": 0.22047562732641904, + "grad_norm": 0.2359444946050644, + "learning_rate": 6.25e-07, + "loss": 2.3855, + "step": 1975 + }, + { + "epoch": 0.22058726055544509, + "grad_norm": 0.2295287847518921, + "learning_rate": 6.000000000000001e-07, + "loss": 2.2819, + "step": 1976 + }, + { + "epoch": 0.2206988937844711, + "grad_norm": 0.2276148796081543, + "learning_rate": 5.75e-07, + "loss": 2.2884, + "step": 1977 + }, + { + "epoch": 0.22081052701349715, + "grad_norm": 0.2364000827074051, + "learning_rate": 5.5e-07, + "loss": 2.3854, + "step": 1978 + }, + { + "epoch": 0.2209221602425232, + "grad_norm": 0.22925642132759094, + "learning_rate": 5.250000000000001e-07, + "loss": 2.3348, + "step": 1979 + }, + { + "epoch": 0.22103379347154922, + "grad_norm": 0.41490915417671204, + "learning_rate": 5.000000000000001e-07, + "loss": 2.3006, + "step": 1980 + }, + { + "epoch": 0.22114542670057527, + "grad_norm": 0.23148655891418457, + "learning_rate": 4.75e-07, + "loss": 2.2691, + "step": 1981 + }, + { + "epoch": 0.2212570599296013, + "grad_norm": 0.22861695289611816, + "learning_rate": 4.5e-07, + "loss": 2.3781, + "step": 1982 + }, + { + "epoch": 0.22136869315862734, + "grad_norm": 0.23868124186992645, + "learning_rate": 4.2500000000000006e-07, + "loss": 2.3108, + "step": 1983 + }, + { + "epoch": 0.22148032638765336, + "grad_norm": 0.2650975286960602, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.3865, + "step": 1984 + }, + { + "epoch": 0.2215919596166794, + "grad_norm": 0.230629101395607, + "learning_rate": 3.75e-07, + "loss": 2.3458, + "step": 1985 + }, + { + "epoch": 0.22170359284570543, + "grad_norm": 0.2260117530822754, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.3316, + "step": 1986 + }, + { + "epoch": 0.22181522607473148, + "grad_norm": 0.2325662523508072, + "learning_rate": 3.25e-07, + "loss": 2.3991, + "step": 1987 + }, + { + "epoch": 0.2219268593037575, + "grad_norm": 0.2188185751438141, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.3246, + "step": 1988 + }, + { + "epoch": 0.22203849253278354, + "grad_norm": 0.23791977763175964, + "learning_rate": 2.75e-07, + "loss": 2.3504, + "step": 1989 + }, + { + "epoch": 0.22215012576180956, + "grad_norm": 0.3440607190132141, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.3414, + "step": 1990 + }, + { + "epoch": 0.2222617589908356, + "grad_norm": 0.21880482137203217, + "learning_rate": 2.25e-07, + "loss": 2.3769, + "step": 1991 + }, + { + "epoch": 0.22237339221986163, + "grad_norm": 0.23022335767745972, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.4113, + "step": 1992 + }, + { + "epoch": 0.22248502544888768, + "grad_norm": 0.23188486695289612, + "learning_rate": 1.7500000000000002e-07, + "loss": 2.4116, + "step": 1993 + }, + { + "epoch": 0.22259665867791373, + "grad_norm": 0.23768189549446106, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2946, + "step": 1994 + }, + { + "epoch": 0.22270829190693975, + "grad_norm": 0.2431269735097885, + "learning_rate": 1.2500000000000002e-07, + "loss": 2.3861, + "step": 1995 + }, + { + "epoch": 0.2228199251359658, + "grad_norm": 0.2846803665161133, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.4809, + "step": 1996 + }, + { + "epoch": 0.22293155836499182, + "grad_norm": 0.22916531562805176, + "learning_rate": 7.500000000000001e-08, + "loss": 2.3044, + "step": 1997 + }, + { + "epoch": 0.22304319159401786, + "grad_norm": 0.25623413920402527, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.3833, + "step": 1998 + }, + { + "epoch": 0.22315482482304388, + "grad_norm": 0.22197332978248596, + "learning_rate": 2.5000000000000002e-08, + "loss": 2.3048, + "step": 1999 + }, + { + "epoch": 0.22326645805206993, + "grad_norm": 0.2170470952987671, + "learning_rate": 0.0, + "loss": 2.3603, + "step": 2000 + }, + { + "epoch": 0.22326645805206993, + "step": 2000, + "total_flos": 1.04520375926784e+18, + "train_loss": 2.3526821104288103, + "train_runtime": 66080.1443, + "train_samples_per_second": 0.969, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 1.0, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.04520375926784e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}