{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991617770326907, "eval_steps": 500, "global_step": 298, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003352891869237217, "grad_norm": 19.94731903076172, "learning_rate": 1.0000000000000002e-06, "loss": 5.9036, "step": 1 }, { "epoch": 0.006705783738474434, "grad_norm": 19.311952590942383, "learning_rate": 2.0000000000000003e-06, "loss": 5.2027, "step": 2 }, { "epoch": 0.010058675607711651, "grad_norm": 20.031103134155273, "learning_rate": 3e-06, "loss": 5.0706, "step": 3 }, { "epoch": 0.013411567476948869, "grad_norm": 20.00600814819336, "learning_rate": 4.000000000000001e-06, "loss": 5.4878, "step": 4 }, { "epoch": 0.016764459346186086, "grad_norm": 21.179712295532227, "learning_rate": 5e-06, "loss": 5.7332, "step": 5 }, { "epoch": 0.020117351215423303, "grad_norm": 20.510826110839844, "learning_rate": 6e-06, "loss": 5.4379, "step": 6 }, { "epoch": 0.02347024308466052, "grad_norm": 24.237993240356445, "learning_rate": 7.000000000000001e-06, "loss": 6.201, "step": 7 }, { "epoch": 0.026823134953897737, "grad_norm": 24.435766220092773, "learning_rate": 8.000000000000001e-06, "loss": 7.488, "step": 8 }, { "epoch": 0.030176026823134954, "grad_norm": 25.343833923339844, "learning_rate": 9e-06, "loss": 7.7776, "step": 9 }, { "epoch": 0.03352891869237217, "grad_norm": 26.141292572021484, "learning_rate": 1e-05, "loss": 6.3217, "step": 10 }, { "epoch": 0.036881810561609385, "grad_norm": 28.506261825561523, "learning_rate": 1.1000000000000001e-05, "loss": 6.2981, "step": 11 }, { "epoch": 0.040234702430846606, "grad_norm": 24.986642837524414, "learning_rate": 1.2e-05, "loss": 6.9347, "step": 12 }, { "epoch": 0.04358759430008382, "grad_norm": 24.410480499267578, "learning_rate": 1.3000000000000001e-05, "loss": 7.4913, "step": 13 }, { "epoch": 0.04694048616932104, "grad_norm": 27.457660675048828, "learning_rate": 1.4000000000000001e-05, "loss": 6.6574, "step": 14 }, { "epoch": 0.050293378038558254, "grad_norm": 25.729991912841797, "learning_rate": 1.5e-05, "loss": 6.4998, "step": 15 }, { "epoch": 0.053646269907795474, "grad_norm": 24.635997772216797, "learning_rate": 1.6000000000000003e-05, "loss": 6.8279, "step": 16 }, { "epoch": 0.05699916177703269, "grad_norm": 27.968725204467773, "learning_rate": 1.7000000000000003e-05, "loss": 5.303, "step": 17 }, { "epoch": 0.06035205364626991, "grad_norm": 25.632286071777344, "learning_rate": 1.8e-05, "loss": 6.2003, "step": 18 }, { "epoch": 0.06370494551550712, "grad_norm": 25.91347885131836, "learning_rate": 1.9e-05, "loss": 5.4608, "step": 19 }, { "epoch": 0.06705783738474434, "grad_norm": 26.693031311035156, "learning_rate": 2e-05, "loss": 6.2744, "step": 20 }, { "epoch": 0.07041072925398156, "grad_norm": 25.103519439697266, "learning_rate": 2.1e-05, "loss": 5.6415, "step": 21 }, { "epoch": 0.07376362112321877, "grad_norm": 23.946666717529297, "learning_rate": 2.2000000000000003e-05, "loss": 5.9613, "step": 22 }, { "epoch": 0.07711651299245599, "grad_norm": 23.824304580688477, "learning_rate": 2.3000000000000003e-05, "loss": 5.3184, "step": 23 }, { "epoch": 0.08046940486169321, "grad_norm": 22.486934661865234, "learning_rate": 2.4e-05, "loss": 4.6439, "step": 24 }, { "epoch": 0.08382229673093043, "grad_norm": 20.246803283691406, "learning_rate": 2.5e-05, "loss": 5.0911, "step": 25 }, { "epoch": 0.08717518860016764, "grad_norm": 21.117258071899414, "learning_rate": 2.6000000000000002e-05, "loss": 4.2909, "step": 26 }, { "epoch": 0.09052808046940486, "grad_norm": 20.302732467651367, "learning_rate": 2.7000000000000002e-05, "loss": 4.0127, "step": 27 }, { "epoch": 0.09388097233864208, "grad_norm": 17.60736656188965, "learning_rate": 2.8000000000000003e-05, "loss": 4.3916, "step": 28 }, { "epoch": 0.0972338642078793, "grad_norm": 21.172948837280273, "learning_rate": 2.9e-05, "loss": 4.2311, "step": 29 }, { "epoch": 0.10058675607711651, "grad_norm": 43.94004440307617, "learning_rate": 3e-05, "loss": 6.7274, "step": 30 }, { "epoch": 0.10393964794635373, "grad_norm": 32.47929000854492, "learning_rate": 3.1e-05, "loss": 7.0579, "step": 31 }, { "epoch": 0.10729253981559095, "grad_norm": 29.528139114379883, "learning_rate": 3.2000000000000005e-05, "loss": 5.5148, "step": 32 }, { "epoch": 0.11064543168482817, "grad_norm": 29.758312225341797, "learning_rate": 3.3e-05, "loss": 4.8453, "step": 33 }, { "epoch": 0.11399832355406538, "grad_norm": 30.82133674621582, "learning_rate": 3.4000000000000007e-05, "loss": 3.8677, "step": 34 }, { "epoch": 0.1173512154233026, "grad_norm": 31.497970581054688, "learning_rate": 3.5e-05, "loss": 3.1699, "step": 35 }, { "epoch": 0.12070410729253982, "grad_norm": 26.460988998413086, "learning_rate": 3.6e-05, "loss": 2.6621, "step": 36 }, { "epoch": 0.12405699916177704, "grad_norm": 25.016969680786133, "learning_rate": 3.7e-05, "loss": 2.168, "step": 37 }, { "epoch": 0.12740989103101424, "grad_norm": 19.797771453857422, "learning_rate": 3.8e-05, "loss": 1.6021, "step": 38 }, { "epoch": 0.13076278290025148, "grad_norm": 21.162372589111328, "learning_rate": 3.9000000000000006e-05, "loss": 1.9339, "step": 39 }, { "epoch": 0.13411567476948869, "grad_norm": 22.99364471435547, "learning_rate": 4e-05, "loss": 1.4826, "step": 40 }, { "epoch": 0.1374685666387259, "grad_norm": 17.153324127197266, "learning_rate": 4.1e-05, "loss": 0.9375, "step": 41 }, { "epoch": 0.14082145850796313, "grad_norm": 17.41748046875, "learning_rate": 4.2e-05, "loss": 1.0149, "step": 42 }, { "epoch": 0.14417435037720033, "grad_norm": 16.24651527404785, "learning_rate": 4.3e-05, "loss": 0.9972, "step": 43 }, { "epoch": 0.14752724224643754, "grad_norm": 12.924613952636719, "learning_rate": 4.4000000000000006e-05, "loss": 0.7106, "step": 44 }, { "epoch": 0.15088013411567477, "grad_norm": 13.725838661193848, "learning_rate": 4.5e-05, "loss": 0.506, "step": 45 }, { "epoch": 0.15423302598491198, "grad_norm": 13.44334888458252, "learning_rate": 4.600000000000001e-05, "loss": 0.5244, "step": 46 }, { "epoch": 0.15758591785414922, "grad_norm": 10.480420112609863, "learning_rate": 4.7e-05, "loss": 0.4254, "step": 47 }, { "epoch": 0.16093880972338642, "grad_norm": 9.2076997756958, "learning_rate": 4.8e-05, "loss": 0.3862, "step": 48 }, { "epoch": 0.16429170159262363, "grad_norm": 17.21129608154297, "learning_rate": 4.9e-05, "loss": 1.2896, "step": 49 }, { "epoch": 0.16764459346186086, "grad_norm": 25.2115535736084, "learning_rate": 5e-05, "loss": 2.0117, "step": 50 }, { "epoch": 0.17099748533109807, "grad_norm": 55.361873626708984, "learning_rate": 5.1000000000000006e-05, "loss": 4.8735, "step": 51 }, { "epoch": 0.17435037720033528, "grad_norm": 41.75767517089844, "learning_rate": 5.2000000000000004e-05, "loss": 3.4571, "step": 52 }, { "epoch": 0.1777032690695725, "grad_norm": 28.46742057800293, "learning_rate": 5.300000000000001e-05, "loss": 2.0218, "step": 53 }, { "epoch": 0.18105616093880972, "grad_norm": 15.05508804321289, "learning_rate": 5.4000000000000005e-05, "loss": 1.3842, "step": 54 }, { "epoch": 0.18440905280804695, "grad_norm": 13.411344528198242, "learning_rate": 5.500000000000001e-05, "loss": 1.0848, "step": 55 }, { "epoch": 0.18776194467728416, "grad_norm": 7.541948318481445, "learning_rate": 5.6000000000000006e-05, "loss": 0.5558, "step": 56 }, { "epoch": 0.19111483654652137, "grad_norm": 11.816786766052246, "learning_rate": 5.6999999999999996e-05, "loss": 0.8748, "step": 57 }, { "epoch": 0.1944677284157586, "grad_norm": 13.176451683044434, "learning_rate": 5.8e-05, "loss": 0.9711, "step": 58 }, { "epoch": 0.1978206202849958, "grad_norm": 13.811378479003906, "learning_rate": 5.9e-05, "loss": 0.8243, "step": 59 }, { "epoch": 0.20117351215423301, "grad_norm": 12.062150001525879, "learning_rate": 6e-05, "loss": 0.6277, "step": 60 }, { "epoch": 0.20452640402347025, "grad_norm": 10.790270805358887, "learning_rate": 6.1e-05, "loss": 0.6866, "step": 61 }, { "epoch": 0.20787929589270746, "grad_norm": 11.196799278259277, "learning_rate": 6.2e-05, "loss": 0.556, "step": 62 }, { "epoch": 0.2112321877619447, "grad_norm": 11.80176067352295, "learning_rate": 6.3e-05, "loss": 0.4936, "step": 63 }, { "epoch": 0.2145850796311819, "grad_norm": 5.4536614418029785, "learning_rate": 6.400000000000001e-05, "loss": 0.4018, "step": 64 }, { "epoch": 0.2179379715004191, "grad_norm": 8.171553611755371, "learning_rate": 6.500000000000001e-05, "loss": 0.6092, "step": 65 }, { "epoch": 0.22129086336965634, "grad_norm": 4.759002208709717, "learning_rate": 6.6e-05, "loss": 0.2932, "step": 66 }, { "epoch": 0.22464375523889354, "grad_norm": 8.26754093170166, "learning_rate": 6.7e-05, "loss": 0.552, "step": 67 }, { "epoch": 0.22799664710813075, "grad_norm": 5.972261428833008, "learning_rate": 6.800000000000001e-05, "loss": 0.4562, "step": 68 }, { "epoch": 0.23134953897736799, "grad_norm": 6.14136266708374, "learning_rate": 6.9e-05, "loss": 0.4001, "step": 69 }, { "epoch": 0.2347024308466052, "grad_norm": 4.529941082000732, "learning_rate": 7e-05, "loss": 0.434, "step": 70 }, { "epoch": 0.23805532271584243, "grad_norm": 6.710460186004639, "learning_rate": 7.1e-05, "loss": 0.3288, "step": 71 }, { "epoch": 0.24140821458507963, "grad_norm": 4.07722282409668, "learning_rate": 7.2e-05, "loss": 0.2644, "step": 72 }, { "epoch": 0.24476110645431684, "grad_norm": 5.167060852050781, "learning_rate": 7.3e-05, "loss": 0.2852, "step": 73 }, { "epoch": 0.24811399832355407, "grad_norm": 4.403326988220215, "learning_rate": 7.4e-05, "loss": 0.3129, "step": 74 }, { "epoch": 0.2514668901927913, "grad_norm": 3.444688558578491, "learning_rate": 7.500000000000001e-05, "loss": 0.2299, "step": 75 }, { "epoch": 0.2548197820620285, "grad_norm": 4.974164009094238, "learning_rate": 7.6e-05, "loss": 0.3063, "step": 76 }, { "epoch": 0.2581726739312657, "grad_norm": 4.73006010055542, "learning_rate": 7.7e-05, "loss": 0.2648, "step": 77 }, { "epoch": 0.26152556580050296, "grad_norm": 4.712124824523926, "learning_rate": 7.800000000000001e-05, "loss": 0.2816, "step": 78 }, { "epoch": 0.26487845766974016, "grad_norm": 6.925064563751221, "learning_rate": 7.900000000000001e-05, "loss": 0.1847, "step": 79 }, { "epoch": 0.26823134953897737, "grad_norm": 21.28022575378418, "learning_rate": 8e-05, "loss": 1.6031, "step": 80 }, { "epoch": 0.2715842414082146, "grad_norm": 24.482501983642578, "learning_rate": 8.1e-05, "loss": 1.5822, "step": 81 }, { "epoch": 0.2749371332774518, "grad_norm": 28.196971893310547, "learning_rate": 8.2e-05, "loss": 1.3814, "step": 82 }, { "epoch": 0.27829002514668905, "grad_norm": 26.377918243408203, "learning_rate": 8.3e-05, "loss": 1.2232, "step": 83 }, { "epoch": 0.28164291701592625, "grad_norm": 32.93387985229492, "learning_rate": 8.4e-05, "loss": 1.0546, "step": 84 }, { "epoch": 0.28499580888516346, "grad_norm": 20.566518783569336, "learning_rate": 8.5e-05, "loss": 0.8103, "step": 85 }, { "epoch": 0.28834870075440067, "grad_norm": 14.28470516204834, "learning_rate": 8.6e-05, "loss": 0.7405, "step": 86 }, { "epoch": 0.2917015926236379, "grad_norm": 7.578380584716797, "learning_rate": 8.7e-05, "loss": 0.359, "step": 87 }, { "epoch": 0.2950544844928751, "grad_norm": 14.096921920776367, "learning_rate": 8.800000000000001e-05, "loss": 0.5263, "step": 88 }, { "epoch": 0.29840737636211234, "grad_norm": 9.082076072692871, "learning_rate": 8.900000000000001e-05, "loss": 0.349, "step": 89 }, { "epoch": 0.30176026823134955, "grad_norm": 9.854619026184082, "learning_rate": 9e-05, "loss": 0.3619, "step": 90 }, { "epoch": 0.30511316010058676, "grad_norm": 6.256463050842285, "learning_rate": 9.1e-05, "loss": 0.3233, "step": 91 }, { "epoch": 0.30846605196982396, "grad_norm": 21.08913230895996, "learning_rate": 9.200000000000001e-05, "loss": 0.2958, "step": 92 }, { "epoch": 0.31181894383906117, "grad_norm": 15.940088272094727, "learning_rate": 9.300000000000001e-05, "loss": 0.381, "step": 93 }, { "epoch": 0.31517183570829843, "grad_norm": 13.932638168334961, "learning_rate": 9.4e-05, "loss": 0.3273, "step": 94 }, { "epoch": 0.31852472757753564, "grad_norm": 4.884693145751953, "learning_rate": 9.5e-05, "loss": 0.3036, "step": 95 }, { "epoch": 0.32187761944677284, "grad_norm": 4.182151794433594, "learning_rate": 9.6e-05, "loss": 0.1351, "step": 96 }, { "epoch": 0.32523051131601005, "grad_norm": 9.422307014465332, "learning_rate": 9.7e-05, "loss": 0.4685, "step": 97 }, { "epoch": 0.32858340318524726, "grad_norm": 18.013978958129883, "learning_rate": 9.8e-05, "loss": 1.235, "step": 98 }, { "epoch": 0.3319362950544845, "grad_norm": 13.848796844482422, "learning_rate": 9.900000000000001e-05, "loss": 0.9726, "step": 99 }, { "epoch": 0.3352891869237217, "grad_norm": 18.036916732788086, "learning_rate": 0.0001, "loss": 1.2473, "step": 100 }, { "epoch": 0.33864207879295893, "grad_norm": 15.80588150024414, "learning_rate": 9.999370638369377e-05, "loss": 1.3471, "step": 101 }, { "epoch": 0.34199497066219614, "grad_norm": 16.674541473388672, "learning_rate": 9.997482711915927e-05, "loss": 1.1381, "step": 102 }, { "epoch": 0.34534786253143335, "grad_norm": 47.35570526123047, "learning_rate": 9.99433669591504e-05, "loss": 1.1454, "step": 103 }, { "epoch": 0.34870075440067055, "grad_norm": 10.346620559692383, "learning_rate": 9.989933382359422e-05, "loss": 0.7687, "step": 104 }, { "epoch": 0.3520536462699078, "grad_norm": 8.792768478393555, "learning_rate": 9.984273879759713e-05, "loss": 0.5802, "step": 105 }, { "epoch": 0.355406538139145, "grad_norm": 5.656138896942139, "learning_rate": 9.977359612865423e-05, "loss": 0.4869, "step": 106 }, { "epoch": 0.35875943000838223, "grad_norm": 12.679141998291016, "learning_rate": 9.969192322306271e-05, "loss": 0.5927, "step": 107 }, { "epoch": 0.36211232187761944, "grad_norm": 9.823601722717285, "learning_rate": 9.959774064153977e-05, "loss": 0.5512, "step": 108 }, { "epoch": 0.36546521374685664, "grad_norm": 11.15606689453125, "learning_rate": 9.949107209404665e-05, "loss": 0.603, "step": 109 }, { "epoch": 0.3688181056160939, "grad_norm": 10.26724910736084, "learning_rate": 9.937194443381972e-05, "loss": 0.4432, "step": 110 }, { "epoch": 0.3721709974853311, "grad_norm": 8.792684555053711, "learning_rate": 9.924038765061042e-05, "loss": 0.5643, "step": 111 }, { "epoch": 0.3755238893545683, "grad_norm": 5.435505390167236, "learning_rate": 9.909643486313533e-05, "loss": 0.4126, "step": 112 }, { "epoch": 0.3788767812238055, "grad_norm": 5.39192533493042, "learning_rate": 9.894012231073894e-05, "loss": 0.4081, "step": 113 }, { "epoch": 0.38222967309304273, "grad_norm": 10.250500679016113, "learning_rate": 9.877148934427037e-05, "loss": 0.4552, "step": 114 }, { "epoch": 0.38558256496227994, "grad_norm": 9.414995193481445, "learning_rate": 9.859057841617709e-05, "loss": 0.4112, "step": 115 }, { "epoch": 0.3889354568315172, "grad_norm": 6.214295387268066, "learning_rate": 9.839743506981782e-05, "loss": 0.3666, "step": 116 }, { "epoch": 0.3922883487007544, "grad_norm": 4.788174629211426, "learning_rate": 9.819210792799712e-05, "loss": 0.273, "step": 117 }, { "epoch": 0.3956412405699916, "grad_norm": 6.027410507202148, "learning_rate": 9.797464868072488e-05, "loss": 0.2291, "step": 118 }, { "epoch": 0.3989941324392288, "grad_norm": 6.023176670074463, "learning_rate": 9.77451120722037e-05, "loss": 0.2822, "step": 119 }, { "epoch": 0.40234702430846603, "grad_norm": 4.367912769317627, "learning_rate": 9.750355588704727e-05, "loss": 0.4041, "step": 120 }, { "epoch": 0.4056999161777033, "grad_norm": 6.125609397888184, "learning_rate": 9.725004093573342e-05, "loss": 0.2858, "step": 121 }, { "epoch": 0.4090528080469405, "grad_norm": 5.797603607177734, "learning_rate": 9.698463103929542e-05, "loss": 0.2407, "step": 122 }, { "epoch": 0.4124056999161777, "grad_norm": 8.285858154296875, "learning_rate": 9.670739301325534e-05, "loss": 0.2792, "step": 123 }, { "epoch": 0.4157585917854149, "grad_norm": 4.349048614501953, "learning_rate": 9.641839665080363e-05, "loss": 0.1981, "step": 124 }, { "epoch": 0.4191114836546521, "grad_norm": 3.2639927864074707, "learning_rate": 9.611771470522908e-05, "loss": 0.2172, "step": 125 }, { "epoch": 0.4224643755238894, "grad_norm": 4.961655616760254, "learning_rate": 9.580542287160348e-05, "loss": 0.1867, "step": 126 }, { "epoch": 0.4258172673931266, "grad_norm": 4.474092483520508, "learning_rate": 9.548159976772592e-05, "loss": 0.2191, "step": 127 }, { "epoch": 0.4291701592623638, "grad_norm": 4.9893927574157715, "learning_rate": 9.514632691433107e-05, "loss": 0.1895, "step": 128 }, { "epoch": 0.432523051131601, "grad_norm": 28.090858459472656, "learning_rate": 9.479968871456679e-05, "loss": 1.7745, "step": 129 }, { "epoch": 0.4358759430008382, "grad_norm": 29.179738998413086, "learning_rate": 9.444177243274618e-05, "loss": 1.6506, "step": 130 }, { "epoch": 0.4392288348700754, "grad_norm": 26.920635223388672, "learning_rate": 9.407266817237911e-05, "loss": 1.1685, "step": 131 }, { "epoch": 0.4425817267393127, "grad_norm": 25.46989631652832, "learning_rate": 9.369246885348926e-05, "loss": 0.9895, "step": 132 }, { "epoch": 0.4459346186085499, "grad_norm": 19.810270309448242, "learning_rate": 9.330127018922194e-05, "loss": 0.4968, "step": 133 }, { "epoch": 0.4492875104777871, "grad_norm": 9.30676555633545, "learning_rate": 9.289917066174886e-05, "loss": 0.5492, "step": 134 }, { "epoch": 0.4526404023470243, "grad_norm": 16.38142204284668, "learning_rate": 9.248627149747573e-05, "loss": 0.6716, "step": 135 }, { "epoch": 0.4559932942162615, "grad_norm": 15.360873222351074, "learning_rate": 9.206267664155907e-05, "loss": 0.588, "step": 136 }, { "epoch": 0.45934618608549876, "grad_norm": 11.33460521697998, "learning_rate": 9.162849273173857e-05, "loss": 0.3989, "step": 137 }, { "epoch": 0.46269907795473597, "grad_norm": 7.337402820587158, "learning_rate": 9.118382907149165e-05, "loss": 0.3023, "step": 138 }, { "epoch": 0.4660519698239732, "grad_norm": 9.812034606933594, "learning_rate": 9.072879760251679e-05, "loss": 0.2999, "step": 139 }, { "epoch": 0.4694048616932104, "grad_norm": 15.64886474609375, "learning_rate": 9.026351287655294e-05, "loss": 0.4581, "step": 140 }, { "epoch": 0.4727577535624476, "grad_norm": 24.609207153320312, "learning_rate": 8.978809202654162e-05, "loss": 0.4414, "step": 141 }, { "epoch": 0.47611064543168485, "grad_norm": 13.780351638793945, "learning_rate": 8.930265473713938e-05, "loss": 0.314, "step": 142 }, { "epoch": 0.47946353730092206, "grad_norm": 4.577981472015381, "learning_rate": 8.880732321458784e-05, "loss": 0.2216, "step": 143 }, { "epoch": 0.48281642917015927, "grad_norm": 6.896910667419434, "learning_rate": 8.83022221559489e-05, "loss": 0.2431, "step": 144 }, { "epoch": 0.4861693210393965, "grad_norm": 10.73836898803711, "learning_rate": 8.778747871771292e-05, "loss": 0.2993, "step": 145 }, { "epoch": 0.4895222129086337, "grad_norm": 5.567845344543457, "learning_rate": 8.726322248378775e-05, "loss": 0.2575, "step": 146 }, { "epoch": 0.4928751047778709, "grad_norm": 9.290807723999023, "learning_rate": 8.672958543287666e-05, "loss": 0.3084, "step": 147 }, { "epoch": 0.49622799664710815, "grad_norm": 13.643046379089355, "learning_rate": 8.618670190525352e-05, "loss": 0.5823, "step": 148 }, { "epoch": 0.49958088851634536, "grad_norm": 11.43275260925293, "learning_rate": 8.563470856894316e-05, "loss": 0.5169, "step": 149 }, { "epoch": 0.5029337803855826, "grad_norm": 16.253604888916016, "learning_rate": 8.507374438531607e-05, "loss": 0.6451, "step": 150 }, { "epoch": 0.5062866722548198, "grad_norm": 14.23026180267334, "learning_rate": 8.450395057410561e-05, "loss": 1.2755, "step": 151 }, { "epoch": 0.509639564124057, "grad_norm": 12.833418846130371, "learning_rate": 8.392547057785661e-05, "loss": 1.0593, "step": 152 }, { "epoch": 0.5129924559932942, "grad_norm": 10.777283668518066, "learning_rate": 8.333845002581458e-05, "loss": 0.7966, "step": 153 }, { "epoch": 0.5163453478625314, "grad_norm": 9.72930908203125, "learning_rate": 8.274303669726426e-05, "loss": 0.5648, "step": 154 }, { "epoch": 0.5196982397317687, "grad_norm": 6.614412784576416, "learning_rate": 8.213938048432697e-05, "loss": 0.5082, "step": 155 }, { "epoch": 0.5230511316010059, "grad_norm": 6.6831889152526855, "learning_rate": 8.152763335422613e-05, "loss": 0.4376, "step": 156 }, { "epoch": 0.5264040234702431, "grad_norm": 5.255478858947754, "learning_rate": 8.090794931103026e-05, "loss": 0.3736, "step": 157 }, { "epoch": 0.5297569153394803, "grad_norm": 8.692307472229004, "learning_rate": 8.028048435688333e-05, "loss": 0.4133, "step": 158 }, { "epoch": 0.5331098072087175, "grad_norm": 7.771063804626465, "learning_rate": 7.964539645273204e-05, "loss": 0.4996, "step": 159 }, { "epoch": 0.5364626990779547, "grad_norm": 7.538997650146484, "learning_rate": 7.900284547855991e-05, "loss": 0.4573, "step": 160 }, { "epoch": 0.539815590947192, "grad_norm": 8.275318145751953, "learning_rate": 7.835299319313853e-05, "loss": 0.3567, "step": 161 }, { "epoch": 0.5431684828164292, "grad_norm": 9.14857292175293, "learning_rate": 7.769600319330552e-05, "loss": 0.3545, "step": 162 }, { "epoch": 0.5465213746856664, "grad_norm": 7.5321784019470215, "learning_rate": 7.703204087277988e-05, "loss": 0.3356, "step": 163 }, { "epoch": 0.5498742665549036, "grad_norm": 5.943683624267578, "learning_rate": 7.636127338052512e-05, "loss": 0.3377, "step": 164 }, { "epoch": 0.5532271584241408, "grad_norm": 4.5078125, "learning_rate": 7.568386957867033e-05, "loss": 0.383, "step": 165 }, { "epoch": 0.5565800502933781, "grad_norm": 5.633090496063232, "learning_rate": 7.500000000000001e-05, "loss": 0.3359, "step": 166 }, { "epoch": 0.5599329421626152, "grad_norm": 4.150753974914551, "learning_rate": 7.430983680502344e-05, "loss": 0.2535, "step": 167 }, { "epoch": 0.5632858340318525, "grad_norm": 7.417738437652588, "learning_rate": 7.361355373863414e-05, "loss": 0.4739, "step": 168 }, { "epoch": 0.5666387259010897, "grad_norm": 4.674182891845703, "learning_rate": 7.291132608637052e-05, "loss": 0.3103, "step": 169 }, { "epoch": 0.5699916177703269, "grad_norm": 2.7935192584991455, "learning_rate": 7.220333063028872e-05, "loss": 0.2336, "step": 170 }, { "epoch": 0.5733445096395641, "grad_norm": 9.859716415405273, "learning_rate": 7.148974560445859e-05, "loss": 0.4477, "step": 171 }, { "epoch": 0.5766974015088013, "grad_norm": 5.132834434509277, "learning_rate": 7.077075065009433e-05, "loss": 0.2547, "step": 172 }, { "epoch": 0.5800502933780386, "grad_norm": 5.565759181976318, "learning_rate": 7.004652677033068e-05, "loss": 0.4095, "step": 173 }, { "epoch": 0.5834031852472757, "grad_norm": 7.700199604034424, "learning_rate": 6.931725628465643e-05, "loss": 0.2234, "step": 174 }, { "epoch": 0.586756077116513, "grad_norm": 3.140568733215332, "learning_rate": 6.858312278301637e-05, "loss": 0.1842, "step": 175 }, { "epoch": 0.5901089689857502, "grad_norm": 7.881420612335205, "learning_rate": 6.784431107959359e-05, "loss": 0.2952, "step": 176 }, { "epoch": 0.5934618608549874, "grad_norm": 4.895866870880127, "learning_rate": 6.710100716628344e-05, "loss": 0.1627, "step": 177 }, { "epoch": 0.5968147527242247, "grad_norm": 6.7675251960754395, "learning_rate": 6.635339816587109e-05, "loss": 0.1709, "step": 178 }, { "epoch": 0.6001676445934618, "grad_norm": 12.553466796875, "learning_rate": 6.560167228492436e-05, "loss": 0.9033, "step": 179 }, { "epoch": 0.6035205364626991, "grad_norm": 25.9533748626709, "learning_rate": 6.484601876641375e-05, "loss": 1.2262, "step": 180 }, { "epoch": 0.6068734283319362, "grad_norm": 25.939783096313477, "learning_rate": 6.408662784207149e-05, "loss": 1.5251, "step": 181 }, { "epoch": 0.6102263202011735, "grad_norm": 31.702119827270508, "learning_rate": 6.332369068450174e-05, "loss": 1.2017, "step": 182 }, { "epoch": 0.6135792120704108, "grad_norm": 24.445140838623047, "learning_rate": 6.255739935905396e-05, "loss": 1.1614, "step": 183 }, { "epoch": 0.6169321039396479, "grad_norm": 26.491666793823242, "learning_rate": 6.178794677547137e-05, "loss": 1.184, "step": 184 }, { "epoch": 0.6202849958088852, "grad_norm": 20.919872283935547, "learning_rate": 6.1015526639327035e-05, "loss": 0.7381, "step": 185 }, { "epoch": 0.6236378876781223, "grad_norm": 77.90556335449219, "learning_rate": 6.024033340325954e-05, "loss": 1.3539, "step": 186 }, { "epoch": 0.6269907795473596, "grad_norm": 21.881759643554688, "learning_rate": 5.946256221802051e-05, "loss": 0.7108, "step": 187 }, { "epoch": 0.6303436714165969, "grad_norm": 15.38471508026123, "learning_rate": 5.868240888334653e-05, "loss": 0.4868, "step": 188 }, { "epoch": 0.633696563285834, "grad_norm": 14.163756370544434, "learning_rate": 5.79000697986675e-05, "loss": 0.6412, "step": 189 }, { "epoch": 0.6370494551550713, "grad_norm": 11.611079216003418, "learning_rate": 5.7115741913664264e-05, "loss": 0.2968, "step": 190 }, { "epoch": 0.6404023470243084, "grad_norm": 13.02933120727539, "learning_rate": 5.6329622678687463e-05, "loss": 0.4897, "step": 191 }, { "epoch": 0.6437552388935457, "grad_norm": 11.208833694458008, "learning_rate": 5.5541909995050554e-05, "loss": 0.5136, "step": 192 }, { "epoch": 0.647108130762783, "grad_norm": 6.404201984405518, "learning_rate": 5.475280216520913e-05, "loss": 0.2015, "step": 193 }, { "epoch": 0.6504610226320201, "grad_norm": 15.304976463317871, "learning_rate": 5.396249784283942e-05, "loss": 0.5075, "step": 194 }, { "epoch": 0.6538139145012574, "grad_norm": 5.351606369018555, "learning_rate": 5.317119598282823e-05, "loss": 0.175, "step": 195 }, { "epoch": 0.6571668063704945, "grad_norm": 6.819107532501221, "learning_rate": 5.2379095791187124e-05, "loss": 0.2753, "step": 196 }, { "epoch": 0.6605196982397318, "grad_norm": 14.04761028289795, "learning_rate": 5.158639667490339e-05, "loss": 0.5383, "step": 197 }, { "epoch": 0.663872590108969, "grad_norm": 6.505035400390625, "learning_rate": 5.0793298191740404e-05, "loss": 0.1789, "step": 198 }, { "epoch": 0.6672254819782062, "grad_norm": 13.04714298248291, "learning_rate": 5e-05, "loss": 0.6869, "step": 199 }, { "epoch": 0.6705783738474435, "grad_norm": 11.471817970275879, "learning_rate": 4.92067018082596e-05, "loss": 0.5482, "step": 200 }, { "epoch": 0.6739312657166806, "grad_norm": 14.056415557861328, "learning_rate": 4.841360332509663e-05, "loss": 0.7455, "step": 201 }, { "epoch": 0.6772841575859179, "grad_norm": 12.744611740112305, "learning_rate": 4.762090420881289e-05, "loss": 0.6835, "step": 202 }, { "epoch": 0.680637049455155, "grad_norm": 13.389442443847656, "learning_rate": 4.6828804017171776e-05, "loss": 0.5843, "step": 203 }, { "epoch": 0.6839899413243923, "grad_norm": 12.951894760131836, "learning_rate": 4.603750215716057e-05, "loss": 0.5312, "step": 204 }, { "epoch": 0.6873428331936295, "grad_norm": 11.330641746520996, "learning_rate": 4.5247197834790876e-05, "loss": 0.6459, "step": 205 }, { "epoch": 0.6906957250628667, "grad_norm": 13.452632904052734, "learning_rate": 4.445809000494946e-05, "loss": 0.8902, "step": 206 }, { "epoch": 0.694048616932104, "grad_norm": 14.852524757385254, "learning_rate": 4.3670377321312535e-05, "loss": 0.8621, "step": 207 }, { "epoch": 0.6974015088013411, "grad_norm": 19.261449813842773, "learning_rate": 4.288425808633575e-05, "loss": 0.8369, "step": 208 }, { "epoch": 0.7007544006705784, "grad_norm": 12.483002662658691, "learning_rate": 4.20999302013325e-05, "loss": 0.6665, "step": 209 }, { "epoch": 0.7041072925398156, "grad_norm": 12.300580024719238, "learning_rate": 4.131759111665349e-05, "loss": 0.6114, "step": 210 }, { "epoch": 0.7074601844090528, "grad_norm": 7.492820739746094, "learning_rate": 4.0537437781979506e-05, "loss": 0.4368, "step": 211 }, { "epoch": 0.71081307627829, "grad_norm": 9.560514450073242, "learning_rate": 3.9759666596740476e-05, "loss": 0.4916, "step": 212 }, { "epoch": 0.7141659681475272, "grad_norm": 9.900005340576172, "learning_rate": 3.898447336067297e-05, "loss": 0.3931, "step": 213 }, { "epoch": 0.7175188600167645, "grad_norm": 9.310670852661133, "learning_rate": 3.821205322452863e-05, "loss": 0.3617, "step": 214 }, { "epoch": 0.7208717518860017, "grad_norm": 9.20274829864502, "learning_rate": 3.744260064094604e-05, "loss": 0.4006, "step": 215 }, { "epoch": 0.7242246437552389, "grad_norm": 9.266610145568848, "learning_rate": 3.6676309315498256e-05, "loss": 0.2972, "step": 216 }, { "epoch": 0.7275775356244761, "grad_norm": 6.18265438079834, "learning_rate": 3.591337215792852e-05, "loss": 0.3088, "step": 217 }, { "epoch": 0.7309304274937133, "grad_norm": 8.273286819458008, "learning_rate": 3.515398123358627e-05, "loss": 0.3283, "step": 218 }, { "epoch": 0.7342833193629505, "grad_norm": 5.254649639129639, "learning_rate": 3.439832771507565e-05, "loss": 0.2671, "step": 219 }, { "epoch": 0.7376362112321878, "grad_norm": 6.091966152191162, "learning_rate": 3.364660183412892e-05, "loss": 0.2778, "step": 220 }, { "epoch": 0.740989103101425, "grad_norm": 4.136501312255859, "learning_rate": 3.289899283371657e-05, "loss": 0.2668, "step": 221 }, { "epoch": 0.7443419949706622, "grad_norm": 6.066188812255859, "learning_rate": 3.215568892040641e-05, "loss": 0.2929, "step": 222 }, { "epoch": 0.7476948868398994, "grad_norm": 5.012242794036865, "learning_rate": 3.141687721698363e-05, "loss": 0.1956, "step": 223 }, { "epoch": 0.7510477787091366, "grad_norm": 5.167850494384766, "learning_rate": 3.0682743715343564e-05, "loss": 0.2545, "step": 224 }, { "epoch": 0.7544006705783739, "grad_norm": 4.93275785446167, "learning_rate": 2.9953473229669328e-05, "loss": 0.2146, "step": 225 }, { "epoch": 0.757753562447611, "grad_norm": 4.2212629318237305, "learning_rate": 2.9229249349905684e-05, "loss": 0.2268, "step": 226 }, { "epoch": 0.7611064543168483, "grad_norm": 2.142444372177124, "learning_rate": 2.851025439554142e-05, "loss": 0.1114, "step": 227 }, { "epoch": 0.7644593461860855, "grad_norm": 6.13271951675415, "learning_rate": 2.7796669369711294e-05, "loss": 0.2117, "step": 228 }, { "epoch": 0.7678122380553227, "grad_norm": 25.86508560180664, "learning_rate": 2.708867391362948e-05, "loss": 0.9176, "step": 229 }, { "epoch": 0.7711651299245599, "grad_norm": 23.312772750854492, "learning_rate": 2.638644626136587e-05, "loss": 0.9364, "step": 230 }, { "epoch": 0.7745180217937971, "grad_norm": 24.960554122924805, "learning_rate": 2.5690163194976575e-05, "loss": 0.9327, "step": 231 }, { "epoch": 0.7778709136630344, "grad_norm": 20.75816535949707, "learning_rate": 2.500000000000001e-05, "loss": 0.8042, "step": 232 }, { "epoch": 0.7812238055322716, "grad_norm": 21.9549560546875, "learning_rate": 2.4316130421329697e-05, "loss": 1.0805, "step": 233 }, { "epoch": 0.7845766974015088, "grad_norm": 20.77252960205078, "learning_rate": 2.363872661947488e-05, "loss": 0.529, "step": 234 }, { "epoch": 0.787929589270746, "grad_norm": 18.2496337890625, "learning_rate": 2.296795912722014e-05, "loss": 0.6804, "step": 235 }, { "epoch": 0.7912824811399832, "grad_norm": 19.881755828857422, "learning_rate": 2.2303996806694488e-05, "loss": 0.7514, "step": 236 }, { "epoch": 0.7946353730092205, "grad_norm": 17.686979293823242, "learning_rate": 2.164700680686147e-05, "loss": 0.8834, "step": 237 }, { "epoch": 0.7979882648784576, "grad_norm": 19.303966522216797, "learning_rate": 2.09971545214401e-05, "loss": 0.8204, "step": 238 }, { "epoch": 0.8013411567476949, "grad_norm": 13.04858684539795, "learning_rate": 2.0354603547267985e-05, "loss": 0.4991, "step": 239 }, { "epoch": 0.8046940486169321, "grad_norm": 15.609128952026367, "learning_rate": 1.9719515643116674e-05, "loss": 0.885, "step": 240 }, { "epoch": 0.8080469404861693, "grad_norm": 14.140607833862305, "learning_rate": 1.9092050688969738e-05, "loss": 0.6477, "step": 241 }, { "epoch": 0.8113998323554066, "grad_norm": 8.173004150390625, "learning_rate": 1.847236664577389e-05, "loss": 0.3195, "step": 242 }, { "epoch": 0.8147527242246437, "grad_norm": 12.263628959655762, "learning_rate": 1.7860619515673033e-05, "loss": 0.3825, "step": 243 }, { "epoch": 0.818105616093881, "grad_norm": 9.158538818359375, "learning_rate": 1.725696330273575e-05, "loss": 0.5394, "step": 244 }, { "epoch": 0.8214585079631181, "grad_norm": 11.503657341003418, "learning_rate": 1.6661549974185424e-05, "loss": 0.5017, "step": 245 }, { "epoch": 0.8248113998323554, "grad_norm": 11.45801830291748, "learning_rate": 1.60745294221434e-05, "loss": 0.5327, "step": 246 }, { "epoch": 0.8281642917015927, "grad_norm": 14.59035873413086, "learning_rate": 1.549604942589441e-05, "loss": 0.6128, "step": 247 }, { "epoch": 0.8315171835708298, "grad_norm": 11.337044715881348, "learning_rate": 1.4926255614683932e-05, "loss": 0.4564, "step": 248 }, { "epoch": 0.8348700754400671, "grad_norm": 9.969059944152832, "learning_rate": 1.4365291431056871e-05, "loss": 0.7156, "step": 249 }, { "epoch": 0.8382229673093042, "grad_norm": 10.448427200317383, "learning_rate": 1.3813298094746491e-05, "loss": 0.5685, "step": 250 }, { "epoch": 0.8415758591785415, "grad_norm": 11.464609146118164, "learning_rate": 1.327041456712334e-05, "loss": 0.6357, "step": 251 }, { "epoch": 0.8449287510477788, "grad_norm": 12.060856819152832, "learning_rate": 1.2736777516212266e-05, "loss": 0.7539, "step": 252 }, { "epoch": 0.8482816429170159, "grad_norm": 12.243000984191895, "learning_rate": 1.2212521282287092e-05, "loss": 0.7645, "step": 253 }, { "epoch": 0.8516345347862532, "grad_norm": 11.656434059143066, "learning_rate": 1.1697777844051105e-05, "loss": 0.708, "step": 254 }, { "epoch": 0.8549874266554903, "grad_norm": 12.392927169799805, "learning_rate": 1.1192676785412154e-05, "loss": 0.7513, "step": 255 }, { "epoch": 0.8583403185247276, "grad_norm": 9.757222175598145, "learning_rate": 1.0697345262860636e-05, "loss": 0.6079, "step": 256 }, { "epoch": 0.8616932103939648, "grad_norm": 8.159704208374023, "learning_rate": 1.021190797345839e-05, "loss": 0.2301, "step": 257 }, { "epoch": 0.865046102263202, "grad_norm": 7.381707668304443, "learning_rate": 9.73648712344707e-06, "loss": 0.337, "step": 258 }, { "epoch": 0.8683989941324393, "grad_norm": 9.870096206665039, "learning_rate": 9.271202397483215e-06, "loss": 0.2997, "step": 259 }, { "epoch": 0.8717518860016764, "grad_norm": 8.603653907775879, "learning_rate": 8.816170928508365e-06, "loss": 0.2499, "step": 260 }, { "epoch": 0.8751047778709137, "grad_norm": 6.676860809326172, "learning_rate": 8.371507268261437e-06, "loss": 0.3058, "step": 261 }, { "epoch": 0.8784576697401508, "grad_norm": 8.418916702270508, "learning_rate": 7.937323358440935e-06, "loss": 0.2895, "step": 262 }, { "epoch": 0.8818105616093881, "grad_norm": 10.64456558227539, "learning_rate": 7.513728502524286e-06, "loss": 0.3863, "step": 263 }, { "epoch": 0.8851634534786254, "grad_norm": 7.481652736663818, "learning_rate": 7.100829338251147e-06, "loss": 0.2319, "step": 264 }, { "epoch": 0.8885163453478625, "grad_norm": 7.484499454498291, "learning_rate": 6.698729810778065e-06, "loss": 0.1861, "step": 265 }, { "epoch": 0.8918692372170998, "grad_norm": 6.579418659210205, "learning_rate": 6.3075311465107535e-06, "loss": 0.2557, "step": 266 }, { "epoch": 0.8952221290863369, "grad_norm": 6.663677215576172, "learning_rate": 5.927331827620903e-06, "loss": 0.3708, "step": 267 }, { "epoch": 0.8985750209555742, "grad_norm": 9.156440734863281, "learning_rate": 5.558227567253832e-06, "loss": 0.3865, "step": 268 }, { "epoch": 0.9019279128248114, "grad_norm": 5.36343240737915, "learning_rate": 5.200311285433213e-06, "loss": 0.2776, "step": 269 }, { "epoch": 0.9052808046940486, "grad_norm": 7.757360935211182, "learning_rate": 4.853673085668947e-06, "loss": 0.335, "step": 270 }, { "epoch": 0.9086336965632859, "grad_norm": 12.774531364440918, "learning_rate": 4.5184002322740785e-06, "loss": 0.3653, "step": 271 }, { "epoch": 0.911986588432523, "grad_norm": 8.137080192565918, "learning_rate": 4.19457712839652e-06, "loss": 0.392, "step": 272 }, { "epoch": 0.9153394803017603, "grad_norm": 9.441322326660156, "learning_rate": 3.8822852947709375e-06, "loss": 0.3059, "step": 273 }, { "epoch": 0.9186923721709975, "grad_norm": 5.269786834716797, "learning_rate": 3.581603349196372e-06, "loss": 0.2275, "step": 274 }, { "epoch": 0.9220452640402347, "grad_norm": 9.277483940124512, "learning_rate": 3.2926069867446675e-06, "loss": 0.335, "step": 275 }, { "epoch": 0.9253981559094719, "grad_norm": 9.3627290725708, "learning_rate": 3.0153689607045845e-06, "loss": 0.4954, "step": 276 }, { "epoch": 0.9287510477787091, "grad_norm": 7.629518508911133, "learning_rate": 2.7499590642665774e-06, "loss": 0.2234, "step": 277 }, { "epoch": 0.9321039396479464, "grad_norm": 5.571048736572266, "learning_rate": 2.496444112952734e-06, "loss": 0.3485, "step": 278 }, { "epoch": 0.9354568315171836, "grad_norm": 18.15276336669922, "learning_rate": 2.2548879277963064e-06, "loss": 0.3434, "step": 279 }, { "epoch": 0.9388097233864208, "grad_norm": 14.836206436157227, "learning_rate": 2.0253513192751373e-06, "loss": 0.3263, "step": 280 }, { "epoch": 0.942162615255658, "grad_norm": 13.923564910888672, "learning_rate": 1.807892072002898e-06, "loss": 0.4678, "step": 281 }, { "epoch": 0.9455155071248952, "grad_norm": 15.387225151062012, "learning_rate": 1.6025649301821876e-06, "loss": 0.4785, "step": 282 }, { "epoch": 0.9488683989941324, "grad_norm": 13.055351257324219, "learning_rate": 1.4094215838229176e-06, "loss": 0.563, "step": 283 }, { "epoch": 0.9522212908633697, "grad_norm": 17.376829147338867, "learning_rate": 1.2285106557296477e-06, "loss": 0.3891, "step": 284 }, { "epoch": 0.9555741827326069, "grad_norm": 17.277982711791992, "learning_rate": 1.0598776892610685e-06, "loss": 0.5198, "step": 285 }, { "epoch": 0.9589270746018441, "grad_norm": 11.45787239074707, "learning_rate": 9.035651368646648e-07, "loss": 0.4912, "step": 286 }, { "epoch": 0.9622799664710813, "grad_norm": 12.024084091186523, "learning_rate": 7.596123493895991e-07, "loss": 0.4365, "step": 287 }, { "epoch": 0.9656328583403185, "grad_norm": 15.519173622131348, "learning_rate": 6.280555661802856e-07, "loss": 0.6457, "step": 288 }, { "epoch": 0.9689857502095558, "grad_norm": 15.936901092529297, "learning_rate": 5.089279059533658e-07, "loss": 0.7402, "step": 289 }, { "epoch": 0.972338642078793, "grad_norm": 14.855793952941895, "learning_rate": 4.02259358460233e-07, "loss": 0.5426, "step": 290 }, { "epoch": 0.9756915339480302, "grad_norm": 17.52886199951172, "learning_rate": 3.080767769372939e-07, "loss": 0.5786, "step": 291 }, { "epoch": 0.9790444258172674, "grad_norm": 14.614376068115234, "learning_rate": 2.2640387134577058e-07, "loss": 0.4134, "step": 292 }, { "epoch": 0.9823973176865046, "grad_norm": 12.858384132385254, "learning_rate": 1.5726120240288634e-07, "loss": 0.9686, "step": 293 }, { "epoch": 0.9857502095557418, "grad_norm": 14.443262100219727, "learning_rate": 1.0066617640578368e-07, "loss": 0.5455, "step": 294 }, { "epoch": 0.989103101424979, "grad_norm": 14.196720123291016, "learning_rate": 5.663304084960186e-08, "loss": 0.4852, "step": 295 }, { "epoch": 0.9924559932942163, "grad_norm": 13.345819473266602, "learning_rate": 2.5172880840745873e-08, "loss": 0.4879, "step": 296 }, { "epoch": 0.9958088851634534, "grad_norm": 4.6811842918396, "learning_rate": 6.293616306246586e-09, "loss": 0.2391, "step": 297 }, { "epoch": 0.9991617770326907, "grad_norm": 7.340142250061035, "learning_rate": 0.0, "loss": 0.4579, "step": 298 } ], "logging_steps": 1, "max_steps": 298, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.998363176038564e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }