|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991617770326907, |
|
"eval_steps": 500, |
|
"global_step": 298, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003352891869237217, |
|
"grad_norm": 19.94731903076172, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 5.9036, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006705783738474434, |
|
"grad_norm": 19.311952590942383, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 5.2027, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010058675607711651, |
|
"grad_norm": 20.031103134155273, |
|
"learning_rate": 3e-06, |
|
"loss": 5.0706, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013411567476948869, |
|
"grad_norm": 20.00600814819336, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 5.4878, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.016764459346186086, |
|
"grad_norm": 21.179712295532227, |
|
"learning_rate": 5e-06, |
|
"loss": 5.7332, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.020117351215423303, |
|
"grad_norm": 20.510826110839844, |
|
"learning_rate": 6e-06, |
|
"loss": 5.4379, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02347024308466052, |
|
"grad_norm": 24.237993240356445, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 6.201, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.026823134953897737, |
|
"grad_norm": 24.435766220092773, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 7.488, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030176026823134954, |
|
"grad_norm": 25.343833923339844, |
|
"learning_rate": 9e-06, |
|
"loss": 7.7776, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03352891869237217, |
|
"grad_norm": 26.141292572021484, |
|
"learning_rate": 1e-05, |
|
"loss": 6.3217, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.036881810561609385, |
|
"grad_norm": 28.506261825561523, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 6.2981, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.040234702430846606, |
|
"grad_norm": 24.986642837524414, |
|
"learning_rate": 1.2e-05, |
|
"loss": 6.9347, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04358759430008382, |
|
"grad_norm": 24.410480499267578, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 7.4913, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04694048616932104, |
|
"grad_norm": 27.457660675048828, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 6.6574, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.050293378038558254, |
|
"grad_norm": 25.729991912841797, |
|
"learning_rate": 1.5e-05, |
|
"loss": 6.4998, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.053646269907795474, |
|
"grad_norm": 24.635997772216797, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 6.8279, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05699916177703269, |
|
"grad_norm": 27.968725204467773, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 5.303, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06035205364626991, |
|
"grad_norm": 25.632286071777344, |
|
"learning_rate": 1.8e-05, |
|
"loss": 6.2003, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06370494551550712, |
|
"grad_norm": 25.91347885131836, |
|
"learning_rate": 1.9e-05, |
|
"loss": 5.4608, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06705783738474434, |
|
"grad_norm": 26.693031311035156, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07041072925398156, |
|
"grad_norm": 25.103519439697266, |
|
"learning_rate": 2.1e-05, |
|
"loss": 5.6415, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07376362112321877, |
|
"grad_norm": 23.946666717529297, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 5.9613, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07711651299245599, |
|
"grad_norm": 23.824304580688477, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 5.3184, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08046940486169321, |
|
"grad_norm": 22.486934661865234, |
|
"learning_rate": 2.4e-05, |
|
"loss": 4.6439, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08382229673093043, |
|
"grad_norm": 20.246803283691406, |
|
"learning_rate": 2.5e-05, |
|
"loss": 5.0911, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08717518860016764, |
|
"grad_norm": 21.117258071899414, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 4.2909, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09052808046940486, |
|
"grad_norm": 20.302732467651367, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 4.0127, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09388097233864208, |
|
"grad_norm": 17.60736656188965, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 4.3916, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0972338642078793, |
|
"grad_norm": 21.172948837280273, |
|
"learning_rate": 2.9e-05, |
|
"loss": 4.2311, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10058675607711651, |
|
"grad_norm": 43.94004440307617, |
|
"learning_rate": 3e-05, |
|
"loss": 6.7274, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10393964794635373, |
|
"grad_norm": 32.47929000854492, |
|
"learning_rate": 3.1e-05, |
|
"loss": 7.0579, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10729253981559095, |
|
"grad_norm": 29.528139114379883, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 5.5148, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11064543168482817, |
|
"grad_norm": 29.758312225341797, |
|
"learning_rate": 3.3e-05, |
|
"loss": 4.8453, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11399832355406538, |
|
"grad_norm": 30.82133674621582, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 3.8677, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1173512154233026, |
|
"grad_norm": 31.497970581054688, |
|
"learning_rate": 3.5e-05, |
|
"loss": 3.1699, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12070410729253982, |
|
"grad_norm": 26.460988998413086, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.6621, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12405699916177704, |
|
"grad_norm": 25.016969680786133, |
|
"learning_rate": 3.7e-05, |
|
"loss": 2.168, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12740989103101424, |
|
"grad_norm": 19.797771453857422, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.6021, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13076278290025148, |
|
"grad_norm": 21.162372589111328, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.9339, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13411567476948869, |
|
"grad_norm": 22.99364471435547, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4826, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1374685666387259, |
|
"grad_norm": 17.153324127197266, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.9375, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14082145850796313, |
|
"grad_norm": 17.41748046875, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.0149, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.14417435037720033, |
|
"grad_norm": 16.24651527404785, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.9972, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14752724224643754, |
|
"grad_norm": 12.924613952636719, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.7106, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15088013411567477, |
|
"grad_norm": 13.725838661193848, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.506, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15423302598491198, |
|
"grad_norm": 13.44334888458252, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.5244, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15758591785414922, |
|
"grad_norm": 10.480420112609863, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.4254, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16093880972338642, |
|
"grad_norm": 9.2076997756958, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.3862, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16429170159262363, |
|
"grad_norm": 17.21129608154297, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.2896, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.16764459346186086, |
|
"grad_norm": 25.2115535736084, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0117, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17099748533109807, |
|
"grad_norm": 55.361873626708984, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 4.8735, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17435037720033528, |
|
"grad_norm": 41.75767517089844, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 3.4571, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1777032690695725, |
|
"grad_norm": 28.46742057800293, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 2.0218, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18105616093880972, |
|
"grad_norm": 15.05508804321289, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.3842, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.18440905280804695, |
|
"grad_norm": 13.411344528198242, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.0848, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18776194467728416, |
|
"grad_norm": 7.541948318481445, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.5558, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.19111483654652137, |
|
"grad_norm": 11.816786766052246, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.8748, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1944677284157586, |
|
"grad_norm": 13.176451683044434, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.9711, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1978206202849958, |
|
"grad_norm": 13.811378479003906, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.8243, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20117351215423301, |
|
"grad_norm": 12.062150001525879, |
|
"learning_rate": 6e-05, |
|
"loss": 0.6277, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20452640402347025, |
|
"grad_norm": 10.790270805358887, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.6866, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.20787929589270746, |
|
"grad_norm": 11.196799278259277, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.556, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2112321877619447, |
|
"grad_norm": 11.80176067352295, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.4936, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2145850796311819, |
|
"grad_norm": 5.4536614418029785, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.4018, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2179379715004191, |
|
"grad_norm": 8.171553611755371, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.6092, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22129086336965634, |
|
"grad_norm": 4.759002208709717, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.2932, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.22464375523889354, |
|
"grad_norm": 8.26754093170166, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.552, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.22799664710813075, |
|
"grad_norm": 5.972261428833008, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.4562, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23134953897736799, |
|
"grad_norm": 6.14136266708374, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.4001, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2347024308466052, |
|
"grad_norm": 4.529941082000732, |
|
"learning_rate": 7e-05, |
|
"loss": 0.434, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23805532271584243, |
|
"grad_norm": 6.710460186004639, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.3288, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24140821458507963, |
|
"grad_norm": 4.07722282409668, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.2644, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24476110645431684, |
|
"grad_norm": 5.167060852050781, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.2852, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.24811399832355407, |
|
"grad_norm": 4.403326988220215, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.3129, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2514668901927913, |
|
"grad_norm": 3.444688558578491, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.2299, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2548197820620285, |
|
"grad_norm": 4.974164009094238, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.3063, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2581726739312657, |
|
"grad_norm": 4.73006010055542, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.2648, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.26152556580050296, |
|
"grad_norm": 4.712124824523926, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.2816, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.26487845766974016, |
|
"grad_norm": 6.925064563751221, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.1847, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.26823134953897737, |
|
"grad_norm": 21.28022575378418, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6031, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2715842414082146, |
|
"grad_norm": 24.482501983642578, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.5822, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2749371332774518, |
|
"grad_norm": 28.196971893310547, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.3814, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.27829002514668905, |
|
"grad_norm": 26.377918243408203, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.2232, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.28164291701592625, |
|
"grad_norm": 32.93387985229492, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0546, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.28499580888516346, |
|
"grad_norm": 20.566518783569336, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.8103, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28834870075440067, |
|
"grad_norm": 14.28470516204834, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.7405, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2917015926236379, |
|
"grad_norm": 7.578380584716797, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.359, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2950544844928751, |
|
"grad_norm": 14.096921920776367, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.5263, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.29840737636211234, |
|
"grad_norm": 9.082076072692871, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.349, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30176026823134955, |
|
"grad_norm": 9.854619026184082, |
|
"learning_rate": 9e-05, |
|
"loss": 0.3619, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30511316010058676, |
|
"grad_norm": 6.256463050842285, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.3233, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.30846605196982396, |
|
"grad_norm": 21.08913230895996, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.2958, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.31181894383906117, |
|
"grad_norm": 15.940088272094727, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.381, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.31517183570829843, |
|
"grad_norm": 13.932638168334961, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.3273, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.31852472757753564, |
|
"grad_norm": 4.884693145751953, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.3036, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32187761944677284, |
|
"grad_norm": 4.182151794433594, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.1351, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.32523051131601005, |
|
"grad_norm": 9.422307014465332, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.4685, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.32858340318524726, |
|
"grad_norm": 18.013978958129883, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.235, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3319362950544845, |
|
"grad_norm": 13.848796844482422, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.9726, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3352891869237217, |
|
"grad_norm": 18.036916732788086, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2473, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33864207879295893, |
|
"grad_norm": 15.80588150024414, |
|
"learning_rate": 9.999370638369377e-05, |
|
"loss": 1.3471, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.34199497066219614, |
|
"grad_norm": 16.674541473388672, |
|
"learning_rate": 9.997482711915927e-05, |
|
"loss": 1.1381, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.34534786253143335, |
|
"grad_norm": 47.35570526123047, |
|
"learning_rate": 9.99433669591504e-05, |
|
"loss": 1.1454, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.34870075440067055, |
|
"grad_norm": 10.346620559692383, |
|
"learning_rate": 9.989933382359422e-05, |
|
"loss": 0.7687, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3520536462699078, |
|
"grad_norm": 8.792768478393555, |
|
"learning_rate": 9.984273879759713e-05, |
|
"loss": 0.5802, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.355406538139145, |
|
"grad_norm": 5.656138896942139, |
|
"learning_rate": 9.977359612865423e-05, |
|
"loss": 0.4869, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.35875943000838223, |
|
"grad_norm": 12.679141998291016, |
|
"learning_rate": 9.969192322306271e-05, |
|
"loss": 0.5927, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.36211232187761944, |
|
"grad_norm": 9.823601722717285, |
|
"learning_rate": 9.959774064153977e-05, |
|
"loss": 0.5512, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.36546521374685664, |
|
"grad_norm": 11.15606689453125, |
|
"learning_rate": 9.949107209404665e-05, |
|
"loss": 0.603, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3688181056160939, |
|
"grad_norm": 10.26724910736084, |
|
"learning_rate": 9.937194443381972e-05, |
|
"loss": 0.4432, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3721709974853311, |
|
"grad_norm": 8.792684555053711, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.5643, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3755238893545683, |
|
"grad_norm": 5.435505390167236, |
|
"learning_rate": 9.909643486313533e-05, |
|
"loss": 0.4126, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3788767812238055, |
|
"grad_norm": 5.39192533493042, |
|
"learning_rate": 9.894012231073894e-05, |
|
"loss": 0.4081, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.38222967309304273, |
|
"grad_norm": 10.250500679016113, |
|
"learning_rate": 9.877148934427037e-05, |
|
"loss": 0.4552, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.38558256496227994, |
|
"grad_norm": 9.414995193481445, |
|
"learning_rate": 9.859057841617709e-05, |
|
"loss": 0.4112, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3889354568315172, |
|
"grad_norm": 6.214295387268066, |
|
"learning_rate": 9.839743506981782e-05, |
|
"loss": 0.3666, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3922883487007544, |
|
"grad_norm": 4.788174629211426, |
|
"learning_rate": 9.819210792799712e-05, |
|
"loss": 0.273, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3956412405699916, |
|
"grad_norm": 6.027410507202148, |
|
"learning_rate": 9.797464868072488e-05, |
|
"loss": 0.2291, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3989941324392288, |
|
"grad_norm": 6.023176670074463, |
|
"learning_rate": 9.77451120722037e-05, |
|
"loss": 0.2822, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.40234702430846603, |
|
"grad_norm": 4.367912769317627, |
|
"learning_rate": 9.750355588704727e-05, |
|
"loss": 0.4041, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4056999161777033, |
|
"grad_norm": 6.125609397888184, |
|
"learning_rate": 9.725004093573342e-05, |
|
"loss": 0.2858, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4090528080469405, |
|
"grad_norm": 5.797603607177734, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.2407, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4124056999161777, |
|
"grad_norm": 8.285858154296875, |
|
"learning_rate": 9.670739301325534e-05, |
|
"loss": 0.2792, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4157585917854149, |
|
"grad_norm": 4.349048614501953, |
|
"learning_rate": 9.641839665080363e-05, |
|
"loss": 0.1981, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4191114836546521, |
|
"grad_norm": 3.2639927864074707, |
|
"learning_rate": 9.611771470522908e-05, |
|
"loss": 0.2172, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4224643755238894, |
|
"grad_norm": 4.961655616760254, |
|
"learning_rate": 9.580542287160348e-05, |
|
"loss": 0.1867, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4258172673931266, |
|
"grad_norm": 4.474092483520508, |
|
"learning_rate": 9.548159976772592e-05, |
|
"loss": 0.2191, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.4291701592623638, |
|
"grad_norm": 4.9893927574157715, |
|
"learning_rate": 9.514632691433107e-05, |
|
"loss": 0.1895, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.432523051131601, |
|
"grad_norm": 28.090858459472656, |
|
"learning_rate": 9.479968871456679e-05, |
|
"loss": 1.7745, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4358759430008382, |
|
"grad_norm": 29.179738998413086, |
|
"learning_rate": 9.444177243274618e-05, |
|
"loss": 1.6506, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4392288348700754, |
|
"grad_norm": 26.920635223388672, |
|
"learning_rate": 9.407266817237911e-05, |
|
"loss": 1.1685, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4425817267393127, |
|
"grad_norm": 25.46989631652832, |
|
"learning_rate": 9.369246885348926e-05, |
|
"loss": 0.9895, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4459346186085499, |
|
"grad_norm": 19.810270309448242, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.4968, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4492875104777871, |
|
"grad_norm": 9.30676555633545, |
|
"learning_rate": 9.289917066174886e-05, |
|
"loss": 0.5492, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4526404023470243, |
|
"grad_norm": 16.38142204284668, |
|
"learning_rate": 9.248627149747573e-05, |
|
"loss": 0.6716, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4559932942162615, |
|
"grad_norm": 15.360873222351074, |
|
"learning_rate": 9.206267664155907e-05, |
|
"loss": 0.588, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.45934618608549876, |
|
"grad_norm": 11.33460521697998, |
|
"learning_rate": 9.162849273173857e-05, |
|
"loss": 0.3989, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.46269907795473597, |
|
"grad_norm": 7.337402820587158, |
|
"learning_rate": 9.118382907149165e-05, |
|
"loss": 0.3023, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4660519698239732, |
|
"grad_norm": 9.812034606933594, |
|
"learning_rate": 9.072879760251679e-05, |
|
"loss": 0.2999, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4694048616932104, |
|
"grad_norm": 15.64886474609375, |
|
"learning_rate": 9.026351287655294e-05, |
|
"loss": 0.4581, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4727577535624476, |
|
"grad_norm": 24.609207153320312, |
|
"learning_rate": 8.978809202654162e-05, |
|
"loss": 0.4414, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.47611064543168485, |
|
"grad_norm": 13.780351638793945, |
|
"learning_rate": 8.930265473713938e-05, |
|
"loss": 0.314, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.47946353730092206, |
|
"grad_norm": 4.577981472015381, |
|
"learning_rate": 8.880732321458784e-05, |
|
"loss": 0.2216, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.48281642917015927, |
|
"grad_norm": 6.896910667419434, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.2431, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4861693210393965, |
|
"grad_norm": 10.73836898803711, |
|
"learning_rate": 8.778747871771292e-05, |
|
"loss": 0.2993, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4895222129086337, |
|
"grad_norm": 5.567845344543457, |
|
"learning_rate": 8.726322248378775e-05, |
|
"loss": 0.2575, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4928751047778709, |
|
"grad_norm": 9.290807723999023, |
|
"learning_rate": 8.672958543287666e-05, |
|
"loss": 0.3084, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.49622799664710815, |
|
"grad_norm": 13.643046379089355, |
|
"learning_rate": 8.618670190525352e-05, |
|
"loss": 0.5823, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.49958088851634536, |
|
"grad_norm": 11.43275260925293, |
|
"learning_rate": 8.563470856894316e-05, |
|
"loss": 0.5169, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5029337803855826, |
|
"grad_norm": 16.253604888916016, |
|
"learning_rate": 8.507374438531607e-05, |
|
"loss": 0.6451, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5062866722548198, |
|
"grad_norm": 14.23026180267334, |
|
"learning_rate": 8.450395057410561e-05, |
|
"loss": 1.2755, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.509639564124057, |
|
"grad_norm": 12.833418846130371, |
|
"learning_rate": 8.392547057785661e-05, |
|
"loss": 1.0593, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5129924559932942, |
|
"grad_norm": 10.777283668518066, |
|
"learning_rate": 8.333845002581458e-05, |
|
"loss": 0.7966, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5163453478625314, |
|
"grad_norm": 9.72930908203125, |
|
"learning_rate": 8.274303669726426e-05, |
|
"loss": 0.5648, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5196982397317687, |
|
"grad_norm": 6.614412784576416, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.5082, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5230511316010059, |
|
"grad_norm": 6.6831889152526855, |
|
"learning_rate": 8.152763335422613e-05, |
|
"loss": 0.4376, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5264040234702431, |
|
"grad_norm": 5.255478858947754, |
|
"learning_rate": 8.090794931103026e-05, |
|
"loss": 0.3736, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5297569153394803, |
|
"grad_norm": 8.692307472229004, |
|
"learning_rate": 8.028048435688333e-05, |
|
"loss": 0.4133, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5331098072087175, |
|
"grad_norm": 7.771063804626465, |
|
"learning_rate": 7.964539645273204e-05, |
|
"loss": 0.4996, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5364626990779547, |
|
"grad_norm": 7.538997650146484, |
|
"learning_rate": 7.900284547855991e-05, |
|
"loss": 0.4573, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.539815590947192, |
|
"grad_norm": 8.275318145751953, |
|
"learning_rate": 7.835299319313853e-05, |
|
"loss": 0.3567, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5431684828164292, |
|
"grad_norm": 9.14857292175293, |
|
"learning_rate": 7.769600319330552e-05, |
|
"loss": 0.3545, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5465213746856664, |
|
"grad_norm": 7.5321784019470215, |
|
"learning_rate": 7.703204087277988e-05, |
|
"loss": 0.3356, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5498742665549036, |
|
"grad_norm": 5.943683624267578, |
|
"learning_rate": 7.636127338052512e-05, |
|
"loss": 0.3377, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5532271584241408, |
|
"grad_norm": 4.5078125, |
|
"learning_rate": 7.568386957867033e-05, |
|
"loss": 0.383, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5565800502933781, |
|
"grad_norm": 5.633090496063232, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3359, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5599329421626152, |
|
"grad_norm": 4.150753974914551, |
|
"learning_rate": 7.430983680502344e-05, |
|
"loss": 0.2535, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5632858340318525, |
|
"grad_norm": 7.417738437652588, |
|
"learning_rate": 7.361355373863414e-05, |
|
"loss": 0.4739, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5666387259010897, |
|
"grad_norm": 4.674182891845703, |
|
"learning_rate": 7.291132608637052e-05, |
|
"loss": 0.3103, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5699916177703269, |
|
"grad_norm": 2.7935192584991455, |
|
"learning_rate": 7.220333063028872e-05, |
|
"loss": 0.2336, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5733445096395641, |
|
"grad_norm": 9.859716415405273, |
|
"learning_rate": 7.148974560445859e-05, |
|
"loss": 0.4477, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5766974015088013, |
|
"grad_norm": 5.132834434509277, |
|
"learning_rate": 7.077075065009433e-05, |
|
"loss": 0.2547, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5800502933780386, |
|
"grad_norm": 5.565759181976318, |
|
"learning_rate": 7.004652677033068e-05, |
|
"loss": 0.4095, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5834031852472757, |
|
"grad_norm": 7.700199604034424, |
|
"learning_rate": 6.931725628465643e-05, |
|
"loss": 0.2234, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.586756077116513, |
|
"grad_norm": 3.140568733215332, |
|
"learning_rate": 6.858312278301637e-05, |
|
"loss": 0.1842, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5901089689857502, |
|
"grad_norm": 7.881420612335205, |
|
"learning_rate": 6.784431107959359e-05, |
|
"loss": 0.2952, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5934618608549874, |
|
"grad_norm": 4.895866870880127, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.1627, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5968147527242247, |
|
"grad_norm": 6.7675251960754395, |
|
"learning_rate": 6.635339816587109e-05, |
|
"loss": 0.1709, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6001676445934618, |
|
"grad_norm": 12.553466796875, |
|
"learning_rate": 6.560167228492436e-05, |
|
"loss": 0.9033, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6035205364626991, |
|
"grad_norm": 25.9533748626709, |
|
"learning_rate": 6.484601876641375e-05, |
|
"loss": 1.2262, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6068734283319362, |
|
"grad_norm": 25.939783096313477, |
|
"learning_rate": 6.408662784207149e-05, |
|
"loss": 1.5251, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6102263202011735, |
|
"grad_norm": 31.702119827270508, |
|
"learning_rate": 6.332369068450174e-05, |
|
"loss": 1.2017, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6135792120704108, |
|
"grad_norm": 24.445140838623047, |
|
"learning_rate": 6.255739935905396e-05, |
|
"loss": 1.1614, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6169321039396479, |
|
"grad_norm": 26.491666793823242, |
|
"learning_rate": 6.178794677547137e-05, |
|
"loss": 1.184, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6202849958088852, |
|
"grad_norm": 20.919872283935547, |
|
"learning_rate": 6.1015526639327035e-05, |
|
"loss": 0.7381, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6236378876781223, |
|
"grad_norm": 77.90556335449219, |
|
"learning_rate": 6.024033340325954e-05, |
|
"loss": 1.3539, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6269907795473596, |
|
"grad_norm": 21.881759643554688, |
|
"learning_rate": 5.946256221802051e-05, |
|
"loss": 0.7108, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6303436714165969, |
|
"grad_norm": 15.38471508026123, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.4868, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.633696563285834, |
|
"grad_norm": 14.163756370544434, |
|
"learning_rate": 5.79000697986675e-05, |
|
"loss": 0.6412, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6370494551550713, |
|
"grad_norm": 11.611079216003418, |
|
"learning_rate": 5.7115741913664264e-05, |
|
"loss": 0.2968, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6404023470243084, |
|
"grad_norm": 13.02933120727539, |
|
"learning_rate": 5.6329622678687463e-05, |
|
"loss": 0.4897, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6437552388935457, |
|
"grad_norm": 11.208833694458008, |
|
"learning_rate": 5.5541909995050554e-05, |
|
"loss": 0.5136, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.647108130762783, |
|
"grad_norm": 6.404201984405518, |
|
"learning_rate": 5.475280216520913e-05, |
|
"loss": 0.2015, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6504610226320201, |
|
"grad_norm": 15.304976463317871, |
|
"learning_rate": 5.396249784283942e-05, |
|
"loss": 0.5075, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6538139145012574, |
|
"grad_norm": 5.351606369018555, |
|
"learning_rate": 5.317119598282823e-05, |
|
"loss": 0.175, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6571668063704945, |
|
"grad_norm": 6.819107532501221, |
|
"learning_rate": 5.2379095791187124e-05, |
|
"loss": 0.2753, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6605196982397318, |
|
"grad_norm": 14.04761028289795, |
|
"learning_rate": 5.158639667490339e-05, |
|
"loss": 0.5383, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.663872590108969, |
|
"grad_norm": 6.505035400390625, |
|
"learning_rate": 5.0793298191740404e-05, |
|
"loss": 0.1789, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6672254819782062, |
|
"grad_norm": 13.04714298248291, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6869, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6705783738474435, |
|
"grad_norm": 11.471817970275879, |
|
"learning_rate": 4.92067018082596e-05, |
|
"loss": 0.5482, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6739312657166806, |
|
"grad_norm": 14.056415557861328, |
|
"learning_rate": 4.841360332509663e-05, |
|
"loss": 0.7455, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6772841575859179, |
|
"grad_norm": 12.744611740112305, |
|
"learning_rate": 4.762090420881289e-05, |
|
"loss": 0.6835, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.680637049455155, |
|
"grad_norm": 13.389442443847656, |
|
"learning_rate": 4.6828804017171776e-05, |
|
"loss": 0.5843, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6839899413243923, |
|
"grad_norm": 12.951894760131836, |
|
"learning_rate": 4.603750215716057e-05, |
|
"loss": 0.5312, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6873428331936295, |
|
"grad_norm": 11.330641746520996, |
|
"learning_rate": 4.5247197834790876e-05, |
|
"loss": 0.6459, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6906957250628667, |
|
"grad_norm": 13.452632904052734, |
|
"learning_rate": 4.445809000494946e-05, |
|
"loss": 0.8902, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.694048616932104, |
|
"grad_norm": 14.852524757385254, |
|
"learning_rate": 4.3670377321312535e-05, |
|
"loss": 0.8621, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6974015088013411, |
|
"grad_norm": 19.261449813842773, |
|
"learning_rate": 4.288425808633575e-05, |
|
"loss": 0.8369, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7007544006705784, |
|
"grad_norm": 12.483002662658691, |
|
"learning_rate": 4.20999302013325e-05, |
|
"loss": 0.6665, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7041072925398156, |
|
"grad_norm": 12.300580024719238, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.6114, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7074601844090528, |
|
"grad_norm": 7.492820739746094, |
|
"learning_rate": 4.0537437781979506e-05, |
|
"loss": 0.4368, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.71081307627829, |
|
"grad_norm": 9.560514450073242, |
|
"learning_rate": 3.9759666596740476e-05, |
|
"loss": 0.4916, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7141659681475272, |
|
"grad_norm": 9.900005340576172, |
|
"learning_rate": 3.898447336067297e-05, |
|
"loss": 0.3931, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7175188600167645, |
|
"grad_norm": 9.310670852661133, |
|
"learning_rate": 3.821205322452863e-05, |
|
"loss": 0.3617, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7208717518860017, |
|
"grad_norm": 9.20274829864502, |
|
"learning_rate": 3.744260064094604e-05, |
|
"loss": 0.4006, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7242246437552389, |
|
"grad_norm": 9.266610145568848, |
|
"learning_rate": 3.6676309315498256e-05, |
|
"loss": 0.2972, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7275775356244761, |
|
"grad_norm": 6.18265438079834, |
|
"learning_rate": 3.591337215792852e-05, |
|
"loss": 0.3088, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7309304274937133, |
|
"grad_norm": 8.273286819458008, |
|
"learning_rate": 3.515398123358627e-05, |
|
"loss": 0.3283, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7342833193629505, |
|
"grad_norm": 5.254649639129639, |
|
"learning_rate": 3.439832771507565e-05, |
|
"loss": 0.2671, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7376362112321878, |
|
"grad_norm": 6.091966152191162, |
|
"learning_rate": 3.364660183412892e-05, |
|
"loss": 0.2778, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.740989103101425, |
|
"grad_norm": 4.136501312255859, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.2668, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7443419949706622, |
|
"grad_norm": 6.066188812255859, |
|
"learning_rate": 3.215568892040641e-05, |
|
"loss": 0.2929, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7476948868398994, |
|
"grad_norm": 5.012242794036865, |
|
"learning_rate": 3.141687721698363e-05, |
|
"loss": 0.1956, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7510477787091366, |
|
"grad_norm": 5.167850494384766, |
|
"learning_rate": 3.0682743715343564e-05, |
|
"loss": 0.2545, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7544006705783739, |
|
"grad_norm": 4.93275785446167, |
|
"learning_rate": 2.9953473229669328e-05, |
|
"loss": 0.2146, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.757753562447611, |
|
"grad_norm": 4.2212629318237305, |
|
"learning_rate": 2.9229249349905684e-05, |
|
"loss": 0.2268, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7611064543168483, |
|
"grad_norm": 2.142444372177124, |
|
"learning_rate": 2.851025439554142e-05, |
|
"loss": 0.1114, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7644593461860855, |
|
"grad_norm": 6.13271951675415, |
|
"learning_rate": 2.7796669369711294e-05, |
|
"loss": 0.2117, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7678122380553227, |
|
"grad_norm": 25.86508560180664, |
|
"learning_rate": 2.708867391362948e-05, |
|
"loss": 0.9176, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7711651299245599, |
|
"grad_norm": 23.312772750854492, |
|
"learning_rate": 2.638644626136587e-05, |
|
"loss": 0.9364, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7745180217937971, |
|
"grad_norm": 24.960554122924805, |
|
"learning_rate": 2.5690163194976575e-05, |
|
"loss": 0.9327, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7778709136630344, |
|
"grad_norm": 20.75816535949707, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.8042, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7812238055322716, |
|
"grad_norm": 21.9549560546875, |
|
"learning_rate": 2.4316130421329697e-05, |
|
"loss": 1.0805, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7845766974015088, |
|
"grad_norm": 20.77252960205078, |
|
"learning_rate": 2.363872661947488e-05, |
|
"loss": 0.529, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.787929589270746, |
|
"grad_norm": 18.2496337890625, |
|
"learning_rate": 2.296795912722014e-05, |
|
"loss": 0.6804, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7912824811399832, |
|
"grad_norm": 19.881755828857422, |
|
"learning_rate": 2.2303996806694488e-05, |
|
"loss": 0.7514, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7946353730092205, |
|
"grad_norm": 17.686979293823242, |
|
"learning_rate": 2.164700680686147e-05, |
|
"loss": 0.8834, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7979882648784576, |
|
"grad_norm": 19.303966522216797, |
|
"learning_rate": 2.09971545214401e-05, |
|
"loss": 0.8204, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8013411567476949, |
|
"grad_norm": 13.04858684539795, |
|
"learning_rate": 2.0354603547267985e-05, |
|
"loss": 0.4991, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8046940486169321, |
|
"grad_norm": 15.609128952026367, |
|
"learning_rate": 1.9719515643116674e-05, |
|
"loss": 0.885, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8080469404861693, |
|
"grad_norm": 14.140607833862305, |
|
"learning_rate": 1.9092050688969738e-05, |
|
"loss": 0.6477, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8113998323554066, |
|
"grad_norm": 8.173004150390625, |
|
"learning_rate": 1.847236664577389e-05, |
|
"loss": 0.3195, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8147527242246437, |
|
"grad_norm": 12.263628959655762, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.3825, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.818105616093881, |
|
"grad_norm": 9.158538818359375, |
|
"learning_rate": 1.725696330273575e-05, |
|
"loss": 0.5394, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8214585079631181, |
|
"grad_norm": 11.503657341003418, |
|
"learning_rate": 1.6661549974185424e-05, |
|
"loss": 0.5017, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8248113998323554, |
|
"grad_norm": 11.45801830291748, |
|
"learning_rate": 1.60745294221434e-05, |
|
"loss": 0.5327, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8281642917015927, |
|
"grad_norm": 14.59035873413086, |
|
"learning_rate": 1.549604942589441e-05, |
|
"loss": 0.6128, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8315171835708298, |
|
"grad_norm": 11.337044715881348, |
|
"learning_rate": 1.4926255614683932e-05, |
|
"loss": 0.4564, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8348700754400671, |
|
"grad_norm": 9.969059944152832, |
|
"learning_rate": 1.4365291431056871e-05, |
|
"loss": 0.7156, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8382229673093042, |
|
"grad_norm": 10.448427200317383, |
|
"learning_rate": 1.3813298094746491e-05, |
|
"loss": 0.5685, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8415758591785415, |
|
"grad_norm": 11.464609146118164, |
|
"learning_rate": 1.327041456712334e-05, |
|
"loss": 0.6357, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8449287510477788, |
|
"grad_norm": 12.060856819152832, |
|
"learning_rate": 1.2736777516212266e-05, |
|
"loss": 0.7539, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8482816429170159, |
|
"grad_norm": 12.243000984191895, |
|
"learning_rate": 1.2212521282287092e-05, |
|
"loss": 0.7645, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8516345347862532, |
|
"grad_norm": 11.656434059143066, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.708, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8549874266554903, |
|
"grad_norm": 12.392927169799805, |
|
"learning_rate": 1.1192676785412154e-05, |
|
"loss": 0.7513, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8583403185247276, |
|
"grad_norm": 9.757222175598145, |
|
"learning_rate": 1.0697345262860636e-05, |
|
"loss": 0.6079, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8616932103939648, |
|
"grad_norm": 8.159704208374023, |
|
"learning_rate": 1.021190797345839e-05, |
|
"loss": 0.2301, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.865046102263202, |
|
"grad_norm": 7.381707668304443, |
|
"learning_rate": 9.73648712344707e-06, |
|
"loss": 0.337, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8683989941324393, |
|
"grad_norm": 9.870096206665039, |
|
"learning_rate": 9.271202397483215e-06, |
|
"loss": 0.2997, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8717518860016764, |
|
"grad_norm": 8.603653907775879, |
|
"learning_rate": 8.816170928508365e-06, |
|
"loss": 0.2499, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8751047778709137, |
|
"grad_norm": 6.676860809326172, |
|
"learning_rate": 8.371507268261437e-06, |
|
"loss": 0.3058, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8784576697401508, |
|
"grad_norm": 8.418916702270508, |
|
"learning_rate": 7.937323358440935e-06, |
|
"loss": 0.2895, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8818105616093881, |
|
"grad_norm": 10.64456558227539, |
|
"learning_rate": 7.513728502524286e-06, |
|
"loss": 0.3863, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8851634534786254, |
|
"grad_norm": 7.481652736663818, |
|
"learning_rate": 7.100829338251147e-06, |
|
"loss": 0.2319, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8885163453478625, |
|
"grad_norm": 7.484499454498291, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.1861, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8918692372170998, |
|
"grad_norm": 6.579418659210205, |
|
"learning_rate": 6.3075311465107535e-06, |
|
"loss": 0.2557, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8952221290863369, |
|
"grad_norm": 6.663677215576172, |
|
"learning_rate": 5.927331827620903e-06, |
|
"loss": 0.3708, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8985750209555742, |
|
"grad_norm": 9.156440734863281, |
|
"learning_rate": 5.558227567253832e-06, |
|
"loss": 0.3865, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9019279128248114, |
|
"grad_norm": 5.36343240737915, |
|
"learning_rate": 5.200311285433213e-06, |
|
"loss": 0.2776, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9052808046940486, |
|
"grad_norm": 7.757360935211182, |
|
"learning_rate": 4.853673085668947e-06, |
|
"loss": 0.335, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9086336965632859, |
|
"grad_norm": 12.774531364440918, |
|
"learning_rate": 4.5184002322740785e-06, |
|
"loss": 0.3653, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.911986588432523, |
|
"grad_norm": 8.137080192565918, |
|
"learning_rate": 4.19457712839652e-06, |
|
"loss": 0.392, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9153394803017603, |
|
"grad_norm": 9.441322326660156, |
|
"learning_rate": 3.8822852947709375e-06, |
|
"loss": 0.3059, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9186923721709975, |
|
"grad_norm": 5.269786834716797, |
|
"learning_rate": 3.581603349196372e-06, |
|
"loss": 0.2275, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9220452640402347, |
|
"grad_norm": 9.277483940124512, |
|
"learning_rate": 3.2926069867446675e-06, |
|
"loss": 0.335, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9253981559094719, |
|
"grad_norm": 9.3627290725708, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.4954, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9287510477787091, |
|
"grad_norm": 7.629518508911133, |
|
"learning_rate": 2.7499590642665774e-06, |
|
"loss": 0.2234, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9321039396479464, |
|
"grad_norm": 5.571048736572266, |
|
"learning_rate": 2.496444112952734e-06, |
|
"loss": 0.3485, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9354568315171836, |
|
"grad_norm": 18.15276336669922, |
|
"learning_rate": 2.2548879277963064e-06, |
|
"loss": 0.3434, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9388097233864208, |
|
"grad_norm": 14.836206436157227, |
|
"learning_rate": 2.0253513192751373e-06, |
|
"loss": 0.3263, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.942162615255658, |
|
"grad_norm": 13.923564910888672, |
|
"learning_rate": 1.807892072002898e-06, |
|
"loss": 0.4678, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9455155071248952, |
|
"grad_norm": 15.387225151062012, |
|
"learning_rate": 1.6025649301821876e-06, |
|
"loss": 0.4785, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9488683989941324, |
|
"grad_norm": 13.055351257324219, |
|
"learning_rate": 1.4094215838229176e-06, |
|
"loss": 0.563, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9522212908633697, |
|
"grad_norm": 17.376829147338867, |
|
"learning_rate": 1.2285106557296477e-06, |
|
"loss": 0.3891, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9555741827326069, |
|
"grad_norm": 17.277982711791992, |
|
"learning_rate": 1.0598776892610685e-06, |
|
"loss": 0.5198, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9589270746018441, |
|
"grad_norm": 11.45787239074707, |
|
"learning_rate": 9.035651368646648e-07, |
|
"loss": 0.4912, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9622799664710813, |
|
"grad_norm": 12.024084091186523, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.4365, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9656328583403185, |
|
"grad_norm": 15.519173622131348, |
|
"learning_rate": 6.280555661802856e-07, |
|
"loss": 0.6457, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9689857502095558, |
|
"grad_norm": 15.936901092529297, |
|
"learning_rate": 5.089279059533658e-07, |
|
"loss": 0.7402, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.972338642078793, |
|
"grad_norm": 14.855793952941895, |
|
"learning_rate": 4.02259358460233e-07, |
|
"loss": 0.5426, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9756915339480302, |
|
"grad_norm": 17.52886199951172, |
|
"learning_rate": 3.080767769372939e-07, |
|
"loss": 0.5786, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9790444258172674, |
|
"grad_norm": 14.614376068115234, |
|
"learning_rate": 2.2640387134577058e-07, |
|
"loss": 0.4134, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9823973176865046, |
|
"grad_norm": 12.858384132385254, |
|
"learning_rate": 1.5726120240288634e-07, |
|
"loss": 0.9686, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9857502095557418, |
|
"grad_norm": 14.443262100219727, |
|
"learning_rate": 1.0066617640578368e-07, |
|
"loss": 0.5455, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.989103101424979, |
|
"grad_norm": 14.196720123291016, |
|
"learning_rate": 5.663304084960186e-08, |
|
"loss": 0.4852, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9924559932942163, |
|
"grad_norm": 13.345819473266602, |
|
"learning_rate": 2.5172880840745873e-08, |
|
"loss": 0.4879, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9958088851634534, |
|
"grad_norm": 4.6811842918396, |
|
"learning_rate": 6.293616306246586e-09, |
|
"loss": 0.2391, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.9991617770326907, |
|
"grad_norm": 7.340142250061035, |
|
"learning_rate": 0.0, |
|
"loss": 0.4579, |
|
"step": 298 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 298, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.998363176038564e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|