{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9361702127659575, "eval_steps": 24, "global_step": 282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010638297872340425, "grad_norm": 0.10508890450000763, "learning_rate": 3.0000000000000004e-08, "loss": 1.2333, "step": 1 }, { "epoch": 0.010638297872340425, "eval_loss": 1.5895557403564453, "eval_runtime": 2.9603, "eval_samples_per_second": 35.469, "eval_steps_per_second": 17.903, "step": 1 }, { "epoch": 0.02127659574468085, "grad_norm": 0.12071573734283447, "learning_rate": 6.000000000000001e-08, "loss": 1.3986, "step": 2 }, { "epoch": 0.031914893617021274, "grad_norm": 0.10401547700166702, "learning_rate": 9e-08, "loss": 1.2058, "step": 3 }, { "epoch": 0.0425531914893617, "grad_norm": 0.12298297882080078, "learning_rate": 1.2000000000000002e-07, "loss": 1.3298, "step": 4 }, { "epoch": 0.05319148936170213, "grad_norm": 0.1327030211687088, "learning_rate": 1.5000000000000002e-07, "loss": 1.4031, "step": 5 }, { "epoch": 0.06382978723404255, "grad_norm": 0.12036772817373276, "learning_rate": 1.8e-07, "loss": 1.5335, "step": 6 }, { "epoch": 0.07446808510638298, "grad_norm": 0.14979703724384308, "learning_rate": 2.1000000000000003e-07, "loss": 1.3224, "step": 7 }, { "epoch": 0.0851063829787234, "grad_norm": 0.13082227110862732, "learning_rate": 2.4000000000000003e-07, "loss": 1.4035, "step": 8 }, { "epoch": 0.09574468085106383, "grad_norm": 0.09265327453613281, "learning_rate": 2.7e-07, "loss": 1.1546, "step": 9 }, { "epoch": 0.10638297872340426, "grad_norm": 0.10080434381961823, "learning_rate": 3.0000000000000004e-07, "loss": 1.2652, "step": 10 }, { "epoch": 0.11702127659574468, "grad_norm": 0.10915568470954895, "learning_rate": 3.3e-07, "loss": 1.1438, "step": 11 }, { "epoch": 0.1276595744680851, "grad_norm": 0.1307348906993866, "learning_rate": 3.6e-07, "loss": 1.2821, "step": 12 }, { "epoch": 0.13829787234042554, "grad_norm": 0.09814529865980148, "learning_rate": 3.9e-07, "loss": 1.3905, "step": 13 }, { "epoch": 0.14893617021276595, "grad_norm": 0.11842218786478043, "learning_rate": 4.2000000000000006e-07, "loss": 1.3085, "step": 14 }, { "epoch": 0.1595744680851064, "grad_norm": 0.10207124054431915, "learning_rate": 4.5e-07, "loss": 1.19, "step": 15 }, { "epoch": 0.1702127659574468, "grad_norm": 0.1150127425789833, "learning_rate": 4.800000000000001e-07, "loss": 1.386, "step": 16 }, { "epoch": 0.18085106382978725, "grad_norm": 0.11641352623701096, "learning_rate": 5.100000000000001e-07, "loss": 1.4357, "step": 17 }, { "epoch": 0.19148936170212766, "grad_norm": 0.15035435557365417, "learning_rate": 5.4e-07, "loss": 1.2549, "step": 18 }, { "epoch": 0.20212765957446807, "grad_norm": 0.0984087809920311, "learning_rate": 5.7e-07, "loss": 0.9723, "step": 19 }, { "epoch": 0.2127659574468085, "grad_norm": 0.11582670360803604, "learning_rate": 6.000000000000001e-07, "loss": 1.1402, "step": 20 }, { "epoch": 0.22340425531914893, "grad_norm": 0.10151291638612747, "learning_rate": 6.3e-07, "loss": 1.1533, "step": 21 }, { "epoch": 0.23404255319148937, "grad_norm": 0.09040653705596924, "learning_rate": 6.6e-07, "loss": 1.0438, "step": 22 }, { "epoch": 0.24468085106382978, "grad_norm": 0.10666865110397339, "learning_rate": 6.900000000000001e-07, "loss": 1.3195, "step": 23 }, { "epoch": 0.2553191489361702, "grad_norm": 0.12293969094753265, "learning_rate": 7.2e-07, "loss": 1.7286, "step": 24 }, { "epoch": 0.2553191489361702, "eval_loss": 1.5890721082687378, "eval_runtime": 2.9908, "eval_samples_per_second": 35.108, "eval_steps_per_second": 17.721, "step": 24 }, { "epoch": 0.26595744680851063, "grad_norm": 0.11052387952804565, "learning_rate": 7.5e-07, "loss": 1.1413, "step": 25 }, { "epoch": 0.2765957446808511, "grad_norm": 0.09323304146528244, "learning_rate": 7.8e-07, "loss": 1.2906, "step": 26 }, { "epoch": 0.2872340425531915, "grad_norm": 0.12542971968650818, "learning_rate": 8.100000000000001e-07, "loss": 1.2809, "step": 27 }, { "epoch": 0.2978723404255319, "grad_norm": 0.1080215722322464, "learning_rate": 8.400000000000001e-07, "loss": 1.2735, "step": 28 }, { "epoch": 0.30851063829787234, "grad_norm": 0.11304887384176254, "learning_rate": 8.699999999999999e-07, "loss": 1.3767, "step": 29 }, { "epoch": 0.3191489361702128, "grad_norm": 0.15650290250778198, "learning_rate": 9e-07, "loss": 1.4713, "step": 30 }, { "epoch": 0.32978723404255317, "grad_norm": 0.12139321118593216, "learning_rate": 9.3e-07, "loss": 1.3479, "step": 31 }, { "epoch": 0.3404255319148936, "grad_norm": 0.10141867399215698, "learning_rate": 9.600000000000001e-07, "loss": 1.1512, "step": 32 }, { "epoch": 0.35106382978723405, "grad_norm": 0.15045498311519623, "learning_rate": 9.9e-07, "loss": 1.5059, "step": 33 }, { "epoch": 0.3617021276595745, "grad_norm": 0.12956956028938293, "learning_rate": 1.0200000000000002e-06, "loss": 1.2849, "step": 34 }, { "epoch": 0.3723404255319149, "grad_norm": 0.12963886559009552, "learning_rate": 1.05e-06, "loss": 1.5025, "step": 35 }, { "epoch": 0.3829787234042553, "grad_norm": 0.11268144845962524, "learning_rate": 1.08e-06, "loss": 1.2987, "step": 36 }, { "epoch": 0.39361702127659576, "grad_norm": 0.12941108644008636, "learning_rate": 1.11e-06, "loss": 1.3432, "step": 37 }, { "epoch": 0.40425531914893614, "grad_norm": 0.11319927126169205, "learning_rate": 1.14e-06, "loss": 1.1665, "step": 38 }, { "epoch": 0.4148936170212766, "grad_norm": 0.11748912930488586, "learning_rate": 1.17e-06, "loss": 1.423, "step": 39 }, { "epoch": 0.425531914893617, "grad_norm": 0.11666171997785568, "learning_rate": 1.2000000000000002e-06, "loss": 1.4391, "step": 40 }, { "epoch": 0.43617021276595747, "grad_norm": 0.12274409830570221, "learning_rate": 1.2299999999999999e-06, "loss": 1.4865, "step": 41 }, { "epoch": 0.44680851063829785, "grad_norm": 0.09922561049461365, "learning_rate": 1.26e-06, "loss": 1.2193, "step": 42 }, { "epoch": 0.4574468085106383, "grad_norm": 0.12003930658102036, "learning_rate": 1.29e-06, "loss": 1.3024, "step": 43 }, { "epoch": 0.46808510638297873, "grad_norm": 0.12094161659479141, "learning_rate": 1.32e-06, "loss": 1.4654, "step": 44 }, { "epoch": 0.4787234042553192, "grad_norm": 0.12934774160385132, "learning_rate": 1.35e-06, "loss": 1.3713, "step": 45 }, { "epoch": 0.48936170212765956, "grad_norm": 0.09754550457000732, "learning_rate": 1.3800000000000001e-06, "loss": 1.1991, "step": 46 }, { "epoch": 0.5, "grad_norm": 0.11549004167318344, "learning_rate": 1.41e-06, "loss": 1.4859, "step": 47 }, { "epoch": 0.5106382978723404, "grad_norm": 0.12035688012838364, "learning_rate": 1.44e-06, "loss": 1.2823, "step": 48 }, { "epoch": 0.5106382978723404, "eval_loss": 1.5874611139297485, "eval_runtime": 2.9787, "eval_samples_per_second": 35.251, "eval_steps_per_second": 17.793, "step": 48 }, { "epoch": 0.5212765957446809, "grad_norm": 0.12258938699960709, "learning_rate": 1.4700000000000001e-06, "loss": 1.2966, "step": 49 }, { "epoch": 0.5319148936170213, "grad_norm": 0.12217017263174057, "learning_rate": 1.5e-06, "loss": 1.2988, "step": 50 }, { "epoch": 0.5425531914893617, "grad_norm": 0.12793436646461487, "learning_rate": 1.53e-06, "loss": 1.4233, "step": 51 }, { "epoch": 0.5531914893617021, "grad_norm": 0.11145548522472382, "learning_rate": 1.56e-06, "loss": 1.2792, "step": 52 }, { "epoch": 0.5638297872340425, "grad_norm": 0.13195408880710602, "learning_rate": 1.59e-06, "loss": 1.4481, "step": 53 }, { "epoch": 0.574468085106383, "grad_norm": 0.10663347691297531, "learning_rate": 1.6200000000000002e-06, "loss": 1.2331, "step": 54 }, { "epoch": 0.5851063829787234, "grad_norm": 0.10975392907857895, "learning_rate": 1.65e-06, "loss": 1.1839, "step": 55 }, { "epoch": 0.5957446808510638, "grad_norm": 0.13139477372169495, "learning_rate": 1.6800000000000002e-06, "loss": 1.436, "step": 56 }, { "epoch": 0.6063829787234043, "grad_norm": 0.0924743041396141, "learning_rate": 1.71e-06, "loss": 1.1291, "step": 57 }, { "epoch": 0.6170212765957447, "grad_norm": 0.12475109100341797, "learning_rate": 1.7399999999999999e-06, "loss": 1.3444, "step": 58 }, { "epoch": 0.6276595744680851, "grad_norm": 0.08960220962762833, "learning_rate": 1.77e-06, "loss": 1.2511, "step": 59 }, { "epoch": 0.6382978723404256, "grad_norm": 0.09909304976463318, "learning_rate": 1.8e-06, "loss": 1.1281, "step": 60 }, { "epoch": 0.648936170212766, "grad_norm": 0.11598234623670578, "learning_rate": 1.83e-06, "loss": 1.486, "step": 61 }, { "epoch": 0.6595744680851063, "grad_norm": 0.1404409557580948, "learning_rate": 1.86e-06, "loss": 1.4557, "step": 62 }, { "epoch": 0.6702127659574468, "grad_norm": 0.11349129676818848, "learning_rate": 1.8900000000000001e-06, "loss": 1.3969, "step": 63 }, { "epoch": 0.6808510638297872, "grad_norm": 0.10858353972434998, "learning_rate": 1.9200000000000003e-06, "loss": 1.3515, "step": 64 }, { "epoch": 0.6914893617021277, "grad_norm": 0.11054569482803345, "learning_rate": 1.95e-06, "loss": 1.248, "step": 65 }, { "epoch": 0.7021276595744681, "grad_norm": 0.11826737225055695, "learning_rate": 1.98e-06, "loss": 1.3439, "step": 66 }, { "epoch": 0.7127659574468085, "grad_norm": 0.12291310727596283, "learning_rate": 2.0100000000000002e-06, "loss": 1.4226, "step": 67 }, { "epoch": 0.723404255319149, "grad_norm": 0.1383126825094223, "learning_rate": 2.0400000000000004e-06, "loss": 1.194, "step": 68 }, { "epoch": 0.7340425531914894, "grad_norm": 0.10981890559196472, "learning_rate": 2.07e-06, "loss": 1.2502, "step": 69 }, { "epoch": 0.7446808510638298, "grad_norm": 0.10639657825231552, "learning_rate": 2.1e-06, "loss": 1.2262, "step": 70 }, { "epoch": 0.7553191489361702, "grad_norm": 0.46177396178245544, "learning_rate": 2.13e-06, "loss": 5.063, "step": 71 }, { "epoch": 0.7659574468085106, "grad_norm": 0.11061427742242813, "learning_rate": 2.16e-06, "loss": 1.3856, "step": 72 }, { "epoch": 0.7659574468085106, "eval_loss": 1.5843762159347534, "eval_runtime": 2.9943, "eval_samples_per_second": 35.066, "eval_steps_per_second": 17.7, "step": 72 }, { "epoch": 0.776595744680851, "grad_norm": 0.13233324885368347, "learning_rate": 2.19e-06, "loss": 1.526, "step": 73 }, { "epoch": 0.7872340425531915, "grad_norm": 0.14500053226947784, "learning_rate": 2.22e-06, "loss": 1.2554, "step": 74 }, { "epoch": 0.7978723404255319, "grad_norm": 0.10629269480705261, "learning_rate": 2.25e-06, "loss": 1.2917, "step": 75 }, { "epoch": 0.8085106382978723, "grad_norm": 0.09674028307199478, "learning_rate": 2.28e-06, "loss": 1.2169, "step": 76 }, { "epoch": 0.8191489361702128, "grad_norm": 0.11965947598218918, "learning_rate": 2.31e-06, "loss": 1.2445, "step": 77 }, { "epoch": 0.8297872340425532, "grad_norm": 0.10441229492425919, "learning_rate": 2.34e-06, "loss": 1.2392, "step": 78 }, { "epoch": 0.8404255319148937, "grad_norm": 0.13795869052410126, "learning_rate": 2.37e-06, "loss": 1.5335, "step": 79 }, { "epoch": 0.851063829787234, "grad_norm": 0.10789927840232849, "learning_rate": 2.4000000000000003e-06, "loss": 1.2309, "step": 80 }, { "epoch": 0.8617021276595744, "grad_norm": 0.10697130113840103, "learning_rate": 2.43e-06, "loss": 1.276, "step": 81 }, { "epoch": 0.8723404255319149, "grad_norm": 0.11484125256538391, "learning_rate": 2.4599999999999997e-06, "loss": 1.2163, "step": 82 }, { "epoch": 0.8829787234042553, "grad_norm": 0.09692610800266266, "learning_rate": 2.49e-06, "loss": 1.1554, "step": 83 }, { "epoch": 0.8936170212765957, "grad_norm": 0.10697747021913528, "learning_rate": 2.52e-06, "loss": 1.211, "step": 84 }, { "epoch": 0.9042553191489362, "grad_norm": 0.10578318685293198, "learning_rate": 2.55e-06, "loss": 1.2634, "step": 85 }, { "epoch": 0.9148936170212766, "grad_norm": 0.10587752610445023, "learning_rate": 2.58e-06, "loss": 1.369, "step": 86 }, { "epoch": 0.925531914893617, "grad_norm": 0.11430489271879196, "learning_rate": 2.61e-06, "loss": 1.3048, "step": 87 }, { "epoch": 0.9361702127659575, "grad_norm": 0.11116154491901398, "learning_rate": 2.64e-06, "loss": 1.1883, "step": 88 }, { "epoch": 0.9468085106382979, "grad_norm": 0.12686476111412048, "learning_rate": 2.6700000000000003e-06, "loss": 1.2605, "step": 89 }, { "epoch": 0.9574468085106383, "grad_norm": 0.10976041853427887, "learning_rate": 2.7e-06, "loss": 1.2345, "step": 90 }, { "epoch": 0.9680851063829787, "grad_norm": 0.12391550838947296, "learning_rate": 2.73e-06, "loss": 1.2233, "step": 91 }, { "epoch": 0.9787234042553191, "grad_norm": 0.1277073174715042, "learning_rate": 2.7600000000000003e-06, "loss": 1.2437, "step": 92 }, { "epoch": 0.9893617021276596, "grad_norm": 0.10421616584062576, "learning_rate": 2.7900000000000004e-06, "loss": 1.1432, "step": 93 }, { "epoch": 1.0, "grad_norm": 0.12696842849254608, "learning_rate": 2.82e-06, "loss": 1.3558, "step": 94 }, { "epoch": 1.0106382978723405, "grad_norm": 0.10387945920228958, "learning_rate": 2.85e-06, "loss": 1.1902, "step": 95 }, { "epoch": 1.0212765957446808, "grad_norm": 0.14543002843856812, "learning_rate": 2.88e-06, "loss": 1.244, "step": 96 }, { "epoch": 1.0212765957446808, "eval_loss": 1.5803985595703125, "eval_runtime": 3.0087, "eval_samples_per_second": 34.899, "eval_steps_per_second": 17.616, "step": 96 }, { "epoch": 1.0319148936170213, "grad_norm": 0.17504040896892548, "learning_rate": 2.91e-06, "loss": 1.3115, "step": 97 }, { "epoch": 1.0106382978723405, "grad_norm": 0.14017756283283234, "learning_rate": 2.9400000000000002e-06, "loss": 1.356, "step": 98 }, { "epoch": 1.0212765957446808, "grad_norm": 0.11604318767786026, "learning_rate": 2.97e-06, "loss": 1.2806, "step": 99 }, { "epoch": 1.0319148936170213, "grad_norm": 0.15116065740585327, "learning_rate": 3e-06, "loss": 1.3279, "step": 100 }, { "epoch": 1.0425531914893618, "grad_norm": 0.10773392021656036, "learning_rate": 2.9999028286680832e-06, "loss": 1.3871, "step": 101 }, { "epoch": 1.053191489361702, "grad_norm": 0.09363637119531631, "learning_rate": 2.999611327262024e-06, "loss": 1.1334, "step": 102 }, { "epoch": 1.0638297872340425, "grad_norm": 0.09164439886808395, "learning_rate": 2.999125533549261e-06, "loss": 1.4026, "step": 103 }, { "epoch": 1.074468085106383, "grad_norm": 0.12113964557647705, "learning_rate": 2.9984455104700915e-06, "loss": 1.2097, "step": 104 }, { "epoch": 1.0851063829787233, "grad_norm": 0.10569895058870316, "learning_rate": 2.9975713461295125e-06, "loss": 1.3077, "step": 105 }, { "epoch": 1.0957446808510638, "grad_norm": 0.11219862103462219, "learning_rate": 2.996503153785809e-06, "loss": 1.1923, "step": 106 }, { "epoch": 1.1063829787234043, "grad_norm": 0.13474853336811066, "learning_rate": 2.9952410718358777e-06, "loss": 1.2729, "step": 107 }, { "epoch": 1.1170212765957448, "grad_norm": 0.10451857000589371, "learning_rate": 2.993785263797297e-06, "loss": 1.2904, "step": 108 }, { "epoch": 1.127659574468085, "grad_norm": 0.12593719363212585, "learning_rate": 2.9921359182871422e-06, "loss": 1.4021, "step": 109 }, { "epoch": 1.1382978723404256, "grad_norm": 0.12547315657138824, "learning_rate": 2.990293248997547e-06, "loss": 1.2093, "step": 110 }, { "epoch": 1.148936170212766, "grad_norm": 0.10086555033922195, "learning_rate": 2.988257494668016e-06, "loss": 1.3098, "step": 111 }, { "epoch": 1.1595744680851063, "grad_norm": 0.10801573097705841, "learning_rate": 2.9860289190544963e-06, "loss": 1.2752, "step": 112 }, { "epoch": 1.1702127659574468, "grad_norm": 0.11851054430007935, "learning_rate": 2.9836078108952015e-06, "loss": 1.3066, "step": 113 }, { "epoch": 1.1808510638297873, "grad_norm": 0.1265188604593277, "learning_rate": 2.9809944838732045e-06, "loss": 1.1946, "step": 114 }, { "epoch": 1.1914893617021276, "grad_norm": 0.10702725499868393, "learning_rate": 2.9781892765757954e-06, "loss": 1.2182, "step": 115 }, { "epoch": 1.202127659574468, "grad_norm": 0.7757766842842102, "learning_rate": 2.9751925524506135e-06, "loss": 5.6082, "step": 116 }, { "epoch": 1.2127659574468086, "grad_norm": 0.10182739049196243, "learning_rate": 2.9720046997585577e-06, "loss": 1.309, "step": 117 }, { "epoch": 1.2234042553191489, "grad_norm": 0.14062073826789856, "learning_rate": 2.9686261315234844e-06, "loss": 1.3668, "step": 118 }, { "epoch": 1.2340425531914894, "grad_norm": 0.1063840240240097, "learning_rate": 2.965057285478694e-06, "loss": 1.2247, "step": 119 }, { "epoch": 1.2446808510638299, "grad_norm": 0.10701325535774231, "learning_rate": 2.961298624010219e-06, "loss": 1.2499, "step": 120 }, { "epoch": 1.2446808510638299, "eval_loss": 1.575344204902649, "eval_runtime": 8.2617, "eval_samples_per_second": 12.709, "eval_steps_per_second": 6.415, "step": 120 }, { "epoch": 1.2553191489361701, "grad_norm": 0.11413375288248062, "learning_rate": 2.957350634096912e-06, "loss": 1.3593, "step": 121 }, { "epoch": 1.2659574468085106, "grad_norm": 0.10826950520277023, "learning_rate": 2.9532138272473597e-06, "loss": 1.232, "step": 122 }, { "epoch": 1.2765957446808511, "grad_norm": 0.10749218612909317, "learning_rate": 2.9488887394336023e-06, "loss": 1.401, "step": 123 }, { "epoch": 1.2872340425531914, "grad_norm": 0.1175396591424942, "learning_rate": 2.944375931021699e-06, "loss": 1.2867, "step": 124 }, { "epoch": 1.297872340425532, "grad_norm": 0.10688498616218567, "learning_rate": 2.9396759866991214e-06, "loss": 1.3689, "step": 125 }, { "epoch": 1.3085106382978724, "grad_norm": 0.11630786955356598, "learning_rate": 2.934789515399002e-06, "loss": 1.2328, "step": 126 }, { "epoch": 1.3191489361702127, "grad_norm": 0.11495409905910492, "learning_rate": 2.9297171502212414e-06, "loss": 1.1738, "step": 127 }, { "epoch": 1.3297872340425532, "grad_norm": 0.10823408514261246, "learning_rate": 2.924459548350479e-06, "loss": 1.2318, "step": 128 }, { "epoch": 1.3404255319148937, "grad_norm": 0.10759969800710678, "learning_rate": 2.9190173909709506e-06, "loss": 1.319, "step": 129 }, { "epoch": 1.351063829787234, "grad_norm": 0.12790299952030182, "learning_rate": 2.9133913831782307e-06, "loss": 1.4282, "step": 130 }, { "epoch": 1.3617021276595744, "grad_norm": 0.10899878293275833, "learning_rate": 2.9075822538878805e-06, "loss": 1.2455, "step": 131 }, { "epoch": 1.372340425531915, "grad_norm": 0.12732432782649994, "learning_rate": 2.9015907557410068e-06, "loss": 1.2227, "step": 132 }, { "epoch": 1.3829787234042552, "grad_norm": 0.10140018165111542, "learning_rate": 2.8954176650067494e-06, "loss": 1.0596, "step": 133 }, { "epoch": 1.3936170212765957, "grad_norm": 0.12063546478748322, "learning_rate": 2.889063781481708e-06, "loss": 1.3321, "step": 134 }, { "epoch": 1.4042553191489362, "grad_norm": 0.11323118209838867, "learning_rate": 2.8825299283863144e-06, "loss": 1.3366, "step": 135 }, { "epoch": 1.4148936170212765, "grad_norm": 0.12391477078199387, "learning_rate": 2.8758169522581796e-06, "loss": 1.2917, "step": 136 }, { "epoch": 1.425531914893617, "grad_norm": 0.12329094856977463, "learning_rate": 2.8689257228424125e-06, "loss": 1.1365, "step": 137 }, { "epoch": 1.4361702127659575, "grad_norm": 0.11298198997974396, "learning_rate": 2.8618571329789333e-06, "loss": 1.4996, "step": 138 }, { "epoch": 1.4468085106382977, "grad_norm": 0.10263165086507797, "learning_rate": 2.8546120984867994e-06, "loss": 1.199, "step": 139 }, { "epoch": 1.4574468085106382, "grad_norm": 0.13330323994159698, "learning_rate": 2.8471915580455442e-06, "loss": 1.3746, "step": 140 }, { "epoch": 1.4680851063829787, "grad_norm": 0.10694881528615952, "learning_rate": 2.8395964730735666e-06, "loss": 1.3798, "step": 141 }, { "epoch": 1.4787234042553192, "grad_norm": 0.10929003357887268, "learning_rate": 2.8318278276035626e-06, "loss": 1.2918, "step": 142 }, { "epoch": 1.4893617021276595, "grad_norm": 0.10969982296228409, "learning_rate": 2.8238866281550366e-06, "loss": 1.0444, "step": 143 }, { "epoch": 1.5, "grad_norm": 0.10668367147445679, "learning_rate": 2.815773903603892e-06, "loss": 1.1656, "step": 144 }, { "epoch": 1.5, "eval_loss": 1.5705612897872925, "eval_runtime": 5.6853, "eval_samples_per_second": 18.469, "eval_steps_per_second": 9.322, "step": 144 }, { "epoch": 1.5106382978723403, "grad_norm": 0.12057002633810043, "learning_rate": 2.807490705049127e-06, "loss": 1.2475, "step": 145 }, { "epoch": 1.521276595744681, "grad_norm": 0.10394139587879181, "learning_rate": 2.7990381056766585e-06, "loss": 1.3376, "step": 146 }, { "epoch": 1.5319148936170213, "grad_norm": 0.12907464802265167, "learning_rate": 2.7904172006202705e-06, "loss": 1.3509, "step": 147 }, { "epoch": 1.5425531914893615, "grad_norm": 0.1252606213092804, "learning_rate": 2.7816291068197328e-06, "loss": 1.1744, "step": 148 }, { "epoch": 1.5531914893617023, "grad_norm": 0.1514441967010498, "learning_rate": 2.772674962876085e-06, "loss": 1.9324, "step": 149 }, { "epoch": 1.5638297872340425, "grad_norm": 0.12377320230007172, "learning_rate": 2.7635559289041174e-06, "loss": 1.4511, "step": 150 }, { "epoch": 1.574468085106383, "grad_norm": 0.10629550367593765, "learning_rate": 2.7542731863820665e-06, "loss": 1.257, "step": 151 }, { "epoch": 1.5851063829787235, "grad_norm": 0.10322704166173935, "learning_rate": 2.744827937998538e-06, "loss": 1.1796, "step": 152 }, { "epoch": 1.5957446808510638, "grad_norm": 0.12690168619155884, "learning_rate": 2.735221407496686e-06, "loss": 1.3325, "step": 153 }, { "epoch": 1.6063829787234043, "grad_norm": 0.0982392430305481, "learning_rate": 2.725454839515663e-06, "loss": 1.2169, "step": 154 }, { "epoch": 1.6170212765957448, "grad_norm": 0.11184275150299072, "learning_rate": 2.7155294994293606e-06, "loss": 1.295, "step": 155 }, { "epoch": 1.627659574468085, "grad_norm": 0.10721098631620407, "learning_rate": 2.7054466731824676e-06, "loss": 1.1945, "step": 156 }, { "epoch": 1.6382978723404256, "grad_norm": 0.1254221498966217, "learning_rate": 2.695207667123857e-06, "loss": 1.2693, "step": 157 }, { "epoch": 1.648936170212766, "grad_norm": 0.1157686784863472, "learning_rate": 2.684813807837338e-06, "loss": 1.1992, "step": 158 }, { "epoch": 1.6595744680851063, "grad_norm": 0.10201229155063629, "learning_rate": 2.674266441969778e-06, "loss": 1.2401, "step": 159 }, { "epoch": 1.6702127659574468, "grad_norm": 0.10953360050916672, "learning_rate": 2.6635669360566298e-06, "loss": 1.1627, "step": 160 }, { "epoch": 1.6808510638297873, "grad_norm": 0.1248813271522522, "learning_rate": 2.652716676344881e-06, "loss": 1.3179, "step": 161 }, { "epoch": 1.6914893617021276, "grad_norm": 0.09584470838308334, "learning_rate": 2.6417170686134472e-06, "loss": 1.2128, "step": 162 }, { "epoch": 1.702127659574468, "grad_norm": 0.12864486873149872, "learning_rate": 2.630569537991042e-06, "loss": 1.1889, "step": 163 }, { "epoch": 1.7127659574468086, "grad_norm": 0.11820186674594879, "learning_rate": 2.6192755287715282e-06, "loss": 1.2848, "step": 164 }, { "epoch": 1.7234042553191489, "grad_norm": 0.13811764121055603, "learning_rate": 2.6078365042267987e-06, "loss": 1.2091, "step": 165 }, { "epoch": 1.7340425531914894, "grad_norm": 0.12099539488554001, "learning_rate": 2.5962539464171862e-06, "loss": 1.2853, "step": 166 }, { "epoch": 1.7446808510638299, "grad_norm": 0.14544783532619476, "learning_rate": 2.5845293559994513e-06, "loss": 1.1574, "step": 167 }, { "epoch": 1.7553191489361701, "grad_norm": 0.11159494519233704, "learning_rate": 2.5726642520323493e-06, "loss": 1.1928, "step": 168 }, { "epoch": 1.7553191489361701, "eval_loss": 1.5656005144119263, "eval_runtime": 3.0568, "eval_samples_per_second": 34.349, "eval_steps_per_second": 17.338, "step": 168 }, { "epoch": 1.7659574468085106, "grad_norm": 0.1385321319103241, "learning_rate": 2.5606601717798212e-06, "loss": 1.365, "step": 169 }, { "epoch": 1.7765957446808511, "grad_norm": 0.12905962765216827, "learning_rate": 2.548518670511823e-06, "loss": 1.3664, "step": 170 }, { "epoch": 1.7872340425531914, "grad_norm": 0.09589093923568726, "learning_rate": 2.536241321302821e-06, "loss": 1.1508, "step": 171 }, { "epoch": 1.797872340425532, "grad_norm": 0.12533985078334808, "learning_rate": 2.5238297148279814e-06, "loss": 1.2892, "step": 172 }, { "epoch": 1.8085106382978724, "grad_norm": 0.11337278038263321, "learning_rate": 2.5112854591570804e-06, "loss": 1.2772, "step": 173 }, { "epoch": 1.8191489361702127, "grad_norm": 0.1345362514257431, "learning_rate": 2.4986101795461608e-06, "loss": 1.3583, "step": 174 }, { "epoch": 1.8297872340425532, "grad_norm": 0.121876560151577, "learning_rate": 2.485805518226959e-06, "loss": 1.2065, "step": 175 }, { "epoch": 1.8404255319148937, "grad_norm": 0.10669907927513123, "learning_rate": 2.4728731341941343e-06, "loss": 1.2339, "step": 176 }, { "epoch": 1.851063829787234, "grad_norm": 0.12710174918174744, "learning_rate": 2.4598147029903284e-06, "loss": 1.4597, "step": 177 }, { "epoch": 1.8617021276595744, "grad_norm": 0.11735282093286514, "learning_rate": 2.4466319164890795e-06, "loss": 1.2273, "step": 178 }, { "epoch": 1.872340425531915, "grad_norm": 0.12083889544010162, "learning_rate": 2.4333264826756165e-06, "loss": 1.3051, "step": 179 }, { "epoch": 1.8829787234042552, "grad_norm": 0.10338792204856873, "learning_rate": 2.4199001254255746e-06, "loss": 1.3827, "step": 180 }, { "epoch": 1.8936170212765957, "grad_norm": 0.10191334784030914, "learning_rate": 2.406354584281642e-06, "loss": 1.4067, "step": 181 }, { "epoch": 1.9042553191489362, "grad_norm": 0.10776273161172867, "learning_rate": 2.3926916142281846e-06, "loss": 1.2344, "step": 182 }, { "epoch": 1.9148936170212765, "grad_norm": 0.1253603845834732, "learning_rate": 2.378912985463867e-06, "loss": 1.2179, "step": 183 }, { "epoch": 1.925531914893617, "grad_norm": 0.09942696243524551, "learning_rate": 2.365020483172301e-06, "loss": 1.0613, "step": 184 }, { "epoch": 1.9361702127659575, "grad_norm": 0.10441701859235764, "learning_rate": 2.351015907290755e-06, "loss": 1.3886, "step": 185 }, { "epoch": 1.9468085106382977, "grad_norm": 0.10571073740720749, "learning_rate": 2.3369010722769502e-06, "loss": 1.2023, "step": 186 }, { "epoch": 1.9574468085106385, "grad_norm": 0.12477201223373413, "learning_rate": 2.3226778068739783e-06, "loss": 1.361, "step": 187 }, { "epoch": 1.9680851063829787, "grad_norm": 0.12106247246265411, "learning_rate": 2.3083479538733636e-06, "loss": 1.4153, "step": 188 }, { "epoch": 1.978723404255319, "grad_norm": 0.17261436581611633, "learning_rate": 2.2939133698763084e-06, "loss": 1.45, "step": 189 }, { "epoch": 1.9893617021276597, "grad_norm": 0.12172568589448929, "learning_rate": 2.2793759250531504e-06, "loss": 1.1764, "step": 190 }, { "epoch": 2.0, "grad_norm": 0.11111535131931305, "learning_rate": 2.2647375029010575e-06, "loss": 1.1347, "step": 191 }, { "epoch": 2.0106382978723403, "grad_norm": 0.10596352070569992, "learning_rate": 2.25e-06, "loss": 1.1623, "step": 192 }, { "epoch": 2.0106382978723403, "eval_loss": 1.560809850692749, "eval_runtime": 4.8693, "eval_samples_per_second": 21.564, "eval_steps_per_second": 10.885, "step": 192 }, { "epoch": 2.0212765957446805, "grad_norm": 0.08630727976560593, "learning_rate": 2.235165325767026e-06, "loss": 1.1733, "step": 193 }, { "epoch": 2.0319148936170213, "grad_norm": 0.3629175126552582, "learning_rate": 2.2202354022088735e-06, "loss": 1.1114, "step": 194 }, { "epoch": 2.0106382978723403, "grad_norm": 0.12061420828104019, "learning_rate": 2.2052121636729527e-06, "loss": 1.1813, "step": 195 }, { "epoch": 2.021276595744681, "grad_norm": 0.12194402515888214, "learning_rate": 2.1900975565967284e-06, "loss": 1.4466, "step": 196 }, { "epoch": 2.0319148936170213, "grad_norm": 0.09581239521503448, "learning_rate": 2.1748935392555347e-06, "loss": 1.1863, "step": 197 }, { "epoch": 2.0425531914893615, "grad_norm": 0.12056192755699158, "learning_rate": 2.1596020815088587e-06, "loss": 1.5781, "step": 198 }, { "epoch": 2.0531914893617023, "grad_norm": 0.10989479720592499, "learning_rate": 2.1442251645451233e-06, "loss": 1.2638, "step": 199 }, { "epoch": 2.0638297872340425, "grad_norm": 0.1273481398820877, "learning_rate": 2.1287647806249964e-06, "loss": 1.1346, "step": 200 }, { "epoch": 2.074468085106383, "grad_norm": 0.11547158658504486, "learning_rate": 2.1132229328232755e-06, "loss": 1.124, "step": 201 }, { "epoch": 2.0851063829787235, "grad_norm": 0.12033279985189438, "learning_rate": 2.0976016347693624e-06, "loss": 1.2695, "step": 202 }, { "epoch": 2.095744680851064, "grad_norm": 0.11929858475923538, "learning_rate": 2.0819029103863746e-06, "loss": 1.1885, "step": 203 }, { "epoch": 2.106382978723404, "grad_norm": 0.10733456164598465, "learning_rate": 2.0661287936289213e-06, "loss": 1.3742, "step": 204 }, { "epoch": 2.117021276595745, "grad_norm": 0.12037491053342819, "learning_rate": 2.050281328219585e-06, "loss": 1.1911, "step": 205 }, { "epoch": 2.127659574468085, "grad_norm": 0.12621213495731354, "learning_rate": 2.0343625673841254e-06, "loss": 1.1241, "step": 206 }, { "epoch": 2.1382978723404253, "grad_norm": 0.15265725553035736, "learning_rate": 2.018374573585467e-06, "loss": 1.4419, "step": 207 }, { "epoch": 2.148936170212766, "grad_norm": 0.10528834164142609, "learning_rate": 2.0023194182564793e-06, "loss": 1.2211, "step": 208 }, { "epoch": 2.1595744680851063, "grad_norm": 0.11791864037513733, "learning_rate": 1.986199181531599e-06, "loss": 1.2159, "step": 209 }, { "epoch": 2.1702127659574466, "grad_norm": 0.1230137050151825, "learning_rate": 1.9700159519773233e-06, "loss": 1.2687, "step": 210 }, { "epoch": 2.1808510638297873, "grad_norm": 0.11243241280317307, "learning_rate": 1.9537718263216137e-06, "loss": 1.2008, "step": 211 }, { "epoch": 2.1914893617021276, "grad_norm": 0.13040384650230408, "learning_rate": 1.9374689091822377e-06, "loss": 1.2937, "step": 212 }, { "epoch": 2.202127659574468, "grad_norm": 0.096621572971344, "learning_rate": 1.9211093127940917e-06, "loss": 1.1315, "step": 213 }, { "epoch": 2.2127659574468086, "grad_norm": 0.10366329550743103, "learning_rate": 1.9046951567355363e-06, "loss": 1.1998, "step": 214 }, { "epoch": 2.223404255319149, "grad_norm": 0.13266436755657196, "learning_rate": 1.888228567653781e-06, "loss": 1.2169, "step": 215 }, { "epoch": 2.2340425531914896, "grad_norm": 0.12041480094194412, "learning_rate": 1.87171167898935e-06, "loss": 1.2679, "step": 216 }, { "epoch": 2.2340425531914896, "eval_loss": 1.557094931602478, "eval_runtime": 3.8276, "eval_samples_per_second": 27.432, "eval_steps_per_second": 13.847, "step": 216 }, { "epoch": 2.24468085106383, "grad_norm": 0.11926256865262985, "learning_rate": 1.8551466306996702e-06, "loss": 1.2496, "step": 217 }, { "epoch": 2.25531914893617, "grad_norm": 0.11768142879009247, "learning_rate": 1.838535568981816e-06, "loss": 1.2793, "step": 218 }, { "epoch": 2.2659574468085104, "grad_norm": 0.09850093722343445, "learning_rate": 1.821880645994443e-06, "loss": 1.1712, "step": 219 }, { "epoch": 2.276595744680851, "grad_norm": 0.11399635672569275, "learning_rate": 1.8051840195789509e-06, "loss": 1.3151, "step": 220 }, { "epoch": 2.2872340425531914, "grad_norm": 0.10654745250940323, "learning_rate": 1.788447852979909e-06, "loss": 1.1726, "step": 221 }, { "epoch": 2.297872340425532, "grad_norm": 0.12983720004558563, "learning_rate": 1.7716743145647837e-06, "loss": 1.2158, "step": 222 }, { "epoch": 2.3085106382978724, "grad_norm": 0.08331640809774399, "learning_rate": 1.754865577543e-06, "loss": 0.8472, "step": 223 }, { "epoch": 2.3191489361702127, "grad_norm": 0.11662207543849945, "learning_rate": 1.738023819684377e-06, "loss": 1.2938, "step": 224 }, { "epoch": 2.329787234042553, "grad_norm": 0.13265132904052734, "learning_rate": 1.7211512230369716e-06, "loss": 1.3288, "step": 225 }, { "epoch": 2.3404255319148937, "grad_norm": 0.10342156141996384, "learning_rate": 1.7042499736443702e-06, "loss": 1.2352, "step": 226 }, { "epoch": 2.351063829787234, "grad_norm": 0.10495639592409134, "learning_rate": 1.6873222612624575e-06, "loss": 1.1748, "step": 227 }, { "epoch": 2.3617021276595747, "grad_norm": 0.1564377248287201, "learning_rate": 1.6703702790757123e-06, "loss": 1.4071, "step": 228 }, { "epoch": 2.372340425531915, "grad_norm": 0.11000990867614746, "learning_rate": 1.6533962234130513e-06, "loss": 1.2102, "step": 229 }, { "epoch": 2.382978723404255, "grad_norm": 0.12565018236637115, "learning_rate": 1.6364022934632706e-06, "loss": 1.4047, "step": 230 }, { "epoch": 2.393617021276596, "grad_norm": 0.11519981920719147, "learning_rate": 1.6193906909901133e-06, "loss": 1.3905, "step": 231 }, { "epoch": 2.404255319148936, "grad_norm": 0.12704767286777496, "learning_rate": 1.6023636200470066e-06, "loss": 1.0632, "step": 232 }, { "epoch": 2.4148936170212765, "grad_norm": 0.11226076632738113, "learning_rate": 1.5853232866915004e-06, "loss": 1.2448, "step": 233 }, { "epoch": 2.425531914893617, "grad_norm": 0.11282170563936234, "learning_rate": 1.5682718986994457e-06, "loss": 1.5068, "step": 234 }, { "epoch": 2.4361702127659575, "grad_norm": 0.09763903170824051, "learning_rate": 1.5512116652789518e-06, "loss": 1.2398, "step": 235 }, { "epoch": 2.4468085106382977, "grad_norm": 0.11640322953462601, "learning_rate": 1.5341447967841585e-06, "loss": 1.3176, "step": 236 }, { "epoch": 2.4574468085106385, "grad_norm": 0.12292848527431488, "learning_rate": 1.5170735044288565e-06, "loss": 1.8823, "step": 237 }, { "epoch": 2.4680851063829787, "grad_norm": 0.12308197468519211, "learning_rate": 1.5e-06, "loss": 1.14, "step": 238 }, { "epoch": 2.478723404255319, "grad_norm": 0.11871509999036789, "learning_rate": 1.4829264955711436e-06, "loss": 1.2753, "step": 239 }, { "epoch": 2.4893617021276597, "grad_norm": 0.14034831523895264, "learning_rate": 1.4658552032158414e-06, "loss": 1.2845, "step": 240 }, { "epoch": 2.4893617021276597, "eval_loss": 1.5537203550338745, "eval_runtime": 5.1489, "eval_samples_per_second": 20.393, "eval_steps_per_second": 10.294, "step": 240 }, { "epoch": 2.5, "grad_norm": 0.12317660450935364, "learning_rate": 1.4487883347210483e-06, "loss": 1.2742, "step": 241 }, { "epoch": 2.5106382978723403, "grad_norm": 0.11180029809474945, "learning_rate": 1.431728101300555e-06, "loss": 1.3743, "step": 242 }, { "epoch": 2.521276595744681, "grad_norm": 0.12544095516204834, "learning_rate": 1.4146767133085e-06, "loss": 1.2935, "step": 243 }, { "epoch": 2.5319148936170213, "grad_norm": 0.12283220142126083, "learning_rate": 1.3976363799529937e-06, "loss": 1.2529, "step": 244 }, { "epoch": 2.5425531914893615, "grad_norm": 0.1375051885843277, "learning_rate": 1.3806093090098872e-06, "loss": 1.2907, "step": 245 }, { "epoch": 2.5531914893617023, "grad_norm": 0.10762558877468109, "learning_rate": 1.3635977065367295e-06, "loss": 1.1548, "step": 246 }, { "epoch": 2.5638297872340425, "grad_norm": 0.12075044959783554, "learning_rate": 1.3466037765869486e-06, "loss": 1.2208, "step": 247 }, { "epoch": 2.574468085106383, "grad_norm": 0.11484728753566742, "learning_rate": 1.3296297209242874e-06, "loss": 1.1182, "step": 248 }, { "epoch": 2.5851063829787235, "grad_norm": 0.11546777188777924, "learning_rate": 1.312677738737543e-06, "loss": 1.4478, "step": 249 }, { "epoch": 2.595744680851064, "grad_norm": 0.12128724902868271, "learning_rate": 1.2957500263556303e-06, "loss": 1.3132, "step": 250 }, { "epoch": 2.6063829787234045, "grad_norm": 0.09233469516038895, "learning_rate": 1.2788487769630284e-06, "loss": 1.0863, "step": 251 }, { "epoch": 2.617021276595745, "grad_norm": 0.12085112929344177, "learning_rate": 1.261976180315623e-06, "loss": 1.2855, "step": 252 }, { "epoch": 2.627659574468085, "grad_norm": 0.12655006349086761, "learning_rate": 1.2451344224570002e-06, "loss": 1.2691, "step": 253 }, { "epoch": 2.6382978723404253, "grad_norm": 0.12646865844726562, "learning_rate": 1.2283256854352162e-06, "loss": 1.2194, "step": 254 }, { "epoch": 2.648936170212766, "grad_norm": 0.1081186905503273, "learning_rate": 1.2115521470200911e-06, "loss": 1.2551, "step": 255 }, { "epoch": 2.6595744680851063, "grad_norm": 0.12278001755475998, "learning_rate": 1.1948159804210496e-06, "loss": 1.3049, "step": 256 }, { "epoch": 2.670212765957447, "grad_norm": 0.09346724301576614, "learning_rate": 1.1781193540055573e-06, "loss": 1.1507, "step": 257 }, { "epoch": 2.6808510638297873, "grad_norm": 0.10679617524147034, "learning_rate": 1.1614644310181842e-06, "loss": 1.2971, "step": 258 }, { "epoch": 2.6914893617021276, "grad_norm": 0.11394444108009338, "learning_rate": 1.14485336930033e-06, "loss": 1.2133, "step": 259 }, { "epoch": 2.702127659574468, "grad_norm": 0.1416221708059311, "learning_rate": 1.1282883210106503e-06, "loss": 1.2132, "step": 260 }, { "epoch": 2.7127659574468086, "grad_norm": 0.09133229404687881, "learning_rate": 1.1117714323462188e-06, "loss": 1.17, "step": 261 }, { "epoch": 2.723404255319149, "grad_norm": 0.10997346043586731, "learning_rate": 1.0953048432644634e-06, "loss": 1.1443, "step": 262 }, { "epoch": 2.7340425531914896, "grad_norm": 0.11233331263065338, "learning_rate": 1.0788906872059088e-06, "loss": 1.3383, "step": 263 }, { "epoch": 2.74468085106383, "grad_norm": 0.11798594146966934, "learning_rate": 1.0625310908177626e-06, "loss": 1.1226, "step": 264 }, { "epoch": 2.74468085106383, "eval_loss": 1.5516207218170166, "eval_runtime": 4.7557, "eval_samples_per_second": 22.079, "eval_steps_per_second": 11.145, "step": 264 }, { "epoch": 2.75531914893617, "grad_norm": 0.12637464702129364, "learning_rate": 1.0462281736783864e-06, "loss": 1.4182, "step": 265 }, { "epoch": 2.7659574468085104, "grad_norm": 0.11374552547931671, "learning_rate": 1.0299840480226768e-06, "loss": 1.2926, "step": 266 }, { "epoch": 2.776595744680851, "grad_norm": 0.13765357434749603, "learning_rate": 1.0138008184684011e-06, "loss": 1.3036, "step": 267 }, { "epoch": 2.7872340425531914, "grad_norm": 0.1168639063835144, "learning_rate": 9.976805817435208e-07, "loss": 1.2854, "step": 268 }, { "epoch": 2.797872340425532, "grad_norm": 0.11609431356191635, "learning_rate": 9.816254264145328e-07, "loss": 1.2832, "step": 269 }, { "epoch": 2.8085106382978724, "grad_norm": 0.11090860515832901, "learning_rate": 9.656374326158753e-07, "loss": 1.4098, "step": 270 }, { "epoch": 2.8191489361702127, "grad_norm": 0.12376827001571655, "learning_rate": 9.497186717804155e-07, "loss": 1.2335, "step": 271 }, { "epoch": 2.829787234042553, "grad_norm": 0.11905582994222641, "learning_rate": 9.338712063710786e-07, "loss": 1.1991, "step": 272 }, { "epoch": 2.8404255319148937, "grad_norm": 0.09383586794137955, "learning_rate": 9.180970896136258e-07, "loss": 1.1715, "step": 273 }, { "epoch": 2.851063829787234, "grad_norm": 0.14080052077770233, "learning_rate": 9.023983652306379e-07, "loss": 1.3227, "step": 274 }, { "epoch": 2.8617021276595747, "grad_norm": 0.10943106561899185, "learning_rate": 8.867770671767246e-07, "loss": 0.9774, "step": 275 }, { "epoch": 2.872340425531915, "grad_norm": 0.093683160841465, "learning_rate": 8.712352193750043e-07, "loss": 1.053, "step": 276 }, { "epoch": 2.882978723404255, "grad_norm": 0.1352389007806778, "learning_rate": 8.557748354548773e-07, "loss": 1.2713, "step": 277 }, { "epoch": 2.8936170212765955, "grad_norm": 0.09906943887472153, "learning_rate": 8.403979184911411e-07, "loss": 1.3128, "step": 278 }, { "epoch": 2.904255319148936, "grad_norm": 0.13467717170715332, "learning_rate": 8.251064607444658e-07, "loss": 1.4274, "step": 279 }, { "epoch": 2.9148936170212765, "grad_norm": 0.11490917205810547, "learning_rate": 8.099024434032718e-07, "loss": 1.307, "step": 280 }, { "epoch": 2.925531914893617, "grad_norm": 0.10982628166675568, "learning_rate": 7.947878363270472e-07, "loss": 1.0661, "step": 281 }, { "epoch": 2.9361702127659575, "grad_norm": 0.13426214456558228, "learning_rate": 7.797645977911268e-07, "loss": 1.3296, "step": 282 } ], "logging_steps": 1, "max_steps": 376, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 94, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9461272877727744.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }