{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.643835616438356, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010958904109589041, "grad_norm": 5.003701210021973, "learning_rate": 0.0002, "loss": 1.3947, "step": 1 }, { "epoch": 0.021917808219178082, "grad_norm": 10.884932518005371, "learning_rate": 0.0002, "loss": 0.657, "step": 2 }, { "epoch": 0.03287671232876712, "grad_norm": 1.7771106958389282, "learning_rate": 0.0002, "loss": 1.1432, "step": 3 }, { "epoch": 0.043835616438356165, "grad_norm": 2.750317096710205, "learning_rate": 0.0002, "loss": 0.6564, "step": 4 }, { "epoch": 0.0547945205479452, "grad_norm": 1.633827567100525, "learning_rate": 0.0002, "loss": 0.9156, "step": 5 }, { "epoch": 0.06575342465753424, "grad_norm": 1.747514009475708, "learning_rate": 0.0002, "loss": 0.5935, "step": 6 }, { "epoch": 0.07671232876712329, "grad_norm": 1.3404035568237305, "learning_rate": 0.0002, "loss": 0.5328, "step": 7 }, { "epoch": 0.08767123287671233, "grad_norm": 7.1872968673706055, "learning_rate": 0.0002, "loss": 1.0489, "step": 8 }, { "epoch": 0.09863013698630137, "grad_norm": 1.8788518905639648, "learning_rate": 0.0002, "loss": 0.8711, "step": 9 }, { "epoch": 0.1095890410958904, "grad_norm": 1.554734706878662, "learning_rate": 0.0002, "loss": 0.6025, "step": 10 }, { "epoch": 0.12054794520547946, "grad_norm": 1.5181900262832642, "learning_rate": 0.0002, "loss": 0.9178, "step": 11 }, { "epoch": 0.13150684931506848, "grad_norm": 1.74048912525177, "learning_rate": 0.0002, "loss": 0.8413, "step": 12 }, { "epoch": 0.14246575342465753, "grad_norm": 1.5069341659545898, "learning_rate": 0.0002, "loss": 0.7297, "step": 13 }, { "epoch": 0.15342465753424658, "grad_norm": 1.9904654026031494, "learning_rate": 0.0002, "loss": 0.909, "step": 14 }, { "epoch": 0.1643835616438356, "grad_norm": 1.329214334487915, "learning_rate": 0.0002, "loss": 0.6475, "step": 15 }, { "epoch": 0.17534246575342466, "grad_norm": 3.0678200721740723, "learning_rate": 0.0002, "loss": 0.8995, "step": 16 }, { "epoch": 0.1863013698630137, "grad_norm": 1.5679476261138916, "learning_rate": 0.0002, "loss": 0.6728, "step": 17 }, { "epoch": 0.19726027397260273, "grad_norm": 1.196855068206787, "learning_rate": 0.0002, "loss": 0.4871, "step": 18 }, { "epoch": 0.20821917808219179, "grad_norm": 1.8244729042053223, "learning_rate": 0.0002, "loss": 0.5748, "step": 19 }, { "epoch": 0.2191780821917808, "grad_norm": 1.1432626247406006, "learning_rate": 0.0002, "loss": 0.5982, "step": 20 }, { "epoch": 0.23013698630136986, "grad_norm": 1.672303557395935, "learning_rate": 0.0002, "loss": 0.5639, "step": 21 }, { "epoch": 0.2410958904109589, "grad_norm": 1.5940009355545044, "learning_rate": 0.0002, "loss": 0.6198, "step": 22 }, { "epoch": 0.25205479452054796, "grad_norm": 1.710236668586731, "learning_rate": 0.0002, "loss": 0.8638, "step": 23 }, { "epoch": 0.26301369863013696, "grad_norm": 1.7786955833435059, "learning_rate": 0.0002, "loss": 0.7169, "step": 24 }, { "epoch": 0.273972602739726, "grad_norm": 1.2856895923614502, "learning_rate": 0.0002, "loss": 0.5522, "step": 25 }, { "epoch": 0.28493150684931506, "grad_norm": 1.3859294652938843, "learning_rate": 0.0002, "loss": 0.6687, "step": 26 }, { "epoch": 0.2958904109589041, "grad_norm": 1.3184758424758911, "learning_rate": 0.0002, "loss": 0.6227, "step": 27 }, { "epoch": 0.30684931506849317, "grad_norm": 1.3072282075881958, "learning_rate": 0.0002, "loss": 0.4417, "step": 28 }, { "epoch": 0.3178082191780822, "grad_norm": 1.7866010665893555, "learning_rate": 0.0002, "loss": 0.7648, "step": 29 }, { "epoch": 0.3287671232876712, "grad_norm": 1.0981870889663696, "learning_rate": 0.0002, "loss": 0.6496, "step": 30 }, { "epoch": 0.33972602739726027, "grad_norm": 1.586014747619629, "learning_rate": 0.0002, "loss": 0.7389, "step": 31 }, { "epoch": 0.3506849315068493, "grad_norm": 1.4227101802825928, "learning_rate": 0.0002, "loss": 0.7764, "step": 32 }, { "epoch": 0.36164383561643837, "grad_norm": 1.0654901266098022, "learning_rate": 0.0002, "loss": 0.7015, "step": 33 }, { "epoch": 0.3726027397260274, "grad_norm": 1.22892427444458, "learning_rate": 0.0002, "loss": 0.6085, "step": 34 }, { "epoch": 0.3835616438356164, "grad_norm": 1.1209129095077515, "learning_rate": 0.0002, "loss": 0.334, "step": 35 }, { "epoch": 0.39452054794520547, "grad_norm": 1.3106253147125244, "learning_rate": 0.0002, "loss": 0.6377, "step": 36 }, { "epoch": 0.4054794520547945, "grad_norm": 1.1807057857513428, "learning_rate": 0.0002, "loss": 0.4686, "step": 37 }, { "epoch": 0.41643835616438357, "grad_norm": 1.1512413024902344, "learning_rate": 0.0002, "loss": 0.5934, "step": 38 }, { "epoch": 0.4273972602739726, "grad_norm": 1.2861087322235107, "learning_rate": 0.0002, "loss": 0.4494, "step": 39 }, { "epoch": 0.4383561643835616, "grad_norm": 1.6408014297485352, "learning_rate": 0.0002, "loss": 0.8894, "step": 40 }, { "epoch": 0.44931506849315067, "grad_norm": 1.1644452810287476, "learning_rate": 0.0002, "loss": 0.9024, "step": 41 }, { "epoch": 0.4602739726027397, "grad_norm": 0.989621639251709, "learning_rate": 0.0002, "loss": 0.4873, "step": 42 }, { "epoch": 0.4712328767123288, "grad_norm": 1.2218654155731201, "learning_rate": 0.0002, "loss": 0.7415, "step": 43 }, { "epoch": 0.4821917808219178, "grad_norm": 1.2144018411636353, "learning_rate": 0.0002, "loss": 0.6073, "step": 44 }, { "epoch": 0.4931506849315068, "grad_norm": 1.5843247175216675, "learning_rate": 0.0002, "loss": 0.8353, "step": 45 }, { "epoch": 0.5041095890410959, "grad_norm": 1.3587316274642944, "learning_rate": 0.0002, "loss": 0.5154, "step": 46 }, { "epoch": 0.5150684931506849, "grad_norm": 1.173448085784912, "learning_rate": 0.0002, "loss": 0.616, "step": 47 }, { "epoch": 0.5260273972602739, "grad_norm": 1.6074247360229492, "learning_rate": 0.0002, "loss": 0.8318, "step": 48 }, { "epoch": 0.536986301369863, "grad_norm": 1.0739307403564453, "learning_rate": 0.0002, "loss": 0.5982, "step": 49 }, { "epoch": 0.547945205479452, "grad_norm": 1.330855131149292, "learning_rate": 0.0002, "loss": 0.5309, "step": 50 }, { "epoch": 0.5589041095890411, "grad_norm": 1.5128343105316162, "learning_rate": 0.0002, "loss": 0.5063, "step": 51 }, { "epoch": 0.5698630136986301, "grad_norm": 1.5110679864883423, "learning_rate": 0.0002, "loss": 0.5551, "step": 52 }, { "epoch": 0.5808219178082191, "grad_norm": 2.263357639312744, "learning_rate": 0.0002, "loss": 0.6387, "step": 53 }, { "epoch": 0.5917808219178082, "grad_norm": 1.3241772651672363, "learning_rate": 0.0002, "loss": 0.8482, "step": 54 }, { "epoch": 0.6027397260273972, "grad_norm": 1.246489405632019, "learning_rate": 0.0002, "loss": 0.7622, "step": 55 }, { "epoch": 0.6136986301369863, "grad_norm": 1.2963398694992065, "learning_rate": 0.0002, "loss": 0.6943, "step": 56 }, { "epoch": 0.6246575342465753, "grad_norm": 1.116220474243164, "learning_rate": 0.0002, "loss": 0.6305, "step": 57 }, { "epoch": 0.6356164383561644, "grad_norm": 1.4782965183258057, "learning_rate": 0.0002, "loss": 0.7089, "step": 58 }, { "epoch": 0.6465753424657534, "grad_norm": 1.207879662513733, "learning_rate": 0.0002, "loss": 0.8837, "step": 59 }, { "epoch": 0.6575342465753424, "grad_norm": 1.0886225700378418, "learning_rate": 0.0002, "loss": 0.7521, "step": 60 }, { "epoch": 0.6684931506849315, "grad_norm": 1.1209737062454224, "learning_rate": 0.0002, "loss": 0.6905, "step": 61 }, { "epoch": 0.6794520547945205, "grad_norm": 1.732853889465332, "learning_rate": 0.0002, "loss": 0.6397, "step": 62 }, { "epoch": 0.6904109589041096, "grad_norm": 1.2688523530960083, "learning_rate": 0.0002, "loss": 0.647, "step": 63 }, { "epoch": 0.7013698630136986, "grad_norm": 1.3005374670028687, "learning_rate": 0.0002, "loss": 0.6742, "step": 64 }, { "epoch": 0.7123287671232876, "grad_norm": 1.3675568103790283, "learning_rate": 0.0002, "loss": 0.8946, "step": 65 }, { "epoch": 0.7232876712328767, "grad_norm": 1.3661890029907227, "learning_rate": 0.0002, "loss": 0.6946, "step": 66 }, { "epoch": 0.7342465753424657, "grad_norm": 1.4970860481262207, "learning_rate": 0.0002, "loss": 0.6293, "step": 67 }, { "epoch": 0.7452054794520548, "grad_norm": 1.445917010307312, "learning_rate": 0.0002, "loss": 0.8058, "step": 68 }, { "epoch": 0.7561643835616438, "grad_norm": 1.6117463111877441, "learning_rate": 0.0002, "loss": 0.7998, "step": 69 }, { "epoch": 0.7671232876712328, "grad_norm": 1.6023530960083008, "learning_rate": 0.0002, "loss": 0.5355, "step": 70 }, { "epoch": 0.7780821917808219, "grad_norm": 1.4635958671569824, "learning_rate": 0.0002, "loss": 0.6999, "step": 71 }, { "epoch": 0.7890410958904109, "grad_norm": 1.4061299562454224, "learning_rate": 0.0002, "loss": 0.6554, "step": 72 }, { "epoch": 0.8, "grad_norm": 1.4091109037399292, "learning_rate": 0.0002, "loss": 0.6972, "step": 73 }, { "epoch": 0.810958904109589, "grad_norm": 1.3066381216049194, "learning_rate": 0.0002, "loss": 0.742, "step": 74 }, { "epoch": 0.821917808219178, "grad_norm": 0.9933669567108154, "learning_rate": 0.0002, "loss": 0.6989, "step": 75 }, { "epoch": 0.8328767123287671, "grad_norm": 1.2205321788787842, "learning_rate": 0.0002, "loss": 0.6398, "step": 76 }, { "epoch": 0.8438356164383561, "grad_norm": 1.3536911010742188, "learning_rate": 0.0002, "loss": 0.5861, "step": 77 }, { "epoch": 0.8547945205479452, "grad_norm": 1.5119093656539917, "learning_rate": 0.0002, "loss": 0.9953, "step": 78 }, { "epoch": 0.8657534246575342, "grad_norm": 1.0627142190933228, "learning_rate": 0.0002, "loss": 0.4492, "step": 79 }, { "epoch": 0.8767123287671232, "grad_norm": 1.2815035581588745, "learning_rate": 0.0002, "loss": 0.7471, "step": 80 }, { "epoch": 0.8876712328767123, "grad_norm": 1.376985788345337, "learning_rate": 0.0002, "loss": 0.8526, "step": 81 }, { "epoch": 0.8986301369863013, "grad_norm": 1.3588144779205322, "learning_rate": 0.0002, "loss": 0.7122, "step": 82 }, { "epoch": 0.9095890410958904, "grad_norm": 1.378824234008789, "learning_rate": 0.0002, "loss": 0.8444, "step": 83 }, { "epoch": 0.9205479452054794, "grad_norm": 1.5447663068771362, "learning_rate": 0.0002, "loss": 0.6788, "step": 84 }, { "epoch": 0.9315068493150684, "grad_norm": 1.4500224590301514, "learning_rate": 0.0002, "loss": 0.5721, "step": 85 }, { "epoch": 0.9424657534246575, "grad_norm": 1.0830070972442627, "learning_rate": 0.0002, "loss": 0.657, "step": 86 }, { "epoch": 0.9534246575342465, "grad_norm": 1.3003672361373901, "learning_rate": 0.0002, "loss": 0.4806, "step": 87 }, { "epoch": 0.9643835616438357, "grad_norm": 1.1137444972991943, "learning_rate": 0.0002, "loss": 0.7444, "step": 88 }, { "epoch": 0.9753424657534246, "grad_norm": 1.2204691171646118, "learning_rate": 0.0002, "loss": 0.7924, "step": 89 }, { "epoch": 0.9863013698630136, "grad_norm": 1.3225165605545044, "learning_rate": 0.0002, "loss": 0.7357, "step": 90 }, { "epoch": 0.9972602739726028, "grad_norm": 1.2743207216262817, "learning_rate": 0.0002, "loss": 0.6903, "step": 91 }, { "epoch": 1.0082191780821919, "grad_norm": 1.2072831392288208, "learning_rate": 0.0002, "loss": 0.4617, "step": 92 }, { "epoch": 1.0191780821917809, "grad_norm": 1.0190479755401611, "learning_rate": 0.0002, "loss": 0.4412, "step": 93 }, { "epoch": 1.0301369863013699, "grad_norm": 0.8685715198516846, "learning_rate": 0.0002, "loss": 0.4422, "step": 94 }, { "epoch": 1.0410958904109588, "grad_norm": 0.6671916246414185, "learning_rate": 0.0002, "loss": 0.2872, "step": 95 }, { "epoch": 1.0520547945205478, "grad_norm": 0.8552739024162292, "learning_rate": 0.0002, "loss": 0.2837, "step": 96 }, { "epoch": 1.063013698630137, "grad_norm": 0.8662064075469971, "learning_rate": 0.0002, "loss": 0.2549, "step": 97 }, { "epoch": 1.073972602739726, "grad_norm": 1.6159878969192505, "learning_rate": 0.0002, "loss": 0.4545, "step": 98 }, { "epoch": 1.084931506849315, "grad_norm": 1.0922621488571167, "learning_rate": 0.0002, "loss": 0.2703, "step": 99 }, { "epoch": 1.095890410958904, "grad_norm": 0.9011418223381042, "learning_rate": 0.0002, "loss": 0.1948, "step": 100 }, { "epoch": 1.106849315068493, "grad_norm": 1.094281554222107, "learning_rate": 0.0002, "loss": 0.2967, "step": 101 }, { "epoch": 1.1178082191780823, "grad_norm": 0.9296566843986511, "learning_rate": 0.0002, "loss": 0.2372, "step": 102 }, { "epoch": 1.1287671232876713, "grad_norm": 1.2015409469604492, "learning_rate": 0.0002, "loss": 0.2724, "step": 103 }, { "epoch": 1.1397260273972603, "grad_norm": 1.0707019567489624, "learning_rate": 0.0002, "loss": 0.242, "step": 104 }, { "epoch": 1.1506849315068493, "grad_norm": 1.381605863571167, "learning_rate": 0.0002, "loss": 0.4983, "step": 105 }, { "epoch": 1.1616438356164385, "grad_norm": 1.3150050640106201, "learning_rate": 0.0002, "loss": 0.3801, "step": 106 }, { "epoch": 1.1726027397260275, "grad_norm": 1.2527716159820557, "learning_rate": 0.0002, "loss": 0.3798, "step": 107 }, { "epoch": 1.1835616438356165, "grad_norm": 1.2365212440490723, "learning_rate": 0.0002, "loss": 0.2736, "step": 108 }, { "epoch": 1.1945205479452055, "grad_norm": 1.1183747053146362, "learning_rate": 0.0002, "loss": 0.4753, "step": 109 }, { "epoch": 1.2054794520547945, "grad_norm": 0.8566204905509949, "learning_rate": 0.0002, "loss": 0.2531, "step": 110 }, { "epoch": 1.2164383561643834, "grad_norm": 1.0663121938705444, "learning_rate": 0.0002, "loss": 0.1986, "step": 111 }, { "epoch": 1.2273972602739727, "grad_norm": 1.4607142210006714, "learning_rate": 0.0002, "loss": 0.3589, "step": 112 }, { "epoch": 1.2383561643835617, "grad_norm": 0.7903380990028381, "learning_rate": 0.0002, "loss": 0.1885, "step": 113 }, { "epoch": 1.2493150684931507, "grad_norm": 1.3529448509216309, "learning_rate": 0.0002, "loss": 0.2417, "step": 114 }, { "epoch": 1.2602739726027397, "grad_norm": 1.0445804595947266, "learning_rate": 0.0002, "loss": 0.2208, "step": 115 }, { "epoch": 1.2712328767123289, "grad_norm": 1.0864062309265137, "learning_rate": 0.0002, "loss": 0.2603, "step": 116 }, { "epoch": 1.2821917808219179, "grad_norm": 1.0503292083740234, "learning_rate": 0.0002, "loss": 0.1478, "step": 117 }, { "epoch": 1.2931506849315069, "grad_norm": 1.4396042823791504, "learning_rate": 0.0002, "loss": 0.2482, "step": 118 }, { "epoch": 1.3041095890410959, "grad_norm": 1.7265571355819702, "learning_rate": 0.0002, "loss": 0.3195, "step": 119 }, { "epoch": 1.3150684931506849, "grad_norm": 1.2890552282333374, "learning_rate": 0.0002, "loss": 0.1891, "step": 120 }, { "epoch": 1.3260273972602739, "grad_norm": 1.25291109085083, "learning_rate": 0.0002, "loss": 0.272, "step": 121 }, { "epoch": 1.336986301369863, "grad_norm": 1.3044368028640747, "learning_rate": 0.0002, "loss": 0.3068, "step": 122 }, { "epoch": 1.347945205479452, "grad_norm": 1.7130950689315796, "learning_rate": 0.0002, "loss": 0.5022, "step": 123 }, { "epoch": 1.358904109589041, "grad_norm": 2.3856253623962402, "learning_rate": 0.0002, "loss": 0.2692, "step": 124 }, { "epoch": 1.36986301369863, "grad_norm": 1.2418773174285889, "learning_rate": 0.0002, "loss": 0.3586, "step": 125 }, { "epoch": 1.3808219178082193, "grad_norm": 1.4788987636566162, "learning_rate": 0.0002, "loss": 0.3331, "step": 126 }, { "epoch": 1.3917808219178083, "grad_norm": 0.8837617635726929, "learning_rate": 0.0002, "loss": 0.2599, "step": 127 }, { "epoch": 1.4027397260273973, "grad_norm": 1.1440480947494507, "learning_rate": 0.0002, "loss": 0.4741, "step": 128 }, { "epoch": 1.4136986301369863, "grad_norm": 0.924139142036438, "learning_rate": 0.0002, "loss": 0.3046, "step": 129 }, { "epoch": 1.4246575342465753, "grad_norm": 1.0871144533157349, "learning_rate": 0.0002, "loss": 0.2887, "step": 130 }, { "epoch": 1.4356164383561643, "grad_norm": 0.9994255304336548, "learning_rate": 0.0002, "loss": 0.2292, "step": 131 }, { "epoch": 1.4465753424657535, "grad_norm": 1.2388752698898315, "learning_rate": 0.0002, "loss": 0.2912, "step": 132 }, { "epoch": 1.4575342465753425, "grad_norm": 1.0453673601150513, "learning_rate": 0.0002, "loss": 0.2324, "step": 133 }, { "epoch": 1.4684931506849315, "grad_norm": 1.558586597442627, "learning_rate": 0.0002, "loss": 0.3854, "step": 134 }, { "epoch": 1.4794520547945205, "grad_norm": 1.2428361177444458, "learning_rate": 0.0002, "loss": 0.2, "step": 135 }, { "epoch": 1.4904109589041097, "grad_norm": 1.2706862688064575, "learning_rate": 0.0002, "loss": 0.1539, "step": 136 }, { "epoch": 1.5013698630136987, "grad_norm": 1.4815326929092407, "learning_rate": 0.0002, "loss": 0.4198, "step": 137 }, { "epoch": 1.5123287671232877, "grad_norm": 1.3065235614776611, "learning_rate": 0.0002, "loss": 0.2996, "step": 138 }, { "epoch": 1.5232876712328767, "grad_norm": 1.1650217771530151, "learning_rate": 0.0002, "loss": 0.2343, "step": 139 }, { "epoch": 1.5342465753424657, "grad_norm": 2.339799165725708, "learning_rate": 0.0002, "loss": 0.3193, "step": 140 }, { "epoch": 1.5452054794520547, "grad_norm": 1.2828121185302734, "learning_rate": 0.0002, "loss": 0.3996, "step": 141 }, { "epoch": 1.5561643835616439, "grad_norm": 1.0856819152832031, "learning_rate": 0.0002, "loss": 0.3242, "step": 142 }, { "epoch": 1.5671232876712329, "grad_norm": 1.0250024795532227, "learning_rate": 0.0002, "loss": 0.23, "step": 143 }, { "epoch": 1.5780821917808219, "grad_norm": 0.9548241496086121, "learning_rate": 0.0002, "loss": 0.1995, "step": 144 }, { "epoch": 1.589041095890411, "grad_norm": 0.966123104095459, "learning_rate": 0.0002, "loss": 0.3443, "step": 145 }, { "epoch": 1.6, "grad_norm": 1.8860892057418823, "learning_rate": 0.0002, "loss": 0.3481, "step": 146 }, { "epoch": 1.610958904109589, "grad_norm": 1.1538076400756836, "learning_rate": 0.0002, "loss": 0.2511, "step": 147 }, { "epoch": 1.621917808219178, "grad_norm": 1.4117934703826904, "learning_rate": 0.0002, "loss": 0.3807, "step": 148 }, { "epoch": 1.632876712328767, "grad_norm": 1.4486627578735352, "learning_rate": 0.0002, "loss": 0.2264, "step": 149 }, { "epoch": 1.643835616438356, "grad_norm": 0.643312931060791, "learning_rate": 0.0002, "loss": 0.0966, "step": 150 } ], "logging_steps": 1, "max_steps": 364, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.76287234956329e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }