{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 291, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003436426116838488, "grad_norm": 0.4921875, "learning_rate": 9.965635738831616e-06, "loss": 1.7833, "step": 1 }, { "epoch": 0.006872852233676976, "grad_norm": 0.47265625, "learning_rate": 9.931271477663231e-06, "loss": 1.6757, "step": 2 }, { "epoch": 0.010309278350515464, "grad_norm": 0.48828125, "learning_rate": 9.896907216494846e-06, "loss": 1.7604, "step": 3 }, { "epoch": 0.013745704467353952, "grad_norm": 0.43359375, "learning_rate": 9.862542955326461e-06, "loss": 1.6662, "step": 4 }, { "epoch": 0.01718213058419244, "grad_norm": 0.43359375, "learning_rate": 9.828178694158076e-06, "loss": 1.7293, "step": 5 }, { "epoch": 0.020618556701030927, "grad_norm": 0.380859375, "learning_rate": 9.793814432989691e-06, "loss": 1.6688, "step": 6 }, { "epoch": 0.024054982817869417, "grad_norm": 0.34375, "learning_rate": 9.759450171821306e-06, "loss": 1.6821, "step": 7 }, { "epoch": 0.027491408934707903, "grad_norm": 0.314453125, "learning_rate": 9.725085910652921e-06, "loss": 1.6133, "step": 8 }, { "epoch": 0.030927835051546393, "grad_norm": 0.298828125, "learning_rate": 9.690721649484536e-06, "loss": 1.6739, "step": 9 }, { "epoch": 0.03436426116838488, "grad_norm": 0.26171875, "learning_rate": 9.656357388316153e-06, "loss": 1.5586, "step": 10 }, { "epoch": 0.037800687285223365, "grad_norm": 0.2578125, "learning_rate": 9.621993127147768e-06, "loss": 1.5729, "step": 11 }, { "epoch": 0.041237113402061855, "grad_norm": 0.2333984375, "learning_rate": 9.587628865979383e-06, "loss": 1.5494, "step": 12 }, { "epoch": 0.044673539518900345, "grad_norm": 0.244140625, "learning_rate": 9.553264604810998e-06, "loss": 1.5882, "step": 13 }, { "epoch": 0.048109965635738834, "grad_norm": 0.23828125, "learning_rate": 9.518900343642611e-06, "loss": 1.5447, "step": 14 }, { "epoch": 0.05154639175257732, "grad_norm": 0.232421875, "learning_rate": 9.484536082474226e-06, "loss": 1.5998, "step": 15 }, { "epoch": 0.054982817869415807, "grad_norm": 0.2431640625, "learning_rate": 9.450171821305843e-06, "loss": 1.5377, "step": 16 }, { "epoch": 0.058419243986254296, "grad_norm": 0.2314453125, "learning_rate": 9.415807560137458e-06, "loss": 1.5192, "step": 17 }, { "epoch": 0.061855670103092786, "grad_norm": 0.234375, "learning_rate": 9.381443298969073e-06, "loss": 1.5355, "step": 18 }, { "epoch": 0.06529209621993128, "grad_norm": 0.2265625, "learning_rate": 9.347079037800688e-06, "loss": 1.5234, "step": 19 }, { "epoch": 0.06872852233676977, "grad_norm": 0.220703125, "learning_rate": 9.312714776632303e-06, "loss": 1.499, "step": 20 }, { "epoch": 0.07216494845360824, "grad_norm": 0.2060546875, "learning_rate": 9.278350515463918e-06, "loss": 1.5096, "step": 21 }, { "epoch": 0.07560137457044673, "grad_norm": 0.193359375, "learning_rate": 9.243986254295533e-06, "loss": 1.5011, "step": 22 }, { "epoch": 0.07903780068728522, "grad_norm": 0.197265625, "learning_rate": 9.209621993127148e-06, "loss": 1.4814, "step": 23 }, { "epoch": 0.08247422680412371, "grad_norm": 0.18359375, "learning_rate": 9.175257731958764e-06, "loss": 1.4306, "step": 24 }, { "epoch": 0.0859106529209622, "grad_norm": 0.1826171875, "learning_rate": 9.140893470790379e-06, "loss": 1.4095, "step": 25 }, { "epoch": 0.08934707903780069, "grad_norm": 0.1767578125, "learning_rate": 9.106529209621994e-06, "loss": 1.5332, "step": 26 }, { "epoch": 0.09278350515463918, "grad_norm": 0.171875, "learning_rate": 9.072164948453609e-06, "loss": 1.3951, "step": 27 }, { "epoch": 0.09621993127147767, "grad_norm": 0.1650390625, "learning_rate": 9.037800687285224e-06, "loss": 1.4271, "step": 28 }, { "epoch": 0.09965635738831616, "grad_norm": 0.17578125, "learning_rate": 9.003436426116839e-06, "loss": 1.5286, "step": 29 }, { "epoch": 0.10309278350515463, "grad_norm": 0.1572265625, "learning_rate": 8.969072164948455e-06, "loss": 1.4261, "step": 30 }, { "epoch": 0.10652920962199312, "grad_norm": 0.1689453125, "learning_rate": 8.93470790378007e-06, "loss": 1.4047, "step": 31 }, { "epoch": 0.10996563573883161, "grad_norm": 0.1611328125, "learning_rate": 8.900343642611684e-06, "loss": 1.3345, "step": 32 }, { "epoch": 0.1134020618556701, "grad_norm": 0.177734375, "learning_rate": 8.865979381443299e-06, "loss": 1.3815, "step": 33 }, { "epoch": 0.11683848797250859, "grad_norm": 0.1708984375, "learning_rate": 8.831615120274914e-06, "loss": 1.419, "step": 34 }, { "epoch": 0.12027491408934708, "grad_norm": 0.1630859375, "learning_rate": 8.797250859106529e-06, "loss": 1.4125, "step": 35 }, { "epoch": 0.12371134020618557, "grad_norm": 0.16015625, "learning_rate": 8.762886597938146e-06, "loss": 1.3938, "step": 36 }, { "epoch": 0.12714776632302405, "grad_norm": 0.1513671875, "learning_rate": 8.72852233676976e-06, "loss": 1.3676, "step": 37 }, { "epoch": 0.13058419243986255, "grad_norm": 0.1767578125, "learning_rate": 8.694158075601376e-06, "loss": 1.355, "step": 38 }, { "epoch": 0.13402061855670103, "grad_norm": 0.1435546875, "learning_rate": 8.65979381443299e-06, "loss": 1.2992, "step": 39 }, { "epoch": 0.13745704467353953, "grad_norm": 0.1494140625, "learning_rate": 8.625429553264606e-06, "loss": 1.2822, "step": 40 }, { "epoch": 0.140893470790378, "grad_norm": 0.1416015625, "learning_rate": 8.591065292096221e-06, "loss": 1.3814, "step": 41 }, { "epoch": 0.14432989690721648, "grad_norm": 0.1455078125, "learning_rate": 8.556701030927836e-06, "loss": 1.342, "step": 42 }, { "epoch": 0.14776632302405499, "grad_norm": 0.1552734375, "learning_rate": 8.522336769759451e-06, "loss": 1.3447, "step": 43 }, { "epoch": 0.15120274914089346, "grad_norm": 0.146484375, "learning_rate": 8.487972508591066e-06, "loss": 1.3496, "step": 44 }, { "epoch": 0.15463917525773196, "grad_norm": 0.142578125, "learning_rate": 8.453608247422681e-06, "loss": 1.361, "step": 45 }, { "epoch": 0.15807560137457044, "grad_norm": 0.1708984375, "learning_rate": 8.419243986254296e-06, "loss": 1.3466, "step": 46 }, { "epoch": 0.16151202749140894, "grad_norm": 0.1572265625, "learning_rate": 8.384879725085911e-06, "loss": 1.2897, "step": 47 }, { "epoch": 0.16494845360824742, "grad_norm": 0.1318359375, "learning_rate": 8.350515463917526e-06, "loss": 1.2955, "step": 48 }, { "epoch": 0.16838487972508592, "grad_norm": 0.26953125, "learning_rate": 8.316151202749141e-06, "loss": 1.2537, "step": 49 }, { "epoch": 0.1718213058419244, "grad_norm": 0.1259765625, "learning_rate": 8.281786941580758e-06, "loss": 1.2805, "step": 50 }, { "epoch": 0.17525773195876287, "grad_norm": 0.1171875, "learning_rate": 8.247422680412371e-06, "loss": 1.2535, "step": 51 }, { "epoch": 0.17869415807560138, "grad_norm": 0.11962890625, "learning_rate": 8.213058419243986e-06, "loss": 1.3243, "step": 52 }, { "epoch": 0.18213058419243985, "grad_norm": 0.12353515625, "learning_rate": 8.178694158075601e-06, "loss": 1.262, "step": 53 }, { "epoch": 0.18556701030927836, "grad_norm": 0.1259765625, "learning_rate": 8.144329896907216e-06, "loss": 1.2669, "step": 54 }, { "epoch": 0.18900343642611683, "grad_norm": 0.123046875, "learning_rate": 8.109965635738832e-06, "loss": 1.2936, "step": 55 }, { "epoch": 0.19243986254295534, "grad_norm": 0.1357421875, "learning_rate": 8.075601374570448e-06, "loss": 1.2678, "step": 56 }, { "epoch": 0.1958762886597938, "grad_norm": 0.1171875, "learning_rate": 8.041237113402063e-06, "loss": 1.3035, "step": 57 }, { "epoch": 0.19931271477663232, "grad_norm": 0.1357421875, "learning_rate": 8.006872852233678e-06, "loss": 1.3927, "step": 58 }, { "epoch": 0.2027491408934708, "grad_norm": 0.1103515625, "learning_rate": 7.972508591065293e-06, "loss": 1.2696, "step": 59 }, { "epoch": 0.20618556701030927, "grad_norm": 0.1181640625, "learning_rate": 7.938144329896907e-06, "loss": 1.2684, "step": 60 }, { "epoch": 0.20962199312714777, "grad_norm": 0.1474609375, "learning_rate": 7.903780068728523e-06, "loss": 1.3102, "step": 61 }, { "epoch": 0.21305841924398625, "grad_norm": 0.11181640625, "learning_rate": 7.869415807560138e-06, "loss": 1.2037, "step": 62 }, { "epoch": 0.21649484536082475, "grad_norm": 0.11376953125, "learning_rate": 7.835051546391754e-06, "loss": 1.2694, "step": 63 }, { "epoch": 0.21993127147766323, "grad_norm": 0.11474609375, "learning_rate": 7.800687285223369e-06, "loss": 1.2515, "step": 64 }, { "epoch": 0.22336769759450173, "grad_norm": 0.1142578125, "learning_rate": 7.766323024054984e-06, "loss": 1.2484, "step": 65 }, { "epoch": 0.2268041237113402, "grad_norm": 0.11767578125, "learning_rate": 7.731958762886599e-06, "loss": 1.342, "step": 66 }, { "epoch": 0.23024054982817868, "grad_norm": 0.12255859375, "learning_rate": 7.697594501718214e-06, "loss": 1.2629, "step": 67 }, { "epoch": 0.23367697594501718, "grad_norm": 0.1162109375, "learning_rate": 7.663230240549829e-06, "loss": 1.2718, "step": 68 }, { "epoch": 0.23711340206185566, "grad_norm": 0.109375, "learning_rate": 7.628865979381444e-06, "loss": 1.3001, "step": 69 }, { "epoch": 0.24054982817869416, "grad_norm": 0.10986328125, "learning_rate": 7.594501718213059e-06, "loss": 1.1548, "step": 70 }, { "epoch": 0.24398625429553264, "grad_norm": 0.109375, "learning_rate": 7.560137457044674e-06, "loss": 1.1643, "step": 71 }, { "epoch": 0.24742268041237114, "grad_norm": 0.13671875, "learning_rate": 7.525773195876289e-06, "loss": 1.2197, "step": 72 }, { "epoch": 0.2508591065292096, "grad_norm": 0.111328125, "learning_rate": 7.491408934707905e-06, "loss": 1.2366, "step": 73 }, { "epoch": 0.2542955326460481, "grad_norm": 0.11767578125, "learning_rate": 7.45704467353952e-06, "loss": 1.2623, "step": 74 }, { "epoch": 0.25773195876288657, "grad_norm": 0.10693359375, "learning_rate": 7.422680412371135e-06, "loss": 1.2439, "step": 75 }, { "epoch": 0.2611683848797251, "grad_norm": 0.109375, "learning_rate": 7.38831615120275e-06, "loss": 1.2451, "step": 76 }, { "epoch": 0.2646048109965636, "grad_norm": 0.11474609375, "learning_rate": 7.353951890034365e-06, "loss": 1.1966, "step": 77 }, { "epoch": 0.26804123711340205, "grad_norm": 0.11083984375, "learning_rate": 7.319587628865979e-06, "loss": 1.1983, "step": 78 }, { "epoch": 0.27147766323024053, "grad_norm": 0.1044921875, "learning_rate": 7.285223367697595e-06, "loss": 1.2115, "step": 79 }, { "epoch": 0.27491408934707906, "grad_norm": 0.1083984375, "learning_rate": 7.25085910652921e-06, "loss": 1.1679, "step": 80 }, { "epoch": 0.27835051546391754, "grad_norm": 0.1357421875, "learning_rate": 7.216494845360825e-06, "loss": 1.2571, "step": 81 }, { "epoch": 0.281786941580756, "grad_norm": 0.119140625, "learning_rate": 7.18213058419244e-06, "loss": 1.3151, "step": 82 }, { "epoch": 0.2852233676975945, "grad_norm": 0.1357421875, "learning_rate": 7.147766323024056e-06, "loss": 1.2378, "step": 83 }, { "epoch": 0.28865979381443296, "grad_norm": 0.107421875, "learning_rate": 7.113402061855671e-06, "loss": 1.2514, "step": 84 }, { "epoch": 0.2920962199312715, "grad_norm": 0.138671875, "learning_rate": 7.079037800687286e-06, "loss": 1.2457, "step": 85 }, { "epoch": 0.29553264604810997, "grad_norm": 0.1513671875, "learning_rate": 7.044673539518901e-06, "loss": 1.2397, "step": 86 }, { "epoch": 0.29896907216494845, "grad_norm": 0.11181640625, "learning_rate": 7.010309278350515e-06, "loss": 1.2586, "step": 87 }, { "epoch": 0.3024054982817869, "grad_norm": 0.115234375, "learning_rate": 6.9759450171821304e-06, "loss": 1.2845, "step": 88 }, { "epoch": 0.30584192439862545, "grad_norm": 0.1240234375, "learning_rate": 6.941580756013746e-06, "loss": 1.2553, "step": 89 }, { "epoch": 0.30927835051546393, "grad_norm": 0.2353515625, "learning_rate": 6.907216494845361e-06, "loss": 1.145, "step": 90 }, { "epoch": 0.3127147766323024, "grad_norm": 0.1513671875, "learning_rate": 6.872852233676976e-06, "loss": 1.2295, "step": 91 }, { "epoch": 0.3161512027491409, "grad_norm": 0.1435546875, "learning_rate": 6.8384879725085914e-06, "loss": 1.1771, "step": 92 }, { "epoch": 0.31958762886597936, "grad_norm": 0.11962890625, "learning_rate": 6.804123711340207e-06, "loss": 1.1458, "step": 93 }, { "epoch": 0.3230240549828179, "grad_norm": 0.158203125, "learning_rate": 6.769759450171822e-06, "loss": 1.2342, "step": 94 }, { "epoch": 0.32646048109965636, "grad_norm": 0.10888671875, "learning_rate": 6.735395189003437e-06, "loss": 1.2193, "step": 95 }, { "epoch": 0.32989690721649484, "grad_norm": 0.10986328125, "learning_rate": 6.701030927835052e-06, "loss": 1.2225, "step": 96 }, { "epoch": 0.3333333333333333, "grad_norm": 0.10498046875, "learning_rate": 6.666666666666667e-06, "loss": 1.2053, "step": 97 }, { "epoch": 0.33676975945017185, "grad_norm": 0.119140625, "learning_rate": 6.632302405498282e-06, "loss": 1.1893, "step": 98 }, { "epoch": 0.3402061855670103, "grad_norm": 0.10986328125, "learning_rate": 6.597938144329898e-06, "loss": 1.2301, "step": 99 }, { "epoch": 0.3436426116838488, "grad_norm": 0.11328125, "learning_rate": 6.563573883161513e-06, "loss": 1.1943, "step": 100 }, { "epoch": 0.3470790378006873, "grad_norm": 0.1552734375, "learning_rate": 6.529209621993128e-06, "loss": 1.2589, "step": 101 }, { "epoch": 0.35051546391752575, "grad_norm": 0.1025390625, "learning_rate": 6.494845360824743e-06, "loss": 1.1986, "step": 102 }, { "epoch": 0.3539518900343643, "grad_norm": 0.146484375, "learning_rate": 6.460481099656359e-06, "loss": 1.2073, "step": 103 }, { "epoch": 0.35738831615120276, "grad_norm": 0.111328125, "learning_rate": 6.426116838487974e-06, "loss": 1.2892, "step": 104 }, { "epoch": 0.36082474226804123, "grad_norm": 0.1064453125, "learning_rate": 6.391752577319588e-06, "loss": 1.2071, "step": 105 }, { "epoch": 0.3642611683848797, "grad_norm": 0.0986328125, "learning_rate": 6.357388316151203e-06, "loss": 1.1472, "step": 106 }, { "epoch": 0.36769759450171824, "grad_norm": 0.1376953125, "learning_rate": 6.323024054982818e-06, "loss": 1.1407, "step": 107 }, { "epoch": 0.3711340206185567, "grad_norm": 0.1328125, "learning_rate": 6.288659793814433e-06, "loss": 1.2531, "step": 108 }, { "epoch": 0.3745704467353952, "grad_norm": 0.10107421875, "learning_rate": 6.254295532646049e-06, "loss": 1.1916, "step": 109 }, { "epoch": 0.37800687285223367, "grad_norm": 0.109375, "learning_rate": 6.219931271477664e-06, "loss": 1.1996, "step": 110 }, { "epoch": 0.38144329896907214, "grad_norm": 0.10302734375, "learning_rate": 6.185567010309279e-06, "loss": 1.158, "step": 111 }, { "epoch": 0.3848797250859107, "grad_norm": 0.11669921875, "learning_rate": 6.151202749140894e-06, "loss": 1.2114, "step": 112 }, { "epoch": 0.38831615120274915, "grad_norm": 0.09912109375, "learning_rate": 6.11683848797251e-06, "loss": 1.2039, "step": 113 }, { "epoch": 0.3917525773195876, "grad_norm": 0.11572265625, "learning_rate": 6.082474226804124e-06, "loss": 1.2078, "step": 114 }, { "epoch": 0.3951890034364261, "grad_norm": 0.10205078125, "learning_rate": 6.048109965635739e-06, "loss": 1.18, "step": 115 }, { "epoch": 0.39862542955326463, "grad_norm": 0.1171875, "learning_rate": 6.013745704467354e-06, "loss": 1.2228, "step": 116 }, { "epoch": 0.4020618556701031, "grad_norm": 0.103515625, "learning_rate": 5.979381443298969e-06, "loss": 1.1975, "step": 117 }, { "epoch": 0.4054982817869416, "grad_norm": 0.1142578125, "learning_rate": 5.945017182130585e-06, "loss": 1.14, "step": 118 }, { "epoch": 0.40893470790378006, "grad_norm": 0.0986328125, "learning_rate": 5.9106529209622e-06, "loss": 1.1489, "step": 119 }, { "epoch": 0.41237113402061853, "grad_norm": 0.10205078125, "learning_rate": 5.876288659793815e-06, "loss": 1.22, "step": 120 }, { "epoch": 0.41580756013745707, "grad_norm": 0.10205078125, "learning_rate": 5.84192439862543e-06, "loss": 1.1821, "step": 121 }, { "epoch": 0.41924398625429554, "grad_norm": 0.12255859375, "learning_rate": 5.807560137457045e-06, "loss": 1.1964, "step": 122 }, { "epoch": 0.422680412371134, "grad_norm": 0.109375, "learning_rate": 5.7731958762886594e-06, "loss": 1.1879, "step": 123 }, { "epoch": 0.4261168384879725, "grad_norm": 0.1005859375, "learning_rate": 5.738831615120275e-06, "loss": 1.1497, "step": 124 }, { "epoch": 0.42955326460481097, "grad_norm": 0.10107421875, "learning_rate": 5.70446735395189e-06, "loss": 1.0871, "step": 125 }, { "epoch": 0.4329896907216495, "grad_norm": 0.11083984375, "learning_rate": 5.670103092783505e-06, "loss": 1.2444, "step": 126 }, { "epoch": 0.436426116838488, "grad_norm": 0.099609375, "learning_rate": 5.6357388316151204e-06, "loss": 1.1902, "step": 127 }, { "epoch": 0.43986254295532645, "grad_norm": 0.11083984375, "learning_rate": 5.601374570446736e-06, "loss": 1.2449, "step": 128 }, { "epoch": 0.44329896907216493, "grad_norm": 0.111328125, "learning_rate": 5.567010309278351e-06, "loss": 1.1841, "step": 129 }, { "epoch": 0.44673539518900346, "grad_norm": 0.10888671875, "learning_rate": 5.532646048109966e-06, "loss": 1.2824, "step": 130 }, { "epoch": 0.45017182130584193, "grad_norm": 0.1015625, "learning_rate": 5.4982817869415815e-06, "loss": 1.1912, "step": 131 }, { "epoch": 0.4536082474226804, "grad_norm": 0.09521484375, "learning_rate": 5.463917525773196e-06, "loss": 1.1778, "step": 132 }, { "epoch": 0.4570446735395189, "grad_norm": 0.10400390625, "learning_rate": 5.429553264604811e-06, "loss": 1.1849, "step": 133 }, { "epoch": 0.46048109965635736, "grad_norm": 0.10009765625, "learning_rate": 5.395189003436427e-06, "loss": 1.1319, "step": 134 }, { "epoch": 0.4639175257731959, "grad_norm": 0.09912109375, "learning_rate": 5.360824742268042e-06, "loss": 1.1162, "step": 135 }, { "epoch": 0.46735395189003437, "grad_norm": 0.09912109375, "learning_rate": 5.326460481099657e-06, "loss": 1.1951, "step": 136 }, { "epoch": 0.47079037800687284, "grad_norm": 0.0986328125, "learning_rate": 5.292096219931272e-06, "loss": 1.2168, "step": 137 }, { "epoch": 0.4742268041237113, "grad_norm": 0.10986328125, "learning_rate": 5.257731958762888e-06, "loss": 1.1578, "step": 138 }, { "epoch": 0.47766323024054985, "grad_norm": 0.1103515625, "learning_rate": 5.223367697594503e-06, "loss": 1.3295, "step": 139 }, { "epoch": 0.48109965635738833, "grad_norm": 0.10595703125, "learning_rate": 5.189003436426118e-06, "loss": 1.1521, "step": 140 }, { "epoch": 0.4845360824742268, "grad_norm": 0.1044921875, "learning_rate": 5.154639175257732e-06, "loss": 1.2219, "step": 141 }, { "epoch": 0.4879725085910653, "grad_norm": 0.09912109375, "learning_rate": 5.120274914089347e-06, "loss": 1.1866, "step": 142 }, { "epoch": 0.49140893470790376, "grad_norm": 0.10009765625, "learning_rate": 5.085910652920962e-06, "loss": 1.1922, "step": 143 }, { "epoch": 0.4948453608247423, "grad_norm": 0.1005859375, "learning_rate": 5.051546391752578e-06, "loss": 1.1784, "step": 144 }, { "epoch": 0.49828178694158076, "grad_norm": 0.10595703125, "learning_rate": 5.017182130584193e-06, "loss": 1.1468, "step": 145 }, { "epoch": 0.5017182130584192, "grad_norm": 0.103515625, "learning_rate": 4.982817869415808e-06, "loss": 1.2448, "step": 146 }, { "epoch": 0.5051546391752577, "grad_norm": 0.099609375, "learning_rate": 4.948453608247423e-06, "loss": 1.1422, "step": 147 }, { "epoch": 0.5085910652920962, "grad_norm": 0.1123046875, "learning_rate": 4.914089347079038e-06, "loss": 1.2039, "step": 148 }, { "epoch": 0.5120274914089347, "grad_norm": 0.09814453125, "learning_rate": 4.879725085910653e-06, "loss": 1.2287, "step": 149 }, { "epoch": 0.5154639175257731, "grad_norm": 0.10302734375, "learning_rate": 4.845360824742268e-06, "loss": 1.17, "step": 150 }, { "epoch": 0.5189003436426117, "grad_norm": 0.1064453125, "learning_rate": 4.810996563573884e-06, "loss": 1.0933, "step": 151 }, { "epoch": 0.5223367697594502, "grad_norm": 0.1005859375, "learning_rate": 4.776632302405499e-06, "loss": 1.2443, "step": 152 }, { "epoch": 0.5257731958762887, "grad_norm": 0.1220703125, "learning_rate": 4.742268041237113e-06, "loss": 1.2075, "step": 153 }, { "epoch": 0.5292096219931272, "grad_norm": 0.10888671875, "learning_rate": 4.707903780068729e-06, "loss": 1.1981, "step": 154 }, { "epoch": 0.5326460481099656, "grad_norm": 0.10107421875, "learning_rate": 4.673539518900344e-06, "loss": 1.1483, "step": 155 }, { "epoch": 0.5360824742268041, "grad_norm": 0.10205078125, "learning_rate": 4.639175257731959e-06, "loss": 1.1753, "step": 156 }, { "epoch": 0.5395189003436426, "grad_norm": 0.099609375, "learning_rate": 4.604810996563574e-06, "loss": 1.1578, "step": 157 }, { "epoch": 0.5429553264604811, "grad_norm": 0.10498046875, "learning_rate": 4.570446735395189e-06, "loss": 1.1467, "step": 158 }, { "epoch": 0.5463917525773195, "grad_norm": 0.1044921875, "learning_rate": 4.536082474226804e-06, "loss": 1.188, "step": 159 }, { "epoch": 0.5498281786941581, "grad_norm": 0.10595703125, "learning_rate": 4.501718213058419e-06, "loss": 1.1748, "step": 160 }, { "epoch": 0.5532646048109966, "grad_norm": 0.10107421875, "learning_rate": 4.467353951890035e-06, "loss": 1.1563, "step": 161 }, { "epoch": 0.5567010309278351, "grad_norm": 0.103515625, "learning_rate": 4.4329896907216494e-06, "loss": 1.2162, "step": 162 }, { "epoch": 0.5601374570446735, "grad_norm": 0.1708984375, "learning_rate": 4.3986254295532645e-06, "loss": 1.1612, "step": 163 }, { "epoch": 0.563573883161512, "grad_norm": 0.1015625, "learning_rate": 4.36426116838488e-06, "loss": 1.1738, "step": 164 }, { "epoch": 0.5670103092783505, "grad_norm": 0.11669921875, "learning_rate": 4.329896907216495e-06, "loss": 1.2428, "step": 165 }, { "epoch": 0.570446735395189, "grad_norm": 0.1767578125, "learning_rate": 4.2955326460481105e-06, "loss": 1.2499, "step": 166 }, { "epoch": 0.5738831615120275, "grad_norm": 0.15234375, "learning_rate": 4.2611683848797255e-06, "loss": 1.1381, "step": 167 }, { "epoch": 0.5773195876288659, "grad_norm": 0.107421875, "learning_rate": 4.2268041237113405e-06, "loss": 1.1218, "step": 168 }, { "epoch": 0.5807560137457045, "grad_norm": 0.103515625, "learning_rate": 4.192439862542956e-06, "loss": 1.1917, "step": 169 }, { "epoch": 0.584192439862543, "grad_norm": 0.1201171875, "learning_rate": 4.158075601374571e-06, "loss": 1.2737, "step": 170 }, { "epoch": 0.5876288659793815, "grad_norm": 0.1376953125, "learning_rate": 4.123711340206186e-06, "loss": 1.1914, "step": 171 }, { "epoch": 0.5910652920962199, "grad_norm": 0.10546875, "learning_rate": 4.089347079037801e-06, "loss": 1.1924, "step": 172 }, { "epoch": 0.5945017182130584, "grad_norm": 0.10595703125, "learning_rate": 4.054982817869416e-06, "loss": 1.186, "step": 173 }, { "epoch": 0.5979381443298969, "grad_norm": 0.1591796875, "learning_rate": 4.020618556701032e-06, "loss": 1.2316, "step": 174 }, { "epoch": 0.6013745704467354, "grad_norm": 0.1015625, "learning_rate": 3.986254295532647e-06, "loss": 1.1319, "step": 175 }, { "epoch": 0.6048109965635738, "grad_norm": 0.11083984375, "learning_rate": 3.951890034364262e-06, "loss": 1.1959, "step": 176 }, { "epoch": 0.6082474226804123, "grad_norm": 0.1103515625, "learning_rate": 3.917525773195877e-06, "loss": 1.192, "step": 177 }, { "epoch": 0.6116838487972509, "grad_norm": 0.10498046875, "learning_rate": 3.883161512027492e-06, "loss": 1.1374, "step": 178 }, { "epoch": 0.6151202749140894, "grad_norm": 0.11865234375, "learning_rate": 3.848797250859107e-06, "loss": 1.2014, "step": 179 }, { "epoch": 0.6185567010309279, "grad_norm": 0.12890625, "learning_rate": 3.814432989690722e-06, "loss": 1.2549, "step": 180 }, { "epoch": 0.6219931271477663, "grad_norm": 0.10107421875, "learning_rate": 3.780068728522337e-06, "loss": 1.2026, "step": 181 }, { "epoch": 0.6254295532646048, "grad_norm": 0.10693359375, "learning_rate": 3.7457044673539524e-06, "loss": 1.249, "step": 182 }, { "epoch": 0.6288659793814433, "grad_norm": 0.11865234375, "learning_rate": 3.7113402061855674e-06, "loss": 1.2328, "step": 183 }, { "epoch": 0.6323024054982818, "grad_norm": 0.10205078125, "learning_rate": 3.6769759450171825e-06, "loss": 1.1544, "step": 184 }, { "epoch": 0.6357388316151202, "grad_norm": 0.099609375, "learning_rate": 3.6426116838487975e-06, "loss": 1.175, "step": 185 }, { "epoch": 0.6391752577319587, "grad_norm": 0.1552734375, "learning_rate": 3.6082474226804126e-06, "loss": 1.122, "step": 186 }, { "epoch": 0.6426116838487973, "grad_norm": 0.123046875, "learning_rate": 3.573883161512028e-06, "loss": 1.1623, "step": 187 }, { "epoch": 0.6460481099656358, "grad_norm": 0.10302734375, "learning_rate": 3.539518900343643e-06, "loss": 1.1869, "step": 188 }, { "epoch": 0.6494845360824743, "grad_norm": 0.0966796875, "learning_rate": 3.5051546391752577e-06, "loss": 1.1359, "step": 189 }, { "epoch": 0.6529209621993127, "grad_norm": 0.1103515625, "learning_rate": 3.470790378006873e-06, "loss": 1.1882, "step": 190 }, { "epoch": 0.6563573883161512, "grad_norm": 0.109375, "learning_rate": 3.436426116838488e-06, "loss": 1.245, "step": 191 }, { "epoch": 0.6597938144329897, "grad_norm": 0.09912109375, "learning_rate": 3.4020618556701037e-06, "loss": 1.1544, "step": 192 }, { "epoch": 0.6632302405498282, "grad_norm": 0.11474609375, "learning_rate": 3.3676975945017187e-06, "loss": 1.1664, "step": 193 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1123046875, "learning_rate": 3.3333333333333333e-06, "loss": 1.1409, "step": 194 }, { "epoch": 0.6701030927835051, "grad_norm": 0.1044921875, "learning_rate": 3.298969072164949e-06, "loss": 1.144, "step": 195 }, { "epoch": 0.6735395189003437, "grad_norm": 0.111328125, "learning_rate": 3.264604810996564e-06, "loss": 1.2533, "step": 196 }, { "epoch": 0.6769759450171822, "grad_norm": 0.10693359375, "learning_rate": 3.2302405498281793e-06, "loss": 1.1903, "step": 197 }, { "epoch": 0.6804123711340206, "grad_norm": 0.12109375, "learning_rate": 3.195876288659794e-06, "loss": 1.1853, "step": 198 }, { "epoch": 0.6838487972508591, "grad_norm": 0.1083984375, "learning_rate": 3.161512027491409e-06, "loss": 1.1456, "step": 199 }, { "epoch": 0.6872852233676976, "grad_norm": 0.10498046875, "learning_rate": 3.1271477663230244e-06, "loss": 1.1724, "step": 200 }, { "epoch": 0.6907216494845361, "grad_norm": 0.11181640625, "learning_rate": 3.0927835051546395e-06, "loss": 1.2242, "step": 201 }, { "epoch": 0.6941580756013745, "grad_norm": 0.1494140625, "learning_rate": 3.058419243986255e-06, "loss": 1.2096, "step": 202 }, { "epoch": 0.697594501718213, "grad_norm": 0.1171875, "learning_rate": 3.0240549828178695e-06, "loss": 1.1972, "step": 203 }, { "epoch": 0.7010309278350515, "grad_norm": 0.1240234375, "learning_rate": 2.9896907216494846e-06, "loss": 1.2032, "step": 204 }, { "epoch": 0.7044673539518901, "grad_norm": 0.11279296875, "learning_rate": 2.9553264604811e-06, "loss": 1.1783, "step": 205 }, { "epoch": 0.7079037800687286, "grad_norm": 0.10986328125, "learning_rate": 2.920962199312715e-06, "loss": 1.1432, "step": 206 }, { "epoch": 0.711340206185567, "grad_norm": 0.1083984375, "learning_rate": 2.8865979381443297e-06, "loss": 1.1197, "step": 207 }, { "epoch": 0.7147766323024055, "grad_norm": 0.1064453125, "learning_rate": 2.852233676975945e-06, "loss": 1.1736, "step": 208 }, { "epoch": 0.718213058419244, "grad_norm": 0.126953125, "learning_rate": 2.8178694158075602e-06, "loss": 1.1505, "step": 209 }, { "epoch": 0.7216494845360825, "grad_norm": 0.1123046875, "learning_rate": 2.7835051546391757e-06, "loss": 1.2264, "step": 210 }, { "epoch": 0.7250859106529209, "grad_norm": 0.12255859375, "learning_rate": 2.7491408934707907e-06, "loss": 1.1537, "step": 211 }, { "epoch": 0.7285223367697594, "grad_norm": 0.1083984375, "learning_rate": 2.7147766323024053e-06, "loss": 1.2153, "step": 212 }, { "epoch": 0.7319587628865979, "grad_norm": 0.10498046875, "learning_rate": 2.680412371134021e-06, "loss": 1.1621, "step": 213 }, { "epoch": 0.7353951890034365, "grad_norm": 0.109375, "learning_rate": 2.646048109965636e-06, "loss": 1.1657, "step": 214 }, { "epoch": 0.738831615120275, "grad_norm": 0.11376953125, "learning_rate": 2.6116838487972513e-06, "loss": 1.1345, "step": 215 }, { "epoch": 0.7422680412371134, "grad_norm": 0.11328125, "learning_rate": 2.577319587628866e-06, "loss": 1.1935, "step": 216 }, { "epoch": 0.7457044673539519, "grad_norm": 0.10302734375, "learning_rate": 2.542955326460481e-06, "loss": 1.1348, "step": 217 }, { "epoch": 0.7491408934707904, "grad_norm": 0.11083984375, "learning_rate": 2.5085910652920964e-06, "loss": 1.1279, "step": 218 }, { "epoch": 0.7525773195876289, "grad_norm": 0.115234375, "learning_rate": 2.4742268041237115e-06, "loss": 1.1244, "step": 219 }, { "epoch": 0.7560137457044673, "grad_norm": 0.109375, "learning_rate": 2.4398625429553265e-06, "loss": 1.1416, "step": 220 }, { "epoch": 0.7594501718213058, "grad_norm": 0.1396484375, "learning_rate": 2.405498281786942e-06, "loss": 1.2763, "step": 221 }, { "epoch": 0.7628865979381443, "grad_norm": 0.10693359375, "learning_rate": 2.3711340206185566e-06, "loss": 1.112, "step": 222 }, { "epoch": 0.7663230240549829, "grad_norm": 0.11279296875, "learning_rate": 2.336769759450172e-06, "loss": 1.2058, "step": 223 }, { "epoch": 0.7697594501718213, "grad_norm": 0.125, "learning_rate": 2.302405498281787e-06, "loss": 1.217, "step": 224 }, { "epoch": 0.7731958762886598, "grad_norm": 0.11962890625, "learning_rate": 2.268041237113402e-06, "loss": 1.1224, "step": 225 }, { "epoch": 0.7766323024054983, "grad_norm": 0.1181640625, "learning_rate": 2.2336769759450176e-06, "loss": 1.2149, "step": 226 }, { "epoch": 0.7800687285223368, "grad_norm": 0.10693359375, "learning_rate": 2.1993127147766322e-06, "loss": 1.1129, "step": 227 }, { "epoch": 0.7835051546391752, "grad_norm": 0.11767578125, "learning_rate": 2.1649484536082477e-06, "loss": 1.1593, "step": 228 }, { "epoch": 0.7869415807560137, "grad_norm": 0.11279296875, "learning_rate": 2.1305841924398628e-06, "loss": 1.1298, "step": 229 }, { "epoch": 0.7903780068728522, "grad_norm": 0.1220703125, "learning_rate": 2.096219931271478e-06, "loss": 1.2249, "step": 230 }, { "epoch": 0.7938144329896907, "grad_norm": 0.1162109375, "learning_rate": 2.061855670103093e-06, "loss": 1.1265, "step": 231 }, { "epoch": 0.7972508591065293, "grad_norm": 0.10791015625, "learning_rate": 2.027491408934708e-06, "loss": 1.0844, "step": 232 }, { "epoch": 0.8006872852233677, "grad_norm": 0.1083984375, "learning_rate": 1.9931271477663233e-06, "loss": 1.1828, "step": 233 }, { "epoch": 0.8041237113402062, "grad_norm": 0.1572265625, "learning_rate": 1.9587628865979384e-06, "loss": 1.1879, "step": 234 }, { "epoch": 0.8075601374570447, "grad_norm": 0.1220703125, "learning_rate": 1.9243986254295534e-06, "loss": 1.2128, "step": 235 }, { "epoch": 0.8109965635738832, "grad_norm": 0.11474609375, "learning_rate": 1.8900343642611685e-06, "loss": 1.1451, "step": 236 }, { "epoch": 0.8144329896907216, "grad_norm": 0.11279296875, "learning_rate": 1.8556701030927837e-06, "loss": 1.1279, "step": 237 }, { "epoch": 0.8178694158075601, "grad_norm": 0.11083984375, "learning_rate": 1.8213058419243988e-06, "loss": 1.1636, "step": 238 }, { "epoch": 0.8213058419243986, "grad_norm": 0.11083984375, "learning_rate": 1.786941580756014e-06, "loss": 1.1323, "step": 239 }, { "epoch": 0.8247422680412371, "grad_norm": 0.1767578125, "learning_rate": 1.7525773195876288e-06, "loss": 1.1456, "step": 240 }, { "epoch": 0.8281786941580757, "grad_norm": 0.1181640625, "learning_rate": 1.718213058419244e-06, "loss": 1.1559, "step": 241 }, { "epoch": 0.8316151202749141, "grad_norm": 0.1591796875, "learning_rate": 1.6838487972508594e-06, "loss": 1.1903, "step": 242 }, { "epoch": 0.8350515463917526, "grad_norm": 0.1376953125, "learning_rate": 1.6494845360824744e-06, "loss": 1.1933, "step": 243 }, { "epoch": 0.8384879725085911, "grad_norm": 0.1328125, "learning_rate": 1.6151202749140896e-06, "loss": 1.2029, "step": 244 }, { "epoch": 0.8419243986254296, "grad_norm": 0.1201171875, "learning_rate": 1.5807560137457045e-06, "loss": 1.1551, "step": 245 }, { "epoch": 0.845360824742268, "grad_norm": 0.11279296875, "learning_rate": 1.5463917525773197e-06, "loss": 1.1213, "step": 246 }, { "epoch": 0.8487972508591065, "grad_norm": 0.11572265625, "learning_rate": 1.5120274914089348e-06, "loss": 1.1897, "step": 247 }, { "epoch": 0.852233676975945, "grad_norm": 0.1142578125, "learning_rate": 1.47766323024055e-06, "loss": 1.2245, "step": 248 }, { "epoch": 0.8556701030927835, "grad_norm": 0.1162109375, "learning_rate": 1.4432989690721649e-06, "loss": 1.1553, "step": 249 }, { "epoch": 0.8591065292096219, "grad_norm": 0.134765625, "learning_rate": 1.4089347079037801e-06, "loss": 1.12, "step": 250 }, { "epoch": 0.8625429553264605, "grad_norm": 0.10986328125, "learning_rate": 1.3745704467353954e-06, "loss": 1.0931, "step": 251 }, { "epoch": 0.865979381443299, "grad_norm": 0.208984375, "learning_rate": 1.3402061855670104e-06, "loss": 1.1994, "step": 252 }, { "epoch": 0.8694158075601375, "grad_norm": 0.10888671875, "learning_rate": 1.3058419243986257e-06, "loss": 1.1478, "step": 253 }, { "epoch": 0.872852233676976, "grad_norm": 0.1181640625, "learning_rate": 1.2714776632302405e-06, "loss": 1.1798, "step": 254 }, { "epoch": 0.8762886597938144, "grad_norm": 0.1318359375, "learning_rate": 1.2371134020618557e-06, "loss": 1.1266, "step": 255 }, { "epoch": 0.8797250859106529, "grad_norm": 0.11962890625, "learning_rate": 1.202749140893471e-06, "loss": 1.1522, "step": 256 }, { "epoch": 0.8831615120274914, "grad_norm": 0.11083984375, "learning_rate": 1.168384879725086e-06, "loss": 1.1133, "step": 257 }, { "epoch": 0.8865979381443299, "grad_norm": 0.11572265625, "learning_rate": 1.134020618556701e-06, "loss": 1.2044, "step": 258 }, { "epoch": 0.8900343642611683, "grad_norm": 0.1279296875, "learning_rate": 1.0996563573883161e-06, "loss": 1.2362, "step": 259 }, { "epoch": 0.8934707903780069, "grad_norm": 0.1728515625, "learning_rate": 1.0652920962199314e-06, "loss": 1.1711, "step": 260 }, { "epoch": 0.8969072164948454, "grad_norm": 0.125, "learning_rate": 1.0309278350515464e-06, "loss": 1.1429, "step": 261 }, { "epoch": 0.9003436426116839, "grad_norm": 0.14453125, "learning_rate": 9.965635738831617e-07, "loss": 1.2533, "step": 262 }, { "epoch": 0.9037800687285223, "grad_norm": 0.126953125, "learning_rate": 9.621993127147767e-07, "loss": 1.1734, "step": 263 }, { "epoch": 0.9072164948453608, "grad_norm": 0.11572265625, "learning_rate": 9.278350515463919e-07, "loss": 1.1519, "step": 264 }, { "epoch": 0.9106529209621993, "grad_norm": 0.11865234375, "learning_rate": 8.93470790378007e-07, "loss": 1.1965, "step": 265 }, { "epoch": 0.9140893470790378, "grad_norm": 0.123046875, "learning_rate": 8.59106529209622e-07, "loss": 1.1872, "step": 266 }, { "epoch": 0.9175257731958762, "grad_norm": 0.12890625, "learning_rate": 8.247422680412372e-07, "loss": 1.28, "step": 267 }, { "epoch": 0.9209621993127147, "grad_norm": 0.1416015625, "learning_rate": 7.903780068728522e-07, "loss": 1.0979, "step": 268 }, { "epoch": 0.9243986254295533, "grad_norm": 0.1484375, "learning_rate": 7.560137457044674e-07, "loss": 1.1781, "step": 269 }, { "epoch": 0.9278350515463918, "grad_norm": 0.115234375, "learning_rate": 7.216494845360824e-07, "loss": 1.1292, "step": 270 }, { "epoch": 0.9312714776632303, "grad_norm": 0.11376953125, "learning_rate": 6.872852233676977e-07, "loss": 1.1431, "step": 271 }, { "epoch": 0.9347079037800687, "grad_norm": 0.11669921875, "learning_rate": 6.529209621993128e-07, "loss": 1.1752, "step": 272 }, { "epoch": 0.9381443298969072, "grad_norm": 0.11376953125, "learning_rate": 6.185567010309279e-07, "loss": 1.167, "step": 273 }, { "epoch": 0.9415807560137457, "grad_norm": 0.11328125, "learning_rate": 5.84192439862543e-07, "loss": 1.1292, "step": 274 }, { "epoch": 0.9450171821305842, "grad_norm": 0.125, "learning_rate": 5.498281786941581e-07, "loss": 1.2543, "step": 275 }, { "epoch": 0.9484536082474226, "grad_norm": 0.12890625, "learning_rate": 5.154639175257732e-07, "loss": 1.2113, "step": 276 }, { "epoch": 0.9518900343642611, "grad_norm": 0.119140625, "learning_rate": 4.810996563573884e-07, "loss": 1.1205, "step": 277 }, { "epoch": 0.9553264604810997, "grad_norm": 0.12109375, "learning_rate": 4.467353951890035e-07, "loss": 1.1825, "step": 278 }, { "epoch": 0.9587628865979382, "grad_norm": 0.39453125, "learning_rate": 4.123711340206186e-07, "loss": 1.1801, "step": 279 }, { "epoch": 0.9621993127147767, "grad_norm": 0.16015625, "learning_rate": 3.780068728522337e-07, "loss": 1.1751, "step": 280 }, { "epoch": 0.9656357388316151, "grad_norm": 0.1259765625, "learning_rate": 3.4364261168384884e-07, "loss": 1.1759, "step": 281 }, { "epoch": 0.9690721649484536, "grad_norm": 0.1298828125, "learning_rate": 3.0927835051546394e-07, "loss": 1.1685, "step": 282 }, { "epoch": 0.9725085910652921, "grad_norm": 0.1796875, "learning_rate": 2.7491408934707903e-07, "loss": 1.1825, "step": 283 }, { "epoch": 0.9759450171821306, "grad_norm": 0.126953125, "learning_rate": 2.405498281786942e-07, "loss": 1.1749, "step": 284 }, { "epoch": 0.979381443298969, "grad_norm": 0.1337890625, "learning_rate": 2.061855670103093e-07, "loss": 1.1566, "step": 285 }, { "epoch": 0.9828178694158075, "grad_norm": 0.1201171875, "learning_rate": 1.7182130584192442e-07, "loss": 1.1753, "step": 286 }, { "epoch": 0.9862542955326461, "grad_norm": 0.185546875, "learning_rate": 1.3745704467353952e-07, "loss": 1.149, "step": 287 }, { "epoch": 0.9896907216494846, "grad_norm": 0.1171875, "learning_rate": 1.0309278350515465e-07, "loss": 1.114, "step": 288 }, { "epoch": 0.993127147766323, "grad_norm": 0.130859375, "learning_rate": 6.872852233676976e-08, "loss": 1.1794, "step": 289 }, { "epoch": 0.9965635738831615, "grad_norm": 0.14453125, "learning_rate": 3.436426116838488e-08, "loss": 1.2422, "step": 290 }, { "epoch": 1.0, "grad_norm": 0.125, "learning_rate": 0.0, "loss": 1.257, "step": 291 } ], "logging_steps": 1.0, "max_steps": 291, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.177843676804547e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }