|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 0, |
|
"global_step": 291, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003436426116838488, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 9.965635738831616e-06, |
|
"loss": 1.7833, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006872852233676976, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.931271477663231e-06, |
|
"loss": 1.6757, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 9.896907216494846e-06, |
|
"loss": 1.7604, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013745704467353952, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.862542955326461e-06, |
|
"loss": 1.6662, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01718213058419244, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.828178694158076e-06, |
|
"loss": 1.7293, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.793814432989691e-06, |
|
"loss": 1.6688, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.024054982817869417, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.759450171821306e-06, |
|
"loss": 1.6821, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.027491408934707903, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 9.725085910652921e-06, |
|
"loss": 1.6133, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030927835051546393, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.690721649484536e-06, |
|
"loss": 1.6739, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03436426116838488, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.656357388316153e-06, |
|
"loss": 1.5586, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037800687285223365, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.621993127147768e-06, |
|
"loss": 1.5729, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.587628865979383e-06, |
|
"loss": 1.5494, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.044673539518900345, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.553264604810998e-06, |
|
"loss": 1.5882, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.048109965635738834, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.518900343642611e-06, |
|
"loss": 1.5447, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 9.484536082474226e-06, |
|
"loss": 1.5998, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.054982817869415807, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 9.450171821305843e-06, |
|
"loss": 1.5377, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.058419243986254296, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 9.415807560137458e-06, |
|
"loss": 1.5192, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 9.381443298969073e-06, |
|
"loss": 1.5355, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06529209621993128, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 9.347079037800688e-06, |
|
"loss": 1.5234, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06872852233676977, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 9.312714776632303e-06, |
|
"loss": 1.499, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07216494845360824, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 9.278350515463918e-06, |
|
"loss": 1.5096, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07560137457044673, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 9.243986254295533e-06, |
|
"loss": 1.5011, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07903780068728522, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 9.209621993127148e-06, |
|
"loss": 1.4814, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 9.175257731958764e-06, |
|
"loss": 1.4306, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0859106529209622, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 9.140893470790379e-06, |
|
"loss": 1.4095, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08934707903780069, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 9.106529209621994e-06, |
|
"loss": 1.5332, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09278350515463918, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 9.072164948453609e-06, |
|
"loss": 1.3951, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09621993127147767, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 9.037800687285224e-06, |
|
"loss": 1.4271, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09965635738831616, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 9.003436426116839e-06, |
|
"loss": 1.5286, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 8.969072164948455e-06, |
|
"loss": 1.4261, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10652920962199312, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 8.93470790378007e-06, |
|
"loss": 1.4047, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10996563573883161, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 8.900343642611684e-06, |
|
"loss": 1.3345, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1134020618556701, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 8.865979381443299e-06, |
|
"loss": 1.3815, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11683848797250859, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 8.831615120274914e-06, |
|
"loss": 1.419, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12027491408934708, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 8.797250859106529e-06, |
|
"loss": 1.4125, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 8.762886597938146e-06, |
|
"loss": 1.3938, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12714776632302405, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 8.72852233676976e-06, |
|
"loss": 1.3676, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.13058419243986255, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 8.694158075601376e-06, |
|
"loss": 1.355, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13402061855670103, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 8.65979381443299e-06, |
|
"loss": 1.2992, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13745704467353953, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 8.625429553264606e-06, |
|
"loss": 1.2822, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.140893470790378, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 8.591065292096221e-06, |
|
"loss": 1.3814, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 8.556701030927836e-06, |
|
"loss": 1.342, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.14776632302405499, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 8.522336769759451e-06, |
|
"loss": 1.3447, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.15120274914089346, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 8.487972508591066e-06, |
|
"loss": 1.3496, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 8.453608247422681e-06, |
|
"loss": 1.361, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15807560137457044, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 8.419243986254296e-06, |
|
"loss": 1.3466, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.16151202749140894, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 8.384879725085911e-06, |
|
"loss": 1.2897, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 8.350515463917526e-06, |
|
"loss": 1.2955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16838487972508592, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 8.316151202749141e-06, |
|
"loss": 1.2537, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1718213058419244, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 8.281786941580758e-06, |
|
"loss": 1.2805, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17525773195876287, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 8.247422680412371e-06, |
|
"loss": 1.2535, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17869415807560138, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 8.213058419243986e-06, |
|
"loss": 1.3243, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.18213058419243985, |
|
"grad_norm": 0.12353515625, |
|
"learning_rate": 8.178694158075601e-06, |
|
"loss": 1.262, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 8.144329896907216e-06, |
|
"loss": 1.2669, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.18900343642611683, |
|
"grad_norm": 0.123046875, |
|
"learning_rate": 8.109965635738832e-06, |
|
"loss": 1.2936, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19243986254295534, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 8.075601374570448e-06, |
|
"loss": 1.2678, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1958762886597938, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 8.041237113402063e-06, |
|
"loss": 1.3035, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19931271477663232, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 8.006872852233678e-06, |
|
"loss": 1.3927, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2027491408934708, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 7.972508591065293e-06, |
|
"loss": 1.2696, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 7.938144329896907e-06, |
|
"loss": 1.2684, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20962199312714777, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 7.903780068728523e-06, |
|
"loss": 1.3102, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.21305841924398625, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 7.869415807560138e-06, |
|
"loss": 1.2037, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.21649484536082475, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 7.835051546391754e-06, |
|
"loss": 1.2694, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21993127147766323, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 7.800687285223369e-06, |
|
"loss": 1.2515, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.22336769759450173, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 7.766323024054984e-06, |
|
"loss": 1.2484, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 7.731958762886599e-06, |
|
"loss": 1.342, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.23024054982817868, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 7.697594501718214e-06, |
|
"loss": 1.2629, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.23367697594501718, |
|
"grad_norm": 0.1162109375, |
|
"learning_rate": 7.663230240549829e-06, |
|
"loss": 1.2718, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23711340206185566, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 7.628865979381444e-06, |
|
"loss": 1.3001, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.24054982817869416, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 7.594501718213059e-06, |
|
"loss": 1.1548, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24398625429553264, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 7.560137457044674e-06, |
|
"loss": 1.1643, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 7.525773195876289e-06, |
|
"loss": 1.2197, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2508591065292096, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 7.491408934707905e-06, |
|
"loss": 1.2366, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2542955326460481, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 7.45704467353952e-06, |
|
"loss": 1.2623, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 7.422680412371135e-06, |
|
"loss": 1.2439, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2611683848797251, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 7.38831615120275e-06, |
|
"loss": 1.2451, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2646048109965636, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 7.353951890034365e-06, |
|
"loss": 1.1966, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 7.319587628865979e-06, |
|
"loss": 1.1983, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.27147766323024053, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 7.285223367697595e-06, |
|
"loss": 1.2115, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.27491408934707906, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 7.25085910652921e-06, |
|
"loss": 1.1679, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27835051546391754, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 7.216494845360825e-06, |
|
"loss": 1.2571, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.281786941580756, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 7.18213058419244e-06, |
|
"loss": 1.3151, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2852233676975945, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 7.147766323024056e-06, |
|
"loss": 1.2378, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 7.113402061855671e-06, |
|
"loss": 1.2514, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2920962199312715, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 7.079037800687286e-06, |
|
"loss": 1.2457, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.29553264604810997, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 7.044673539518901e-06, |
|
"loss": 1.2397, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.29896907216494845, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 7.010309278350515e-06, |
|
"loss": 1.2586, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3024054982817869, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 6.9759450171821304e-06, |
|
"loss": 1.2845, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.30584192439862545, |
|
"grad_norm": 0.1240234375, |
|
"learning_rate": 6.941580756013746e-06, |
|
"loss": 1.2553, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 6.907216494845361e-06, |
|
"loss": 1.145, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3127147766323024, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 6.872852233676976e-06, |
|
"loss": 1.2295, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3161512027491409, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 6.8384879725085914e-06, |
|
"loss": 1.1771, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.31958762886597936, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 6.804123711340207e-06, |
|
"loss": 1.1458, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3230240549828179, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 6.769759450171822e-06, |
|
"loss": 1.2342, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.32646048109965636, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 6.735395189003437e-06, |
|
"loss": 1.2193, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 6.701030927835052e-06, |
|
"loss": 1.2225, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.2053, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.33676975945017185, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 6.632302405498282e-06, |
|
"loss": 1.1893, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3402061855670103, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 6.597938144329898e-06, |
|
"loss": 1.2301, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3436426116838488, |
|
"grad_norm": 0.11328125, |
|
"learning_rate": 6.563573883161513e-06, |
|
"loss": 1.1943, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3470790378006873, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 6.529209621993128e-06, |
|
"loss": 1.2589, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 6.494845360824743e-06, |
|
"loss": 1.1986, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3539518900343643, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 6.460481099656359e-06, |
|
"loss": 1.2073, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.35738831615120276, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 6.426116838487974e-06, |
|
"loss": 1.2892, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 6.391752577319588e-06, |
|
"loss": 1.2071, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3642611683848797, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 6.357388316151203e-06, |
|
"loss": 1.1472, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.36769759450171824, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 6.323024054982818e-06, |
|
"loss": 1.1407, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 6.288659793814433e-06, |
|
"loss": 1.2531, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3745704467353952, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 6.254295532646049e-06, |
|
"loss": 1.1916, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.37800687285223367, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 6.219931271477664e-06, |
|
"loss": 1.1996, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38144329896907214, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 6.185567010309279e-06, |
|
"loss": 1.158, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3848797250859107, |
|
"grad_norm": 0.11669921875, |
|
"learning_rate": 6.151202749140894e-06, |
|
"loss": 1.2114, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.38831615120274915, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 6.11683848797251e-06, |
|
"loss": 1.2039, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 6.082474226804124e-06, |
|
"loss": 1.2078, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3951890034364261, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 6.048109965635739e-06, |
|
"loss": 1.18, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.39862542955326463, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 6.013745704467354e-06, |
|
"loss": 1.2228, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.4020618556701031, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 5.979381443298969e-06, |
|
"loss": 1.1975, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4054982817869416, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 5.945017182130585e-06, |
|
"loss": 1.14, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.40893470790378006, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 5.9106529209622e-06, |
|
"loss": 1.1489, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 5.876288659793815e-06, |
|
"loss": 1.22, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.41580756013745707, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 5.84192439862543e-06, |
|
"loss": 1.1821, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.41924398625429554, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 5.807560137457045e-06, |
|
"loss": 1.1964, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.422680412371134, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 5.7731958762886594e-06, |
|
"loss": 1.1879, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4261168384879725, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 5.738831615120275e-06, |
|
"loss": 1.1497, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.42955326460481097, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 5.70446735395189e-06, |
|
"loss": 1.0871, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 5.670103092783505e-06, |
|
"loss": 1.2444, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.436426116838488, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 5.6357388316151204e-06, |
|
"loss": 1.1902, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.43986254295532645, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 5.601374570446736e-06, |
|
"loss": 1.2449, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.44329896907216493, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 5.567010309278351e-06, |
|
"loss": 1.1841, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.44673539518900346, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 5.532646048109966e-06, |
|
"loss": 1.2824, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45017182130584193, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 5.4982817869415815e-06, |
|
"loss": 1.1912, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 5.463917525773196e-06, |
|
"loss": 1.1778, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4570446735395189, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 5.429553264604811e-06, |
|
"loss": 1.1849, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.46048109965635736, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 5.395189003436427e-06, |
|
"loss": 1.1319, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 5.360824742268042e-06, |
|
"loss": 1.1162, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.46735395189003437, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 5.326460481099657e-06, |
|
"loss": 1.1951, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.47079037800687284, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 5.292096219931272e-06, |
|
"loss": 1.2168, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 5.257731958762888e-06, |
|
"loss": 1.1578, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.47766323024054985, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 5.223367697594503e-06, |
|
"loss": 1.3295, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.48109965635738833, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 5.189003436426118e-06, |
|
"loss": 1.1521, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4845360824742268, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 5.154639175257732e-06, |
|
"loss": 1.2219, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4879725085910653, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 5.120274914089347e-06, |
|
"loss": 1.1866, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.49140893470790376, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 5.085910652920962e-06, |
|
"loss": 1.1922, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 5.051546391752578e-06, |
|
"loss": 1.1784, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.49828178694158076, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 5.017182130584193e-06, |
|
"loss": 1.1468, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5017182130584192, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 4.982817869415808e-06, |
|
"loss": 1.2448, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5051546391752577, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 4.948453608247423e-06, |
|
"loss": 1.1422, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5085910652920962, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 4.914089347079038e-06, |
|
"loss": 1.2039, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5120274914089347, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 4.879725085910653e-06, |
|
"loss": 1.2287, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 4.845360824742268e-06, |
|
"loss": 1.17, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5189003436426117, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 4.810996563573884e-06, |
|
"loss": 1.0933, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5223367697594502, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 4.776632302405499e-06, |
|
"loss": 1.2443, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5257731958762887, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 4.742268041237113e-06, |
|
"loss": 1.2075, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5292096219931272, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 4.707903780068729e-06, |
|
"loss": 1.1981, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5326460481099656, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 4.673539518900344e-06, |
|
"loss": 1.1483, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 4.639175257731959e-06, |
|
"loss": 1.1753, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5395189003436426, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 4.604810996563574e-06, |
|
"loss": 1.1578, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5429553264604811, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 4.570446735395189e-06, |
|
"loss": 1.1467, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5463917525773195, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 4.536082474226804e-06, |
|
"loss": 1.188, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5498281786941581, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 4.501718213058419e-06, |
|
"loss": 1.1748, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5532646048109966, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 4.467353951890035e-06, |
|
"loss": 1.1563, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 4.4329896907216494e-06, |
|
"loss": 1.2162, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5601374570446735, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 4.3986254295532645e-06, |
|
"loss": 1.1612, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.563573883161512, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 4.36426116838488e-06, |
|
"loss": 1.1738, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"grad_norm": 0.11669921875, |
|
"learning_rate": 4.329896907216495e-06, |
|
"loss": 1.2428, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.570446735395189, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 4.2955326460481105e-06, |
|
"loss": 1.2499, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5738831615120275, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 4.2611683848797255e-06, |
|
"loss": 1.1381, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 4.2268041237113405e-06, |
|
"loss": 1.1218, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5807560137457045, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 4.192439862542956e-06, |
|
"loss": 1.1917, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.584192439862543, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 4.158075601374571e-06, |
|
"loss": 1.2737, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5876288659793815, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 4.123711340206186e-06, |
|
"loss": 1.1914, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5910652920962199, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 4.089347079037801e-06, |
|
"loss": 1.1924, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5945017182130584, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 4.054982817869416e-06, |
|
"loss": 1.186, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 4.020618556701032e-06, |
|
"loss": 1.2316, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6013745704467354, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 3.986254295532647e-06, |
|
"loss": 1.1319, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6048109965635738, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 3.951890034364262e-06, |
|
"loss": 1.1959, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6082474226804123, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 3.917525773195877e-06, |
|
"loss": 1.192, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6116838487972509, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 3.883161512027492e-06, |
|
"loss": 1.1374, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6151202749140894, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 3.848797250859107e-06, |
|
"loss": 1.2014, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 3.814432989690722e-06, |
|
"loss": 1.2549, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6219931271477663, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 3.780068728522337e-06, |
|
"loss": 1.2026, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6254295532646048, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 3.7457044673539524e-06, |
|
"loss": 1.249, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6288659793814433, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 3.7113402061855674e-06, |
|
"loss": 1.2328, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6323024054982818, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 3.6769759450171825e-06, |
|
"loss": 1.1544, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6357388316151202, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 3.6426116838487975e-06, |
|
"loss": 1.175, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 3.6082474226804126e-06, |
|
"loss": 1.122, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6426116838487973, |
|
"grad_norm": 0.123046875, |
|
"learning_rate": 3.573883161512028e-06, |
|
"loss": 1.1623, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6460481099656358, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 3.539518900343643e-06, |
|
"loss": 1.1869, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6494845360824743, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 3.5051546391752577e-06, |
|
"loss": 1.1359, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6529209621993127, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 3.470790378006873e-06, |
|
"loss": 1.1882, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6563573883161512, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 3.436426116838488e-06, |
|
"loss": 1.245, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 3.4020618556701037e-06, |
|
"loss": 1.1544, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6632302405498282, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 3.3676975945017187e-06, |
|
"loss": 1.1664, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.1409, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 3.298969072164949e-06, |
|
"loss": 1.144, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6735395189003437, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 3.264604810996564e-06, |
|
"loss": 1.2533, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6769759450171822, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 3.2302405498281793e-06, |
|
"loss": 1.1903, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 3.195876288659794e-06, |
|
"loss": 1.1853, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6838487972508591, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 3.161512027491409e-06, |
|
"loss": 1.1456, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6872852233676976, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 3.1271477663230244e-06, |
|
"loss": 1.1724, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6907216494845361, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 3.0927835051546395e-06, |
|
"loss": 1.2242, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6941580756013745, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.058419243986255e-06, |
|
"loss": 1.2096, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.697594501718213, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 3.0240549828178695e-06, |
|
"loss": 1.1972, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.1240234375, |
|
"learning_rate": 2.9896907216494846e-06, |
|
"loss": 1.2032, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.7044673539518901, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 2.9553264604811e-06, |
|
"loss": 1.1783, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7079037800687286, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 2.920962199312715e-06, |
|
"loss": 1.1432, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.711340206185567, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 2.8865979381443297e-06, |
|
"loss": 1.1197, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7147766323024055, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 2.852233676975945e-06, |
|
"loss": 1.1736, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.718213058419244, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 2.8178694158075602e-06, |
|
"loss": 1.1505, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 2.7835051546391757e-06, |
|
"loss": 1.2264, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7250859106529209, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 2.7491408934707907e-06, |
|
"loss": 1.1537, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7285223367697594, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 2.7147766323024053e-06, |
|
"loss": 1.2153, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7319587628865979, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 2.680412371134021e-06, |
|
"loss": 1.1621, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7353951890034365, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 2.646048109965636e-06, |
|
"loss": 1.1657, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.738831615120275, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 2.6116838487972513e-06, |
|
"loss": 1.1345, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.11328125, |
|
"learning_rate": 2.577319587628866e-06, |
|
"loss": 1.1935, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7457044673539519, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 2.542955326460481e-06, |
|
"loss": 1.1348, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7491408934707904, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 2.5085910652920964e-06, |
|
"loss": 1.1279, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7525773195876289, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 2.4742268041237115e-06, |
|
"loss": 1.1244, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7560137457044673, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 2.4398625429553265e-06, |
|
"loss": 1.1416, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7594501718213058, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 2.405498281786942e-06, |
|
"loss": 1.2763, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 2.3711340206185566e-06, |
|
"loss": 1.112, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7663230240549829, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 2.336769759450172e-06, |
|
"loss": 1.2058, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7697594501718213, |
|
"grad_norm": 0.125, |
|
"learning_rate": 2.302405498281787e-06, |
|
"loss": 1.217, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 2.268041237113402e-06, |
|
"loss": 1.1224, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7766323024054983, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 2.2336769759450176e-06, |
|
"loss": 1.2149, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7800687285223368, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 2.1993127147766322e-06, |
|
"loss": 1.1129, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 2.1649484536082477e-06, |
|
"loss": 1.1593, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7869415807560137, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 2.1305841924398628e-06, |
|
"loss": 1.1298, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7903780068728522, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 2.096219931271478e-06, |
|
"loss": 1.2249, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7938144329896907, |
|
"grad_norm": 0.1162109375, |
|
"learning_rate": 2.061855670103093e-06, |
|
"loss": 1.1265, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7972508591065293, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 2.027491408934708e-06, |
|
"loss": 1.0844, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.8006872852233677, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 1.9931271477663233e-06, |
|
"loss": 1.1828, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 1.9587628865979384e-06, |
|
"loss": 1.1879, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.8075601374570447, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 1.9243986254295534e-06, |
|
"loss": 1.2128, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8109965635738832, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 1.8900343642611685e-06, |
|
"loss": 1.1451, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8144329896907216, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 1.8556701030927837e-06, |
|
"loss": 1.1279, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8178694158075601, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 1.8213058419243988e-06, |
|
"loss": 1.1636, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8213058419243986, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 1.786941580756014e-06, |
|
"loss": 1.1323, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.7525773195876288e-06, |
|
"loss": 1.1456, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8281786941580757, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 1.718213058419244e-06, |
|
"loss": 1.1559, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8316151202749141, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 1.6838487972508594e-06, |
|
"loss": 1.1903, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8350515463917526, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 1.6494845360824744e-06, |
|
"loss": 1.1933, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8384879725085911, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 1.6151202749140896e-06, |
|
"loss": 1.2029, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8419243986254296, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 1.5807560137457045e-06, |
|
"loss": 1.1551, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 1.5463917525773197e-06, |
|
"loss": 1.1213, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8487972508591065, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 1.5120274914089348e-06, |
|
"loss": 1.1897, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.852233676975945, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 1.47766323024055e-06, |
|
"loss": 1.2245, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8556701030927835, |
|
"grad_norm": 0.1162109375, |
|
"learning_rate": 1.4432989690721649e-06, |
|
"loss": 1.1553, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8591065292096219, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 1.4089347079037801e-06, |
|
"loss": 1.12, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8625429553264605, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 1.3745704467353954e-06, |
|
"loss": 1.0931, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 1.3402061855670104e-06, |
|
"loss": 1.1994, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8694158075601375, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 1.3058419243986257e-06, |
|
"loss": 1.1478, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.872852233676976, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 1.2714776632302405e-06, |
|
"loss": 1.1798, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 1.2371134020618557e-06, |
|
"loss": 1.1266, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8797250859106529, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 1.202749140893471e-06, |
|
"loss": 1.1522, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8831615120274914, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 1.168384879725086e-06, |
|
"loss": 1.1133, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 1.134020618556701e-06, |
|
"loss": 1.2044, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8900343642611683, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 1.0996563573883161e-06, |
|
"loss": 1.2362, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8934707903780069, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 1.0652920962199314e-06, |
|
"loss": 1.1711, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8969072164948454, |
|
"grad_norm": 0.125, |
|
"learning_rate": 1.0309278350515464e-06, |
|
"loss": 1.1429, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.9003436426116839, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 9.965635738831617e-07, |
|
"loss": 1.2533, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.9037800687285223, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 9.621993127147767e-07, |
|
"loss": 1.1734, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 9.278350515463919e-07, |
|
"loss": 1.1519, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9106529209621993, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 8.93470790378007e-07, |
|
"loss": 1.1965, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9140893470790378, |
|
"grad_norm": 0.123046875, |
|
"learning_rate": 8.59106529209622e-07, |
|
"loss": 1.1872, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.9175257731958762, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 8.247422680412372e-07, |
|
"loss": 1.28, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.9209621993127147, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 7.903780068728522e-07, |
|
"loss": 1.0979, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9243986254295533, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 7.560137457044674e-07, |
|
"loss": 1.1781, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 7.216494845360824e-07, |
|
"loss": 1.1292, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9312714776632303, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 6.872852233676977e-07, |
|
"loss": 1.1431, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.9347079037800687, |
|
"grad_norm": 0.11669921875, |
|
"learning_rate": 6.529209621993128e-07, |
|
"loss": 1.1752, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9381443298969072, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 6.185567010309279e-07, |
|
"loss": 1.167, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9415807560137457, |
|
"grad_norm": 0.11328125, |
|
"learning_rate": 5.84192439862543e-07, |
|
"loss": 1.1292, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9450171821305842, |
|
"grad_norm": 0.125, |
|
"learning_rate": 5.498281786941581e-07, |
|
"loss": 1.2543, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 5.154639175257732e-07, |
|
"loss": 1.2113, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9518900343642611, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 4.810996563573884e-07, |
|
"loss": 1.1205, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9553264604810997, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 4.467353951890035e-07, |
|
"loss": 1.1825, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9587628865979382, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.123711340206186e-07, |
|
"loss": 1.1801, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9621993127147767, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 3.780068728522337e-07, |
|
"loss": 1.1751, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9656357388316151, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 3.4364261168384884e-07, |
|
"loss": 1.1759, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 3.0927835051546394e-07, |
|
"loss": 1.1685, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9725085910652921, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 2.7491408934707903e-07, |
|
"loss": 1.1825, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9759450171821306, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 2.405498281786942e-07, |
|
"loss": 1.1749, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 2.061855670103093e-07, |
|
"loss": 1.1566, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9828178694158075, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 1.7182130584192442e-07, |
|
"loss": 1.1753, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9862542955326461, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.3745704467353952e-07, |
|
"loss": 1.149, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 1.0309278350515465e-07, |
|
"loss": 1.114, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.993127147766323, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 6.872852233676976e-08, |
|
"loss": 1.1794, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9965635738831615, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.436426116838488e-08, |
|
"loss": 1.2422, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.125, |
|
"learning_rate": 0.0, |
|
"loss": 1.257, |
|
"step": 291 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 291, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.177843676804547e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|