Wanff
Add fine-tuned model
55a65e0
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 291,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003436426116838488,
"grad_norm": 0.4921875,
"learning_rate": 9.965635738831616e-06,
"loss": 1.7833,
"step": 1
},
{
"epoch": 0.006872852233676976,
"grad_norm": 0.47265625,
"learning_rate": 9.931271477663231e-06,
"loss": 1.6757,
"step": 2
},
{
"epoch": 0.010309278350515464,
"grad_norm": 0.48828125,
"learning_rate": 9.896907216494846e-06,
"loss": 1.7604,
"step": 3
},
{
"epoch": 0.013745704467353952,
"grad_norm": 0.43359375,
"learning_rate": 9.862542955326461e-06,
"loss": 1.6662,
"step": 4
},
{
"epoch": 0.01718213058419244,
"grad_norm": 0.43359375,
"learning_rate": 9.828178694158076e-06,
"loss": 1.7293,
"step": 5
},
{
"epoch": 0.020618556701030927,
"grad_norm": 0.380859375,
"learning_rate": 9.793814432989691e-06,
"loss": 1.6688,
"step": 6
},
{
"epoch": 0.024054982817869417,
"grad_norm": 0.34375,
"learning_rate": 9.759450171821306e-06,
"loss": 1.6821,
"step": 7
},
{
"epoch": 0.027491408934707903,
"grad_norm": 0.314453125,
"learning_rate": 9.725085910652921e-06,
"loss": 1.6133,
"step": 8
},
{
"epoch": 0.030927835051546393,
"grad_norm": 0.298828125,
"learning_rate": 9.690721649484536e-06,
"loss": 1.6739,
"step": 9
},
{
"epoch": 0.03436426116838488,
"grad_norm": 0.26171875,
"learning_rate": 9.656357388316153e-06,
"loss": 1.5586,
"step": 10
},
{
"epoch": 0.037800687285223365,
"grad_norm": 0.2578125,
"learning_rate": 9.621993127147768e-06,
"loss": 1.5729,
"step": 11
},
{
"epoch": 0.041237113402061855,
"grad_norm": 0.2333984375,
"learning_rate": 9.587628865979383e-06,
"loss": 1.5494,
"step": 12
},
{
"epoch": 0.044673539518900345,
"grad_norm": 0.244140625,
"learning_rate": 9.553264604810998e-06,
"loss": 1.5882,
"step": 13
},
{
"epoch": 0.048109965635738834,
"grad_norm": 0.23828125,
"learning_rate": 9.518900343642611e-06,
"loss": 1.5447,
"step": 14
},
{
"epoch": 0.05154639175257732,
"grad_norm": 0.232421875,
"learning_rate": 9.484536082474226e-06,
"loss": 1.5998,
"step": 15
},
{
"epoch": 0.054982817869415807,
"grad_norm": 0.2431640625,
"learning_rate": 9.450171821305843e-06,
"loss": 1.5377,
"step": 16
},
{
"epoch": 0.058419243986254296,
"grad_norm": 0.2314453125,
"learning_rate": 9.415807560137458e-06,
"loss": 1.5192,
"step": 17
},
{
"epoch": 0.061855670103092786,
"grad_norm": 0.234375,
"learning_rate": 9.381443298969073e-06,
"loss": 1.5355,
"step": 18
},
{
"epoch": 0.06529209621993128,
"grad_norm": 0.2265625,
"learning_rate": 9.347079037800688e-06,
"loss": 1.5234,
"step": 19
},
{
"epoch": 0.06872852233676977,
"grad_norm": 0.220703125,
"learning_rate": 9.312714776632303e-06,
"loss": 1.499,
"step": 20
},
{
"epoch": 0.07216494845360824,
"grad_norm": 0.2060546875,
"learning_rate": 9.278350515463918e-06,
"loss": 1.5096,
"step": 21
},
{
"epoch": 0.07560137457044673,
"grad_norm": 0.193359375,
"learning_rate": 9.243986254295533e-06,
"loss": 1.5011,
"step": 22
},
{
"epoch": 0.07903780068728522,
"grad_norm": 0.197265625,
"learning_rate": 9.209621993127148e-06,
"loss": 1.4814,
"step": 23
},
{
"epoch": 0.08247422680412371,
"grad_norm": 0.18359375,
"learning_rate": 9.175257731958764e-06,
"loss": 1.4306,
"step": 24
},
{
"epoch": 0.0859106529209622,
"grad_norm": 0.1826171875,
"learning_rate": 9.140893470790379e-06,
"loss": 1.4095,
"step": 25
},
{
"epoch": 0.08934707903780069,
"grad_norm": 0.1767578125,
"learning_rate": 9.106529209621994e-06,
"loss": 1.5332,
"step": 26
},
{
"epoch": 0.09278350515463918,
"grad_norm": 0.171875,
"learning_rate": 9.072164948453609e-06,
"loss": 1.3951,
"step": 27
},
{
"epoch": 0.09621993127147767,
"grad_norm": 0.1650390625,
"learning_rate": 9.037800687285224e-06,
"loss": 1.4271,
"step": 28
},
{
"epoch": 0.09965635738831616,
"grad_norm": 0.17578125,
"learning_rate": 9.003436426116839e-06,
"loss": 1.5286,
"step": 29
},
{
"epoch": 0.10309278350515463,
"grad_norm": 0.1572265625,
"learning_rate": 8.969072164948455e-06,
"loss": 1.4261,
"step": 30
},
{
"epoch": 0.10652920962199312,
"grad_norm": 0.1689453125,
"learning_rate": 8.93470790378007e-06,
"loss": 1.4047,
"step": 31
},
{
"epoch": 0.10996563573883161,
"grad_norm": 0.1611328125,
"learning_rate": 8.900343642611684e-06,
"loss": 1.3345,
"step": 32
},
{
"epoch": 0.1134020618556701,
"grad_norm": 0.177734375,
"learning_rate": 8.865979381443299e-06,
"loss": 1.3815,
"step": 33
},
{
"epoch": 0.11683848797250859,
"grad_norm": 0.1708984375,
"learning_rate": 8.831615120274914e-06,
"loss": 1.419,
"step": 34
},
{
"epoch": 0.12027491408934708,
"grad_norm": 0.1630859375,
"learning_rate": 8.797250859106529e-06,
"loss": 1.4125,
"step": 35
},
{
"epoch": 0.12371134020618557,
"grad_norm": 0.16015625,
"learning_rate": 8.762886597938146e-06,
"loss": 1.3938,
"step": 36
},
{
"epoch": 0.12714776632302405,
"grad_norm": 0.1513671875,
"learning_rate": 8.72852233676976e-06,
"loss": 1.3676,
"step": 37
},
{
"epoch": 0.13058419243986255,
"grad_norm": 0.1767578125,
"learning_rate": 8.694158075601376e-06,
"loss": 1.355,
"step": 38
},
{
"epoch": 0.13402061855670103,
"grad_norm": 0.1435546875,
"learning_rate": 8.65979381443299e-06,
"loss": 1.2992,
"step": 39
},
{
"epoch": 0.13745704467353953,
"grad_norm": 0.1494140625,
"learning_rate": 8.625429553264606e-06,
"loss": 1.2822,
"step": 40
},
{
"epoch": 0.140893470790378,
"grad_norm": 0.1416015625,
"learning_rate": 8.591065292096221e-06,
"loss": 1.3814,
"step": 41
},
{
"epoch": 0.14432989690721648,
"grad_norm": 0.1455078125,
"learning_rate": 8.556701030927836e-06,
"loss": 1.342,
"step": 42
},
{
"epoch": 0.14776632302405499,
"grad_norm": 0.1552734375,
"learning_rate": 8.522336769759451e-06,
"loss": 1.3447,
"step": 43
},
{
"epoch": 0.15120274914089346,
"grad_norm": 0.146484375,
"learning_rate": 8.487972508591066e-06,
"loss": 1.3496,
"step": 44
},
{
"epoch": 0.15463917525773196,
"grad_norm": 0.142578125,
"learning_rate": 8.453608247422681e-06,
"loss": 1.361,
"step": 45
},
{
"epoch": 0.15807560137457044,
"grad_norm": 0.1708984375,
"learning_rate": 8.419243986254296e-06,
"loss": 1.3466,
"step": 46
},
{
"epoch": 0.16151202749140894,
"grad_norm": 0.1572265625,
"learning_rate": 8.384879725085911e-06,
"loss": 1.2897,
"step": 47
},
{
"epoch": 0.16494845360824742,
"grad_norm": 0.1318359375,
"learning_rate": 8.350515463917526e-06,
"loss": 1.2955,
"step": 48
},
{
"epoch": 0.16838487972508592,
"grad_norm": 0.26953125,
"learning_rate": 8.316151202749141e-06,
"loss": 1.2537,
"step": 49
},
{
"epoch": 0.1718213058419244,
"grad_norm": 0.1259765625,
"learning_rate": 8.281786941580758e-06,
"loss": 1.2805,
"step": 50
},
{
"epoch": 0.17525773195876287,
"grad_norm": 0.1171875,
"learning_rate": 8.247422680412371e-06,
"loss": 1.2535,
"step": 51
},
{
"epoch": 0.17869415807560138,
"grad_norm": 0.11962890625,
"learning_rate": 8.213058419243986e-06,
"loss": 1.3243,
"step": 52
},
{
"epoch": 0.18213058419243985,
"grad_norm": 0.12353515625,
"learning_rate": 8.178694158075601e-06,
"loss": 1.262,
"step": 53
},
{
"epoch": 0.18556701030927836,
"grad_norm": 0.1259765625,
"learning_rate": 8.144329896907216e-06,
"loss": 1.2669,
"step": 54
},
{
"epoch": 0.18900343642611683,
"grad_norm": 0.123046875,
"learning_rate": 8.109965635738832e-06,
"loss": 1.2936,
"step": 55
},
{
"epoch": 0.19243986254295534,
"grad_norm": 0.1357421875,
"learning_rate": 8.075601374570448e-06,
"loss": 1.2678,
"step": 56
},
{
"epoch": 0.1958762886597938,
"grad_norm": 0.1171875,
"learning_rate": 8.041237113402063e-06,
"loss": 1.3035,
"step": 57
},
{
"epoch": 0.19931271477663232,
"grad_norm": 0.1357421875,
"learning_rate": 8.006872852233678e-06,
"loss": 1.3927,
"step": 58
},
{
"epoch": 0.2027491408934708,
"grad_norm": 0.1103515625,
"learning_rate": 7.972508591065293e-06,
"loss": 1.2696,
"step": 59
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.1181640625,
"learning_rate": 7.938144329896907e-06,
"loss": 1.2684,
"step": 60
},
{
"epoch": 0.20962199312714777,
"grad_norm": 0.1474609375,
"learning_rate": 7.903780068728523e-06,
"loss": 1.3102,
"step": 61
},
{
"epoch": 0.21305841924398625,
"grad_norm": 0.11181640625,
"learning_rate": 7.869415807560138e-06,
"loss": 1.2037,
"step": 62
},
{
"epoch": 0.21649484536082475,
"grad_norm": 0.11376953125,
"learning_rate": 7.835051546391754e-06,
"loss": 1.2694,
"step": 63
},
{
"epoch": 0.21993127147766323,
"grad_norm": 0.11474609375,
"learning_rate": 7.800687285223369e-06,
"loss": 1.2515,
"step": 64
},
{
"epoch": 0.22336769759450173,
"grad_norm": 0.1142578125,
"learning_rate": 7.766323024054984e-06,
"loss": 1.2484,
"step": 65
},
{
"epoch": 0.2268041237113402,
"grad_norm": 0.11767578125,
"learning_rate": 7.731958762886599e-06,
"loss": 1.342,
"step": 66
},
{
"epoch": 0.23024054982817868,
"grad_norm": 0.12255859375,
"learning_rate": 7.697594501718214e-06,
"loss": 1.2629,
"step": 67
},
{
"epoch": 0.23367697594501718,
"grad_norm": 0.1162109375,
"learning_rate": 7.663230240549829e-06,
"loss": 1.2718,
"step": 68
},
{
"epoch": 0.23711340206185566,
"grad_norm": 0.109375,
"learning_rate": 7.628865979381444e-06,
"loss": 1.3001,
"step": 69
},
{
"epoch": 0.24054982817869416,
"grad_norm": 0.10986328125,
"learning_rate": 7.594501718213059e-06,
"loss": 1.1548,
"step": 70
},
{
"epoch": 0.24398625429553264,
"grad_norm": 0.109375,
"learning_rate": 7.560137457044674e-06,
"loss": 1.1643,
"step": 71
},
{
"epoch": 0.24742268041237114,
"grad_norm": 0.13671875,
"learning_rate": 7.525773195876289e-06,
"loss": 1.2197,
"step": 72
},
{
"epoch": 0.2508591065292096,
"grad_norm": 0.111328125,
"learning_rate": 7.491408934707905e-06,
"loss": 1.2366,
"step": 73
},
{
"epoch": 0.2542955326460481,
"grad_norm": 0.11767578125,
"learning_rate": 7.45704467353952e-06,
"loss": 1.2623,
"step": 74
},
{
"epoch": 0.25773195876288657,
"grad_norm": 0.10693359375,
"learning_rate": 7.422680412371135e-06,
"loss": 1.2439,
"step": 75
},
{
"epoch": 0.2611683848797251,
"grad_norm": 0.109375,
"learning_rate": 7.38831615120275e-06,
"loss": 1.2451,
"step": 76
},
{
"epoch": 0.2646048109965636,
"grad_norm": 0.11474609375,
"learning_rate": 7.353951890034365e-06,
"loss": 1.1966,
"step": 77
},
{
"epoch": 0.26804123711340205,
"grad_norm": 0.11083984375,
"learning_rate": 7.319587628865979e-06,
"loss": 1.1983,
"step": 78
},
{
"epoch": 0.27147766323024053,
"grad_norm": 0.1044921875,
"learning_rate": 7.285223367697595e-06,
"loss": 1.2115,
"step": 79
},
{
"epoch": 0.27491408934707906,
"grad_norm": 0.1083984375,
"learning_rate": 7.25085910652921e-06,
"loss": 1.1679,
"step": 80
},
{
"epoch": 0.27835051546391754,
"grad_norm": 0.1357421875,
"learning_rate": 7.216494845360825e-06,
"loss": 1.2571,
"step": 81
},
{
"epoch": 0.281786941580756,
"grad_norm": 0.119140625,
"learning_rate": 7.18213058419244e-06,
"loss": 1.3151,
"step": 82
},
{
"epoch": 0.2852233676975945,
"grad_norm": 0.1357421875,
"learning_rate": 7.147766323024056e-06,
"loss": 1.2378,
"step": 83
},
{
"epoch": 0.28865979381443296,
"grad_norm": 0.107421875,
"learning_rate": 7.113402061855671e-06,
"loss": 1.2514,
"step": 84
},
{
"epoch": 0.2920962199312715,
"grad_norm": 0.138671875,
"learning_rate": 7.079037800687286e-06,
"loss": 1.2457,
"step": 85
},
{
"epoch": 0.29553264604810997,
"grad_norm": 0.1513671875,
"learning_rate": 7.044673539518901e-06,
"loss": 1.2397,
"step": 86
},
{
"epoch": 0.29896907216494845,
"grad_norm": 0.11181640625,
"learning_rate": 7.010309278350515e-06,
"loss": 1.2586,
"step": 87
},
{
"epoch": 0.3024054982817869,
"grad_norm": 0.115234375,
"learning_rate": 6.9759450171821304e-06,
"loss": 1.2845,
"step": 88
},
{
"epoch": 0.30584192439862545,
"grad_norm": 0.1240234375,
"learning_rate": 6.941580756013746e-06,
"loss": 1.2553,
"step": 89
},
{
"epoch": 0.30927835051546393,
"grad_norm": 0.2353515625,
"learning_rate": 6.907216494845361e-06,
"loss": 1.145,
"step": 90
},
{
"epoch": 0.3127147766323024,
"grad_norm": 0.1513671875,
"learning_rate": 6.872852233676976e-06,
"loss": 1.2295,
"step": 91
},
{
"epoch": 0.3161512027491409,
"grad_norm": 0.1435546875,
"learning_rate": 6.8384879725085914e-06,
"loss": 1.1771,
"step": 92
},
{
"epoch": 0.31958762886597936,
"grad_norm": 0.11962890625,
"learning_rate": 6.804123711340207e-06,
"loss": 1.1458,
"step": 93
},
{
"epoch": 0.3230240549828179,
"grad_norm": 0.158203125,
"learning_rate": 6.769759450171822e-06,
"loss": 1.2342,
"step": 94
},
{
"epoch": 0.32646048109965636,
"grad_norm": 0.10888671875,
"learning_rate": 6.735395189003437e-06,
"loss": 1.2193,
"step": 95
},
{
"epoch": 0.32989690721649484,
"grad_norm": 0.10986328125,
"learning_rate": 6.701030927835052e-06,
"loss": 1.2225,
"step": 96
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.10498046875,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2053,
"step": 97
},
{
"epoch": 0.33676975945017185,
"grad_norm": 0.119140625,
"learning_rate": 6.632302405498282e-06,
"loss": 1.1893,
"step": 98
},
{
"epoch": 0.3402061855670103,
"grad_norm": 0.10986328125,
"learning_rate": 6.597938144329898e-06,
"loss": 1.2301,
"step": 99
},
{
"epoch": 0.3436426116838488,
"grad_norm": 0.11328125,
"learning_rate": 6.563573883161513e-06,
"loss": 1.1943,
"step": 100
},
{
"epoch": 0.3470790378006873,
"grad_norm": 0.1552734375,
"learning_rate": 6.529209621993128e-06,
"loss": 1.2589,
"step": 101
},
{
"epoch": 0.35051546391752575,
"grad_norm": 0.1025390625,
"learning_rate": 6.494845360824743e-06,
"loss": 1.1986,
"step": 102
},
{
"epoch": 0.3539518900343643,
"grad_norm": 0.146484375,
"learning_rate": 6.460481099656359e-06,
"loss": 1.2073,
"step": 103
},
{
"epoch": 0.35738831615120276,
"grad_norm": 0.111328125,
"learning_rate": 6.426116838487974e-06,
"loss": 1.2892,
"step": 104
},
{
"epoch": 0.36082474226804123,
"grad_norm": 0.1064453125,
"learning_rate": 6.391752577319588e-06,
"loss": 1.2071,
"step": 105
},
{
"epoch": 0.3642611683848797,
"grad_norm": 0.0986328125,
"learning_rate": 6.357388316151203e-06,
"loss": 1.1472,
"step": 106
},
{
"epoch": 0.36769759450171824,
"grad_norm": 0.1376953125,
"learning_rate": 6.323024054982818e-06,
"loss": 1.1407,
"step": 107
},
{
"epoch": 0.3711340206185567,
"grad_norm": 0.1328125,
"learning_rate": 6.288659793814433e-06,
"loss": 1.2531,
"step": 108
},
{
"epoch": 0.3745704467353952,
"grad_norm": 0.10107421875,
"learning_rate": 6.254295532646049e-06,
"loss": 1.1916,
"step": 109
},
{
"epoch": 0.37800687285223367,
"grad_norm": 0.109375,
"learning_rate": 6.219931271477664e-06,
"loss": 1.1996,
"step": 110
},
{
"epoch": 0.38144329896907214,
"grad_norm": 0.10302734375,
"learning_rate": 6.185567010309279e-06,
"loss": 1.158,
"step": 111
},
{
"epoch": 0.3848797250859107,
"grad_norm": 0.11669921875,
"learning_rate": 6.151202749140894e-06,
"loss": 1.2114,
"step": 112
},
{
"epoch": 0.38831615120274915,
"grad_norm": 0.09912109375,
"learning_rate": 6.11683848797251e-06,
"loss": 1.2039,
"step": 113
},
{
"epoch": 0.3917525773195876,
"grad_norm": 0.11572265625,
"learning_rate": 6.082474226804124e-06,
"loss": 1.2078,
"step": 114
},
{
"epoch": 0.3951890034364261,
"grad_norm": 0.10205078125,
"learning_rate": 6.048109965635739e-06,
"loss": 1.18,
"step": 115
},
{
"epoch": 0.39862542955326463,
"grad_norm": 0.1171875,
"learning_rate": 6.013745704467354e-06,
"loss": 1.2228,
"step": 116
},
{
"epoch": 0.4020618556701031,
"grad_norm": 0.103515625,
"learning_rate": 5.979381443298969e-06,
"loss": 1.1975,
"step": 117
},
{
"epoch": 0.4054982817869416,
"grad_norm": 0.1142578125,
"learning_rate": 5.945017182130585e-06,
"loss": 1.14,
"step": 118
},
{
"epoch": 0.40893470790378006,
"grad_norm": 0.0986328125,
"learning_rate": 5.9106529209622e-06,
"loss": 1.1489,
"step": 119
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.10205078125,
"learning_rate": 5.876288659793815e-06,
"loss": 1.22,
"step": 120
},
{
"epoch": 0.41580756013745707,
"grad_norm": 0.10205078125,
"learning_rate": 5.84192439862543e-06,
"loss": 1.1821,
"step": 121
},
{
"epoch": 0.41924398625429554,
"grad_norm": 0.12255859375,
"learning_rate": 5.807560137457045e-06,
"loss": 1.1964,
"step": 122
},
{
"epoch": 0.422680412371134,
"grad_norm": 0.109375,
"learning_rate": 5.7731958762886594e-06,
"loss": 1.1879,
"step": 123
},
{
"epoch": 0.4261168384879725,
"grad_norm": 0.1005859375,
"learning_rate": 5.738831615120275e-06,
"loss": 1.1497,
"step": 124
},
{
"epoch": 0.42955326460481097,
"grad_norm": 0.10107421875,
"learning_rate": 5.70446735395189e-06,
"loss": 1.0871,
"step": 125
},
{
"epoch": 0.4329896907216495,
"grad_norm": 0.11083984375,
"learning_rate": 5.670103092783505e-06,
"loss": 1.2444,
"step": 126
},
{
"epoch": 0.436426116838488,
"grad_norm": 0.099609375,
"learning_rate": 5.6357388316151204e-06,
"loss": 1.1902,
"step": 127
},
{
"epoch": 0.43986254295532645,
"grad_norm": 0.11083984375,
"learning_rate": 5.601374570446736e-06,
"loss": 1.2449,
"step": 128
},
{
"epoch": 0.44329896907216493,
"grad_norm": 0.111328125,
"learning_rate": 5.567010309278351e-06,
"loss": 1.1841,
"step": 129
},
{
"epoch": 0.44673539518900346,
"grad_norm": 0.10888671875,
"learning_rate": 5.532646048109966e-06,
"loss": 1.2824,
"step": 130
},
{
"epoch": 0.45017182130584193,
"grad_norm": 0.1015625,
"learning_rate": 5.4982817869415815e-06,
"loss": 1.1912,
"step": 131
},
{
"epoch": 0.4536082474226804,
"grad_norm": 0.09521484375,
"learning_rate": 5.463917525773196e-06,
"loss": 1.1778,
"step": 132
},
{
"epoch": 0.4570446735395189,
"grad_norm": 0.10400390625,
"learning_rate": 5.429553264604811e-06,
"loss": 1.1849,
"step": 133
},
{
"epoch": 0.46048109965635736,
"grad_norm": 0.10009765625,
"learning_rate": 5.395189003436427e-06,
"loss": 1.1319,
"step": 134
},
{
"epoch": 0.4639175257731959,
"grad_norm": 0.09912109375,
"learning_rate": 5.360824742268042e-06,
"loss": 1.1162,
"step": 135
},
{
"epoch": 0.46735395189003437,
"grad_norm": 0.09912109375,
"learning_rate": 5.326460481099657e-06,
"loss": 1.1951,
"step": 136
},
{
"epoch": 0.47079037800687284,
"grad_norm": 0.0986328125,
"learning_rate": 5.292096219931272e-06,
"loss": 1.2168,
"step": 137
},
{
"epoch": 0.4742268041237113,
"grad_norm": 0.10986328125,
"learning_rate": 5.257731958762888e-06,
"loss": 1.1578,
"step": 138
},
{
"epoch": 0.47766323024054985,
"grad_norm": 0.1103515625,
"learning_rate": 5.223367697594503e-06,
"loss": 1.3295,
"step": 139
},
{
"epoch": 0.48109965635738833,
"grad_norm": 0.10595703125,
"learning_rate": 5.189003436426118e-06,
"loss": 1.1521,
"step": 140
},
{
"epoch": 0.4845360824742268,
"grad_norm": 0.1044921875,
"learning_rate": 5.154639175257732e-06,
"loss": 1.2219,
"step": 141
},
{
"epoch": 0.4879725085910653,
"grad_norm": 0.09912109375,
"learning_rate": 5.120274914089347e-06,
"loss": 1.1866,
"step": 142
},
{
"epoch": 0.49140893470790376,
"grad_norm": 0.10009765625,
"learning_rate": 5.085910652920962e-06,
"loss": 1.1922,
"step": 143
},
{
"epoch": 0.4948453608247423,
"grad_norm": 0.1005859375,
"learning_rate": 5.051546391752578e-06,
"loss": 1.1784,
"step": 144
},
{
"epoch": 0.49828178694158076,
"grad_norm": 0.10595703125,
"learning_rate": 5.017182130584193e-06,
"loss": 1.1468,
"step": 145
},
{
"epoch": 0.5017182130584192,
"grad_norm": 0.103515625,
"learning_rate": 4.982817869415808e-06,
"loss": 1.2448,
"step": 146
},
{
"epoch": 0.5051546391752577,
"grad_norm": 0.099609375,
"learning_rate": 4.948453608247423e-06,
"loss": 1.1422,
"step": 147
},
{
"epoch": 0.5085910652920962,
"grad_norm": 0.1123046875,
"learning_rate": 4.914089347079038e-06,
"loss": 1.2039,
"step": 148
},
{
"epoch": 0.5120274914089347,
"grad_norm": 0.09814453125,
"learning_rate": 4.879725085910653e-06,
"loss": 1.2287,
"step": 149
},
{
"epoch": 0.5154639175257731,
"grad_norm": 0.10302734375,
"learning_rate": 4.845360824742268e-06,
"loss": 1.17,
"step": 150
},
{
"epoch": 0.5189003436426117,
"grad_norm": 0.1064453125,
"learning_rate": 4.810996563573884e-06,
"loss": 1.0933,
"step": 151
},
{
"epoch": 0.5223367697594502,
"grad_norm": 0.1005859375,
"learning_rate": 4.776632302405499e-06,
"loss": 1.2443,
"step": 152
},
{
"epoch": 0.5257731958762887,
"grad_norm": 0.1220703125,
"learning_rate": 4.742268041237113e-06,
"loss": 1.2075,
"step": 153
},
{
"epoch": 0.5292096219931272,
"grad_norm": 0.10888671875,
"learning_rate": 4.707903780068729e-06,
"loss": 1.1981,
"step": 154
},
{
"epoch": 0.5326460481099656,
"grad_norm": 0.10107421875,
"learning_rate": 4.673539518900344e-06,
"loss": 1.1483,
"step": 155
},
{
"epoch": 0.5360824742268041,
"grad_norm": 0.10205078125,
"learning_rate": 4.639175257731959e-06,
"loss": 1.1753,
"step": 156
},
{
"epoch": 0.5395189003436426,
"grad_norm": 0.099609375,
"learning_rate": 4.604810996563574e-06,
"loss": 1.1578,
"step": 157
},
{
"epoch": 0.5429553264604811,
"grad_norm": 0.10498046875,
"learning_rate": 4.570446735395189e-06,
"loss": 1.1467,
"step": 158
},
{
"epoch": 0.5463917525773195,
"grad_norm": 0.1044921875,
"learning_rate": 4.536082474226804e-06,
"loss": 1.188,
"step": 159
},
{
"epoch": 0.5498281786941581,
"grad_norm": 0.10595703125,
"learning_rate": 4.501718213058419e-06,
"loss": 1.1748,
"step": 160
},
{
"epoch": 0.5532646048109966,
"grad_norm": 0.10107421875,
"learning_rate": 4.467353951890035e-06,
"loss": 1.1563,
"step": 161
},
{
"epoch": 0.5567010309278351,
"grad_norm": 0.103515625,
"learning_rate": 4.4329896907216494e-06,
"loss": 1.2162,
"step": 162
},
{
"epoch": 0.5601374570446735,
"grad_norm": 0.1708984375,
"learning_rate": 4.3986254295532645e-06,
"loss": 1.1612,
"step": 163
},
{
"epoch": 0.563573883161512,
"grad_norm": 0.1015625,
"learning_rate": 4.36426116838488e-06,
"loss": 1.1738,
"step": 164
},
{
"epoch": 0.5670103092783505,
"grad_norm": 0.11669921875,
"learning_rate": 4.329896907216495e-06,
"loss": 1.2428,
"step": 165
},
{
"epoch": 0.570446735395189,
"grad_norm": 0.1767578125,
"learning_rate": 4.2955326460481105e-06,
"loss": 1.2499,
"step": 166
},
{
"epoch": 0.5738831615120275,
"grad_norm": 0.15234375,
"learning_rate": 4.2611683848797255e-06,
"loss": 1.1381,
"step": 167
},
{
"epoch": 0.5773195876288659,
"grad_norm": 0.107421875,
"learning_rate": 4.2268041237113405e-06,
"loss": 1.1218,
"step": 168
},
{
"epoch": 0.5807560137457045,
"grad_norm": 0.103515625,
"learning_rate": 4.192439862542956e-06,
"loss": 1.1917,
"step": 169
},
{
"epoch": 0.584192439862543,
"grad_norm": 0.1201171875,
"learning_rate": 4.158075601374571e-06,
"loss": 1.2737,
"step": 170
},
{
"epoch": 0.5876288659793815,
"grad_norm": 0.1376953125,
"learning_rate": 4.123711340206186e-06,
"loss": 1.1914,
"step": 171
},
{
"epoch": 0.5910652920962199,
"grad_norm": 0.10546875,
"learning_rate": 4.089347079037801e-06,
"loss": 1.1924,
"step": 172
},
{
"epoch": 0.5945017182130584,
"grad_norm": 0.10595703125,
"learning_rate": 4.054982817869416e-06,
"loss": 1.186,
"step": 173
},
{
"epoch": 0.5979381443298969,
"grad_norm": 0.1591796875,
"learning_rate": 4.020618556701032e-06,
"loss": 1.2316,
"step": 174
},
{
"epoch": 0.6013745704467354,
"grad_norm": 0.1015625,
"learning_rate": 3.986254295532647e-06,
"loss": 1.1319,
"step": 175
},
{
"epoch": 0.6048109965635738,
"grad_norm": 0.11083984375,
"learning_rate": 3.951890034364262e-06,
"loss": 1.1959,
"step": 176
},
{
"epoch": 0.6082474226804123,
"grad_norm": 0.1103515625,
"learning_rate": 3.917525773195877e-06,
"loss": 1.192,
"step": 177
},
{
"epoch": 0.6116838487972509,
"grad_norm": 0.10498046875,
"learning_rate": 3.883161512027492e-06,
"loss": 1.1374,
"step": 178
},
{
"epoch": 0.6151202749140894,
"grad_norm": 0.11865234375,
"learning_rate": 3.848797250859107e-06,
"loss": 1.2014,
"step": 179
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.12890625,
"learning_rate": 3.814432989690722e-06,
"loss": 1.2549,
"step": 180
},
{
"epoch": 0.6219931271477663,
"grad_norm": 0.10107421875,
"learning_rate": 3.780068728522337e-06,
"loss": 1.2026,
"step": 181
},
{
"epoch": 0.6254295532646048,
"grad_norm": 0.10693359375,
"learning_rate": 3.7457044673539524e-06,
"loss": 1.249,
"step": 182
},
{
"epoch": 0.6288659793814433,
"grad_norm": 0.11865234375,
"learning_rate": 3.7113402061855674e-06,
"loss": 1.2328,
"step": 183
},
{
"epoch": 0.6323024054982818,
"grad_norm": 0.10205078125,
"learning_rate": 3.6769759450171825e-06,
"loss": 1.1544,
"step": 184
},
{
"epoch": 0.6357388316151202,
"grad_norm": 0.099609375,
"learning_rate": 3.6426116838487975e-06,
"loss": 1.175,
"step": 185
},
{
"epoch": 0.6391752577319587,
"grad_norm": 0.1552734375,
"learning_rate": 3.6082474226804126e-06,
"loss": 1.122,
"step": 186
},
{
"epoch": 0.6426116838487973,
"grad_norm": 0.123046875,
"learning_rate": 3.573883161512028e-06,
"loss": 1.1623,
"step": 187
},
{
"epoch": 0.6460481099656358,
"grad_norm": 0.10302734375,
"learning_rate": 3.539518900343643e-06,
"loss": 1.1869,
"step": 188
},
{
"epoch": 0.6494845360824743,
"grad_norm": 0.0966796875,
"learning_rate": 3.5051546391752577e-06,
"loss": 1.1359,
"step": 189
},
{
"epoch": 0.6529209621993127,
"grad_norm": 0.1103515625,
"learning_rate": 3.470790378006873e-06,
"loss": 1.1882,
"step": 190
},
{
"epoch": 0.6563573883161512,
"grad_norm": 0.109375,
"learning_rate": 3.436426116838488e-06,
"loss": 1.245,
"step": 191
},
{
"epoch": 0.6597938144329897,
"grad_norm": 0.09912109375,
"learning_rate": 3.4020618556701037e-06,
"loss": 1.1544,
"step": 192
},
{
"epoch": 0.6632302405498282,
"grad_norm": 0.11474609375,
"learning_rate": 3.3676975945017187e-06,
"loss": 1.1664,
"step": 193
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.1123046875,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.1409,
"step": 194
},
{
"epoch": 0.6701030927835051,
"grad_norm": 0.1044921875,
"learning_rate": 3.298969072164949e-06,
"loss": 1.144,
"step": 195
},
{
"epoch": 0.6735395189003437,
"grad_norm": 0.111328125,
"learning_rate": 3.264604810996564e-06,
"loss": 1.2533,
"step": 196
},
{
"epoch": 0.6769759450171822,
"grad_norm": 0.10693359375,
"learning_rate": 3.2302405498281793e-06,
"loss": 1.1903,
"step": 197
},
{
"epoch": 0.6804123711340206,
"grad_norm": 0.12109375,
"learning_rate": 3.195876288659794e-06,
"loss": 1.1853,
"step": 198
},
{
"epoch": 0.6838487972508591,
"grad_norm": 0.1083984375,
"learning_rate": 3.161512027491409e-06,
"loss": 1.1456,
"step": 199
},
{
"epoch": 0.6872852233676976,
"grad_norm": 0.10498046875,
"learning_rate": 3.1271477663230244e-06,
"loss": 1.1724,
"step": 200
},
{
"epoch": 0.6907216494845361,
"grad_norm": 0.11181640625,
"learning_rate": 3.0927835051546395e-06,
"loss": 1.2242,
"step": 201
},
{
"epoch": 0.6941580756013745,
"grad_norm": 0.1494140625,
"learning_rate": 3.058419243986255e-06,
"loss": 1.2096,
"step": 202
},
{
"epoch": 0.697594501718213,
"grad_norm": 0.1171875,
"learning_rate": 3.0240549828178695e-06,
"loss": 1.1972,
"step": 203
},
{
"epoch": 0.7010309278350515,
"grad_norm": 0.1240234375,
"learning_rate": 2.9896907216494846e-06,
"loss": 1.2032,
"step": 204
},
{
"epoch": 0.7044673539518901,
"grad_norm": 0.11279296875,
"learning_rate": 2.9553264604811e-06,
"loss": 1.1783,
"step": 205
},
{
"epoch": 0.7079037800687286,
"grad_norm": 0.10986328125,
"learning_rate": 2.920962199312715e-06,
"loss": 1.1432,
"step": 206
},
{
"epoch": 0.711340206185567,
"grad_norm": 0.1083984375,
"learning_rate": 2.8865979381443297e-06,
"loss": 1.1197,
"step": 207
},
{
"epoch": 0.7147766323024055,
"grad_norm": 0.1064453125,
"learning_rate": 2.852233676975945e-06,
"loss": 1.1736,
"step": 208
},
{
"epoch": 0.718213058419244,
"grad_norm": 0.126953125,
"learning_rate": 2.8178694158075602e-06,
"loss": 1.1505,
"step": 209
},
{
"epoch": 0.7216494845360825,
"grad_norm": 0.1123046875,
"learning_rate": 2.7835051546391757e-06,
"loss": 1.2264,
"step": 210
},
{
"epoch": 0.7250859106529209,
"grad_norm": 0.12255859375,
"learning_rate": 2.7491408934707907e-06,
"loss": 1.1537,
"step": 211
},
{
"epoch": 0.7285223367697594,
"grad_norm": 0.1083984375,
"learning_rate": 2.7147766323024053e-06,
"loss": 1.2153,
"step": 212
},
{
"epoch": 0.7319587628865979,
"grad_norm": 0.10498046875,
"learning_rate": 2.680412371134021e-06,
"loss": 1.1621,
"step": 213
},
{
"epoch": 0.7353951890034365,
"grad_norm": 0.109375,
"learning_rate": 2.646048109965636e-06,
"loss": 1.1657,
"step": 214
},
{
"epoch": 0.738831615120275,
"grad_norm": 0.11376953125,
"learning_rate": 2.6116838487972513e-06,
"loss": 1.1345,
"step": 215
},
{
"epoch": 0.7422680412371134,
"grad_norm": 0.11328125,
"learning_rate": 2.577319587628866e-06,
"loss": 1.1935,
"step": 216
},
{
"epoch": 0.7457044673539519,
"grad_norm": 0.10302734375,
"learning_rate": 2.542955326460481e-06,
"loss": 1.1348,
"step": 217
},
{
"epoch": 0.7491408934707904,
"grad_norm": 0.11083984375,
"learning_rate": 2.5085910652920964e-06,
"loss": 1.1279,
"step": 218
},
{
"epoch": 0.7525773195876289,
"grad_norm": 0.115234375,
"learning_rate": 2.4742268041237115e-06,
"loss": 1.1244,
"step": 219
},
{
"epoch": 0.7560137457044673,
"grad_norm": 0.109375,
"learning_rate": 2.4398625429553265e-06,
"loss": 1.1416,
"step": 220
},
{
"epoch": 0.7594501718213058,
"grad_norm": 0.1396484375,
"learning_rate": 2.405498281786942e-06,
"loss": 1.2763,
"step": 221
},
{
"epoch": 0.7628865979381443,
"grad_norm": 0.10693359375,
"learning_rate": 2.3711340206185566e-06,
"loss": 1.112,
"step": 222
},
{
"epoch": 0.7663230240549829,
"grad_norm": 0.11279296875,
"learning_rate": 2.336769759450172e-06,
"loss": 1.2058,
"step": 223
},
{
"epoch": 0.7697594501718213,
"grad_norm": 0.125,
"learning_rate": 2.302405498281787e-06,
"loss": 1.217,
"step": 224
},
{
"epoch": 0.7731958762886598,
"grad_norm": 0.11962890625,
"learning_rate": 2.268041237113402e-06,
"loss": 1.1224,
"step": 225
},
{
"epoch": 0.7766323024054983,
"grad_norm": 0.1181640625,
"learning_rate": 2.2336769759450176e-06,
"loss": 1.2149,
"step": 226
},
{
"epoch": 0.7800687285223368,
"grad_norm": 0.10693359375,
"learning_rate": 2.1993127147766322e-06,
"loss": 1.1129,
"step": 227
},
{
"epoch": 0.7835051546391752,
"grad_norm": 0.11767578125,
"learning_rate": 2.1649484536082477e-06,
"loss": 1.1593,
"step": 228
},
{
"epoch": 0.7869415807560137,
"grad_norm": 0.11279296875,
"learning_rate": 2.1305841924398628e-06,
"loss": 1.1298,
"step": 229
},
{
"epoch": 0.7903780068728522,
"grad_norm": 0.1220703125,
"learning_rate": 2.096219931271478e-06,
"loss": 1.2249,
"step": 230
},
{
"epoch": 0.7938144329896907,
"grad_norm": 0.1162109375,
"learning_rate": 2.061855670103093e-06,
"loss": 1.1265,
"step": 231
},
{
"epoch": 0.7972508591065293,
"grad_norm": 0.10791015625,
"learning_rate": 2.027491408934708e-06,
"loss": 1.0844,
"step": 232
},
{
"epoch": 0.8006872852233677,
"grad_norm": 0.1083984375,
"learning_rate": 1.9931271477663233e-06,
"loss": 1.1828,
"step": 233
},
{
"epoch": 0.8041237113402062,
"grad_norm": 0.1572265625,
"learning_rate": 1.9587628865979384e-06,
"loss": 1.1879,
"step": 234
},
{
"epoch": 0.8075601374570447,
"grad_norm": 0.1220703125,
"learning_rate": 1.9243986254295534e-06,
"loss": 1.2128,
"step": 235
},
{
"epoch": 0.8109965635738832,
"grad_norm": 0.11474609375,
"learning_rate": 1.8900343642611685e-06,
"loss": 1.1451,
"step": 236
},
{
"epoch": 0.8144329896907216,
"grad_norm": 0.11279296875,
"learning_rate": 1.8556701030927837e-06,
"loss": 1.1279,
"step": 237
},
{
"epoch": 0.8178694158075601,
"grad_norm": 0.11083984375,
"learning_rate": 1.8213058419243988e-06,
"loss": 1.1636,
"step": 238
},
{
"epoch": 0.8213058419243986,
"grad_norm": 0.11083984375,
"learning_rate": 1.786941580756014e-06,
"loss": 1.1323,
"step": 239
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.1767578125,
"learning_rate": 1.7525773195876288e-06,
"loss": 1.1456,
"step": 240
},
{
"epoch": 0.8281786941580757,
"grad_norm": 0.1181640625,
"learning_rate": 1.718213058419244e-06,
"loss": 1.1559,
"step": 241
},
{
"epoch": 0.8316151202749141,
"grad_norm": 0.1591796875,
"learning_rate": 1.6838487972508594e-06,
"loss": 1.1903,
"step": 242
},
{
"epoch": 0.8350515463917526,
"grad_norm": 0.1376953125,
"learning_rate": 1.6494845360824744e-06,
"loss": 1.1933,
"step": 243
},
{
"epoch": 0.8384879725085911,
"grad_norm": 0.1328125,
"learning_rate": 1.6151202749140896e-06,
"loss": 1.2029,
"step": 244
},
{
"epoch": 0.8419243986254296,
"grad_norm": 0.1201171875,
"learning_rate": 1.5807560137457045e-06,
"loss": 1.1551,
"step": 245
},
{
"epoch": 0.845360824742268,
"grad_norm": 0.11279296875,
"learning_rate": 1.5463917525773197e-06,
"loss": 1.1213,
"step": 246
},
{
"epoch": 0.8487972508591065,
"grad_norm": 0.11572265625,
"learning_rate": 1.5120274914089348e-06,
"loss": 1.1897,
"step": 247
},
{
"epoch": 0.852233676975945,
"grad_norm": 0.1142578125,
"learning_rate": 1.47766323024055e-06,
"loss": 1.2245,
"step": 248
},
{
"epoch": 0.8556701030927835,
"grad_norm": 0.1162109375,
"learning_rate": 1.4432989690721649e-06,
"loss": 1.1553,
"step": 249
},
{
"epoch": 0.8591065292096219,
"grad_norm": 0.134765625,
"learning_rate": 1.4089347079037801e-06,
"loss": 1.12,
"step": 250
},
{
"epoch": 0.8625429553264605,
"grad_norm": 0.10986328125,
"learning_rate": 1.3745704467353954e-06,
"loss": 1.0931,
"step": 251
},
{
"epoch": 0.865979381443299,
"grad_norm": 0.208984375,
"learning_rate": 1.3402061855670104e-06,
"loss": 1.1994,
"step": 252
},
{
"epoch": 0.8694158075601375,
"grad_norm": 0.10888671875,
"learning_rate": 1.3058419243986257e-06,
"loss": 1.1478,
"step": 253
},
{
"epoch": 0.872852233676976,
"grad_norm": 0.1181640625,
"learning_rate": 1.2714776632302405e-06,
"loss": 1.1798,
"step": 254
},
{
"epoch": 0.8762886597938144,
"grad_norm": 0.1318359375,
"learning_rate": 1.2371134020618557e-06,
"loss": 1.1266,
"step": 255
},
{
"epoch": 0.8797250859106529,
"grad_norm": 0.11962890625,
"learning_rate": 1.202749140893471e-06,
"loss": 1.1522,
"step": 256
},
{
"epoch": 0.8831615120274914,
"grad_norm": 0.11083984375,
"learning_rate": 1.168384879725086e-06,
"loss": 1.1133,
"step": 257
},
{
"epoch": 0.8865979381443299,
"grad_norm": 0.11572265625,
"learning_rate": 1.134020618556701e-06,
"loss": 1.2044,
"step": 258
},
{
"epoch": 0.8900343642611683,
"grad_norm": 0.1279296875,
"learning_rate": 1.0996563573883161e-06,
"loss": 1.2362,
"step": 259
},
{
"epoch": 0.8934707903780069,
"grad_norm": 0.1728515625,
"learning_rate": 1.0652920962199314e-06,
"loss": 1.1711,
"step": 260
},
{
"epoch": 0.8969072164948454,
"grad_norm": 0.125,
"learning_rate": 1.0309278350515464e-06,
"loss": 1.1429,
"step": 261
},
{
"epoch": 0.9003436426116839,
"grad_norm": 0.14453125,
"learning_rate": 9.965635738831617e-07,
"loss": 1.2533,
"step": 262
},
{
"epoch": 0.9037800687285223,
"grad_norm": 0.126953125,
"learning_rate": 9.621993127147767e-07,
"loss": 1.1734,
"step": 263
},
{
"epoch": 0.9072164948453608,
"grad_norm": 0.11572265625,
"learning_rate": 9.278350515463919e-07,
"loss": 1.1519,
"step": 264
},
{
"epoch": 0.9106529209621993,
"grad_norm": 0.11865234375,
"learning_rate": 8.93470790378007e-07,
"loss": 1.1965,
"step": 265
},
{
"epoch": 0.9140893470790378,
"grad_norm": 0.123046875,
"learning_rate": 8.59106529209622e-07,
"loss": 1.1872,
"step": 266
},
{
"epoch": 0.9175257731958762,
"grad_norm": 0.12890625,
"learning_rate": 8.247422680412372e-07,
"loss": 1.28,
"step": 267
},
{
"epoch": 0.9209621993127147,
"grad_norm": 0.1416015625,
"learning_rate": 7.903780068728522e-07,
"loss": 1.0979,
"step": 268
},
{
"epoch": 0.9243986254295533,
"grad_norm": 0.1484375,
"learning_rate": 7.560137457044674e-07,
"loss": 1.1781,
"step": 269
},
{
"epoch": 0.9278350515463918,
"grad_norm": 0.115234375,
"learning_rate": 7.216494845360824e-07,
"loss": 1.1292,
"step": 270
},
{
"epoch": 0.9312714776632303,
"grad_norm": 0.11376953125,
"learning_rate": 6.872852233676977e-07,
"loss": 1.1431,
"step": 271
},
{
"epoch": 0.9347079037800687,
"grad_norm": 0.11669921875,
"learning_rate": 6.529209621993128e-07,
"loss": 1.1752,
"step": 272
},
{
"epoch": 0.9381443298969072,
"grad_norm": 0.11376953125,
"learning_rate": 6.185567010309279e-07,
"loss": 1.167,
"step": 273
},
{
"epoch": 0.9415807560137457,
"grad_norm": 0.11328125,
"learning_rate": 5.84192439862543e-07,
"loss": 1.1292,
"step": 274
},
{
"epoch": 0.9450171821305842,
"grad_norm": 0.125,
"learning_rate": 5.498281786941581e-07,
"loss": 1.2543,
"step": 275
},
{
"epoch": 0.9484536082474226,
"grad_norm": 0.12890625,
"learning_rate": 5.154639175257732e-07,
"loss": 1.2113,
"step": 276
},
{
"epoch": 0.9518900343642611,
"grad_norm": 0.119140625,
"learning_rate": 4.810996563573884e-07,
"loss": 1.1205,
"step": 277
},
{
"epoch": 0.9553264604810997,
"grad_norm": 0.12109375,
"learning_rate": 4.467353951890035e-07,
"loss": 1.1825,
"step": 278
},
{
"epoch": 0.9587628865979382,
"grad_norm": 0.39453125,
"learning_rate": 4.123711340206186e-07,
"loss": 1.1801,
"step": 279
},
{
"epoch": 0.9621993127147767,
"grad_norm": 0.16015625,
"learning_rate": 3.780068728522337e-07,
"loss": 1.1751,
"step": 280
},
{
"epoch": 0.9656357388316151,
"grad_norm": 0.1259765625,
"learning_rate": 3.4364261168384884e-07,
"loss": 1.1759,
"step": 281
},
{
"epoch": 0.9690721649484536,
"grad_norm": 0.1298828125,
"learning_rate": 3.0927835051546394e-07,
"loss": 1.1685,
"step": 282
},
{
"epoch": 0.9725085910652921,
"grad_norm": 0.1796875,
"learning_rate": 2.7491408934707903e-07,
"loss": 1.1825,
"step": 283
},
{
"epoch": 0.9759450171821306,
"grad_norm": 0.126953125,
"learning_rate": 2.405498281786942e-07,
"loss": 1.1749,
"step": 284
},
{
"epoch": 0.979381443298969,
"grad_norm": 0.1337890625,
"learning_rate": 2.061855670103093e-07,
"loss": 1.1566,
"step": 285
},
{
"epoch": 0.9828178694158075,
"grad_norm": 0.1201171875,
"learning_rate": 1.7182130584192442e-07,
"loss": 1.1753,
"step": 286
},
{
"epoch": 0.9862542955326461,
"grad_norm": 0.185546875,
"learning_rate": 1.3745704467353952e-07,
"loss": 1.149,
"step": 287
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.1171875,
"learning_rate": 1.0309278350515465e-07,
"loss": 1.114,
"step": 288
},
{
"epoch": 0.993127147766323,
"grad_norm": 0.130859375,
"learning_rate": 6.872852233676976e-08,
"loss": 1.1794,
"step": 289
},
{
"epoch": 0.9965635738831615,
"grad_norm": 0.14453125,
"learning_rate": 3.436426116838488e-08,
"loss": 1.2422,
"step": 290
},
{
"epoch": 1.0,
"grad_norm": 0.125,
"learning_rate": 0.0,
"loss": 1.257,
"step": 291
}
],
"logging_steps": 1.0,
"max_steps": 291,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.177843676804547e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}