Wanff
Add fine-tuned model
4bb9884
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 266,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037593984962406013,
"grad_norm": 0.5625,
"learning_rate": 9.962406015037594e-06,
"loss": 2.1309,
"step": 1
},
{
"epoch": 0.007518796992481203,
"grad_norm": 0.546875,
"learning_rate": 9.924812030075189e-06,
"loss": 2.1148,
"step": 2
},
{
"epoch": 0.011278195488721804,
"grad_norm": 0.5625,
"learning_rate": 9.887218045112783e-06,
"loss": 2.1468,
"step": 3
},
{
"epoch": 0.015037593984962405,
"grad_norm": 0.51171875,
"learning_rate": 9.849624060150376e-06,
"loss": 2.079,
"step": 4
},
{
"epoch": 0.018796992481203006,
"grad_norm": 0.498046875,
"learning_rate": 9.812030075187971e-06,
"loss": 2.0392,
"step": 5
},
{
"epoch": 0.022556390977443608,
"grad_norm": 0.455078125,
"learning_rate": 9.774436090225564e-06,
"loss": 2.145,
"step": 6
},
{
"epoch": 0.02631578947368421,
"grad_norm": 0.384765625,
"learning_rate": 9.736842105263159e-06,
"loss": 2.0199,
"step": 7
},
{
"epoch": 0.03007518796992481,
"grad_norm": 0.3671875,
"learning_rate": 9.699248120300752e-06,
"loss": 2.0441,
"step": 8
},
{
"epoch": 0.03383458646616541,
"grad_norm": 0.33203125,
"learning_rate": 9.661654135338347e-06,
"loss": 1.9529,
"step": 9
},
{
"epoch": 0.03759398496240601,
"grad_norm": 0.33203125,
"learning_rate": 9.62406015037594e-06,
"loss": 2.0237,
"step": 10
},
{
"epoch": 0.041353383458646614,
"grad_norm": 0.27734375,
"learning_rate": 9.586466165413535e-06,
"loss": 1.955,
"step": 11
},
{
"epoch": 0.045112781954887216,
"grad_norm": 0.275390625,
"learning_rate": 9.54887218045113e-06,
"loss": 1.9995,
"step": 12
},
{
"epoch": 0.04887218045112782,
"grad_norm": 0.28515625,
"learning_rate": 9.511278195488722e-06,
"loss": 1.9685,
"step": 13
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.283203125,
"learning_rate": 9.473684210526315e-06,
"loss": 1.9314,
"step": 14
},
{
"epoch": 0.05639097744360902,
"grad_norm": 0.2734375,
"learning_rate": 9.43609022556391e-06,
"loss": 1.9443,
"step": 15
},
{
"epoch": 0.06015037593984962,
"grad_norm": 0.259765625,
"learning_rate": 9.398496240601505e-06,
"loss": 1.9512,
"step": 16
},
{
"epoch": 0.06390977443609022,
"grad_norm": 0.251953125,
"learning_rate": 9.360902255639098e-06,
"loss": 1.926,
"step": 17
},
{
"epoch": 0.06766917293233082,
"grad_norm": 0.244140625,
"learning_rate": 9.323308270676693e-06,
"loss": 1.8982,
"step": 18
},
{
"epoch": 0.07142857142857142,
"grad_norm": 0.255859375,
"learning_rate": 9.285714285714288e-06,
"loss": 1.9947,
"step": 19
},
{
"epoch": 0.07518796992481203,
"grad_norm": 0.2470703125,
"learning_rate": 9.24812030075188e-06,
"loss": 1.9373,
"step": 20
},
{
"epoch": 0.07894736842105263,
"grad_norm": 0.2275390625,
"learning_rate": 9.210526315789474e-06,
"loss": 1.8399,
"step": 21
},
{
"epoch": 0.08270676691729323,
"grad_norm": 0.205078125,
"learning_rate": 9.172932330827068e-06,
"loss": 1.8937,
"step": 22
},
{
"epoch": 0.08646616541353383,
"grad_norm": 0.1923828125,
"learning_rate": 9.135338345864663e-06,
"loss": 1.8808,
"step": 23
},
{
"epoch": 0.09022556390977443,
"grad_norm": 0.203125,
"learning_rate": 9.097744360902256e-06,
"loss": 1.8782,
"step": 24
},
{
"epoch": 0.09398496240601503,
"grad_norm": 0.1923828125,
"learning_rate": 9.06015037593985e-06,
"loss": 1.8855,
"step": 25
},
{
"epoch": 0.09774436090225563,
"grad_norm": 0.1865234375,
"learning_rate": 9.022556390977444e-06,
"loss": 1.9202,
"step": 26
},
{
"epoch": 0.10150375939849623,
"grad_norm": 0.1689453125,
"learning_rate": 8.984962406015039e-06,
"loss": 1.8394,
"step": 27
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.1806640625,
"learning_rate": 8.947368421052632e-06,
"loss": 1.8974,
"step": 28
},
{
"epoch": 0.10902255639097744,
"grad_norm": 0.1640625,
"learning_rate": 8.909774436090227e-06,
"loss": 1.8492,
"step": 29
},
{
"epoch": 0.11278195488721804,
"grad_norm": 0.162109375,
"learning_rate": 8.87218045112782e-06,
"loss": 1.8371,
"step": 30
},
{
"epoch": 0.11654135338345864,
"grad_norm": 0.1728515625,
"learning_rate": 8.834586466165414e-06,
"loss": 1.8237,
"step": 31
},
{
"epoch": 0.12030075187969924,
"grad_norm": 0.189453125,
"learning_rate": 8.796992481203007e-06,
"loss": 1.869,
"step": 32
},
{
"epoch": 0.12406015037593984,
"grad_norm": 0.169921875,
"learning_rate": 8.759398496240602e-06,
"loss": 1.8077,
"step": 33
},
{
"epoch": 0.12781954887218044,
"grad_norm": 0.1787109375,
"learning_rate": 8.721804511278195e-06,
"loss": 1.8324,
"step": 34
},
{
"epoch": 0.13157894736842105,
"grad_norm": 0.1806640625,
"learning_rate": 8.68421052631579e-06,
"loss": 1.7985,
"step": 35
},
{
"epoch": 0.13533834586466165,
"grad_norm": 0.169921875,
"learning_rate": 8.646616541353385e-06,
"loss": 1.8283,
"step": 36
},
{
"epoch": 0.13909774436090225,
"grad_norm": 0.1630859375,
"learning_rate": 8.609022556390978e-06,
"loss": 1.7779,
"step": 37
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.154296875,
"learning_rate": 8.571428571428571e-06,
"loss": 1.8007,
"step": 38
},
{
"epoch": 0.14661654135338345,
"grad_norm": 0.1552734375,
"learning_rate": 8.533834586466166e-06,
"loss": 1.7902,
"step": 39
},
{
"epoch": 0.15037593984962405,
"grad_norm": 0.19921875,
"learning_rate": 8.49624060150376e-06,
"loss": 1.7989,
"step": 40
},
{
"epoch": 0.15413533834586465,
"grad_norm": 0.1611328125,
"learning_rate": 8.458646616541353e-06,
"loss": 1.7651,
"step": 41
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.1591796875,
"learning_rate": 8.421052631578948e-06,
"loss": 1.7581,
"step": 42
},
{
"epoch": 0.16165413533834586,
"grad_norm": 0.15234375,
"learning_rate": 8.383458646616543e-06,
"loss": 1.743,
"step": 43
},
{
"epoch": 0.16541353383458646,
"grad_norm": 0.150390625,
"learning_rate": 8.345864661654136e-06,
"loss": 1.7409,
"step": 44
},
{
"epoch": 0.16917293233082706,
"grad_norm": 0.134765625,
"learning_rate": 8.308270676691729e-06,
"loss": 1.719,
"step": 45
},
{
"epoch": 0.17293233082706766,
"grad_norm": 0.1435546875,
"learning_rate": 8.270676691729324e-06,
"loss": 1.749,
"step": 46
},
{
"epoch": 0.17669172932330826,
"grad_norm": 0.154296875,
"learning_rate": 8.233082706766919e-06,
"loss": 1.7724,
"step": 47
},
{
"epoch": 0.18045112781954886,
"grad_norm": 0.1455078125,
"learning_rate": 8.195488721804512e-06,
"loss": 1.7485,
"step": 48
},
{
"epoch": 0.18421052631578946,
"grad_norm": 0.2265625,
"learning_rate": 8.157894736842106e-06,
"loss": 1.7398,
"step": 49
},
{
"epoch": 0.18796992481203006,
"grad_norm": 0.1416015625,
"learning_rate": 8.1203007518797e-06,
"loss": 1.7485,
"step": 50
},
{
"epoch": 0.19172932330827067,
"grad_norm": 0.140625,
"learning_rate": 8.082706766917294e-06,
"loss": 1.7284,
"step": 51
},
{
"epoch": 0.19548872180451127,
"grad_norm": 0.1533203125,
"learning_rate": 8.045112781954887e-06,
"loss": 1.6845,
"step": 52
},
{
"epoch": 0.19924812030075187,
"grad_norm": 0.1357421875,
"learning_rate": 8.007518796992482e-06,
"loss": 1.7095,
"step": 53
},
{
"epoch": 0.20300751879699247,
"grad_norm": 0.1279296875,
"learning_rate": 7.969924812030075e-06,
"loss": 1.7161,
"step": 54
},
{
"epoch": 0.20676691729323307,
"grad_norm": 0.1396484375,
"learning_rate": 7.93233082706767e-06,
"loss": 1.7005,
"step": 55
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.1298828125,
"learning_rate": 7.894736842105265e-06,
"loss": 1.6836,
"step": 56
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.126953125,
"learning_rate": 7.857142857142858e-06,
"loss": 1.7109,
"step": 57
},
{
"epoch": 0.21804511278195488,
"grad_norm": 0.166015625,
"learning_rate": 7.81954887218045e-06,
"loss": 1.7338,
"step": 58
},
{
"epoch": 0.22180451127819548,
"grad_norm": 0.12890625,
"learning_rate": 7.781954887218045e-06,
"loss": 1.6894,
"step": 59
},
{
"epoch": 0.22556390977443608,
"grad_norm": 0.1630859375,
"learning_rate": 7.74436090225564e-06,
"loss": 1.6767,
"step": 60
},
{
"epoch": 0.22932330827067668,
"grad_norm": 0.125,
"learning_rate": 7.706766917293233e-06,
"loss": 1.6984,
"step": 61
},
{
"epoch": 0.23308270676691728,
"grad_norm": 0.1337890625,
"learning_rate": 7.669172932330828e-06,
"loss": 1.6824,
"step": 62
},
{
"epoch": 0.23684210526315788,
"grad_norm": 0.130859375,
"learning_rate": 7.631578947368423e-06,
"loss": 1.7111,
"step": 63
},
{
"epoch": 0.24060150375939848,
"grad_norm": 0.12890625,
"learning_rate": 7.593984962406016e-06,
"loss": 1.68,
"step": 64
},
{
"epoch": 0.24436090225563908,
"grad_norm": 0.13671875,
"learning_rate": 7.55639097744361e-06,
"loss": 1.7161,
"step": 65
},
{
"epoch": 0.24812030075187969,
"grad_norm": 0.119140625,
"learning_rate": 7.518796992481203e-06,
"loss": 1.6781,
"step": 66
},
{
"epoch": 0.2518796992481203,
"grad_norm": 0.1298828125,
"learning_rate": 7.481203007518798e-06,
"loss": 1.6976,
"step": 67
},
{
"epoch": 0.2556390977443609,
"grad_norm": 0.12255859375,
"learning_rate": 7.4436090225563915e-06,
"loss": 1.6768,
"step": 68
},
{
"epoch": 0.2593984962406015,
"grad_norm": 0.1337890625,
"learning_rate": 7.406015037593985e-06,
"loss": 1.6507,
"step": 69
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.1259765625,
"learning_rate": 7.368421052631579e-06,
"loss": 1.6782,
"step": 70
},
{
"epoch": 0.2669172932330827,
"grad_norm": 0.123046875,
"learning_rate": 7.330827067669174e-06,
"loss": 1.6697,
"step": 71
},
{
"epoch": 0.2706766917293233,
"grad_norm": 0.1318359375,
"learning_rate": 7.293233082706768e-06,
"loss": 1.667,
"step": 72
},
{
"epoch": 0.2744360902255639,
"grad_norm": 0.123046875,
"learning_rate": 7.255639097744361e-06,
"loss": 1.6558,
"step": 73
},
{
"epoch": 0.2781954887218045,
"grad_norm": 0.12060546875,
"learning_rate": 7.218045112781955e-06,
"loss": 1.6728,
"step": 74
},
{
"epoch": 0.2819548872180451,
"grad_norm": 0.171875,
"learning_rate": 7.18045112781955e-06,
"loss": 1.6172,
"step": 75
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.12451171875,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.6714,
"step": 76
},
{
"epoch": 0.2894736842105263,
"grad_norm": 0.126953125,
"learning_rate": 7.1052631578947375e-06,
"loss": 1.6557,
"step": 77
},
{
"epoch": 0.2932330827067669,
"grad_norm": 0.14453125,
"learning_rate": 7.067669172932331e-06,
"loss": 1.6452,
"step": 78
},
{
"epoch": 0.29699248120300753,
"grad_norm": 0.11669921875,
"learning_rate": 7.030075187969926e-06,
"loss": 1.6577,
"step": 79
},
{
"epoch": 0.3007518796992481,
"grad_norm": 0.11572265625,
"learning_rate": 6.992481203007519e-06,
"loss": 1.6411,
"step": 80
},
{
"epoch": 0.30451127819548873,
"grad_norm": 0.1142578125,
"learning_rate": 6.954887218045113e-06,
"loss": 1.6033,
"step": 81
},
{
"epoch": 0.3082706766917293,
"grad_norm": 0.1357421875,
"learning_rate": 6.917293233082707e-06,
"loss": 1.6587,
"step": 82
},
{
"epoch": 0.31203007518796994,
"grad_norm": 0.11279296875,
"learning_rate": 6.879699248120302e-06,
"loss": 1.6648,
"step": 83
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.10986328125,
"learning_rate": 6.842105263157896e-06,
"loss": 1.6384,
"step": 84
},
{
"epoch": 0.31954887218045114,
"grad_norm": 0.1171875,
"learning_rate": 6.8045112781954896e-06,
"loss": 1.6531,
"step": 85
},
{
"epoch": 0.3233082706766917,
"grad_norm": 0.1123046875,
"learning_rate": 6.766917293233083e-06,
"loss": 1.6455,
"step": 86
},
{
"epoch": 0.32706766917293234,
"grad_norm": 0.11669921875,
"learning_rate": 6.729323308270677e-06,
"loss": 1.6568,
"step": 87
},
{
"epoch": 0.3308270676691729,
"grad_norm": 0.11328125,
"learning_rate": 6.691729323308271e-06,
"loss": 1.6319,
"step": 88
},
{
"epoch": 0.33458646616541354,
"grad_norm": 0.11328125,
"learning_rate": 6.654135338345865e-06,
"loss": 1.6211,
"step": 89
},
{
"epoch": 0.3383458646616541,
"grad_norm": 0.12060546875,
"learning_rate": 6.616541353383459e-06,
"loss": 1.6466,
"step": 90
},
{
"epoch": 0.34210526315789475,
"grad_norm": 0.12890625,
"learning_rate": 6.578947368421054e-06,
"loss": 1.6599,
"step": 91
},
{
"epoch": 0.3458646616541353,
"grad_norm": 0.11279296875,
"learning_rate": 6.541353383458648e-06,
"loss": 1.6457,
"step": 92
},
{
"epoch": 0.34962406015037595,
"grad_norm": 0.1328125,
"learning_rate": 6.503759398496241e-06,
"loss": 1.6366,
"step": 93
},
{
"epoch": 0.3533834586466165,
"grad_norm": 0.1103515625,
"learning_rate": 6.466165413533835e-06,
"loss": 1.6281,
"step": 94
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.12109375,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.6371,
"step": 95
},
{
"epoch": 0.3609022556390977,
"grad_norm": 0.10986328125,
"learning_rate": 6.390977443609023e-06,
"loss": 1.6179,
"step": 96
},
{
"epoch": 0.36466165413533835,
"grad_norm": 0.10888671875,
"learning_rate": 6.353383458646617e-06,
"loss": 1.6234,
"step": 97
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.1396484375,
"learning_rate": 6.31578947368421e-06,
"loss": 1.5895,
"step": 98
},
{
"epoch": 0.37218045112781956,
"grad_norm": 0.1298828125,
"learning_rate": 6.278195488721806e-06,
"loss": 1.5959,
"step": 99
},
{
"epoch": 0.37593984962406013,
"grad_norm": 0.1484375,
"learning_rate": 6.240601503759399e-06,
"loss": 1.6245,
"step": 100
},
{
"epoch": 0.37969924812030076,
"grad_norm": 0.111328125,
"learning_rate": 6.203007518796993e-06,
"loss": 1.6061,
"step": 101
},
{
"epoch": 0.38345864661654133,
"grad_norm": 0.1181640625,
"learning_rate": 6.165413533834587e-06,
"loss": 1.6062,
"step": 102
},
{
"epoch": 0.38721804511278196,
"grad_norm": 0.1123046875,
"learning_rate": 6.1278195488721816e-06,
"loss": 1.6603,
"step": 103
},
{
"epoch": 0.39097744360902253,
"grad_norm": 0.11083984375,
"learning_rate": 6.0902255639097755e-06,
"loss": 1.6465,
"step": 104
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.11328125,
"learning_rate": 6.0526315789473685e-06,
"loss": 1.5833,
"step": 105
},
{
"epoch": 0.39849624060150374,
"grad_norm": 0.125,
"learning_rate": 6.015037593984962e-06,
"loss": 1.5936,
"step": 106
},
{
"epoch": 0.40225563909774437,
"grad_norm": 0.126953125,
"learning_rate": 5.977443609022557e-06,
"loss": 1.6684,
"step": 107
},
{
"epoch": 0.40601503759398494,
"grad_norm": 0.115234375,
"learning_rate": 5.939849624060151e-06,
"loss": 1.616,
"step": 108
},
{
"epoch": 0.40977443609022557,
"grad_norm": 0.12451171875,
"learning_rate": 5.902255639097745e-06,
"loss": 1.6036,
"step": 109
},
{
"epoch": 0.41353383458646614,
"grad_norm": 0.1044921875,
"learning_rate": 5.864661654135339e-06,
"loss": 1.5928,
"step": 110
},
{
"epoch": 0.41729323308270677,
"grad_norm": 0.12060546875,
"learning_rate": 5.827067669172934e-06,
"loss": 1.6031,
"step": 111
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.1669921875,
"learning_rate": 5.789473684210527e-06,
"loss": 1.5906,
"step": 112
},
{
"epoch": 0.424812030075188,
"grad_norm": 0.10693359375,
"learning_rate": 5.751879699248121e-06,
"loss": 1.5912,
"step": 113
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.1142578125,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.6194,
"step": 114
},
{
"epoch": 0.4323308270676692,
"grad_norm": 0.12890625,
"learning_rate": 5.676691729323309e-06,
"loss": 1.6372,
"step": 115
},
{
"epoch": 0.43609022556390975,
"grad_norm": 0.1484375,
"learning_rate": 5.639097744360903e-06,
"loss": 1.5576,
"step": 116
},
{
"epoch": 0.4398496240601504,
"grad_norm": 0.1064453125,
"learning_rate": 5.601503759398497e-06,
"loss": 1.5777,
"step": 117
},
{
"epoch": 0.44360902255639095,
"grad_norm": 0.1083984375,
"learning_rate": 5.56390977443609e-06,
"loss": 1.6032,
"step": 118
},
{
"epoch": 0.4473684210526316,
"grad_norm": 0.11865234375,
"learning_rate": 5.526315789473685e-06,
"loss": 1.616,
"step": 119
},
{
"epoch": 0.45112781954887216,
"grad_norm": 0.1201171875,
"learning_rate": 5.488721804511279e-06,
"loss": 1.5957,
"step": 120
},
{
"epoch": 0.4548872180451128,
"grad_norm": 0.11279296875,
"learning_rate": 5.451127819548873e-06,
"loss": 1.6231,
"step": 121
},
{
"epoch": 0.45864661654135336,
"grad_norm": 0.1279296875,
"learning_rate": 5.413533834586467e-06,
"loss": 1.5908,
"step": 122
},
{
"epoch": 0.462406015037594,
"grad_norm": 0.1484375,
"learning_rate": 5.375939849624061e-06,
"loss": 1.6041,
"step": 123
},
{
"epoch": 0.46616541353383456,
"grad_norm": 0.11865234375,
"learning_rate": 5.338345864661654e-06,
"loss": 1.5849,
"step": 124
},
{
"epoch": 0.4699248120300752,
"grad_norm": 0.11865234375,
"learning_rate": 5.300751879699248e-06,
"loss": 1.594,
"step": 125
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.1123046875,
"learning_rate": 5.263157894736842e-06,
"loss": 1.5721,
"step": 126
},
{
"epoch": 0.4774436090225564,
"grad_norm": 0.1240234375,
"learning_rate": 5.225563909774437e-06,
"loss": 1.5949,
"step": 127
},
{
"epoch": 0.48120300751879697,
"grad_norm": 0.1455078125,
"learning_rate": 5.187969924812031e-06,
"loss": 1.61,
"step": 128
},
{
"epoch": 0.4849624060150376,
"grad_norm": 0.11865234375,
"learning_rate": 5.150375939849625e-06,
"loss": 1.5879,
"step": 129
},
{
"epoch": 0.48872180451127817,
"grad_norm": 0.11279296875,
"learning_rate": 5.112781954887218e-06,
"loss": 1.5907,
"step": 130
},
{
"epoch": 0.4924812030075188,
"grad_norm": 0.11328125,
"learning_rate": 5.075187969924813e-06,
"loss": 1.579,
"step": 131
},
{
"epoch": 0.49624060150375937,
"grad_norm": 0.10791015625,
"learning_rate": 5.0375939849624065e-06,
"loss": 1.5808,
"step": 132
},
{
"epoch": 0.5,
"grad_norm": 0.109375,
"learning_rate": 5e-06,
"loss": 1.5888,
"step": 133
},
{
"epoch": 0.5037593984962406,
"grad_norm": 0.12255859375,
"learning_rate": 4.962406015037594e-06,
"loss": 1.5729,
"step": 134
},
{
"epoch": 0.5075187969924813,
"grad_norm": 0.10888671875,
"learning_rate": 4.924812030075188e-06,
"loss": 1.5551,
"step": 135
},
{
"epoch": 0.5112781954887218,
"grad_norm": 0.1103515625,
"learning_rate": 4.887218045112782e-06,
"loss": 1.5706,
"step": 136
},
{
"epoch": 0.5150375939849624,
"grad_norm": 0.12109375,
"learning_rate": 4.849624060150376e-06,
"loss": 1.6047,
"step": 137
},
{
"epoch": 0.518796992481203,
"grad_norm": 0.11328125,
"learning_rate": 4.81203007518797e-06,
"loss": 1.594,
"step": 138
},
{
"epoch": 0.5225563909774437,
"grad_norm": 0.11865234375,
"learning_rate": 4.774436090225565e-06,
"loss": 1.6333,
"step": 139
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.11865234375,
"learning_rate": 4.736842105263158e-06,
"loss": 1.5992,
"step": 140
},
{
"epoch": 0.5300751879699248,
"grad_norm": 0.1474609375,
"learning_rate": 4.6992481203007525e-06,
"loss": 1.5606,
"step": 141
},
{
"epoch": 0.5338345864661654,
"grad_norm": 0.10888671875,
"learning_rate": 4.661654135338346e-06,
"loss": 1.6227,
"step": 142
},
{
"epoch": 0.5375939849624061,
"grad_norm": 0.11328125,
"learning_rate": 4.62406015037594e-06,
"loss": 1.5764,
"step": 143
},
{
"epoch": 0.5413533834586466,
"grad_norm": 0.1748046875,
"learning_rate": 4.586466165413534e-06,
"loss": 1.6476,
"step": 144
},
{
"epoch": 0.5451127819548872,
"grad_norm": 0.115234375,
"learning_rate": 4.548872180451128e-06,
"loss": 1.5807,
"step": 145
},
{
"epoch": 0.5488721804511278,
"grad_norm": 0.11865234375,
"learning_rate": 4.511278195488722e-06,
"loss": 1.609,
"step": 146
},
{
"epoch": 0.5526315789473685,
"grad_norm": 0.11083984375,
"learning_rate": 4.473684210526316e-06,
"loss": 1.5962,
"step": 147
},
{
"epoch": 0.556390977443609,
"grad_norm": 0.11376953125,
"learning_rate": 4.43609022556391e-06,
"loss": 1.6281,
"step": 148
},
{
"epoch": 0.5601503759398496,
"grad_norm": 0.109375,
"learning_rate": 4.398496240601504e-06,
"loss": 1.5872,
"step": 149
},
{
"epoch": 0.5639097744360902,
"grad_norm": 0.109375,
"learning_rate": 4.360902255639098e-06,
"loss": 1.5688,
"step": 150
},
{
"epoch": 0.5676691729323309,
"grad_norm": 0.1279296875,
"learning_rate": 4.323308270676692e-06,
"loss": 1.5516,
"step": 151
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.10693359375,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.5516,
"step": 152
},
{
"epoch": 0.575187969924812,
"grad_norm": 0.111328125,
"learning_rate": 4.24812030075188e-06,
"loss": 1.5927,
"step": 153
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.11181640625,
"learning_rate": 4.210526315789474e-06,
"loss": 1.5904,
"step": 154
},
{
"epoch": 0.5827067669172933,
"grad_norm": 0.1171875,
"learning_rate": 4.172932330827068e-06,
"loss": 1.6308,
"step": 155
},
{
"epoch": 0.5864661654135338,
"grad_norm": 0.115234375,
"learning_rate": 4.135338345864662e-06,
"loss": 1.5749,
"step": 156
},
{
"epoch": 0.5902255639097744,
"grad_norm": 0.111328125,
"learning_rate": 4.097744360902256e-06,
"loss": 1.5763,
"step": 157
},
{
"epoch": 0.5939849624060151,
"grad_norm": 0.11181640625,
"learning_rate": 4.06015037593985e-06,
"loss": 1.5667,
"step": 158
},
{
"epoch": 0.5977443609022557,
"grad_norm": 0.1201171875,
"learning_rate": 4.022556390977444e-06,
"loss": 1.563,
"step": 159
},
{
"epoch": 0.6015037593984962,
"grad_norm": 0.12451171875,
"learning_rate": 3.9849624060150376e-06,
"loss": 1.5973,
"step": 160
},
{
"epoch": 0.6052631578947368,
"grad_norm": 0.1240234375,
"learning_rate": 3.947368421052632e-06,
"loss": 1.5963,
"step": 161
},
{
"epoch": 0.6090225563909775,
"grad_norm": 0.10986328125,
"learning_rate": 3.909774436090225e-06,
"loss": 1.567,
"step": 162
},
{
"epoch": 0.6127819548872181,
"grad_norm": 0.12890625,
"learning_rate": 3.87218045112782e-06,
"loss": 1.5866,
"step": 163
},
{
"epoch": 0.6165413533834586,
"grad_norm": 0.11181640625,
"learning_rate": 3.834586466165414e-06,
"loss": 1.5829,
"step": 164
},
{
"epoch": 0.6203007518796992,
"grad_norm": 0.1259765625,
"learning_rate": 3.796992481203008e-06,
"loss": 1.5601,
"step": 165
},
{
"epoch": 0.6240601503759399,
"grad_norm": 0.1171875,
"learning_rate": 3.7593984962406014e-06,
"loss": 1.5983,
"step": 166
},
{
"epoch": 0.6278195488721805,
"grad_norm": 0.1611328125,
"learning_rate": 3.7218045112781957e-06,
"loss": 1.5801,
"step": 167
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.11181640625,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.5545,
"step": 168
},
{
"epoch": 0.6353383458646616,
"grad_norm": 0.1103515625,
"learning_rate": 3.646616541353384e-06,
"loss": 1.5293,
"step": 169
},
{
"epoch": 0.6390977443609023,
"grad_norm": 0.1103515625,
"learning_rate": 3.6090225563909775e-06,
"loss": 1.5764,
"step": 170
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.11279296875,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.5941,
"step": 171
},
{
"epoch": 0.6466165413533834,
"grad_norm": 0.11181640625,
"learning_rate": 3.5338345864661657e-06,
"loss": 1.5842,
"step": 172
},
{
"epoch": 0.650375939849624,
"grad_norm": 0.119140625,
"learning_rate": 3.4962406015037596e-06,
"loss": 1.5493,
"step": 173
},
{
"epoch": 0.6541353383458647,
"grad_norm": 0.111328125,
"learning_rate": 3.4586466165413535e-06,
"loss": 1.5622,
"step": 174
},
{
"epoch": 0.6578947368421053,
"grad_norm": 0.244140625,
"learning_rate": 3.421052631578948e-06,
"loss": 1.5905,
"step": 175
},
{
"epoch": 0.6616541353383458,
"grad_norm": 0.1328125,
"learning_rate": 3.3834586466165413e-06,
"loss": 1.5928,
"step": 176
},
{
"epoch": 0.6654135338345865,
"grad_norm": 0.12890625,
"learning_rate": 3.3458646616541356e-06,
"loss": 1.569,
"step": 177
},
{
"epoch": 0.6691729323308271,
"grad_norm": 0.1083984375,
"learning_rate": 3.3082706766917295e-06,
"loss": 1.5706,
"step": 178
},
{
"epoch": 0.6729323308270677,
"grad_norm": 0.109375,
"learning_rate": 3.270676691729324e-06,
"loss": 1.5598,
"step": 179
},
{
"epoch": 0.6766917293233082,
"grad_norm": 0.134765625,
"learning_rate": 3.2330827067669174e-06,
"loss": 1.5968,
"step": 180
},
{
"epoch": 0.6804511278195489,
"grad_norm": 0.119140625,
"learning_rate": 3.1954887218045117e-06,
"loss": 1.5789,
"step": 181
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.11376953125,
"learning_rate": 3.157894736842105e-06,
"loss": 1.5565,
"step": 182
},
{
"epoch": 0.6879699248120301,
"grad_norm": 0.1171875,
"learning_rate": 3.1203007518796995e-06,
"loss": 1.6115,
"step": 183
},
{
"epoch": 0.6917293233082706,
"grad_norm": 0.123046875,
"learning_rate": 3.0827067669172934e-06,
"loss": 1.5572,
"step": 184
},
{
"epoch": 0.6954887218045113,
"grad_norm": 0.111328125,
"learning_rate": 3.0451127819548877e-06,
"loss": 1.5763,
"step": 185
},
{
"epoch": 0.6992481203007519,
"grad_norm": 0.1240234375,
"learning_rate": 3.007518796992481e-06,
"loss": 1.5728,
"step": 186
},
{
"epoch": 0.7030075187969925,
"grad_norm": 0.11767578125,
"learning_rate": 2.9699248120300755e-06,
"loss": 1.5665,
"step": 187
},
{
"epoch": 0.706766917293233,
"grad_norm": 0.12353515625,
"learning_rate": 2.9323308270676694e-06,
"loss": 1.5877,
"step": 188
},
{
"epoch": 0.7105263157894737,
"grad_norm": 0.1572265625,
"learning_rate": 2.8947368421052634e-06,
"loss": 1.5883,
"step": 189
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.11328125,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.5978,
"step": 190
},
{
"epoch": 0.7180451127819549,
"grad_norm": 0.1123046875,
"learning_rate": 2.8195488721804516e-06,
"loss": 1.5785,
"step": 191
},
{
"epoch": 0.7218045112781954,
"grad_norm": 0.11962890625,
"learning_rate": 2.781954887218045e-06,
"loss": 1.5598,
"step": 192
},
{
"epoch": 0.7255639097744361,
"grad_norm": 0.1123046875,
"learning_rate": 2.7443609022556394e-06,
"loss": 1.5426,
"step": 193
},
{
"epoch": 0.7293233082706767,
"grad_norm": 0.1201171875,
"learning_rate": 2.7067669172932333e-06,
"loss": 1.5618,
"step": 194
},
{
"epoch": 0.7330827067669173,
"grad_norm": 0.142578125,
"learning_rate": 2.669172932330827e-06,
"loss": 1.6137,
"step": 195
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.15625,
"learning_rate": 2.631578947368421e-06,
"loss": 1.5443,
"step": 196
},
{
"epoch": 0.7406015037593985,
"grad_norm": 0.130859375,
"learning_rate": 2.5939849624060154e-06,
"loss": 1.5559,
"step": 197
},
{
"epoch": 0.7443609022556391,
"grad_norm": 0.177734375,
"learning_rate": 2.556390977443609e-06,
"loss": 1.5526,
"step": 198
},
{
"epoch": 0.7481203007518797,
"grad_norm": 0.11962890625,
"learning_rate": 2.5187969924812033e-06,
"loss": 1.5839,
"step": 199
},
{
"epoch": 0.7518796992481203,
"grad_norm": 0.1123046875,
"learning_rate": 2.481203007518797e-06,
"loss": 1.5576,
"step": 200
},
{
"epoch": 0.7556390977443609,
"grad_norm": 0.1240234375,
"learning_rate": 2.443609022556391e-06,
"loss": 1.6115,
"step": 201
},
{
"epoch": 0.7593984962406015,
"grad_norm": 0.1162109375,
"learning_rate": 2.406015037593985e-06,
"loss": 1.5613,
"step": 202
},
{
"epoch": 0.7631578947368421,
"grad_norm": 0.11181640625,
"learning_rate": 2.368421052631579e-06,
"loss": 1.5773,
"step": 203
},
{
"epoch": 0.7669172932330827,
"grad_norm": 0.11962890625,
"learning_rate": 2.330827067669173e-06,
"loss": 1.5458,
"step": 204
},
{
"epoch": 0.7706766917293233,
"grad_norm": 0.1123046875,
"learning_rate": 2.293233082706767e-06,
"loss": 1.5696,
"step": 205
},
{
"epoch": 0.7744360902255639,
"grad_norm": 0.10986328125,
"learning_rate": 2.255639097744361e-06,
"loss": 1.5509,
"step": 206
},
{
"epoch": 0.7781954887218046,
"grad_norm": 0.11181640625,
"learning_rate": 2.218045112781955e-06,
"loss": 1.5625,
"step": 207
},
{
"epoch": 0.7819548872180451,
"grad_norm": 0.10888671875,
"learning_rate": 2.180451127819549e-06,
"loss": 1.5507,
"step": 208
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.12158203125,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.5591,
"step": 209
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.1533203125,
"learning_rate": 2.105263157894737e-06,
"loss": 1.5325,
"step": 210
},
{
"epoch": 0.793233082706767,
"grad_norm": 0.1435546875,
"learning_rate": 2.067669172932331e-06,
"loss": 1.5841,
"step": 211
},
{
"epoch": 0.7969924812030075,
"grad_norm": 0.11181640625,
"learning_rate": 2.030075187969925e-06,
"loss": 1.5669,
"step": 212
},
{
"epoch": 0.8007518796992481,
"grad_norm": 0.1142578125,
"learning_rate": 1.9924812030075188e-06,
"loss": 1.5734,
"step": 213
},
{
"epoch": 0.8045112781954887,
"grad_norm": 0.1474609375,
"learning_rate": 1.9548872180451127e-06,
"loss": 1.5846,
"step": 214
},
{
"epoch": 0.8082706766917294,
"grad_norm": 0.115234375,
"learning_rate": 1.917293233082707e-06,
"loss": 1.5544,
"step": 215
},
{
"epoch": 0.8120300751879699,
"grad_norm": 0.123046875,
"learning_rate": 1.8796992481203007e-06,
"loss": 1.5718,
"step": 216
},
{
"epoch": 0.8157894736842105,
"grad_norm": 0.1318359375,
"learning_rate": 1.8421052631578948e-06,
"loss": 1.5483,
"step": 217
},
{
"epoch": 0.8195488721804511,
"grad_norm": 0.11376953125,
"learning_rate": 1.8045112781954887e-06,
"loss": 1.5883,
"step": 218
},
{
"epoch": 0.8233082706766918,
"grad_norm": 0.12158203125,
"learning_rate": 1.7669172932330828e-06,
"loss": 1.5785,
"step": 219
},
{
"epoch": 0.8270676691729323,
"grad_norm": 0.11572265625,
"learning_rate": 1.7293233082706767e-06,
"loss": 1.5372,
"step": 220
},
{
"epoch": 0.8308270676691729,
"grad_norm": 0.12890625,
"learning_rate": 1.6917293233082707e-06,
"loss": 1.5527,
"step": 221
},
{
"epoch": 0.8345864661654135,
"grad_norm": 0.166015625,
"learning_rate": 1.6541353383458648e-06,
"loss": 1.5593,
"step": 222
},
{
"epoch": 0.8383458646616542,
"grad_norm": 0.126953125,
"learning_rate": 1.6165413533834587e-06,
"loss": 1.545,
"step": 223
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.11376953125,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.5537,
"step": 224
},
{
"epoch": 0.8458646616541353,
"grad_norm": 0.13671875,
"learning_rate": 1.5413533834586467e-06,
"loss": 1.5667,
"step": 225
},
{
"epoch": 0.849624060150376,
"grad_norm": 0.12451171875,
"learning_rate": 1.5037593984962406e-06,
"loss": 1.5319,
"step": 226
},
{
"epoch": 0.8533834586466166,
"grad_norm": 0.12451171875,
"learning_rate": 1.4661654135338347e-06,
"loss": 1.5617,
"step": 227
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.146484375,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.5843,
"step": 228
},
{
"epoch": 0.8609022556390977,
"grad_norm": 0.1220703125,
"learning_rate": 1.3909774436090225e-06,
"loss": 1.5435,
"step": 229
},
{
"epoch": 0.8646616541353384,
"grad_norm": 0.10888671875,
"learning_rate": 1.3533834586466167e-06,
"loss": 1.5431,
"step": 230
},
{
"epoch": 0.868421052631579,
"grad_norm": 0.11328125,
"learning_rate": 1.3157894736842106e-06,
"loss": 1.5902,
"step": 231
},
{
"epoch": 0.8721804511278195,
"grad_norm": 0.13671875,
"learning_rate": 1.2781954887218045e-06,
"loss": 1.5973,
"step": 232
},
{
"epoch": 0.8759398496240601,
"grad_norm": 0.1162109375,
"learning_rate": 1.2406015037593986e-06,
"loss": 1.5482,
"step": 233
},
{
"epoch": 0.8796992481203008,
"grad_norm": 0.11279296875,
"learning_rate": 1.2030075187969925e-06,
"loss": 1.5471,
"step": 234
},
{
"epoch": 0.8834586466165414,
"grad_norm": 0.1513671875,
"learning_rate": 1.1654135338345866e-06,
"loss": 1.5542,
"step": 235
},
{
"epoch": 0.8872180451127819,
"grad_norm": 0.166015625,
"learning_rate": 1.1278195488721805e-06,
"loss": 1.6103,
"step": 236
},
{
"epoch": 0.8909774436090225,
"grad_norm": 0.119140625,
"learning_rate": 1.0902255639097744e-06,
"loss": 1.5798,
"step": 237
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.1162109375,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.5934,
"step": 238
},
{
"epoch": 0.8984962406015038,
"grad_norm": 0.11083984375,
"learning_rate": 1.0150375939849624e-06,
"loss": 1.5938,
"step": 239
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.130859375,
"learning_rate": 9.774436090225563e-07,
"loss": 1.5225,
"step": 240
},
{
"epoch": 0.9060150375939849,
"grad_norm": 0.1376953125,
"learning_rate": 9.398496240601504e-07,
"loss": 1.5115,
"step": 241
},
{
"epoch": 0.9097744360902256,
"grad_norm": 0.1279296875,
"learning_rate": 9.022556390977444e-07,
"loss": 1.5831,
"step": 242
},
{
"epoch": 0.9135338345864662,
"grad_norm": 0.140625,
"learning_rate": 8.646616541353384e-07,
"loss": 1.5898,
"step": 243
},
{
"epoch": 0.9172932330827067,
"grad_norm": 0.134765625,
"learning_rate": 8.270676691729324e-07,
"loss": 1.567,
"step": 244
},
{
"epoch": 0.9210526315789473,
"grad_norm": 0.111328125,
"learning_rate": 7.894736842105263e-07,
"loss": 1.5691,
"step": 245
},
{
"epoch": 0.924812030075188,
"grad_norm": 0.115234375,
"learning_rate": 7.518796992481203e-07,
"loss": 1.5861,
"step": 246
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.11474609375,
"learning_rate": 7.142857142857143e-07,
"loss": 1.5823,
"step": 247
},
{
"epoch": 0.9323308270676691,
"grad_norm": 0.1708984375,
"learning_rate": 6.766917293233083e-07,
"loss": 1.4993,
"step": 248
},
{
"epoch": 0.9360902255639098,
"grad_norm": 0.16796875,
"learning_rate": 6.390977443609022e-07,
"loss": 1.6202,
"step": 249
},
{
"epoch": 0.9398496240601504,
"grad_norm": 0.11376953125,
"learning_rate": 6.015037593984962e-07,
"loss": 1.5887,
"step": 250
},
{
"epoch": 0.943609022556391,
"grad_norm": 0.1982421875,
"learning_rate": 5.639097744360903e-07,
"loss": 1.5946,
"step": 251
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.125,
"learning_rate": 5.263157894736843e-07,
"loss": 1.597,
"step": 252
},
{
"epoch": 0.9511278195488722,
"grad_norm": 0.1337890625,
"learning_rate": 4.887218045112782e-07,
"loss": 1.5722,
"step": 253
},
{
"epoch": 0.9548872180451128,
"grad_norm": 0.142578125,
"learning_rate": 4.511278195488722e-07,
"loss": 1.5879,
"step": 254
},
{
"epoch": 0.9586466165413534,
"grad_norm": 0.19140625,
"learning_rate": 4.135338345864662e-07,
"loss": 1.6135,
"step": 255
},
{
"epoch": 0.9624060150375939,
"grad_norm": 0.115234375,
"learning_rate": 3.7593984962406015e-07,
"loss": 1.5847,
"step": 256
},
{
"epoch": 0.9661654135338346,
"grad_norm": 0.11328125,
"learning_rate": 3.3834586466165416e-07,
"loss": 1.5615,
"step": 257
},
{
"epoch": 0.9699248120300752,
"grad_norm": 0.1669921875,
"learning_rate": 3.007518796992481e-07,
"loss": 1.5342,
"step": 258
},
{
"epoch": 0.9736842105263158,
"grad_norm": 0.1640625,
"learning_rate": 2.6315789473684213e-07,
"loss": 1.5971,
"step": 259
},
{
"epoch": 0.9774436090225563,
"grad_norm": 0.1748046875,
"learning_rate": 2.255639097744361e-07,
"loss": 1.5573,
"step": 260
},
{
"epoch": 0.981203007518797,
"grad_norm": 0.1591796875,
"learning_rate": 1.8796992481203008e-07,
"loss": 1.5388,
"step": 261
},
{
"epoch": 0.9849624060150376,
"grad_norm": 0.11328125,
"learning_rate": 1.5037593984962406e-07,
"loss": 1.5811,
"step": 262
},
{
"epoch": 0.9887218045112782,
"grad_norm": 0.1494140625,
"learning_rate": 1.1278195488721805e-07,
"loss": 1.5266,
"step": 263
},
{
"epoch": 0.9924812030075187,
"grad_norm": 0.12060546875,
"learning_rate": 7.518796992481203e-08,
"loss": 1.5533,
"step": 264
},
{
"epoch": 0.9962406015037594,
"grad_norm": 0.123046875,
"learning_rate": 3.7593984962406015e-08,
"loss": 1.5889,
"step": 265
},
{
"epoch": 1.0,
"grad_norm": 0.11865234375,
"learning_rate": 0.0,
"loss": 1.5887,
"step": 266
}
],
"logging_steps": 1.0,
"max_steps": 266,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.317663665780163e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}