Vivid-7B-Instruct-Lora-PT-500 / trainer_state.json
dminhvu02's picture
Upload folder using huggingface_hub
4881a67 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4248539564524695,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 2.1944120442820862,
"learning_rate": 2.777777777777778e-06,
"loss": 1.6587,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 2.158646448617879,
"learning_rate": 5.555555555555556e-06,
"loss": 1.6685,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 2.1343466361951706,
"learning_rate": 8.333333333333334e-06,
"loss": 1.6836,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 2.0231531922210753,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.6523,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 1.6162620203223095,
"learning_rate": 1.388888888888889e-05,
"loss": 1.6636,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 1.3779619042576137,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.6289,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 1.223668866838671,
"learning_rate": 1.9444444444444445e-05,
"loss": 1.6201,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 1.5016416362830853,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.5596,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 1.465420671008811,
"learning_rate": 2.5e-05,
"loss": 1.6113,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 1.1965670018804309,
"learning_rate": 2.777777777777778e-05,
"loss": 1.5898,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.1117797752417102,
"learning_rate": 3.055555555555556e-05,
"loss": 1.6035,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 0.9878470790338667,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.5625,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 1.02494625138462,
"learning_rate": 3.611111111111111e-05,
"loss": 1.5547,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 1.0223917263016193,
"learning_rate": 3.888888888888889e-05,
"loss": 1.5615,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 0.9433437823947872,
"learning_rate": 4.166666666666667e-05,
"loss": 1.5728,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.8737056838198499,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.5327,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 0.847350291380953,
"learning_rate": 4.722222222222222e-05,
"loss": 1.4829,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 0.9546966542598146,
"learning_rate": 5e-05,
"loss": 1.5532,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 0.9232787655185869,
"learning_rate": 5.2777777777777784e-05,
"loss": 1.5303,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 0.879349873123116,
"learning_rate": 5.555555555555556e-05,
"loss": 1.502,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.8709535184620328,
"learning_rate": 5.833333333333334e-05,
"loss": 1.4585,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 0.8819786376627559,
"learning_rate": 6.111111111111112e-05,
"loss": 1.4858,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 0.8589200426275411,
"learning_rate": 6.388888888888888e-05,
"loss": 1.4644,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 0.8190182080399642,
"learning_rate": 6.666666666666667e-05,
"loss": 1.4561,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 0.8796864611649672,
"learning_rate": 6.944444444444444e-05,
"loss": 1.4546,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.8331325598252782,
"learning_rate": 7.222222222222222e-05,
"loss": 1.4624,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 0.8345520972989295,
"learning_rate": 7.500000000000001e-05,
"loss": 1.4453,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 0.8176489443002161,
"learning_rate": 7.777777777777778e-05,
"loss": 1.4541,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 0.7528421779691234,
"learning_rate": 8.055555555555556e-05,
"loss": 1.4434,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 0.7991219912695795,
"learning_rate": 8.333333333333334e-05,
"loss": 1.4546,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 0.7453012031751974,
"learning_rate": 8.611111111111112e-05,
"loss": 1.4541,
"step": 31
},
{
"epoch": 0.03,
"grad_norm": 0.7356336435000073,
"learning_rate": 8.888888888888889e-05,
"loss": 1.4565,
"step": 32
},
{
"epoch": 0.03,
"grad_norm": 0.7239229910223912,
"learning_rate": 9.166666666666667e-05,
"loss": 1.4253,
"step": 33
},
{
"epoch": 0.03,
"grad_norm": 0.6995782237549312,
"learning_rate": 9.444444444444444e-05,
"loss": 1.4116,
"step": 34
},
{
"epoch": 0.03,
"grad_norm": 0.7108394167974162,
"learning_rate": 9.722222222222223e-05,
"loss": 1.4053,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 0.7270375728618296,
"learning_rate": 0.0001,
"loss": 1.4214,
"step": 36
},
{
"epoch": 0.03,
"grad_norm": 0.7494051458635556,
"learning_rate": 9.999981014161752e-05,
"loss": 1.4644,
"step": 37
},
{
"epoch": 0.03,
"grad_norm": 0.733832068426627,
"learning_rate": 9.999924056791192e-05,
"loss": 1.4141,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 0.6719534359517587,
"learning_rate": 9.999829128320874e-05,
"loss": 1.4023,
"step": 39
},
{
"epoch": 0.03,
"grad_norm": 0.7406841100980851,
"learning_rate": 9.999696229471716e-05,
"loss": 1.4263,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.6561411493710649,
"learning_rate": 9.999525361252996e-05,
"loss": 1.4126,
"step": 41
},
{
"epoch": 0.04,
"grad_norm": 0.6703214903768667,
"learning_rate": 9.999316524962345e-05,
"loss": 1.3955,
"step": 42
},
{
"epoch": 0.04,
"grad_norm": 0.6952049638900921,
"learning_rate": 9.999069722185737e-05,
"loss": 1.4072,
"step": 43
},
{
"epoch": 0.04,
"grad_norm": 0.6806747265810544,
"learning_rate": 9.998784954797474e-05,
"loss": 1.4146,
"step": 44
},
{
"epoch": 0.04,
"grad_norm": 0.6761436892518071,
"learning_rate": 9.998462224960175e-05,
"loss": 1.4009,
"step": 45
},
{
"epoch": 0.04,
"grad_norm": 0.694044598842866,
"learning_rate": 9.998101535124758e-05,
"loss": 1.4268,
"step": 46
},
{
"epoch": 0.04,
"grad_norm": 0.6557563304435648,
"learning_rate": 9.997702888030423e-05,
"loss": 1.3794,
"step": 47
},
{
"epoch": 0.04,
"grad_norm": 0.6612841638564682,
"learning_rate": 9.997266286704631e-05,
"loss": 1.3892,
"step": 48
},
{
"epoch": 0.04,
"grad_norm": 0.6486556767977087,
"learning_rate": 9.996791734463077e-05,
"loss": 1.3652,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 0.6483540085185676,
"learning_rate": 9.996279234909671e-05,
"loss": 1.3984,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 0.6700432628612305,
"learning_rate": 9.995728791936504e-05,
"loss": 1.3999,
"step": 51
},
{
"epoch": 0.04,
"grad_norm": 0.6432744026831555,
"learning_rate": 9.99514040972383e-05,
"loss": 1.356,
"step": 52
},
{
"epoch": 0.05,
"grad_norm": 0.6216728827903856,
"learning_rate": 9.994514092740015e-05,
"loss": 1.3882,
"step": 53
},
{
"epoch": 0.05,
"grad_norm": 0.6467739800460915,
"learning_rate": 9.993849845741524e-05,
"loss": 1.3765,
"step": 54
},
{
"epoch": 0.05,
"grad_norm": 0.6503437970639988,
"learning_rate": 9.99314767377287e-05,
"loss": 1.373,
"step": 55
},
{
"epoch": 0.05,
"grad_norm": 0.6657501610674698,
"learning_rate": 9.992407582166581e-05,
"loss": 1.3838,
"step": 56
},
{
"epoch": 0.05,
"grad_norm": 0.6605689841115963,
"learning_rate": 9.991629576543163e-05,
"loss": 1.3716,
"step": 57
},
{
"epoch": 0.05,
"grad_norm": 0.6989365655033877,
"learning_rate": 9.990813662811051e-05,
"loss": 1.3882,
"step": 58
},
{
"epoch": 0.05,
"grad_norm": 0.6084957965701701,
"learning_rate": 9.989959847166567e-05,
"loss": 1.3545,
"step": 59
},
{
"epoch": 0.05,
"grad_norm": 0.6699929146209974,
"learning_rate": 9.989068136093873e-05,
"loss": 1.3418,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 0.6247455324530298,
"learning_rate": 9.988138536364922e-05,
"loss": 1.3486,
"step": 61
},
{
"epoch": 0.05,
"grad_norm": 0.6622758669061856,
"learning_rate": 9.987171055039408e-05,
"loss": 1.3892,
"step": 62
},
{
"epoch": 0.05,
"grad_norm": 0.6034683645026113,
"learning_rate": 9.986165699464705e-05,
"loss": 1.3491,
"step": 63
},
{
"epoch": 0.05,
"grad_norm": 0.6266046141102322,
"learning_rate": 9.985122477275824e-05,
"loss": 1.3452,
"step": 64
},
{
"epoch": 0.06,
"grad_norm": 0.6299383394087646,
"learning_rate": 9.984041396395343e-05,
"loss": 1.3569,
"step": 65
},
{
"epoch": 0.06,
"grad_norm": 0.6104287148111909,
"learning_rate": 9.98292246503335e-05,
"loss": 1.333,
"step": 66
},
{
"epoch": 0.06,
"grad_norm": 0.6519029188667027,
"learning_rate": 9.981765691687388e-05,
"loss": 1.3857,
"step": 67
},
{
"epoch": 0.06,
"grad_norm": 0.6142161143427214,
"learning_rate": 9.980571085142381e-05,
"loss": 1.3228,
"step": 68
},
{
"epoch": 0.06,
"grad_norm": 0.6229626554946482,
"learning_rate": 9.979338654470569e-05,
"loss": 1.3574,
"step": 69
},
{
"epoch": 0.06,
"grad_norm": 0.6071740965934106,
"learning_rate": 9.978068409031449e-05,
"loss": 1.3379,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 0.6180402576227992,
"learning_rate": 9.976760358471686e-05,
"loss": 1.3672,
"step": 71
},
{
"epoch": 0.06,
"grad_norm": 0.6254634252353867,
"learning_rate": 9.975414512725057e-05,
"loss": 1.3525,
"step": 72
},
{
"epoch": 0.06,
"grad_norm": 0.6146388668983097,
"learning_rate": 9.974030882012367e-05,
"loss": 1.3677,
"step": 73
},
{
"epoch": 0.06,
"grad_norm": 0.60548422624436,
"learning_rate": 9.972609476841367e-05,
"loss": 1.3271,
"step": 74
},
{
"epoch": 0.06,
"grad_norm": 0.5960020566477471,
"learning_rate": 9.97115030800669e-05,
"loss": 1.3203,
"step": 75
},
{
"epoch": 0.06,
"grad_norm": 0.5840389582557357,
"learning_rate": 9.969653386589748e-05,
"loss": 1.3457,
"step": 76
},
{
"epoch": 0.07,
"grad_norm": 0.6017170229389899,
"learning_rate": 9.968118723958668e-05,
"loss": 1.3555,
"step": 77
},
{
"epoch": 0.07,
"grad_norm": 0.59548463038904,
"learning_rate": 9.966546331768191e-05,
"loss": 1.312,
"step": 78
},
{
"epoch": 0.07,
"grad_norm": 0.6376362739222085,
"learning_rate": 9.96493622195959e-05,
"loss": 1.3896,
"step": 79
},
{
"epoch": 0.07,
"grad_norm": 0.5924552743524675,
"learning_rate": 9.963288406760582e-05,
"loss": 1.3882,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 0.5932204834859686,
"learning_rate": 9.961602898685226e-05,
"loss": 1.3228,
"step": 81
},
{
"epoch": 0.07,
"grad_norm": 0.6343152356114848,
"learning_rate": 9.959879710533835e-05,
"loss": 1.3418,
"step": 82
},
{
"epoch": 0.07,
"grad_norm": 0.619176447518611,
"learning_rate": 9.958118855392876e-05,
"loss": 1.3511,
"step": 83
},
{
"epoch": 0.07,
"grad_norm": 0.6059536670723797,
"learning_rate": 9.956320346634876e-05,
"loss": 1.3496,
"step": 84
},
{
"epoch": 0.07,
"grad_norm": 0.5833794255152709,
"learning_rate": 9.954484197918315e-05,
"loss": 1.3047,
"step": 85
},
{
"epoch": 0.07,
"grad_norm": 0.6201224777123214,
"learning_rate": 9.952610423187516e-05,
"loss": 1.3486,
"step": 86
},
{
"epoch": 0.07,
"grad_norm": 0.6007754625054771,
"learning_rate": 9.950699036672559e-05,
"loss": 1.3281,
"step": 87
},
{
"epoch": 0.07,
"grad_norm": 0.5942092191181105,
"learning_rate": 9.94875005288915e-05,
"loss": 1.3247,
"step": 88
},
{
"epoch": 0.08,
"grad_norm": 0.5943876215982206,
"learning_rate": 9.946763486638528e-05,
"loss": 1.3286,
"step": 89
},
{
"epoch": 0.08,
"grad_norm": 0.5851272119218502,
"learning_rate": 9.944739353007344e-05,
"loss": 1.333,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 0.5736452322086703,
"learning_rate": 9.942677667367541e-05,
"loss": 1.3281,
"step": 91
},
{
"epoch": 0.08,
"grad_norm": 0.5858368666665404,
"learning_rate": 9.940578445376258e-05,
"loss": 1.3408,
"step": 92
},
{
"epoch": 0.08,
"grad_norm": 0.5951660769871204,
"learning_rate": 9.938441702975689e-05,
"loss": 1.332,
"step": 93
},
{
"epoch": 0.08,
"grad_norm": 0.5673882981333603,
"learning_rate": 9.936267456392971e-05,
"loss": 1.29,
"step": 94
},
{
"epoch": 0.08,
"grad_norm": 0.57937244503091,
"learning_rate": 9.934055722140061e-05,
"loss": 1.3379,
"step": 95
},
{
"epoch": 0.08,
"grad_norm": 0.5404623165150114,
"learning_rate": 9.931806517013612e-05,
"loss": 1.2832,
"step": 96
},
{
"epoch": 0.08,
"grad_norm": 0.570458957982483,
"learning_rate": 9.929519858094843e-05,
"loss": 1.2827,
"step": 97
},
{
"epoch": 0.08,
"grad_norm": 0.6087016644937208,
"learning_rate": 9.927195762749405e-05,
"loss": 1.3218,
"step": 98
},
{
"epoch": 0.08,
"grad_norm": 0.5870682794787072,
"learning_rate": 9.92483424862726e-05,
"loss": 1.3135,
"step": 99
},
{
"epoch": 0.08,
"grad_norm": 0.5856272723632059,
"learning_rate": 9.922435333662536e-05,
"loss": 1.2881,
"step": 100
},
{
"epoch": 0.09,
"grad_norm": 0.6111832032418365,
"learning_rate": 9.9199990360734e-05,
"loss": 1.3203,
"step": 101
},
{
"epoch": 0.09,
"grad_norm": 0.5726861967258199,
"learning_rate": 9.917525374361912e-05,
"loss": 1.3179,
"step": 102
},
{
"epoch": 0.09,
"grad_norm": 0.5884861410749106,
"learning_rate": 9.915014367313888e-05,
"loss": 1.3228,
"step": 103
},
{
"epoch": 0.09,
"grad_norm": 0.6135395087168646,
"learning_rate": 9.912466033998757e-05,
"loss": 1.3335,
"step": 104
},
{
"epoch": 0.09,
"grad_norm": 0.5645793699483922,
"learning_rate": 9.90988039376942e-05,
"loss": 1.3125,
"step": 105
},
{
"epoch": 0.09,
"grad_norm": 0.5857606819583933,
"learning_rate": 9.90725746626209e-05,
"loss": 1.2744,
"step": 106
},
{
"epoch": 0.09,
"grad_norm": 0.5831028711698518,
"learning_rate": 9.904597271396162e-05,
"loss": 1.311,
"step": 107
},
{
"epoch": 0.09,
"grad_norm": 0.6348588219528606,
"learning_rate": 9.901899829374047e-05,
"loss": 1.3452,
"step": 108
},
{
"epoch": 0.09,
"grad_norm": 0.5603970131542654,
"learning_rate": 9.899165160681025e-05,
"loss": 1.2964,
"step": 109
},
{
"epoch": 0.09,
"grad_norm": 0.5865675897271678,
"learning_rate": 9.896393286085084e-05,
"loss": 1.3071,
"step": 110
},
{
"epoch": 0.09,
"grad_norm": 0.5784164637201951,
"learning_rate": 9.893584226636772e-05,
"loss": 1.3008,
"step": 111
},
{
"epoch": 0.1,
"grad_norm": 0.5710383398686641,
"learning_rate": 9.890738003669029e-05,
"loss": 1.2886,
"step": 112
},
{
"epoch": 0.1,
"grad_norm": 0.5846130781014983,
"learning_rate": 9.887854638797023e-05,
"loss": 1.3096,
"step": 113
},
{
"epoch": 0.1,
"grad_norm": 0.5880954219156209,
"learning_rate": 9.884934153917997e-05,
"loss": 1.3145,
"step": 114
},
{
"epoch": 0.1,
"grad_norm": 0.5637844902513754,
"learning_rate": 9.88197657121109e-05,
"loss": 1.29,
"step": 115
},
{
"epoch": 0.1,
"grad_norm": 0.5876469941096398,
"learning_rate": 9.878981913137179e-05,
"loss": 1.3418,
"step": 116
},
{
"epoch": 0.1,
"grad_norm": 0.6000214582832701,
"learning_rate": 9.8759502024387e-05,
"loss": 1.2896,
"step": 117
},
{
"epoch": 0.1,
"grad_norm": 0.5839730365390634,
"learning_rate": 9.872881462139479e-05,
"loss": 1.2705,
"step": 118
},
{
"epoch": 0.1,
"grad_norm": 0.5990677563429729,
"learning_rate": 9.869775715544562e-05,
"loss": 1.3071,
"step": 119
},
{
"epoch": 0.1,
"grad_norm": 0.5953706456388055,
"learning_rate": 9.86663298624003e-05,
"loss": 1.2959,
"step": 120
},
{
"epoch": 0.1,
"grad_norm": 0.5817561833721856,
"learning_rate": 9.86345329809282e-05,
"loss": 1.2852,
"step": 121
},
{
"epoch": 0.1,
"grad_norm": 0.5744763014438757,
"learning_rate": 9.860236675250552e-05,
"loss": 1.2783,
"step": 122
},
{
"epoch": 0.1,
"grad_norm": 0.597834970808429,
"learning_rate": 9.856983142141339e-05,
"loss": 1.2925,
"step": 123
},
{
"epoch": 0.11,
"grad_norm": 0.58712068164488,
"learning_rate": 9.8536927234736e-05,
"loss": 1.3042,
"step": 124
},
{
"epoch": 0.11,
"grad_norm": 0.5750527697531876,
"learning_rate": 9.85036544423588e-05,
"loss": 1.2734,
"step": 125
},
{
"epoch": 0.11,
"grad_norm": 0.5809004038351853,
"learning_rate": 9.847001329696653e-05,
"loss": 1.2886,
"step": 126
},
{
"epoch": 0.11,
"grad_norm": 0.5913258260888848,
"learning_rate": 9.843600405404131e-05,
"loss": 1.2871,
"step": 127
},
{
"epoch": 0.11,
"grad_norm": 0.6178847163930624,
"learning_rate": 9.840162697186075e-05,
"loss": 1.3066,
"step": 128
},
{
"epoch": 0.11,
"grad_norm": 0.569525516075595,
"learning_rate": 9.836688231149592e-05,
"loss": 1.2866,
"step": 129
},
{
"epoch": 0.11,
"grad_norm": 0.6019297014242129,
"learning_rate": 9.833177033680944e-05,
"loss": 1.2881,
"step": 130
},
{
"epoch": 0.11,
"grad_norm": 0.5570240623213132,
"learning_rate": 9.829629131445342e-05,
"loss": 1.2739,
"step": 131
},
{
"epoch": 0.11,
"grad_norm": 0.6690303999031133,
"learning_rate": 9.826044551386744e-05,
"loss": 1.3208,
"step": 132
},
{
"epoch": 0.11,
"grad_norm": 0.5869605252956118,
"learning_rate": 9.822423320727654e-05,
"loss": 1.3271,
"step": 133
},
{
"epoch": 0.11,
"grad_norm": 0.6041810553976592,
"learning_rate": 9.818765466968909e-05,
"loss": 1.3071,
"step": 134
},
{
"epoch": 0.11,
"grad_norm": 0.6055207100602872,
"learning_rate": 9.815071017889482e-05,
"loss": 1.3208,
"step": 135
},
{
"epoch": 0.12,
"grad_norm": 0.598286558624508,
"learning_rate": 9.811340001546251e-05,
"loss": 1.2842,
"step": 136
},
{
"epoch": 0.12,
"grad_norm": 0.581665353584805,
"learning_rate": 9.807572446273814e-05,
"loss": 1.2959,
"step": 137
},
{
"epoch": 0.12,
"grad_norm": 0.5983711752241108,
"learning_rate": 9.803768380684242e-05,
"loss": 1.3027,
"step": 138
},
{
"epoch": 0.12,
"grad_norm": 0.6044826878147297,
"learning_rate": 9.799927833666887e-05,
"loss": 1.3169,
"step": 139
},
{
"epoch": 0.12,
"grad_norm": 0.5879408855078629,
"learning_rate": 9.796050834388149e-05,
"loss": 1.2935,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 0.5963062749845591,
"learning_rate": 9.792137412291265e-05,
"loss": 1.2979,
"step": 141
},
{
"epoch": 0.12,
"grad_norm": 0.5897254261995125,
"learning_rate": 9.788187597096069e-05,
"loss": 1.3018,
"step": 142
},
{
"epoch": 0.12,
"grad_norm": 0.578076556919417,
"learning_rate": 9.784201418798786e-05,
"loss": 1.2939,
"step": 143
},
{
"epoch": 0.12,
"grad_norm": 0.5718451016522863,
"learning_rate": 9.780178907671789e-05,
"loss": 1.2871,
"step": 144
},
{
"epoch": 0.12,
"grad_norm": 0.5799606456697325,
"learning_rate": 9.776120094263376e-05,
"loss": 1.2803,
"step": 145
},
{
"epoch": 0.12,
"grad_norm": 0.5668567319879788,
"learning_rate": 9.772025009397537e-05,
"loss": 1.2905,
"step": 146
},
{
"epoch": 0.12,
"grad_norm": 0.5789124237655097,
"learning_rate": 9.767893684173721e-05,
"loss": 1.271,
"step": 147
},
{
"epoch": 0.13,
"grad_norm": 0.585619412138699,
"learning_rate": 9.763726149966596e-05,
"loss": 1.3115,
"step": 148
},
{
"epoch": 0.13,
"grad_norm": 0.5859239926184487,
"learning_rate": 9.759522438425813e-05,
"loss": 1.29,
"step": 149
},
{
"epoch": 0.13,
"grad_norm": 0.5735625871596765,
"learning_rate": 9.755282581475769e-05,
"loss": 1.2432,
"step": 150
},
{
"epoch": 0.13,
"grad_norm": 0.5854811026507778,
"learning_rate": 9.751006611315356e-05,
"loss": 1.3008,
"step": 151
},
{
"epoch": 0.13,
"grad_norm": 0.5900623773496287,
"learning_rate": 9.746694560417731e-05,
"loss": 1.2822,
"step": 152
},
{
"epoch": 0.13,
"grad_norm": 0.5822405028437867,
"learning_rate": 9.742346461530048e-05,
"loss": 1.2822,
"step": 153
},
{
"epoch": 0.13,
"grad_norm": 0.572768675663712,
"learning_rate": 9.737962347673231e-05,
"loss": 1.2783,
"step": 154
},
{
"epoch": 0.13,
"grad_norm": 0.6054400604030475,
"learning_rate": 9.733542252141711e-05,
"loss": 1.292,
"step": 155
},
{
"epoch": 0.13,
"grad_norm": 0.5813631045634108,
"learning_rate": 9.729086208503174e-05,
"loss": 1.2803,
"step": 156
},
{
"epoch": 0.13,
"grad_norm": 0.5807050077512271,
"learning_rate": 9.724594250598311e-05,
"loss": 1.2949,
"step": 157
},
{
"epoch": 0.13,
"grad_norm": 0.5909109801864655,
"learning_rate": 9.720066412540554e-05,
"loss": 1.2695,
"step": 158
},
{
"epoch": 0.14,
"grad_norm": 0.5997130269760653,
"learning_rate": 9.715502728715826e-05,
"loss": 1.3262,
"step": 159
},
{
"epoch": 0.14,
"grad_norm": 0.5949543658594536,
"learning_rate": 9.710903233782272e-05,
"loss": 1.2852,
"step": 160
},
{
"epoch": 0.14,
"grad_norm": 0.57306909788563,
"learning_rate": 9.706267962669998e-05,
"loss": 1.2896,
"step": 161
},
{
"epoch": 0.14,
"grad_norm": 0.5729420362088954,
"learning_rate": 9.701596950580806e-05,
"loss": 1.2944,
"step": 162
},
{
"epoch": 0.14,
"grad_norm": 0.5840523135217965,
"learning_rate": 9.696890232987931e-05,
"loss": 1.3315,
"step": 163
},
{
"epoch": 0.14,
"grad_norm": 0.5838059803168936,
"learning_rate": 9.692147845635761e-05,
"loss": 1.2759,
"step": 164
},
{
"epoch": 0.14,
"grad_norm": 0.598956713705172,
"learning_rate": 9.687369824539577e-05,
"loss": 1.2949,
"step": 165
},
{
"epoch": 0.14,
"grad_norm": 0.5684032035075478,
"learning_rate": 9.682556205985274e-05,
"loss": 1.2656,
"step": 166
},
{
"epoch": 0.14,
"grad_norm": 0.6042525592488565,
"learning_rate": 9.677707026529086e-05,
"loss": 1.2734,
"step": 167
},
{
"epoch": 0.14,
"grad_norm": 0.5948932876750508,
"learning_rate": 9.672822322997305e-05,
"loss": 1.3013,
"step": 168
},
{
"epoch": 0.14,
"grad_norm": 0.5849401513666068,
"learning_rate": 9.667902132486009e-05,
"loss": 1.2871,
"step": 169
},
{
"epoch": 0.14,
"grad_norm": 0.5735157149486471,
"learning_rate": 9.662946492360776e-05,
"loss": 1.2852,
"step": 170
},
{
"epoch": 0.15,
"grad_norm": 0.584305724268113,
"learning_rate": 9.657955440256395e-05,
"loss": 1.2622,
"step": 171
},
{
"epoch": 0.15,
"grad_norm": 0.5564763345369137,
"learning_rate": 9.652929014076593e-05,
"loss": 1.2876,
"step": 172
},
{
"epoch": 0.15,
"grad_norm": 0.5891098312264256,
"learning_rate": 9.647867251993734e-05,
"loss": 1.2642,
"step": 173
},
{
"epoch": 0.15,
"grad_norm": 0.5686838714294669,
"learning_rate": 9.642770192448536e-05,
"loss": 1.272,
"step": 174
},
{
"epoch": 0.15,
"grad_norm": 0.5750296942467902,
"learning_rate": 9.637637874149779e-05,
"loss": 1.2275,
"step": 175
},
{
"epoch": 0.15,
"grad_norm": 0.5643960417977013,
"learning_rate": 9.632470336074009e-05,
"loss": 1.2671,
"step": 176
},
{
"epoch": 0.15,
"grad_norm": 0.5852327056741696,
"learning_rate": 9.627267617465243e-05,
"loss": 1.2661,
"step": 177
},
{
"epoch": 0.15,
"grad_norm": 0.6187631168037819,
"learning_rate": 9.62202975783467e-05,
"loss": 1.3086,
"step": 178
},
{
"epoch": 0.15,
"grad_norm": 0.5975346392701849,
"learning_rate": 9.616756796960353e-05,
"loss": 1.2822,
"step": 179
},
{
"epoch": 0.15,
"grad_norm": 0.5861121272844693,
"learning_rate": 9.611448774886924e-05,
"loss": 1.2686,
"step": 180
},
{
"epoch": 0.15,
"grad_norm": 0.5894840252008043,
"learning_rate": 9.606105731925283e-05,
"loss": 1.2729,
"step": 181
},
{
"epoch": 0.15,
"grad_norm": 0.5611807927613827,
"learning_rate": 9.600727708652289e-05,
"loss": 1.2593,
"step": 182
},
{
"epoch": 0.16,
"grad_norm": 0.5963397000712469,
"learning_rate": 9.595314745910456e-05,
"loss": 1.2539,
"step": 183
},
{
"epoch": 0.16,
"grad_norm": 0.5735424943279671,
"learning_rate": 9.589866884807635e-05,
"loss": 1.2842,
"step": 184
},
{
"epoch": 0.16,
"grad_norm": 0.5721828195245187,
"learning_rate": 9.584384166716714e-05,
"loss": 1.2588,
"step": 185
},
{
"epoch": 0.16,
"grad_norm": 0.581984160350711,
"learning_rate": 9.578866633275288e-05,
"loss": 1.2769,
"step": 186
},
{
"epoch": 0.16,
"grad_norm": 0.5783634261178175,
"learning_rate": 9.573314326385359e-05,
"loss": 1.2812,
"step": 187
},
{
"epoch": 0.16,
"grad_norm": 0.5970813411028905,
"learning_rate": 9.567727288213005e-05,
"loss": 1.2666,
"step": 188
},
{
"epoch": 0.16,
"grad_norm": 0.586596312906345,
"learning_rate": 9.562105561188069e-05,
"loss": 1.269,
"step": 189
},
{
"epoch": 0.16,
"grad_norm": 0.598789269026695,
"learning_rate": 9.556449188003831e-05,
"loss": 1.312,
"step": 190
},
{
"epoch": 0.16,
"grad_norm": 0.5817408021330366,
"learning_rate": 9.550758211616684e-05,
"loss": 1.2749,
"step": 191
},
{
"epoch": 0.16,
"grad_norm": 0.5877525727442802,
"learning_rate": 9.545032675245813e-05,
"loss": 1.2949,
"step": 192
},
{
"epoch": 0.16,
"grad_norm": 0.565735316640337,
"learning_rate": 9.539272622372858e-05,
"loss": 1.2646,
"step": 193
},
{
"epoch": 0.16,
"grad_norm": 0.5758462655500972,
"learning_rate": 9.533478096741597e-05,
"loss": 1.2842,
"step": 194
},
{
"epoch": 0.17,
"grad_norm": 0.5684311218285671,
"learning_rate": 9.527649142357596e-05,
"loss": 1.2607,
"step": 195
},
{
"epoch": 0.17,
"grad_norm": 0.5618003842331919,
"learning_rate": 9.521785803487889e-05,
"loss": 1.248,
"step": 196
},
{
"epoch": 0.17,
"grad_norm": 0.5876045436622994,
"learning_rate": 9.515888124660638e-05,
"loss": 1.2642,
"step": 197
},
{
"epoch": 0.17,
"grad_norm": 0.5655105832120266,
"learning_rate": 9.509956150664796e-05,
"loss": 1.2764,
"step": 198
},
{
"epoch": 0.17,
"grad_norm": 0.5729197026608559,
"learning_rate": 9.50398992654976e-05,
"loss": 1.2812,
"step": 199
},
{
"epoch": 0.17,
"grad_norm": 0.6001679004926507,
"learning_rate": 9.497989497625035e-05,
"loss": 1.2935,
"step": 200
},
{
"epoch": 0.17,
"grad_norm": 0.5764744840651138,
"learning_rate": 9.491954909459895e-05,
"loss": 1.2363,
"step": 201
},
{
"epoch": 0.17,
"grad_norm": 0.5929339319221091,
"learning_rate": 9.485886207883022e-05,
"loss": 1.2974,
"step": 202
},
{
"epoch": 0.17,
"grad_norm": 0.5764385418022675,
"learning_rate": 9.479783438982172e-05,
"loss": 1.2925,
"step": 203
},
{
"epoch": 0.17,
"grad_norm": 0.5837028558309609,
"learning_rate": 9.473646649103818e-05,
"loss": 1.2891,
"step": 204
},
{
"epoch": 0.17,
"grad_norm": 0.5650735749077636,
"learning_rate": 9.4674758848528e-05,
"loss": 1.2334,
"step": 205
},
{
"epoch": 0.18,
"grad_norm": 0.5586611238688065,
"learning_rate": 9.46127119309197e-05,
"loss": 1.2305,
"step": 206
},
{
"epoch": 0.18,
"grad_norm": 0.6080687697649292,
"learning_rate": 9.45503262094184e-05,
"loss": 1.2827,
"step": 207
},
{
"epoch": 0.18,
"grad_norm": 0.5993996873094892,
"learning_rate": 9.448760215780217e-05,
"loss": 1.2695,
"step": 208
},
{
"epoch": 0.18,
"grad_norm": 0.5778443809302056,
"learning_rate": 9.442454025241847e-05,
"loss": 1.2744,
"step": 209
},
{
"epoch": 0.18,
"grad_norm": 0.5746167067960812,
"learning_rate": 9.43611409721806e-05,
"loss": 1.2754,
"step": 210
},
{
"epoch": 0.18,
"grad_norm": 0.5733796335171946,
"learning_rate": 9.42974047985639e-05,
"loss": 1.2627,
"step": 211
},
{
"epoch": 0.18,
"grad_norm": 0.5836676487926156,
"learning_rate": 9.42333322156023e-05,
"loss": 1.2583,
"step": 212
},
{
"epoch": 0.18,
"grad_norm": 0.5553156591047226,
"learning_rate": 9.416892370988444e-05,
"loss": 1.2373,
"step": 213
},
{
"epoch": 0.18,
"grad_norm": 0.582882964454643,
"learning_rate": 9.410417977055011e-05,
"loss": 1.2417,
"step": 214
},
{
"epoch": 0.18,
"grad_norm": 0.5669189146341135,
"learning_rate": 9.403910088928651e-05,
"loss": 1.248,
"step": 215
},
{
"epoch": 0.18,
"grad_norm": 0.5851076716461637,
"learning_rate": 9.397368756032445e-05,
"loss": 1.2485,
"step": 216
},
{
"epoch": 0.18,
"grad_norm": 0.5763454225514788,
"learning_rate": 9.390794028043474e-05,
"loss": 1.2559,
"step": 217
},
{
"epoch": 0.19,
"grad_norm": 0.5670534619234323,
"learning_rate": 9.384185954892422e-05,
"loss": 1.2524,
"step": 218
},
{
"epoch": 0.19,
"grad_norm": 0.5649816822215726,
"learning_rate": 9.377544586763215e-05,
"loss": 1.2646,
"step": 219
},
{
"epoch": 0.19,
"grad_norm": 0.5654358097466196,
"learning_rate": 9.370869974092629e-05,
"loss": 1.23,
"step": 220
},
{
"epoch": 0.19,
"grad_norm": 0.5870883099911602,
"learning_rate": 9.364162167569907e-05,
"loss": 1.2319,
"step": 221
},
{
"epoch": 0.19,
"grad_norm": 0.5828901563721925,
"learning_rate": 9.357421218136386e-05,
"loss": 1.2515,
"step": 222
},
{
"epoch": 0.19,
"grad_norm": 0.5601565336888377,
"learning_rate": 9.350647176985095e-05,
"loss": 1.2588,
"step": 223
},
{
"epoch": 0.19,
"grad_norm": 0.5817838980314579,
"learning_rate": 9.343840095560372e-05,
"loss": 1.2612,
"step": 224
},
{
"epoch": 0.19,
"grad_norm": 0.5789842413499999,
"learning_rate": 9.337000025557476e-05,
"loss": 1.2642,
"step": 225
},
{
"epoch": 0.19,
"grad_norm": 0.5840330292514332,
"learning_rate": 9.330127018922194e-05,
"loss": 1.2705,
"step": 226
},
{
"epoch": 0.19,
"grad_norm": 0.5505547742761182,
"learning_rate": 9.323221127850441e-05,
"loss": 1.2285,
"step": 227
},
{
"epoch": 0.19,
"grad_norm": 0.590833567081984,
"learning_rate": 9.316282404787871e-05,
"loss": 1.2666,
"step": 228
},
{
"epoch": 0.19,
"grad_norm": 0.5716485112550096,
"learning_rate": 9.309310902429472e-05,
"loss": 1.2563,
"step": 229
},
{
"epoch": 0.2,
"grad_norm": 0.5927741075240744,
"learning_rate": 9.30230667371917e-05,
"loss": 1.2559,
"step": 230
},
{
"epoch": 0.2,
"grad_norm": 0.5713472314685684,
"learning_rate": 9.295269771849427e-05,
"loss": 1.2632,
"step": 231
},
{
"epoch": 0.2,
"grad_norm": 0.5553599915751299,
"learning_rate": 9.288200250260836e-05,
"loss": 1.2393,
"step": 232
},
{
"epoch": 0.2,
"grad_norm": 0.5541231337910232,
"learning_rate": 9.281098162641714e-05,
"loss": 1.2393,
"step": 233
},
{
"epoch": 0.2,
"grad_norm": 0.5756192048225591,
"learning_rate": 9.273963562927695e-05,
"loss": 1.2627,
"step": 234
},
{
"epoch": 0.2,
"grad_norm": 0.5607724586820175,
"learning_rate": 9.266796505301322e-05,
"loss": 1.2319,
"step": 235
},
{
"epoch": 0.2,
"grad_norm": 0.5829558605752644,
"learning_rate": 9.259597044191636e-05,
"loss": 1.2144,
"step": 236
},
{
"epoch": 0.2,
"grad_norm": 0.5462589451489466,
"learning_rate": 9.252365234273755e-05,
"loss": 1.249,
"step": 237
},
{
"epoch": 0.2,
"grad_norm": 0.5728804325543755,
"learning_rate": 9.24510113046847e-05,
"loss": 1.2725,
"step": 238
},
{
"epoch": 0.2,
"grad_norm": 0.5661301120279436,
"learning_rate": 9.237804787941819e-05,
"loss": 1.251,
"step": 239
},
{
"epoch": 0.2,
"grad_norm": 0.5590909151931563,
"learning_rate": 9.230476262104677e-05,
"loss": 1.2544,
"step": 240
},
{
"epoch": 0.2,
"grad_norm": 0.547954794791917,
"learning_rate": 9.223115608612325e-05,
"loss": 1.2505,
"step": 241
},
{
"epoch": 0.21,
"grad_norm": 0.5532925940395454,
"learning_rate": 9.215722883364033e-05,
"loss": 1.2173,
"step": 242
},
{
"epoch": 0.21,
"grad_norm": 0.5384117130456804,
"learning_rate": 9.208298142502636e-05,
"loss": 1.27,
"step": 243
},
{
"epoch": 0.21,
"grad_norm": 0.5603423300490713,
"learning_rate": 9.200841442414106e-05,
"loss": 1.2266,
"step": 244
},
{
"epoch": 0.21,
"grad_norm": 0.5367634686371322,
"learning_rate": 9.193352839727121e-05,
"loss": 1.2163,
"step": 245
},
{
"epoch": 0.21,
"grad_norm": 0.5645847437540861,
"learning_rate": 9.185832391312644e-05,
"loss": 1.2354,
"step": 246
},
{
"epoch": 0.21,
"grad_norm": 0.5663009948987631,
"learning_rate": 9.17828015428348e-05,
"loss": 1.2354,
"step": 247
},
{
"epoch": 0.21,
"grad_norm": 0.5552105400298469,
"learning_rate": 9.17069618599385e-05,
"loss": 1.2383,
"step": 248
},
{
"epoch": 0.21,
"grad_norm": 0.5754960899676003,
"learning_rate": 9.163080544038952e-05,
"loss": 1.2456,
"step": 249
},
{
"epoch": 0.21,
"grad_norm": 0.563524941652744,
"learning_rate": 9.155433286254525e-05,
"loss": 1.2554,
"step": 250
},
{
"epoch": 0.21,
"grad_norm": 0.5517648385505713,
"learning_rate": 9.147754470716408e-05,
"loss": 1.2266,
"step": 251
},
{
"epoch": 0.21,
"grad_norm": 0.5559354777913459,
"learning_rate": 9.140044155740101e-05,
"loss": 1.2661,
"step": 252
},
{
"epoch": 0.21,
"grad_norm": 0.5533913100102068,
"learning_rate": 9.132302399880321e-05,
"loss": 1.2559,
"step": 253
},
{
"epoch": 0.22,
"grad_norm": 0.5643293887010528,
"learning_rate": 9.124529261930559e-05,
"loss": 1.2612,
"step": 254
},
{
"epoch": 0.22,
"grad_norm": 0.5582480686922173,
"learning_rate": 9.116724800922629e-05,
"loss": 1.2466,
"step": 255
},
{
"epoch": 0.22,
"grad_norm": 0.596671009095723,
"learning_rate": 9.108889076126226e-05,
"loss": 1.2827,
"step": 256
},
{
"epoch": 0.22,
"grad_norm": 0.5549405425453204,
"learning_rate": 9.101022147048473e-05,
"loss": 1.2354,
"step": 257
},
{
"epoch": 0.22,
"grad_norm": 0.5568898420832058,
"learning_rate": 9.093124073433463e-05,
"loss": 1.2285,
"step": 258
},
{
"epoch": 0.22,
"grad_norm": 0.5495005874150289,
"learning_rate": 9.085194915261818e-05,
"loss": 1.2461,
"step": 259
},
{
"epoch": 0.22,
"grad_norm": 0.5707362551733327,
"learning_rate": 9.077234732750224e-05,
"loss": 1.2231,
"step": 260
},
{
"epoch": 0.22,
"grad_norm": 0.5598850464506612,
"learning_rate": 9.069243586350975e-05,
"loss": 1.2583,
"step": 261
},
{
"epoch": 0.22,
"grad_norm": 0.5524944518616185,
"learning_rate": 9.061221536751517e-05,
"loss": 1.2222,
"step": 262
},
{
"epoch": 0.22,
"grad_norm": 0.545356383712922,
"learning_rate": 9.053168644873984e-05,
"loss": 1.2178,
"step": 263
},
{
"epoch": 0.22,
"grad_norm": 0.5762727272844113,
"learning_rate": 9.045084971874738e-05,
"loss": 1.2349,
"step": 264
},
{
"epoch": 0.23,
"grad_norm": 0.5725082474964974,
"learning_rate": 9.0369705791439e-05,
"loss": 1.2632,
"step": 265
},
{
"epoch": 0.23,
"grad_norm": 0.5621844240082758,
"learning_rate": 9.028825528304892e-05,
"loss": 1.2373,
"step": 266
},
{
"epoch": 0.23,
"grad_norm": 0.5634252832307994,
"learning_rate": 9.020649881213958e-05,
"loss": 1.2554,
"step": 267
},
{
"epoch": 0.23,
"grad_norm": 0.5813087529333412,
"learning_rate": 9.012443699959705e-05,
"loss": 1.2505,
"step": 268
},
{
"epoch": 0.23,
"grad_norm": 0.5606046951042896,
"learning_rate": 9.004207046862624e-05,
"loss": 1.2734,
"step": 269
},
{
"epoch": 0.23,
"grad_norm": 0.5619672577392087,
"learning_rate": 8.995939984474624e-05,
"loss": 1.2349,
"step": 270
},
{
"epoch": 0.23,
"grad_norm": 0.5872821822263775,
"learning_rate": 8.987642575578545e-05,
"loss": 1.2314,
"step": 271
},
{
"epoch": 0.23,
"grad_norm": 0.5715303843479439,
"learning_rate": 8.979314883187693e-05,
"loss": 1.2227,
"step": 272
},
{
"epoch": 0.23,
"grad_norm": 0.5741431027758869,
"learning_rate": 8.970956970545355e-05,
"loss": 1.2271,
"step": 273
},
{
"epoch": 0.23,
"grad_norm": 0.6059321787013856,
"learning_rate": 8.962568901124327e-05,
"loss": 1.2534,
"step": 274
},
{
"epoch": 0.23,
"grad_norm": 0.556501982362245,
"learning_rate": 8.954150738626414e-05,
"loss": 1.2363,
"step": 275
},
{
"epoch": 0.23,
"grad_norm": 0.6086209059398,
"learning_rate": 8.945702546981969e-05,
"loss": 1.2847,
"step": 276
},
{
"epoch": 0.24,
"grad_norm": 0.5564876767683732,
"learning_rate": 8.93722439034939e-05,
"loss": 1.2153,
"step": 277
},
{
"epoch": 0.24,
"grad_norm": 0.5942368515326768,
"learning_rate": 8.928716333114643e-05,
"loss": 1.2588,
"step": 278
},
{
"epoch": 0.24,
"grad_norm": 0.5662096725519149,
"learning_rate": 8.920178439890765e-05,
"loss": 1.2441,
"step": 279
},
{
"epoch": 0.24,
"grad_norm": 0.5683718909670799,
"learning_rate": 8.911610775517382e-05,
"loss": 1.2275,
"step": 280
},
{
"epoch": 0.24,
"grad_norm": 0.5430162332075814,
"learning_rate": 8.903013405060211e-05,
"loss": 1.2188,
"step": 281
},
{
"epoch": 0.24,
"grad_norm": 0.5646245018782939,
"learning_rate": 8.894386393810563e-05,
"loss": 1.2305,
"step": 282
},
{
"epoch": 0.24,
"grad_norm": 0.5775822002852916,
"learning_rate": 8.885729807284856e-05,
"loss": 1.2432,
"step": 283
},
{
"epoch": 0.24,
"grad_norm": 0.5984775169761717,
"learning_rate": 8.877043711224108e-05,
"loss": 1.2598,
"step": 284
},
{
"epoch": 0.24,
"grad_norm": 0.6320516038682688,
"learning_rate": 8.868328171593448e-05,
"loss": 1.2437,
"step": 285
},
{
"epoch": 0.24,
"grad_norm": 0.5529680708320875,
"learning_rate": 8.859583254581605e-05,
"loss": 1.2344,
"step": 286
},
{
"epoch": 0.24,
"grad_norm": 0.5742550531241745,
"learning_rate": 8.85080902660041e-05,
"loss": 1.23,
"step": 287
},
{
"epoch": 0.24,
"grad_norm": 0.5543548657925883,
"learning_rate": 8.842005554284296e-05,
"loss": 1.2632,
"step": 288
},
{
"epoch": 0.25,
"grad_norm": 0.5639830569646097,
"learning_rate": 8.83317290448978e-05,
"loss": 1.2314,
"step": 289
},
{
"epoch": 0.25,
"grad_norm": 0.5824488432279822,
"learning_rate": 8.824311144294965e-05,
"loss": 1.2661,
"step": 290
},
{
"epoch": 0.25,
"grad_norm": 0.5586136460629528,
"learning_rate": 8.815420340999033e-05,
"loss": 1.1987,
"step": 291
},
{
"epoch": 0.25,
"grad_norm": 0.5642272069241788,
"learning_rate": 8.806500562121723e-05,
"loss": 1.21,
"step": 292
},
{
"epoch": 0.25,
"grad_norm": 0.5987204031332185,
"learning_rate": 8.797551875402827e-05,
"loss": 1.2246,
"step": 293
},
{
"epoch": 0.25,
"grad_norm": 0.5530693719354123,
"learning_rate": 8.788574348801675e-05,
"loss": 1.2202,
"step": 294
},
{
"epoch": 0.25,
"grad_norm": 0.532182798905587,
"learning_rate": 8.77956805049661e-05,
"loss": 1.2026,
"step": 295
},
{
"epoch": 0.25,
"grad_norm": 0.5554774523967565,
"learning_rate": 8.770533048884482e-05,
"loss": 1.2256,
"step": 296
},
{
"epoch": 0.25,
"grad_norm": 0.5496250811271655,
"learning_rate": 8.761469412580125e-05,
"loss": 1.2197,
"step": 297
},
{
"epoch": 0.25,
"grad_norm": 0.5513817728642723,
"learning_rate": 8.75237721041583e-05,
"loss": 1.2026,
"step": 298
},
{
"epoch": 0.25,
"grad_norm": 0.5783631239023462,
"learning_rate": 8.74325651144083e-05,
"loss": 1.2666,
"step": 299
},
{
"epoch": 0.25,
"grad_norm": 0.5570391531595814,
"learning_rate": 8.73410738492077e-05,
"loss": 1.2158,
"step": 300
},
{
"epoch": 0.26,
"grad_norm": 0.5991957140636426,
"learning_rate": 8.724929900337186e-05,
"loss": 1.27,
"step": 301
},
{
"epoch": 0.26,
"grad_norm": 0.5642120830729553,
"learning_rate": 8.715724127386972e-05,
"loss": 1.2095,
"step": 302
},
{
"epoch": 0.26,
"grad_norm": 0.5848386239977618,
"learning_rate": 8.706490135981855e-05,
"loss": 1.2495,
"step": 303
},
{
"epoch": 0.26,
"grad_norm": 0.5511112850774736,
"learning_rate": 8.697227996247861e-05,
"loss": 1.2305,
"step": 304
},
{
"epoch": 0.26,
"grad_norm": 0.5846633795708265,
"learning_rate": 8.687937778524786e-05,
"loss": 1.209,
"step": 305
},
{
"epoch": 0.26,
"grad_norm": 0.5540295140780195,
"learning_rate": 8.678619553365659e-05,
"loss": 1.2354,
"step": 306
},
{
"epoch": 0.26,
"grad_norm": 0.5414463070611637,
"learning_rate": 8.669273391536204e-05,
"loss": 1.2344,
"step": 307
},
{
"epoch": 0.26,
"grad_norm": 0.5427098846896791,
"learning_rate": 8.659899364014309e-05,
"loss": 1.209,
"step": 308
},
{
"epoch": 0.26,
"grad_norm": 0.5574933062513657,
"learning_rate": 8.650497541989482e-05,
"loss": 1.2178,
"step": 309
},
{
"epoch": 0.26,
"grad_norm": 0.5444595039607957,
"learning_rate": 8.641067996862311e-05,
"loss": 1.2363,
"step": 310
},
{
"epoch": 0.26,
"grad_norm": 0.5655086471261345,
"learning_rate": 8.631610800243926e-05,
"loss": 1.2236,
"step": 311
},
{
"epoch": 0.27,
"grad_norm": 0.5676258965708015,
"learning_rate": 8.622126023955446e-05,
"loss": 1.2222,
"step": 312
},
{
"epoch": 0.27,
"grad_norm": 0.5738164390287753,
"learning_rate": 8.612613740027443e-05,
"loss": 1.2437,
"step": 313
},
{
"epoch": 0.27,
"grad_norm": 0.5763979630809666,
"learning_rate": 8.603074020699393e-05,
"loss": 1.2588,
"step": 314
},
{
"epoch": 0.27,
"grad_norm": 0.5694093785228663,
"learning_rate": 8.59350693841912e-05,
"loss": 1.2305,
"step": 315
},
{
"epoch": 0.27,
"grad_norm": 0.5668385068217152,
"learning_rate": 8.583912565842257e-05,
"loss": 1.2324,
"step": 316
},
{
"epoch": 0.27,
"grad_norm": 0.5681247446685661,
"learning_rate": 8.574290975831685e-05,
"loss": 1.2461,
"step": 317
},
{
"epoch": 0.27,
"grad_norm": 0.5705548704956539,
"learning_rate": 8.564642241456986e-05,
"loss": 1.2529,
"step": 318
},
{
"epoch": 0.27,
"grad_norm": 0.5490624808218363,
"learning_rate": 8.554966435993882e-05,
"loss": 1.2119,
"step": 319
},
{
"epoch": 0.27,
"grad_norm": 0.5327956614769455,
"learning_rate": 8.545263632923687e-05,
"loss": 1.2051,
"step": 320
},
{
"epoch": 0.27,
"grad_norm": 0.5394754263176863,
"learning_rate": 8.535533905932738e-05,
"loss": 1.2207,
"step": 321
},
{
"epoch": 0.27,
"grad_norm": 0.547540468306315,
"learning_rate": 8.525777328911846e-05,
"loss": 1.2241,
"step": 322
},
{
"epoch": 0.27,
"grad_norm": 0.5262754503627509,
"learning_rate": 8.515993975955727e-05,
"loss": 1.2227,
"step": 323
},
{
"epoch": 0.28,
"grad_norm": 0.5839641855087577,
"learning_rate": 8.506183921362443e-05,
"loss": 1.228,
"step": 324
},
{
"epoch": 0.28,
"grad_norm": 0.5658179501896037,
"learning_rate": 8.49634723963284e-05,
"loss": 1.2534,
"step": 325
},
{
"epoch": 0.28,
"grad_norm": 0.5500813153743531,
"learning_rate": 8.486484005469977e-05,
"loss": 1.2104,
"step": 326
},
{
"epoch": 0.28,
"grad_norm": 0.547119722986805,
"learning_rate": 8.476594293778561e-05,
"loss": 1.1938,
"step": 327
},
{
"epoch": 0.28,
"grad_norm": 0.5596429046688678,
"learning_rate": 8.466678179664379e-05,
"loss": 1.2148,
"step": 328
},
{
"epoch": 0.28,
"grad_norm": 0.5705084720451127,
"learning_rate": 8.456735738433723e-05,
"loss": 1.2432,
"step": 329
},
{
"epoch": 0.28,
"grad_norm": 0.5895336557197053,
"learning_rate": 8.44676704559283e-05,
"loss": 1.252,
"step": 330
},
{
"epoch": 0.28,
"grad_norm": 0.5774427874239505,
"learning_rate": 8.436772176847294e-05,
"loss": 1.2251,
"step": 331
},
{
"epoch": 0.28,
"grad_norm": 0.5394619079429321,
"learning_rate": 8.4267512081015e-05,
"loss": 1.2329,
"step": 332
},
{
"epoch": 0.28,
"grad_norm": 0.5755505544475856,
"learning_rate": 8.416704215458043e-05,
"loss": 1.2471,
"step": 333
},
{
"epoch": 0.28,
"grad_norm": 0.5725344759637591,
"learning_rate": 8.406631275217156e-05,
"loss": 1.2397,
"step": 334
},
{
"epoch": 0.28,
"grad_norm": 0.5518081872615708,
"learning_rate": 8.396532463876124e-05,
"loss": 1.248,
"step": 335
},
{
"epoch": 0.29,
"grad_norm": 0.5841438683442003,
"learning_rate": 8.386407858128706e-05,
"loss": 1.2339,
"step": 336
},
{
"epoch": 0.29,
"grad_norm": 0.5513391183560247,
"learning_rate": 8.376257534864553e-05,
"loss": 1.2373,
"step": 337
},
{
"epoch": 0.29,
"grad_norm": 0.5702720231441866,
"learning_rate": 8.366081571168625e-05,
"loss": 1.2202,
"step": 338
},
{
"epoch": 0.29,
"grad_norm": 0.5401170183476215,
"learning_rate": 8.355880044320598e-05,
"loss": 1.2036,
"step": 339
},
{
"epoch": 0.29,
"grad_norm": 0.5584668011986428,
"learning_rate": 8.345653031794292e-05,
"loss": 1.2109,
"step": 340
},
{
"epoch": 0.29,
"grad_norm": 0.5651374075473236,
"learning_rate": 8.335400611257067e-05,
"loss": 1.2305,
"step": 341
},
{
"epoch": 0.29,
"grad_norm": 0.5576999267528816,
"learning_rate": 8.32512286056924e-05,
"loss": 1.208,
"step": 342
},
{
"epoch": 0.29,
"grad_norm": 0.5417887475825113,
"learning_rate": 8.314819857783503e-05,
"loss": 1.2212,
"step": 343
},
{
"epoch": 0.29,
"grad_norm": 0.5697549522190456,
"learning_rate": 8.304491681144306e-05,
"loss": 1.2227,
"step": 344
},
{
"epoch": 0.29,
"grad_norm": 0.5916950215563477,
"learning_rate": 8.29413840908729e-05,
"loss": 1.2256,
"step": 345
},
{
"epoch": 0.29,
"grad_norm": 0.5429495837198551,
"learning_rate": 8.283760120238672e-05,
"loss": 1.2036,
"step": 346
},
{
"epoch": 0.29,
"grad_norm": 0.5586180705362634,
"learning_rate": 8.273356893414659e-05,
"loss": 1.2095,
"step": 347
},
{
"epoch": 0.3,
"grad_norm": 0.5575989259323167,
"learning_rate": 8.262928807620843e-05,
"loss": 1.2231,
"step": 348
},
{
"epoch": 0.3,
"grad_norm": 0.5441346695675087,
"learning_rate": 8.252475942051605e-05,
"loss": 1.2056,
"step": 349
},
{
"epoch": 0.3,
"grad_norm": 0.5639413959564339,
"learning_rate": 8.241998376089508e-05,
"loss": 1.2173,
"step": 350
},
{
"epoch": 0.3,
"grad_norm": 0.590712779900491,
"learning_rate": 8.231496189304704e-05,
"loss": 1.2568,
"step": 351
},
{
"epoch": 0.3,
"grad_norm": 0.5777848614323381,
"learning_rate": 8.220969461454322e-05,
"loss": 1.2393,
"step": 352
},
{
"epoch": 0.3,
"grad_norm": 0.5340681608181079,
"learning_rate": 8.210418272481859e-05,
"loss": 1.2041,
"step": 353
},
{
"epoch": 0.3,
"grad_norm": 0.5677393547154248,
"learning_rate": 8.199842702516583e-05,
"loss": 1.2192,
"step": 354
},
{
"epoch": 0.3,
"grad_norm": 0.5437413315036653,
"learning_rate": 8.18924283187292e-05,
"loss": 1.2139,
"step": 355
},
{
"epoch": 0.3,
"grad_norm": 0.5659533238197777,
"learning_rate": 8.178618741049842e-05,
"loss": 1.207,
"step": 356
},
{
"epoch": 0.3,
"grad_norm": 0.5070802266343845,
"learning_rate": 8.167970510730253e-05,
"loss": 1.1914,
"step": 357
},
{
"epoch": 0.3,
"grad_norm": 0.5556418994287101,
"learning_rate": 8.157298221780389e-05,
"loss": 1.1938,
"step": 358
},
{
"epoch": 0.31,
"grad_norm": 0.5473208831723899,
"learning_rate": 8.146601955249188e-05,
"loss": 1.2183,
"step": 359
},
{
"epoch": 0.31,
"grad_norm": 0.5703775361988737,
"learning_rate": 8.135881792367686e-05,
"loss": 1.2417,
"step": 360
},
{
"epoch": 0.31,
"grad_norm": 0.561879279645762,
"learning_rate": 8.125137814548393e-05,
"loss": 1.2148,
"step": 361
},
{
"epoch": 0.31,
"grad_norm": 0.550588101278353,
"learning_rate": 8.114370103384681e-05,
"loss": 1.228,
"step": 362
},
{
"epoch": 0.31,
"grad_norm": 0.5259832736436021,
"learning_rate": 8.103578740650156e-05,
"loss": 1.21,
"step": 363
},
{
"epoch": 0.31,
"grad_norm": 0.5147619335698311,
"learning_rate": 8.092763808298048e-05,
"loss": 1.2026,
"step": 364
},
{
"epoch": 0.31,
"grad_norm": 0.5504438033485523,
"learning_rate": 8.081925388460578e-05,
"loss": 1.2026,
"step": 365
},
{
"epoch": 0.31,
"grad_norm": 0.5446547745345772,
"learning_rate": 8.07106356344834e-05,
"loss": 1.2236,
"step": 366
},
{
"epoch": 0.31,
"grad_norm": 0.5282234491024778,
"learning_rate": 8.060178415749674e-05,
"loss": 1.2046,
"step": 367
},
{
"epoch": 0.31,
"grad_norm": 0.5685704417565632,
"learning_rate": 8.049270028030046e-05,
"loss": 1.1948,
"step": 368
},
{
"epoch": 0.31,
"grad_norm": 0.5387912641493281,
"learning_rate": 8.038338483131407e-05,
"loss": 1.1987,
"step": 369
},
{
"epoch": 0.31,
"grad_norm": 0.5524731924712689,
"learning_rate": 8.027383864071573e-05,
"loss": 1.2261,
"step": 370
},
{
"epoch": 0.32,
"grad_norm": 0.5326013165738928,
"learning_rate": 8.016406254043595e-05,
"loss": 1.1987,
"step": 371
},
{
"epoch": 0.32,
"grad_norm": 0.5694990420557361,
"learning_rate": 8.005405736415126e-05,
"loss": 1.2246,
"step": 372
},
{
"epoch": 0.32,
"grad_norm": 0.5512702902650021,
"learning_rate": 7.994382394727784e-05,
"loss": 1.25,
"step": 373
},
{
"epoch": 0.32,
"grad_norm": 0.5627692554190333,
"learning_rate": 7.983336312696522e-05,
"loss": 1.2344,
"step": 374
},
{
"epoch": 0.32,
"grad_norm": 0.5723746306616886,
"learning_rate": 7.972267574208991e-05,
"loss": 1.2266,
"step": 375
},
{
"epoch": 0.32,
"grad_norm": 0.5785199319910487,
"learning_rate": 7.961176263324901e-05,
"loss": 1.2046,
"step": 376
},
{
"epoch": 0.32,
"grad_norm": 0.5638166540487942,
"learning_rate": 7.950062464275387e-05,
"loss": 1.2124,
"step": 377
},
{
"epoch": 0.32,
"grad_norm": 0.5591166049939134,
"learning_rate": 7.938926261462366e-05,
"loss": 1.2251,
"step": 378
},
{
"epoch": 0.32,
"grad_norm": 0.5505127627644203,
"learning_rate": 7.927767739457897e-05,
"loss": 1.2158,
"step": 379
},
{
"epoch": 0.32,
"grad_norm": 0.5650623189448578,
"learning_rate": 7.916586983003533e-05,
"loss": 1.208,
"step": 380
},
{
"epoch": 0.32,
"grad_norm": 0.5691768474472332,
"learning_rate": 7.905384077009693e-05,
"loss": 1.1875,
"step": 381
},
{
"epoch": 0.32,
"grad_norm": 0.5660613389542086,
"learning_rate": 7.894159106554997e-05,
"loss": 1.2227,
"step": 382
},
{
"epoch": 0.33,
"grad_norm": 0.5582910482328445,
"learning_rate": 7.882912156885637e-05,
"loss": 1.2173,
"step": 383
},
{
"epoch": 0.33,
"grad_norm": 0.5687428665442464,
"learning_rate": 7.871643313414718e-05,
"loss": 1.2188,
"step": 384
},
{
"epoch": 0.33,
"grad_norm": 0.5700426706301734,
"learning_rate": 7.860352661721619e-05,
"loss": 1.2534,
"step": 385
},
{
"epoch": 0.33,
"grad_norm": 0.5640156767511431,
"learning_rate": 7.849040287551331e-05,
"loss": 1.2256,
"step": 386
},
{
"epoch": 0.33,
"grad_norm": 0.5730028379052688,
"learning_rate": 7.837706276813819e-05,
"loss": 1.2383,
"step": 387
},
{
"epoch": 0.33,
"grad_norm": 0.5729196218457621,
"learning_rate": 7.82635071558336e-05,
"loss": 1.2539,
"step": 388
},
{
"epoch": 0.33,
"grad_norm": 0.5389989913836808,
"learning_rate": 7.814973690097893e-05,
"loss": 1.2114,
"step": 389
},
{
"epoch": 0.33,
"grad_norm": 0.563817849905785,
"learning_rate": 7.803575286758364e-05,
"loss": 1.1978,
"step": 390
},
{
"epoch": 0.33,
"grad_norm": 0.5742098449729457,
"learning_rate": 7.79215559212807e-05,
"loss": 1.2104,
"step": 391
},
{
"epoch": 0.33,
"grad_norm": 0.5482109799457336,
"learning_rate": 7.780714692932002e-05,
"loss": 1.1978,
"step": 392
},
{
"epoch": 0.33,
"grad_norm": 0.5619582390386062,
"learning_rate": 7.769252676056187e-05,
"loss": 1.2197,
"step": 393
},
{
"epoch": 0.33,
"grad_norm": 0.5707713766775032,
"learning_rate": 7.757769628547018e-05,
"loss": 1.2349,
"step": 394
},
{
"epoch": 0.34,
"grad_norm": 0.5491475936260862,
"learning_rate": 7.746265637610613e-05,
"loss": 1.1758,
"step": 395
},
{
"epoch": 0.34,
"grad_norm": 0.5500938802153376,
"learning_rate": 7.734740790612136e-05,
"loss": 1.2041,
"step": 396
},
{
"epoch": 0.34,
"grad_norm": 0.5293011700429642,
"learning_rate": 7.723195175075136e-05,
"loss": 1.1821,
"step": 397
},
{
"epoch": 0.34,
"grad_norm": 0.5596089219360537,
"learning_rate": 7.711628878680892e-05,
"loss": 1.2539,
"step": 398
},
{
"epoch": 0.34,
"grad_norm": 0.5754494198608471,
"learning_rate": 7.700041989267736e-05,
"loss": 1.2378,
"step": 399
},
{
"epoch": 0.34,
"grad_norm": 0.5705286060826581,
"learning_rate": 7.688434594830392e-05,
"loss": 1.2192,
"step": 400
},
{
"epoch": 0.34,
"grad_norm": 0.5320392604087235,
"learning_rate": 7.676806783519304e-05,
"loss": 1.2021,
"step": 401
},
{
"epoch": 0.34,
"grad_norm": 0.5573096847631821,
"learning_rate": 7.66515864363997e-05,
"loss": 1.229,
"step": 402
},
{
"epoch": 0.34,
"grad_norm": 0.5670482309405055,
"learning_rate": 7.653490263652269e-05,
"loss": 1.2324,
"step": 403
},
{
"epoch": 0.34,
"grad_norm": 0.5214893797779285,
"learning_rate": 7.641801732169795e-05,
"loss": 1.1968,
"step": 404
},
{
"epoch": 0.34,
"grad_norm": 0.5729861431933309,
"learning_rate": 7.630093137959171e-05,
"loss": 1.2163,
"step": 405
},
{
"epoch": 0.34,
"grad_norm": 0.5552719599472679,
"learning_rate": 7.618364569939391e-05,
"loss": 1.2075,
"step": 406
},
{
"epoch": 0.35,
"grad_norm": 0.5465863524732877,
"learning_rate": 7.606616117181128e-05,
"loss": 1.1968,
"step": 407
},
{
"epoch": 0.35,
"grad_norm": 0.5748817703147777,
"learning_rate": 7.594847868906076e-05,
"loss": 1.2227,
"step": 408
},
{
"epoch": 0.35,
"grad_norm": 0.5439693138286017,
"learning_rate": 7.583059914486257e-05,
"loss": 1.2031,
"step": 409
},
{
"epoch": 0.35,
"grad_norm": 0.5635085757798922,
"learning_rate": 7.571252343443349e-05,
"loss": 1.2324,
"step": 410
},
{
"epoch": 0.35,
"grad_norm": 0.5490256924391643,
"learning_rate": 7.559425245448006e-05,
"loss": 1.1953,
"step": 411
},
{
"epoch": 0.35,
"grad_norm": 0.5581764204211717,
"learning_rate": 7.547578710319174e-05,
"loss": 1.2158,
"step": 412
},
{
"epoch": 0.35,
"grad_norm": 0.5554727329505762,
"learning_rate": 7.535712828023416e-05,
"loss": 1.2236,
"step": 413
},
{
"epoch": 0.35,
"grad_norm": 0.5395464303361225,
"learning_rate": 7.52382768867422e-05,
"loss": 1.2114,
"step": 414
},
{
"epoch": 0.35,
"grad_norm": 0.5613556527369445,
"learning_rate": 7.511923382531317e-05,
"loss": 1.1792,
"step": 415
},
{
"epoch": 0.35,
"grad_norm": 0.5802200387551621,
"learning_rate": 7.500000000000001e-05,
"loss": 1.1899,
"step": 416
},
{
"epoch": 0.35,
"grad_norm": 0.5673594518905887,
"learning_rate": 7.488057631630437e-05,
"loss": 1.2236,
"step": 417
},
{
"epoch": 0.36,
"grad_norm": 0.5439832310153052,
"learning_rate": 7.476096368116974e-05,
"loss": 1.2168,
"step": 418
},
{
"epoch": 0.36,
"grad_norm": 0.601300632455188,
"learning_rate": 7.464116300297458e-05,
"loss": 1.2534,
"step": 419
},
{
"epoch": 0.36,
"grad_norm": 0.5489145931334055,
"learning_rate": 7.452117519152542e-05,
"loss": 1.2007,
"step": 420
},
{
"epoch": 0.36,
"grad_norm": 0.5398362128758979,
"learning_rate": 7.440100115804991e-05,
"loss": 1.1743,
"step": 421
},
{
"epoch": 0.36,
"grad_norm": 0.5816646831232165,
"learning_rate": 7.428064181518997e-05,
"loss": 1.2344,
"step": 422
},
{
"epoch": 0.36,
"grad_norm": 0.5633431228968494,
"learning_rate": 7.416009807699482e-05,
"loss": 1.2017,
"step": 423
},
{
"epoch": 0.36,
"grad_norm": 0.5589499609875991,
"learning_rate": 7.403937085891397e-05,
"loss": 1.2095,
"step": 424
},
{
"epoch": 0.36,
"grad_norm": 0.5493151850885487,
"learning_rate": 7.391846107779047e-05,
"loss": 1.1865,
"step": 425
},
{
"epoch": 0.36,
"grad_norm": 0.540833710956897,
"learning_rate": 7.379736965185368e-05,
"loss": 1.2041,
"step": 426
},
{
"epoch": 0.36,
"grad_norm": 0.524036751110743,
"learning_rate": 7.367609750071252e-05,
"loss": 1.1826,
"step": 427
},
{
"epoch": 0.36,
"grad_norm": 0.569282857600664,
"learning_rate": 7.355464554534837e-05,
"loss": 1.187,
"step": 428
},
{
"epoch": 0.36,
"grad_norm": 0.5524428783713417,
"learning_rate": 7.343301470810808e-05,
"loss": 1.2202,
"step": 429
},
{
"epoch": 0.37,
"grad_norm": 0.5623114711099751,
"learning_rate": 7.331120591269701e-05,
"loss": 1.1899,
"step": 430
},
{
"epoch": 0.37,
"grad_norm": 0.5301869310349011,
"learning_rate": 7.318922008417203e-05,
"loss": 1.1919,
"step": 431
},
{
"epoch": 0.37,
"grad_norm": 0.5687590875970512,
"learning_rate": 7.30670581489344e-05,
"loss": 1.2056,
"step": 432
},
{
"epoch": 0.37,
"grad_norm": 0.5589775032893604,
"learning_rate": 7.294472103472281e-05,
"loss": 1.2188,
"step": 433
},
{
"epoch": 0.37,
"grad_norm": 0.578465226917116,
"learning_rate": 7.282220967060633e-05,
"loss": 1.2158,
"step": 434
},
{
"epoch": 0.37,
"grad_norm": 0.542415194752666,
"learning_rate": 7.269952498697734e-05,
"loss": 1.187,
"step": 435
},
{
"epoch": 0.37,
"grad_norm": 0.5438469592749641,
"learning_rate": 7.257666791554448e-05,
"loss": 1.1494,
"step": 436
},
{
"epoch": 0.37,
"grad_norm": 0.56103545827402,
"learning_rate": 7.245363938932551e-05,
"loss": 1.2085,
"step": 437
},
{
"epoch": 0.37,
"grad_norm": 0.5543439263354124,
"learning_rate": 7.233044034264034e-05,
"loss": 1.186,
"step": 438
},
{
"epoch": 0.37,
"grad_norm": 0.5621095189445257,
"learning_rate": 7.220707171110382e-05,
"loss": 1.2036,
"step": 439
},
{
"epoch": 0.37,
"grad_norm": 0.578507048853321,
"learning_rate": 7.20835344316187e-05,
"loss": 1.2158,
"step": 440
},
{
"epoch": 0.37,
"grad_norm": 0.5393466899110284,
"learning_rate": 7.195982944236851e-05,
"loss": 1.1807,
"step": 441
},
{
"epoch": 0.38,
"grad_norm": 0.5390437694953595,
"learning_rate": 7.183595768281043e-05,
"loss": 1.1914,
"step": 442
},
{
"epoch": 0.38,
"grad_norm": 0.525445074400616,
"learning_rate": 7.171192009366814e-05,
"loss": 1.1655,
"step": 443
},
{
"epoch": 0.38,
"grad_norm": 0.5397543259123138,
"learning_rate": 7.158771761692464e-05,
"loss": 1.2139,
"step": 444
},
{
"epoch": 0.38,
"grad_norm": 0.5301908429870645,
"learning_rate": 7.146335119581523e-05,
"loss": 1.2163,
"step": 445
},
{
"epoch": 0.38,
"grad_norm": 0.5528560850589617,
"learning_rate": 7.133882177482019e-05,
"loss": 1.2046,
"step": 446
},
{
"epoch": 0.38,
"grad_norm": 0.5223068286596114,
"learning_rate": 7.121413029965769e-05,
"loss": 1.1855,
"step": 447
},
{
"epoch": 0.38,
"grad_norm": 0.5375221567597188,
"learning_rate": 7.108927771727661e-05,
"loss": 1.1841,
"step": 448
},
{
"epoch": 0.38,
"grad_norm": 0.5528153279757392,
"learning_rate": 7.096426497584933e-05,
"loss": 1.2002,
"step": 449
},
{
"epoch": 0.38,
"grad_norm": 0.5641067857153084,
"learning_rate": 7.083909302476453e-05,
"loss": 1.1914,
"step": 450
},
{
"epoch": 0.38,
"grad_norm": 0.5543757360519023,
"learning_rate": 7.071376281461994e-05,
"loss": 1.2026,
"step": 451
},
{
"epoch": 0.38,
"grad_norm": 0.5521178435951897,
"learning_rate": 7.058827529721525e-05,
"loss": 1.1816,
"step": 452
},
{
"epoch": 0.38,
"grad_norm": 0.545288440889916,
"learning_rate": 7.04626314255447e-05,
"loss": 1.2202,
"step": 453
},
{
"epoch": 0.39,
"grad_norm": 0.5437815680869471,
"learning_rate": 7.033683215379002e-05,
"loss": 1.2031,
"step": 454
},
{
"epoch": 0.39,
"grad_norm": 0.5551130898620283,
"learning_rate": 7.021087843731302e-05,
"loss": 1.189,
"step": 455
},
{
"epoch": 0.39,
"grad_norm": 0.5553626624921362,
"learning_rate": 7.008477123264848e-05,
"loss": 1.2261,
"step": 456
},
{
"epoch": 0.39,
"grad_norm": 0.5452825817203325,
"learning_rate": 6.99585114974968e-05,
"loss": 1.1768,
"step": 457
},
{
"epoch": 0.39,
"grad_norm": 0.5642586993770626,
"learning_rate": 6.98321001907167e-05,
"loss": 1.1841,
"step": 458
},
{
"epoch": 0.39,
"grad_norm": 0.5398272756531507,
"learning_rate": 6.97055382723181e-05,
"loss": 1.1865,
"step": 459
},
{
"epoch": 0.39,
"grad_norm": 0.5522752869750288,
"learning_rate": 6.957882670345458e-05,
"loss": 1.2061,
"step": 460
},
{
"epoch": 0.39,
"grad_norm": 0.5308798815860719,
"learning_rate": 6.94519664464163e-05,
"loss": 1.1948,
"step": 461
},
{
"epoch": 0.39,
"grad_norm": 0.5495493844679439,
"learning_rate": 6.932495846462261e-05,
"loss": 1.1914,
"step": 462
},
{
"epoch": 0.39,
"grad_norm": 0.5482416736955646,
"learning_rate": 6.91978037226147e-05,
"loss": 1.2017,
"step": 463
},
{
"epoch": 0.39,
"grad_norm": 0.5503012321222697,
"learning_rate": 6.90705031860483e-05,
"loss": 1.2119,
"step": 464
},
{
"epoch": 0.4,
"grad_norm": 0.5725638118414337,
"learning_rate": 6.894305782168638e-05,
"loss": 1.1899,
"step": 465
},
{
"epoch": 0.4,
"grad_norm": 0.5624448379824697,
"learning_rate": 6.881546859739179e-05,
"loss": 1.23,
"step": 466
},
{
"epoch": 0.4,
"grad_norm": 0.5561279665840796,
"learning_rate": 6.868773648211983e-05,
"loss": 1.2017,
"step": 467
},
{
"epoch": 0.4,
"grad_norm": 0.5396579505393837,
"learning_rate": 6.855986244591104e-05,
"loss": 1.1733,
"step": 468
},
{
"epoch": 0.4,
"grad_norm": 0.5593889291768533,
"learning_rate": 6.843184745988373e-05,
"loss": 1.2119,
"step": 469
},
{
"epoch": 0.4,
"grad_norm": 0.5517593854010368,
"learning_rate": 6.830369249622662e-05,
"loss": 1.2114,
"step": 470
},
{
"epoch": 0.4,
"grad_norm": 0.565602467217196,
"learning_rate": 6.817539852819149e-05,
"loss": 1.1968,
"step": 471
},
{
"epoch": 0.4,
"grad_norm": 0.5454828029904284,
"learning_rate": 6.804696653008575e-05,
"loss": 1.1938,
"step": 472
},
{
"epoch": 0.4,
"grad_norm": 0.5312848354649821,
"learning_rate": 6.7918397477265e-05,
"loss": 1.1909,
"step": 473
},
{
"epoch": 0.4,
"grad_norm": 0.5379892410248488,
"learning_rate": 6.778969234612584e-05,
"loss": 1.1733,
"step": 474
},
{
"epoch": 0.4,
"grad_norm": 0.5640301128196445,
"learning_rate": 6.76608521140981e-05,
"loss": 1.1938,
"step": 475
},
{
"epoch": 0.4,
"grad_norm": 0.5659554102145757,
"learning_rate": 6.753187775963773e-05,
"loss": 1.2192,
"step": 476
},
{
"epoch": 0.41,
"grad_norm": 0.5330483555414703,
"learning_rate": 6.740277026221923e-05,
"loss": 1.2163,
"step": 477
},
{
"epoch": 0.41,
"grad_norm": 0.5518117687862835,
"learning_rate": 6.727353060232822e-05,
"loss": 1.1904,
"step": 478
},
{
"epoch": 0.41,
"grad_norm": 0.556594787840222,
"learning_rate": 6.714415976145402e-05,
"loss": 1.2056,
"step": 479
},
{
"epoch": 0.41,
"grad_norm": 0.5718671551889151,
"learning_rate": 6.701465872208216e-05,
"loss": 1.2271,
"step": 480
},
{
"epoch": 0.41,
"grad_norm": 0.5512989214363787,
"learning_rate": 6.688502846768696e-05,
"loss": 1.2031,
"step": 481
},
{
"epoch": 0.41,
"grad_norm": 0.5494732510246357,
"learning_rate": 6.675526998272405e-05,
"loss": 1.2119,
"step": 482
},
{
"epoch": 0.41,
"grad_norm": 0.5378563166396855,
"learning_rate": 6.662538425262285e-05,
"loss": 1.1621,
"step": 483
},
{
"epoch": 0.41,
"grad_norm": 0.5480052049772425,
"learning_rate": 6.649537226377915e-05,
"loss": 1.1841,
"step": 484
},
{
"epoch": 0.41,
"grad_norm": 0.5636382985336955,
"learning_rate": 6.636523500354759e-05,
"loss": 1.2056,
"step": 485
},
{
"epoch": 0.41,
"grad_norm": 0.5416198773488824,
"learning_rate": 6.623497346023418e-05,
"loss": 1.1646,
"step": 486
},
{
"epoch": 0.41,
"grad_norm": 0.5478977270550187,
"learning_rate": 6.610458862308872e-05,
"loss": 1.1914,
"step": 487
},
{
"epoch": 0.41,
"grad_norm": 0.5607213727964705,
"learning_rate": 6.59740814822974e-05,
"loss": 1.2012,
"step": 488
},
{
"epoch": 0.42,
"grad_norm": 0.5434978380272973,
"learning_rate": 6.584345302897523e-05,
"loss": 1.167,
"step": 489
},
{
"epoch": 0.42,
"grad_norm": 0.5616431237123319,
"learning_rate": 6.571270425515843e-05,
"loss": 1.1938,
"step": 490
},
{
"epoch": 0.42,
"grad_norm": 0.5718848045230116,
"learning_rate": 6.558183615379707e-05,
"loss": 1.1968,
"step": 491
},
{
"epoch": 0.42,
"grad_norm": 0.5468507624315109,
"learning_rate": 6.545084971874738e-05,
"loss": 1.1719,
"step": 492
},
{
"epoch": 0.42,
"grad_norm": 0.5648647170806229,
"learning_rate": 6.531974594476425e-05,
"loss": 1.207,
"step": 493
},
{
"epoch": 0.42,
"grad_norm": 0.5503576509488237,
"learning_rate": 6.518852582749373e-05,
"loss": 1.1992,
"step": 494
},
{
"epoch": 0.42,
"grad_norm": 0.5609821252964683,
"learning_rate": 6.505719036346539e-05,
"loss": 1.1997,
"step": 495
},
{
"epoch": 0.42,
"grad_norm": 0.5498062769196067,
"learning_rate": 6.492574055008473e-05,
"loss": 1.1875,
"step": 496
},
{
"epoch": 0.42,
"grad_norm": 0.5539230993166063,
"learning_rate": 6.479417738562576e-05,
"loss": 1.1909,
"step": 497
},
{
"epoch": 0.42,
"grad_norm": 0.5496978328792177,
"learning_rate": 6.466250186922325e-05,
"loss": 1.2139,
"step": 498
},
{
"epoch": 0.42,
"grad_norm": 0.558021764032463,
"learning_rate": 6.45307150008652e-05,
"loss": 1.1943,
"step": 499
},
{
"epoch": 0.42,
"grad_norm": 0.5762104557001627,
"learning_rate": 6.439881778138531e-05,
"loss": 1.2148,
"step": 500
}
],
"logging_steps": 1.0,
"max_steps": 1176,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"total_flos": 4.0453184645024973e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}