|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4248539564524695, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.1944120442820862, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 1.6587, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.158646448617879, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 1.6685, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.1343466361951706, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.6836, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.0231531922210753, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 1.6523, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.6162620203223095, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 1.6636, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3779619042576137, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.6289, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.223668866838671, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 1.6201, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5016416362830853, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.5596, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.465420671008811, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.6113, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1965670018804309, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.5898, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1117797752417102, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 1.6035, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9878470790338667, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.5625, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.02494625138462, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 1.5547, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0223917263016193, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 1.5615, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9433437823947872, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.5728, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8737056838198499, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 1.5327, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.847350291380953, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 1.4829, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9546966542598146, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5532, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9232787655185869, |
|
"learning_rate": 5.2777777777777784e-05, |
|
"loss": 1.5303, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.879349873123116, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 1.502, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8709535184620328, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 1.4585, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8819786376627559, |
|
"learning_rate": 6.111111111111112e-05, |
|
"loss": 1.4858, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8589200426275411, |
|
"learning_rate": 6.388888888888888e-05, |
|
"loss": 1.4644, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8190182080399642, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.4561, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8796864611649672, |
|
"learning_rate": 6.944444444444444e-05, |
|
"loss": 1.4546, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8331325598252782, |
|
"learning_rate": 7.222222222222222e-05, |
|
"loss": 1.4624, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8345520972989295, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.4453, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8176489443002161, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 1.4541, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.7528421779691234, |
|
"learning_rate": 8.055555555555556e-05, |
|
"loss": 1.4434, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7991219912695795, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.4546, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7453012031751974, |
|
"learning_rate": 8.611111111111112e-05, |
|
"loss": 1.4541, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7356336435000073, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 1.4565, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7239229910223912, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.4253, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6995782237549312, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 1.4116, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7108394167974162, |
|
"learning_rate": 9.722222222222223e-05, |
|
"loss": 1.4053, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7270375728618296, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4214, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7494051458635556, |
|
"learning_rate": 9.999981014161752e-05, |
|
"loss": 1.4644, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.733832068426627, |
|
"learning_rate": 9.999924056791192e-05, |
|
"loss": 1.4141, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6719534359517587, |
|
"learning_rate": 9.999829128320874e-05, |
|
"loss": 1.4023, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7406841100980851, |
|
"learning_rate": 9.999696229471716e-05, |
|
"loss": 1.4263, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6561411493710649, |
|
"learning_rate": 9.999525361252996e-05, |
|
"loss": 1.4126, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6703214903768667, |
|
"learning_rate": 9.999316524962345e-05, |
|
"loss": 1.3955, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6952049638900921, |
|
"learning_rate": 9.999069722185737e-05, |
|
"loss": 1.4072, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6806747265810544, |
|
"learning_rate": 9.998784954797474e-05, |
|
"loss": 1.4146, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6761436892518071, |
|
"learning_rate": 9.998462224960175e-05, |
|
"loss": 1.4009, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.694044598842866, |
|
"learning_rate": 9.998101535124758e-05, |
|
"loss": 1.4268, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6557563304435648, |
|
"learning_rate": 9.997702888030423e-05, |
|
"loss": 1.3794, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6612841638564682, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 1.3892, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6486556767977087, |
|
"learning_rate": 9.996791734463077e-05, |
|
"loss": 1.3652, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6483540085185676, |
|
"learning_rate": 9.996279234909671e-05, |
|
"loss": 1.3984, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6700432628612305, |
|
"learning_rate": 9.995728791936504e-05, |
|
"loss": 1.3999, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6432744026831555, |
|
"learning_rate": 9.99514040972383e-05, |
|
"loss": 1.356, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6216728827903856, |
|
"learning_rate": 9.994514092740015e-05, |
|
"loss": 1.3882, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6467739800460915, |
|
"learning_rate": 9.993849845741524e-05, |
|
"loss": 1.3765, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6503437970639988, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 1.373, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6657501610674698, |
|
"learning_rate": 9.992407582166581e-05, |
|
"loss": 1.3838, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6605689841115963, |
|
"learning_rate": 9.991629576543163e-05, |
|
"loss": 1.3716, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6989365655033877, |
|
"learning_rate": 9.990813662811051e-05, |
|
"loss": 1.3882, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6084957965701701, |
|
"learning_rate": 9.989959847166567e-05, |
|
"loss": 1.3545, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6699929146209974, |
|
"learning_rate": 9.989068136093873e-05, |
|
"loss": 1.3418, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6247455324530298, |
|
"learning_rate": 9.988138536364922e-05, |
|
"loss": 1.3486, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6622758669061856, |
|
"learning_rate": 9.987171055039408e-05, |
|
"loss": 1.3892, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6034683645026113, |
|
"learning_rate": 9.986165699464705e-05, |
|
"loss": 1.3491, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6266046141102322, |
|
"learning_rate": 9.985122477275824e-05, |
|
"loss": 1.3452, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6299383394087646, |
|
"learning_rate": 9.984041396395343e-05, |
|
"loss": 1.3569, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6104287148111909, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 1.333, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6519029188667027, |
|
"learning_rate": 9.981765691687388e-05, |
|
"loss": 1.3857, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6142161143427214, |
|
"learning_rate": 9.980571085142381e-05, |
|
"loss": 1.3228, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6229626554946482, |
|
"learning_rate": 9.979338654470569e-05, |
|
"loss": 1.3574, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6071740965934106, |
|
"learning_rate": 9.978068409031449e-05, |
|
"loss": 1.3379, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6180402576227992, |
|
"learning_rate": 9.976760358471686e-05, |
|
"loss": 1.3672, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6254634252353867, |
|
"learning_rate": 9.975414512725057e-05, |
|
"loss": 1.3525, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6146388668983097, |
|
"learning_rate": 9.974030882012367e-05, |
|
"loss": 1.3677, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.60548422624436, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 1.3271, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5960020566477471, |
|
"learning_rate": 9.97115030800669e-05, |
|
"loss": 1.3203, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5840389582557357, |
|
"learning_rate": 9.969653386589748e-05, |
|
"loss": 1.3457, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6017170229389899, |
|
"learning_rate": 9.968118723958668e-05, |
|
"loss": 1.3555, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.59548463038904, |
|
"learning_rate": 9.966546331768191e-05, |
|
"loss": 1.312, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6376362739222085, |
|
"learning_rate": 9.96493622195959e-05, |
|
"loss": 1.3896, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5924552743524675, |
|
"learning_rate": 9.963288406760582e-05, |
|
"loss": 1.3882, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5932204834859686, |
|
"learning_rate": 9.961602898685226e-05, |
|
"loss": 1.3228, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6343152356114848, |
|
"learning_rate": 9.959879710533835e-05, |
|
"loss": 1.3418, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.619176447518611, |
|
"learning_rate": 9.958118855392876e-05, |
|
"loss": 1.3511, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6059536670723797, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 1.3496, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5833794255152709, |
|
"learning_rate": 9.954484197918315e-05, |
|
"loss": 1.3047, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6201224777123214, |
|
"learning_rate": 9.952610423187516e-05, |
|
"loss": 1.3486, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6007754625054771, |
|
"learning_rate": 9.950699036672559e-05, |
|
"loss": 1.3281, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5942092191181105, |
|
"learning_rate": 9.94875005288915e-05, |
|
"loss": 1.3247, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5943876215982206, |
|
"learning_rate": 9.946763486638528e-05, |
|
"loss": 1.3286, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5851272119218502, |
|
"learning_rate": 9.944739353007344e-05, |
|
"loss": 1.333, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5736452322086703, |
|
"learning_rate": 9.942677667367541e-05, |
|
"loss": 1.3281, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5858368666665404, |
|
"learning_rate": 9.940578445376258e-05, |
|
"loss": 1.3408, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5951660769871204, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 1.332, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5673882981333603, |
|
"learning_rate": 9.936267456392971e-05, |
|
"loss": 1.29, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.57937244503091, |
|
"learning_rate": 9.934055722140061e-05, |
|
"loss": 1.3379, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5404623165150114, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 1.2832, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.570458957982483, |
|
"learning_rate": 9.929519858094843e-05, |
|
"loss": 1.2827, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6087016644937208, |
|
"learning_rate": 9.927195762749405e-05, |
|
"loss": 1.3218, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5870682794787072, |
|
"learning_rate": 9.92483424862726e-05, |
|
"loss": 1.3135, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5856272723632059, |
|
"learning_rate": 9.922435333662536e-05, |
|
"loss": 1.2881, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6111832032418365, |
|
"learning_rate": 9.9199990360734e-05, |
|
"loss": 1.3203, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5726861967258199, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 1.3179, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5884861410749106, |
|
"learning_rate": 9.915014367313888e-05, |
|
"loss": 1.3228, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6135395087168646, |
|
"learning_rate": 9.912466033998757e-05, |
|
"loss": 1.3335, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5645793699483922, |
|
"learning_rate": 9.90988039376942e-05, |
|
"loss": 1.3125, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5857606819583933, |
|
"learning_rate": 9.90725746626209e-05, |
|
"loss": 1.2744, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5831028711698518, |
|
"learning_rate": 9.904597271396162e-05, |
|
"loss": 1.311, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6348588219528606, |
|
"learning_rate": 9.901899829374047e-05, |
|
"loss": 1.3452, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5603970131542654, |
|
"learning_rate": 9.899165160681025e-05, |
|
"loss": 1.2964, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5865675897271678, |
|
"learning_rate": 9.896393286085084e-05, |
|
"loss": 1.3071, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5784164637201951, |
|
"learning_rate": 9.893584226636772e-05, |
|
"loss": 1.3008, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5710383398686641, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 1.2886, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5846130781014983, |
|
"learning_rate": 9.887854638797023e-05, |
|
"loss": 1.3096, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5880954219156209, |
|
"learning_rate": 9.884934153917997e-05, |
|
"loss": 1.3145, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5637844902513754, |
|
"learning_rate": 9.88197657121109e-05, |
|
"loss": 1.29, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5876469941096398, |
|
"learning_rate": 9.878981913137179e-05, |
|
"loss": 1.3418, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6000214582832701, |
|
"learning_rate": 9.8759502024387e-05, |
|
"loss": 1.2896, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5839730365390634, |
|
"learning_rate": 9.872881462139479e-05, |
|
"loss": 1.2705, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5990677563429729, |
|
"learning_rate": 9.869775715544562e-05, |
|
"loss": 1.3071, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5953706456388055, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 1.2959, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5817561833721856, |
|
"learning_rate": 9.86345329809282e-05, |
|
"loss": 1.2852, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5744763014438757, |
|
"learning_rate": 9.860236675250552e-05, |
|
"loss": 1.2783, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.597834970808429, |
|
"learning_rate": 9.856983142141339e-05, |
|
"loss": 1.2925, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.58712068164488, |
|
"learning_rate": 9.8536927234736e-05, |
|
"loss": 1.3042, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5750527697531876, |
|
"learning_rate": 9.85036544423588e-05, |
|
"loss": 1.2734, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5809004038351853, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 1.2886, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5913258260888848, |
|
"learning_rate": 9.843600405404131e-05, |
|
"loss": 1.2871, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6178847163930624, |
|
"learning_rate": 9.840162697186075e-05, |
|
"loss": 1.3066, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.569525516075595, |
|
"learning_rate": 9.836688231149592e-05, |
|
"loss": 1.2866, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6019297014242129, |
|
"learning_rate": 9.833177033680944e-05, |
|
"loss": 1.2881, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5570240623213132, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 1.2739, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6690303999031133, |
|
"learning_rate": 9.826044551386744e-05, |
|
"loss": 1.3208, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5869605252956118, |
|
"learning_rate": 9.822423320727654e-05, |
|
"loss": 1.3271, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6041810553976592, |
|
"learning_rate": 9.818765466968909e-05, |
|
"loss": 1.3071, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6055207100602872, |
|
"learning_rate": 9.815071017889482e-05, |
|
"loss": 1.3208, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.598286558624508, |
|
"learning_rate": 9.811340001546251e-05, |
|
"loss": 1.2842, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.581665353584805, |
|
"learning_rate": 9.807572446273814e-05, |
|
"loss": 1.2959, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5983711752241108, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 1.3027, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6044826878147297, |
|
"learning_rate": 9.799927833666887e-05, |
|
"loss": 1.3169, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5879408855078629, |
|
"learning_rate": 9.796050834388149e-05, |
|
"loss": 1.2935, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5963062749845591, |
|
"learning_rate": 9.792137412291265e-05, |
|
"loss": 1.2979, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5897254261995125, |
|
"learning_rate": 9.788187597096069e-05, |
|
"loss": 1.3018, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.578076556919417, |
|
"learning_rate": 9.784201418798786e-05, |
|
"loss": 1.2939, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5718451016522863, |
|
"learning_rate": 9.780178907671789e-05, |
|
"loss": 1.2871, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5799606456697325, |
|
"learning_rate": 9.776120094263376e-05, |
|
"loss": 1.2803, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5668567319879788, |
|
"learning_rate": 9.772025009397537e-05, |
|
"loss": 1.2905, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5789124237655097, |
|
"learning_rate": 9.767893684173721e-05, |
|
"loss": 1.271, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.585619412138699, |
|
"learning_rate": 9.763726149966596e-05, |
|
"loss": 1.3115, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5859239926184487, |
|
"learning_rate": 9.759522438425813e-05, |
|
"loss": 1.29, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5735625871596765, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.2432, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5854811026507778, |
|
"learning_rate": 9.751006611315356e-05, |
|
"loss": 1.3008, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5900623773496287, |
|
"learning_rate": 9.746694560417731e-05, |
|
"loss": 1.2822, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5822405028437867, |
|
"learning_rate": 9.742346461530048e-05, |
|
"loss": 1.2822, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.572768675663712, |
|
"learning_rate": 9.737962347673231e-05, |
|
"loss": 1.2783, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6054400604030475, |
|
"learning_rate": 9.733542252141711e-05, |
|
"loss": 1.292, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5813631045634108, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 1.2803, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5807050077512271, |
|
"learning_rate": 9.724594250598311e-05, |
|
"loss": 1.2949, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5909109801864655, |
|
"learning_rate": 9.720066412540554e-05, |
|
"loss": 1.2695, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5997130269760653, |
|
"learning_rate": 9.715502728715826e-05, |
|
"loss": 1.3262, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5949543658594536, |
|
"learning_rate": 9.710903233782272e-05, |
|
"loss": 1.2852, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.57306909788563, |
|
"learning_rate": 9.706267962669998e-05, |
|
"loss": 1.2896, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5729420362088954, |
|
"learning_rate": 9.701596950580806e-05, |
|
"loss": 1.2944, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5840523135217965, |
|
"learning_rate": 9.696890232987931e-05, |
|
"loss": 1.3315, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5838059803168936, |
|
"learning_rate": 9.692147845635761e-05, |
|
"loss": 1.2759, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.598956713705172, |
|
"learning_rate": 9.687369824539577e-05, |
|
"loss": 1.2949, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5684032035075478, |
|
"learning_rate": 9.682556205985274e-05, |
|
"loss": 1.2656, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6042525592488565, |
|
"learning_rate": 9.677707026529086e-05, |
|
"loss": 1.2734, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5948932876750508, |
|
"learning_rate": 9.672822322997305e-05, |
|
"loss": 1.3013, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5849401513666068, |
|
"learning_rate": 9.667902132486009e-05, |
|
"loss": 1.2871, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5735157149486471, |
|
"learning_rate": 9.662946492360776e-05, |
|
"loss": 1.2852, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.584305724268113, |
|
"learning_rate": 9.657955440256395e-05, |
|
"loss": 1.2622, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5564763345369137, |
|
"learning_rate": 9.652929014076593e-05, |
|
"loss": 1.2876, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5891098312264256, |
|
"learning_rate": 9.647867251993734e-05, |
|
"loss": 1.2642, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5686838714294669, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 1.272, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5750296942467902, |
|
"learning_rate": 9.637637874149779e-05, |
|
"loss": 1.2275, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5643960417977013, |
|
"learning_rate": 9.632470336074009e-05, |
|
"loss": 1.2671, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5852327056741696, |
|
"learning_rate": 9.627267617465243e-05, |
|
"loss": 1.2661, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6187631168037819, |
|
"learning_rate": 9.62202975783467e-05, |
|
"loss": 1.3086, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5975346392701849, |
|
"learning_rate": 9.616756796960353e-05, |
|
"loss": 1.2822, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5861121272844693, |
|
"learning_rate": 9.611448774886924e-05, |
|
"loss": 1.2686, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5894840252008043, |
|
"learning_rate": 9.606105731925283e-05, |
|
"loss": 1.2729, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5611807927613827, |
|
"learning_rate": 9.600727708652289e-05, |
|
"loss": 1.2593, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5963397000712469, |
|
"learning_rate": 9.595314745910456e-05, |
|
"loss": 1.2539, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5735424943279671, |
|
"learning_rate": 9.589866884807635e-05, |
|
"loss": 1.2842, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5721828195245187, |
|
"learning_rate": 9.584384166716714e-05, |
|
"loss": 1.2588, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.581984160350711, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 1.2769, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5783634261178175, |
|
"learning_rate": 9.573314326385359e-05, |
|
"loss": 1.2812, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5970813411028905, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 1.2666, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.586596312906345, |
|
"learning_rate": 9.562105561188069e-05, |
|
"loss": 1.269, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.598789269026695, |
|
"learning_rate": 9.556449188003831e-05, |
|
"loss": 1.312, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5817408021330366, |
|
"learning_rate": 9.550758211616684e-05, |
|
"loss": 1.2749, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5877525727442802, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 1.2949, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.565735316640337, |
|
"learning_rate": 9.539272622372858e-05, |
|
"loss": 1.2646, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5758462655500972, |
|
"learning_rate": 9.533478096741597e-05, |
|
"loss": 1.2842, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5684311218285671, |
|
"learning_rate": 9.527649142357596e-05, |
|
"loss": 1.2607, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5618003842331919, |
|
"learning_rate": 9.521785803487889e-05, |
|
"loss": 1.248, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5876045436622994, |
|
"learning_rate": 9.515888124660638e-05, |
|
"loss": 1.2642, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5655105832120266, |
|
"learning_rate": 9.509956150664796e-05, |
|
"loss": 1.2764, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5729197026608559, |
|
"learning_rate": 9.50398992654976e-05, |
|
"loss": 1.2812, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6001679004926507, |
|
"learning_rate": 9.497989497625035e-05, |
|
"loss": 1.2935, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5764744840651138, |
|
"learning_rate": 9.491954909459895e-05, |
|
"loss": 1.2363, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5929339319221091, |
|
"learning_rate": 9.485886207883022e-05, |
|
"loss": 1.2974, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5764385418022675, |
|
"learning_rate": 9.479783438982172e-05, |
|
"loss": 1.2925, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5837028558309609, |
|
"learning_rate": 9.473646649103818e-05, |
|
"loss": 1.2891, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5650735749077636, |
|
"learning_rate": 9.4674758848528e-05, |
|
"loss": 1.2334, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5586611238688065, |
|
"learning_rate": 9.46127119309197e-05, |
|
"loss": 1.2305, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6080687697649292, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 1.2827, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5993996873094892, |
|
"learning_rate": 9.448760215780217e-05, |
|
"loss": 1.2695, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5778443809302056, |
|
"learning_rate": 9.442454025241847e-05, |
|
"loss": 1.2744, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5746167067960812, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 1.2754, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5733796335171946, |
|
"learning_rate": 9.42974047985639e-05, |
|
"loss": 1.2627, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5836676487926156, |
|
"learning_rate": 9.42333322156023e-05, |
|
"loss": 1.2583, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5553156591047226, |
|
"learning_rate": 9.416892370988444e-05, |
|
"loss": 1.2373, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.582882964454643, |
|
"learning_rate": 9.410417977055011e-05, |
|
"loss": 1.2417, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5669189146341135, |
|
"learning_rate": 9.403910088928651e-05, |
|
"loss": 1.248, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5851076716461637, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 1.2485, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5763454225514788, |
|
"learning_rate": 9.390794028043474e-05, |
|
"loss": 1.2559, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5670534619234323, |
|
"learning_rate": 9.384185954892422e-05, |
|
"loss": 1.2524, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5649816822215726, |
|
"learning_rate": 9.377544586763215e-05, |
|
"loss": 1.2646, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5654358097466196, |
|
"learning_rate": 9.370869974092629e-05, |
|
"loss": 1.23, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5870883099911602, |
|
"learning_rate": 9.364162167569907e-05, |
|
"loss": 1.2319, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5828901563721925, |
|
"learning_rate": 9.357421218136386e-05, |
|
"loss": 1.2515, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5601565336888377, |
|
"learning_rate": 9.350647176985095e-05, |
|
"loss": 1.2588, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5817838980314579, |
|
"learning_rate": 9.343840095560372e-05, |
|
"loss": 1.2612, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5789842413499999, |
|
"learning_rate": 9.337000025557476e-05, |
|
"loss": 1.2642, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5840330292514332, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 1.2705, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5505547742761182, |
|
"learning_rate": 9.323221127850441e-05, |
|
"loss": 1.2285, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.590833567081984, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 1.2666, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5716485112550096, |
|
"learning_rate": 9.309310902429472e-05, |
|
"loss": 1.2563, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5927741075240744, |
|
"learning_rate": 9.30230667371917e-05, |
|
"loss": 1.2559, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5713472314685684, |
|
"learning_rate": 9.295269771849427e-05, |
|
"loss": 1.2632, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5553599915751299, |
|
"learning_rate": 9.288200250260836e-05, |
|
"loss": 1.2393, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5541231337910232, |
|
"learning_rate": 9.281098162641714e-05, |
|
"loss": 1.2393, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5756192048225591, |
|
"learning_rate": 9.273963562927695e-05, |
|
"loss": 1.2627, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5607724586820175, |
|
"learning_rate": 9.266796505301322e-05, |
|
"loss": 1.2319, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5829558605752644, |
|
"learning_rate": 9.259597044191636e-05, |
|
"loss": 1.2144, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5462589451489466, |
|
"learning_rate": 9.252365234273755e-05, |
|
"loss": 1.249, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5728804325543755, |
|
"learning_rate": 9.24510113046847e-05, |
|
"loss": 1.2725, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5661301120279436, |
|
"learning_rate": 9.237804787941819e-05, |
|
"loss": 1.251, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5590909151931563, |
|
"learning_rate": 9.230476262104677e-05, |
|
"loss": 1.2544, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.547954794791917, |
|
"learning_rate": 9.223115608612325e-05, |
|
"loss": 1.2505, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5532925940395454, |
|
"learning_rate": 9.215722883364033e-05, |
|
"loss": 1.2173, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5384117130456804, |
|
"learning_rate": 9.208298142502636e-05, |
|
"loss": 1.27, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5603423300490713, |
|
"learning_rate": 9.200841442414106e-05, |
|
"loss": 1.2266, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5367634686371322, |
|
"learning_rate": 9.193352839727121e-05, |
|
"loss": 1.2163, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5645847437540861, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 1.2354, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5663009948987631, |
|
"learning_rate": 9.17828015428348e-05, |
|
"loss": 1.2354, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5552105400298469, |
|
"learning_rate": 9.17069618599385e-05, |
|
"loss": 1.2383, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5754960899676003, |
|
"learning_rate": 9.163080544038952e-05, |
|
"loss": 1.2456, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.563524941652744, |
|
"learning_rate": 9.155433286254525e-05, |
|
"loss": 1.2554, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5517648385505713, |
|
"learning_rate": 9.147754470716408e-05, |
|
"loss": 1.2266, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5559354777913459, |
|
"learning_rate": 9.140044155740101e-05, |
|
"loss": 1.2661, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5533913100102068, |
|
"learning_rate": 9.132302399880321e-05, |
|
"loss": 1.2559, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5643293887010528, |
|
"learning_rate": 9.124529261930559e-05, |
|
"loss": 1.2612, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5582480686922173, |
|
"learning_rate": 9.116724800922629e-05, |
|
"loss": 1.2466, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.596671009095723, |
|
"learning_rate": 9.108889076126226e-05, |
|
"loss": 1.2827, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5549405425453204, |
|
"learning_rate": 9.101022147048473e-05, |
|
"loss": 1.2354, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5568898420832058, |
|
"learning_rate": 9.093124073433463e-05, |
|
"loss": 1.2285, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5495005874150289, |
|
"learning_rate": 9.085194915261818e-05, |
|
"loss": 1.2461, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5707362551733327, |
|
"learning_rate": 9.077234732750224e-05, |
|
"loss": 1.2231, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5598850464506612, |
|
"learning_rate": 9.069243586350975e-05, |
|
"loss": 1.2583, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5524944518616185, |
|
"learning_rate": 9.061221536751517e-05, |
|
"loss": 1.2222, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.545356383712922, |
|
"learning_rate": 9.053168644873984e-05, |
|
"loss": 1.2178, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5762727272844113, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.2349, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5725082474964974, |
|
"learning_rate": 9.0369705791439e-05, |
|
"loss": 1.2632, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5621844240082758, |
|
"learning_rate": 9.028825528304892e-05, |
|
"loss": 1.2373, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5634252832307994, |
|
"learning_rate": 9.020649881213958e-05, |
|
"loss": 1.2554, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5813087529333412, |
|
"learning_rate": 9.012443699959705e-05, |
|
"loss": 1.2505, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5606046951042896, |
|
"learning_rate": 9.004207046862624e-05, |
|
"loss": 1.2734, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5619672577392087, |
|
"learning_rate": 8.995939984474624e-05, |
|
"loss": 1.2349, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5872821822263775, |
|
"learning_rate": 8.987642575578545e-05, |
|
"loss": 1.2314, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5715303843479439, |
|
"learning_rate": 8.979314883187693e-05, |
|
"loss": 1.2227, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5741431027758869, |
|
"learning_rate": 8.970956970545355e-05, |
|
"loss": 1.2271, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6059321787013856, |
|
"learning_rate": 8.962568901124327e-05, |
|
"loss": 1.2534, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.556501982362245, |
|
"learning_rate": 8.954150738626414e-05, |
|
"loss": 1.2363, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6086209059398, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 1.2847, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5564876767683732, |
|
"learning_rate": 8.93722439034939e-05, |
|
"loss": 1.2153, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5942368515326768, |
|
"learning_rate": 8.928716333114643e-05, |
|
"loss": 1.2588, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5662096725519149, |
|
"learning_rate": 8.920178439890765e-05, |
|
"loss": 1.2441, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5683718909670799, |
|
"learning_rate": 8.911610775517382e-05, |
|
"loss": 1.2275, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5430162332075814, |
|
"learning_rate": 8.903013405060211e-05, |
|
"loss": 1.2188, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5646245018782939, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 1.2305, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5775822002852916, |
|
"learning_rate": 8.885729807284856e-05, |
|
"loss": 1.2432, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5984775169761717, |
|
"learning_rate": 8.877043711224108e-05, |
|
"loss": 1.2598, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6320516038682688, |
|
"learning_rate": 8.868328171593448e-05, |
|
"loss": 1.2437, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5529680708320875, |
|
"learning_rate": 8.859583254581605e-05, |
|
"loss": 1.2344, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5742550531241745, |
|
"learning_rate": 8.85080902660041e-05, |
|
"loss": 1.23, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5543548657925883, |
|
"learning_rate": 8.842005554284296e-05, |
|
"loss": 1.2632, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5639830569646097, |
|
"learning_rate": 8.83317290448978e-05, |
|
"loss": 1.2314, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5824488432279822, |
|
"learning_rate": 8.824311144294965e-05, |
|
"loss": 1.2661, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5586136460629528, |
|
"learning_rate": 8.815420340999033e-05, |
|
"loss": 1.1987, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5642272069241788, |
|
"learning_rate": 8.806500562121723e-05, |
|
"loss": 1.21, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5987204031332185, |
|
"learning_rate": 8.797551875402827e-05, |
|
"loss": 1.2246, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5530693719354123, |
|
"learning_rate": 8.788574348801675e-05, |
|
"loss": 1.2202, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.532182798905587, |
|
"learning_rate": 8.77956805049661e-05, |
|
"loss": 1.2026, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5554774523967565, |
|
"learning_rate": 8.770533048884482e-05, |
|
"loss": 1.2256, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5496250811271655, |
|
"learning_rate": 8.761469412580125e-05, |
|
"loss": 1.2197, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5513817728642723, |
|
"learning_rate": 8.75237721041583e-05, |
|
"loss": 1.2026, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5783631239023462, |
|
"learning_rate": 8.74325651144083e-05, |
|
"loss": 1.2666, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5570391531595814, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 1.2158, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5991957140636426, |
|
"learning_rate": 8.724929900337186e-05, |
|
"loss": 1.27, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5642120830729553, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 1.2095, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5848386239977618, |
|
"learning_rate": 8.706490135981855e-05, |
|
"loss": 1.2495, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5511112850774736, |
|
"learning_rate": 8.697227996247861e-05, |
|
"loss": 1.2305, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5846633795708265, |
|
"learning_rate": 8.687937778524786e-05, |
|
"loss": 1.209, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5540295140780195, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 1.2354, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5414463070611637, |
|
"learning_rate": 8.669273391536204e-05, |
|
"loss": 1.2344, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5427098846896791, |
|
"learning_rate": 8.659899364014309e-05, |
|
"loss": 1.209, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5574933062513657, |
|
"learning_rate": 8.650497541989482e-05, |
|
"loss": 1.2178, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5444595039607957, |
|
"learning_rate": 8.641067996862311e-05, |
|
"loss": 1.2363, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5655086471261345, |
|
"learning_rate": 8.631610800243926e-05, |
|
"loss": 1.2236, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5676258965708015, |
|
"learning_rate": 8.622126023955446e-05, |
|
"loss": 1.2222, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5738164390287753, |
|
"learning_rate": 8.612613740027443e-05, |
|
"loss": 1.2437, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5763979630809666, |
|
"learning_rate": 8.603074020699393e-05, |
|
"loss": 1.2588, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5694093785228663, |
|
"learning_rate": 8.59350693841912e-05, |
|
"loss": 1.2305, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5668385068217152, |
|
"learning_rate": 8.583912565842257e-05, |
|
"loss": 1.2324, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5681247446685661, |
|
"learning_rate": 8.574290975831685e-05, |
|
"loss": 1.2461, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5705548704956539, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 1.2529, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5490624808218363, |
|
"learning_rate": 8.554966435993882e-05, |
|
"loss": 1.2119, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5327956614769455, |
|
"learning_rate": 8.545263632923687e-05, |
|
"loss": 1.2051, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5394754263176863, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 1.2207, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.547540468306315, |
|
"learning_rate": 8.525777328911846e-05, |
|
"loss": 1.2241, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5262754503627509, |
|
"learning_rate": 8.515993975955727e-05, |
|
"loss": 1.2227, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5839641855087577, |
|
"learning_rate": 8.506183921362443e-05, |
|
"loss": 1.228, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5658179501896037, |
|
"learning_rate": 8.49634723963284e-05, |
|
"loss": 1.2534, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5500813153743531, |
|
"learning_rate": 8.486484005469977e-05, |
|
"loss": 1.2104, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.547119722986805, |
|
"learning_rate": 8.476594293778561e-05, |
|
"loss": 1.1938, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5596429046688678, |
|
"learning_rate": 8.466678179664379e-05, |
|
"loss": 1.2148, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5705084720451127, |
|
"learning_rate": 8.456735738433723e-05, |
|
"loss": 1.2432, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5895336557197053, |
|
"learning_rate": 8.44676704559283e-05, |
|
"loss": 1.252, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5774427874239505, |
|
"learning_rate": 8.436772176847294e-05, |
|
"loss": 1.2251, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5394619079429321, |
|
"learning_rate": 8.4267512081015e-05, |
|
"loss": 1.2329, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5755505544475856, |
|
"learning_rate": 8.416704215458043e-05, |
|
"loss": 1.2471, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5725344759637591, |
|
"learning_rate": 8.406631275217156e-05, |
|
"loss": 1.2397, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5518081872615708, |
|
"learning_rate": 8.396532463876124e-05, |
|
"loss": 1.248, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5841438683442003, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 1.2339, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5513391183560247, |
|
"learning_rate": 8.376257534864553e-05, |
|
"loss": 1.2373, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5702720231441866, |
|
"learning_rate": 8.366081571168625e-05, |
|
"loss": 1.2202, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5401170183476215, |
|
"learning_rate": 8.355880044320598e-05, |
|
"loss": 1.2036, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5584668011986428, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 1.2109, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5651374075473236, |
|
"learning_rate": 8.335400611257067e-05, |
|
"loss": 1.2305, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5576999267528816, |
|
"learning_rate": 8.32512286056924e-05, |
|
"loss": 1.208, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5417887475825113, |
|
"learning_rate": 8.314819857783503e-05, |
|
"loss": 1.2212, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5697549522190456, |
|
"learning_rate": 8.304491681144306e-05, |
|
"loss": 1.2227, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5916950215563477, |
|
"learning_rate": 8.29413840908729e-05, |
|
"loss": 1.2256, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5429495837198551, |
|
"learning_rate": 8.283760120238672e-05, |
|
"loss": 1.2036, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5586180705362634, |
|
"learning_rate": 8.273356893414659e-05, |
|
"loss": 1.2095, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5575989259323167, |
|
"learning_rate": 8.262928807620843e-05, |
|
"loss": 1.2231, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5441346695675087, |
|
"learning_rate": 8.252475942051605e-05, |
|
"loss": 1.2056, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5639413959564339, |
|
"learning_rate": 8.241998376089508e-05, |
|
"loss": 1.2173, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.590712779900491, |
|
"learning_rate": 8.231496189304704e-05, |
|
"loss": 1.2568, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5777848614323381, |
|
"learning_rate": 8.220969461454322e-05, |
|
"loss": 1.2393, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5340681608181079, |
|
"learning_rate": 8.210418272481859e-05, |
|
"loss": 1.2041, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5677393547154248, |
|
"learning_rate": 8.199842702516583e-05, |
|
"loss": 1.2192, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5437413315036653, |
|
"learning_rate": 8.18924283187292e-05, |
|
"loss": 1.2139, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5659533238197777, |
|
"learning_rate": 8.178618741049842e-05, |
|
"loss": 1.207, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5070802266343845, |
|
"learning_rate": 8.167970510730253e-05, |
|
"loss": 1.1914, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5556418994287101, |
|
"learning_rate": 8.157298221780389e-05, |
|
"loss": 1.1938, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5473208831723899, |
|
"learning_rate": 8.146601955249188e-05, |
|
"loss": 1.2183, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5703775361988737, |
|
"learning_rate": 8.135881792367686e-05, |
|
"loss": 1.2417, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.561879279645762, |
|
"learning_rate": 8.125137814548393e-05, |
|
"loss": 1.2148, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.550588101278353, |
|
"learning_rate": 8.114370103384681e-05, |
|
"loss": 1.228, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5259832736436021, |
|
"learning_rate": 8.103578740650156e-05, |
|
"loss": 1.21, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5147619335698311, |
|
"learning_rate": 8.092763808298048e-05, |
|
"loss": 1.2026, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5504438033485523, |
|
"learning_rate": 8.081925388460578e-05, |
|
"loss": 1.2026, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5446547745345772, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 1.2236, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5282234491024778, |
|
"learning_rate": 8.060178415749674e-05, |
|
"loss": 1.2046, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5685704417565632, |
|
"learning_rate": 8.049270028030046e-05, |
|
"loss": 1.1948, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5387912641493281, |
|
"learning_rate": 8.038338483131407e-05, |
|
"loss": 1.1987, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5524731924712689, |
|
"learning_rate": 8.027383864071573e-05, |
|
"loss": 1.2261, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5326013165738928, |
|
"learning_rate": 8.016406254043595e-05, |
|
"loss": 1.1987, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5694990420557361, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 1.2246, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5512702902650021, |
|
"learning_rate": 7.994382394727784e-05, |
|
"loss": 1.25, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5627692554190333, |
|
"learning_rate": 7.983336312696522e-05, |
|
"loss": 1.2344, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5723746306616886, |
|
"learning_rate": 7.972267574208991e-05, |
|
"loss": 1.2266, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5785199319910487, |
|
"learning_rate": 7.961176263324901e-05, |
|
"loss": 1.2046, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5638166540487942, |
|
"learning_rate": 7.950062464275387e-05, |
|
"loss": 1.2124, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5591166049939134, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 1.2251, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5505127627644203, |
|
"learning_rate": 7.927767739457897e-05, |
|
"loss": 1.2158, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5650623189448578, |
|
"learning_rate": 7.916586983003533e-05, |
|
"loss": 1.208, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5691768474472332, |
|
"learning_rate": 7.905384077009693e-05, |
|
"loss": 1.1875, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5660613389542086, |
|
"learning_rate": 7.894159106554997e-05, |
|
"loss": 1.2227, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5582910482328445, |
|
"learning_rate": 7.882912156885637e-05, |
|
"loss": 1.2173, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5687428665442464, |
|
"learning_rate": 7.871643313414718e-05, |
|
"loss": 1.2188, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5700426706301734, |
|
"learning_rate": 7.860352661721619e-05, |
|
"loss": 1.2534, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5640156767511431, |
|
"learning_rate": 7.849040287551331e-05, |
|
"loss": 1.2256, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5730028379052688, |
|
"learning_rate": 7.837706276813819e-05, |
|
"loss": 1.2383, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5729196218457621, |
|
"learning_rate": 7.82635071558336e-05, |
|
"loss": 1.2539, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5389989913836808, |
|
"learning_rate": 7.814973690097893e-05, |
|
"loss": 1.2114, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.563817849905785, |
|
"learning_rate": 7.803575286758364e-05, |
|
"loss": 1.1978, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5742098449729457, |
|
"learning_rate": 7.79215559212807e-05, |
|
"loss": 1.2104, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5482109799457336, |
|
"learning_rate": 7.780714692932002e-05, |
|
"loss": 1.1978, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5619582390386062, |
|
"learning_rate": 7.769252676056187e-05, |
|
"loss": 1.2197, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5707713766775032, |
|
"learning_rate": 7.757769628547018e-05, |
|
"loss": 1.2349, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5491475936260862, |
|
"learning_rate": 7.746265637610613e-05, |
|
"loss": 1.1758, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5500938802153376, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 1.2041, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5293011700429642, |
|
"learning_rate": 7.723195175075136e-05, |
|
"loss": 1.1821, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5596089219360537, |
|
"learning_rate": 7.711628878680892e-05, |
|
"loss": 1.2539, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5754494198608471, |
|
"learning_rate": 7.700041989267736e-05, |
|
"loss": 1.2378, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5705286060826581, |
|
"learning_rate": 7.688434594830392e-05, |
|
"loss": 1.2192, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5320392604087235, |
|
"learning_rate": 7.676806783519304e-05, |
|
"loss": 1.2021, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5573096847631821, |
|
"learning_rate": 7.66515864363997e-05, |
|
"loss": 1.229, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5670482309405055, |
|
"learning_rate": 7.653490263652269e-05, |
|
"loss": 1.2324, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5214893797779285, |
|
"learning_rate": 7.641801732169795e-05, |
|
"loss": 1.1968, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5729861431933309, |
|
"learning_rate": 7.630093137959171e-05, |
|
"loss": 1.2163, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5552719599472679, |
|
"learning_rate": 7.618364569939391e-05, |
|
"loss": 1.2075, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5465863524732877, |
|
"learning_rate": 7.606616117181128e-05, |
|
"loss": 1.1968, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5748817703147777, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 1.2227, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5439693138286017, |
|
"learning_rate": 7.583059914486257e-05, |
|
"loss": 1.2031, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5635085757798922, |
|
"learning_rate": 7.571252343443349e-05, |
|
"loss": 1.2324, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5490256924391643, |
|
"learning_rate": 7.559425245448006e-05, |
|
"loss": 1.1953, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5581764204211717, |
|
"learning_rate": 7.547578710319174e-05, |
|
"loss": 1.2158, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5554727329505762, |
|
"learning_rate": 7.535712828023416e-05, |
|
"loss": 1.2236, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5395464303361225, |
|
"learning_rate": 7.52382768867422e-05, |
|
"loss": 1.2114, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5613556527369445, |
|
"learning_rate": 7.511923382531317e-05, |
|
"loss": 1.1792, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5802200387551621, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.1899, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5673594518905887, |
|
"learning_rate": 7.488057631630437e-05, |
|
"loss": 1.2236, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5439832310153052, |
|
"learning_rate": 7.476096368116974e-05, |
|
"loss": 1.2168, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.601300632455188, |
|
"learning_rate": 7.464116300297458e-05, |
|
"loss": 1.2534, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5489145931334055, |
|
"learning_rate": 7.452117519152542e-05, |
|
"loss": 1.2007, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5398362128758979, |
|
"learning_rate": 7.440100115804991e-05, |
|
"loss": 1.1743, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5816646831232165, |
|
"learning_rate": 7.428064181518997e-05, |
|
"loss": 1.2344, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5633431228968494, |
|
"learning_rate": 7.416009807699482e-05, |
|
"loss": 1.2017, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5589499609875991, |
|
"learning_rate": 7.403937085891397e-05, |
|
"loss": 1.2095, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5493151850885487, |
|
"learning_rate": 7.391846107779047e-05, |
|
"loss": 1.1865, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.540833710956897, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 1.2041, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.524036751110743, |
|
"learning_rate": 7.367609750071252e-05, |
|
"loss": 1.1826, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.569282857600664, |
|
"learning_rate": 7.355464554534837e-05, |
|
"loss": 1.187, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5524428783713417, |
|
"learning_rate": 7.343301470810808e-05, |
|
"loss": 1.2202, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5623114711099751, |
|
"learning_rate": 7.331120591269701e-05, |
|
"loss": 1.1899, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5301869310349011, |
|
"learning_rate": 7.318922008417203e-05, |
|
"loss": 1.1919, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5687590875970512, |
|
"learning_rate": 7.30670581489344e-05, |
|
"loss": 1.2056, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5589775032893604, |
|
"learning_rate": 7.294472103472281e-05, |
|
"loss": 1.2188, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.578465226917116, |
|
"learning_rate": 7.282220967060633e-05, |
|
"loss": 1.2158, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.542415194752666, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 1.187, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5438469592749641, |
|
"learning_rate": 7.257666791554448e-05, |
|
"loss": 1.1494, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.56103545827402, |
|
"learning_rate": 7.245363938932551e-05, |
|
"loss": 1.2085, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5543439263354124, |
|
"learning_rate": 7.233044034264034e-05, |
|
"loss": 1.186, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5621095189445257, |
|
"learning_rate": 7.220707171110382e-05, |
|
"loss": 1.2036, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.578507048853321, |
|
"learning_rate": 7.20835344316187e-05, |
|
"loss": 1.2158, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5393466899110284, |
|
"learning_rate": 7.195982944236851e-05, |
|
"loss": 1.1807, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5390437694953595, |
|
"learning_rate": 7.183595768281043e-05, |
|
"loss": 1.1914, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.525445074400616, |
|
"learning_rate": 7.171192009366814e-05, |
|
"loss": 1.1655, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5397543259123138, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 1.2139, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5301908429870645, |
|
"learning_rate": 7.146335119581523e-05, |
|
"loss": 1.2163, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5528560850589617, |
|
"learning_rate": 7.133882177482019e-05, |
|
"loss": 1.2046, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5223068286596114, |
|
"learning_rate": 7.121413029965769e-05, |
|
"loss": 1.1855, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5375221567597188, |
|
"learning_rate": 7.108927771727661e-05, |
|
"loss": 1.1841, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5528153279757392, |
|
"learning_rate": 7.096426497584933e-05, |
|
"loss": 1.2002, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5641067857153084, |
|
"learning_rate": 7.083909302476453e-05, |
|
"loss": 1.1914, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5543757360519023, |
|
"learning_rate": 7.071376281461994e-05, |
|
"loss": 1.2026, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5521178435951897, |
|
"learning_rate": 7.058827529721525e-05, |
|
"loss": 1.1816, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.545288440889916, |
|
"learning_rate": 7.04626314255447e-05, |
|
"loss": 1.2202, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5437815680869471, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 1.2031, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5551130898620283, |
|
"learning_rate": 7.021087843731302e-05, |
|
"loss": 1.189, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5553626624921362, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 1.2261, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5452825817203325, |
|
"learning_rate": 6.99585114974968e-05, |
|
"loss": 1.1768, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5642586993770626, |
|
"learning_rate": 6.98321001907167e-05, |
|
"loss": 1.1841, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5398272756531507, |
|
"learning_rate": 6.97055382723181e-05, |
|
"loss": 1.1865, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5522752869750288, |
|
"learning_rate": 6.957882670345458e-05, |
|
"loss": 1.2061, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5308798815860719, |
|
"learning_rate": 6.94519664464163e-05, |
|
"loss": 1.1948, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5495493844679439, |
|
"learning_rate": 6.932495846462261e-05, |
|
"loss": 1.1914, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5482416736955646, |
|
"learning_rate": 6.91978037226147e-05, |
|
"loss": 1.2017, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5503012321222697, |
|
"learning_rate": 6.90705031860483e-05, |
|
"loss": 1.2119, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5725638118414337, |
|
"learning_rate": 6.894305782168638e-05, |
|
"loss": 1.1899, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5624448379824697, |
|
"learning_rate": 6.881546859739179e-05, |
|
"loss": 1.23, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5561279665840796, |
|
"learning_rate": 6.868773648211983e-05, |
|
"loss": 1.2017, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5396579505393837, |
|
"learning_rate": 6.855986244591104e-05, |
|
"loss": 1.1733, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5593889291768533, |
|
"learning_rate": 6.843184745988373e-05, |
|
"loss": 1.2119, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5517593854010368, |
|
"learning_rate": 6.830369249622662e-05, |
|
"loss": 1.2114, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.565602467217196, |
|
"learning_rate": 6.817539852819149e-05, |
|
"loss": 1.1968, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5454828029904284, |
|
"learning_rate": 6.804696653008575e-05, |
|
"loss": 1.1938, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5312848354649821, |
|
"learning_rate": 6.7918397477265e-05, |
|
"loss": 1.1909, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5379892410248488, |
|
"learning_rate": 6.778969234612584e-05, |
|
"loss": 1.1733, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5640301128196445, |
|
"learning_rate": 6.76608521140981e-05, |
|
"loss": 1.1938, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5659554102145757, |
|
"learning_rate": 6.753187775963773e-05, |
|
"loss": 1.2192, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5330483555414703, |
|
"learning_rate": 6.740277026221923e-05, |
|
"loss": 1.2163, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5518117687862835, |
|
"learning_rate": 6.727353060232822e-05, |
|
"loss": 1.1904, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.556594787840222, |
|
"learning_rate": 6.714415976145402e-05, |
|
"loss": 1.2056, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5718671551889151, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 1.2271, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5512989214363787, |
|
"learning_rate": 6.688502846768696e-05, |
|
"loss": 1.2031, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5494732510246357, |
|
"learning_rate": 6.675526998272405e-05, |
|
"loss": 1.2119, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5378563166396855, |
|
"learning_rate": 6.662538425262285e-05, |
|
"loss": 1.1621, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5480052049772425, |
|
"learning_rate": 6.649537226377915e-05, |
|
"loss": 1.1841, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5636382985336955, |
|
"learning_rate": 6.636523500354759e-05, |
|
"loss": 1.2056, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5416198773488824, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 1.1646, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5478977270550187, |
|
"learning_rate": 6.610458862308872e-05, |
|
"loss": 1.1914, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5607213727964705, |
|
"learning_rate": 6.59740814822974e-05, |
|
"loss": 1.2012, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5434978380272973, |
|
"learning_rate": 6.584345302897523e-05, |
|
"loss": 1.167, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5616431237123319, |
|
"learning_rate": 6.571270425515843e-05, |
|
"loss": 1.1938, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5718848045230116, |
|
"learning_rate": 6.558183615379707e-05, |
|
"loss": 1.1968, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5468507624315109, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 1.1719, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5648647170806229, |
|
"learning_rate": 6.531974594476425e-05, |
|
"loss": 1.207, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5503576509488237, |
|
"learning_rate": 6.518852582749373e-05, |
|
"loss": 1.1992, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5609821252964683, |
|
"learning_rate": 6.505719036346539e-05, |
|
"loss": 1.1997, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5498062769196067, |
|
"learning_rate": 6.492574055008473e-05, |
|
"loss": 1.1875, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5539230993166063, |
|
"learning_rate": 6.479417738562576e-05, |
|
"loss": 1.1909, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5496978328792177, |
|
"learning_rate": 6.466250186922325e-05, |
|
"loss": 1.2139, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.558021764032463, |
|
"learning_rate": 6.45307150008652e-05, |
|
"loss": 1.1943, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5762104557001627, |
|
"learning_rate": 6.439881778138531e-05, |
|
"loss": 1.2148, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1176, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"total_flos": 4.0453184645024973e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|