quest-corruption-7b-s330-v3 / trainer_state.json
kalomaze's picture
Upload folder using huggingface_hub
4f6e68e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9977324263038548,
"eval_steps": 500,
"global_step": 330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030234315948601664,
"grad_norm": 0.6349862119317693,
"learning_rate": 5.000000000000001e-07,
"loss": 1.3237,
"step": 1
},
{
"epoch": 0.006046863189720333,
"grad_norm": 0.6915137231647266,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.3595,
"step": 2
},
{
"epoch": 0.009070294784580499,
"grad_norm": 0.623700079073619,
"learning_rate": 1.5e-06,
"loss": 1.343,
"step": 3
},
{
"epoch": 0.012093726379440665,
"grad_norm": 0.7242880491963869,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3527,
"step": 4
},
{
"epoch": 0.015117157974300832,
"grad_norm": 0.6516906859598985,
"learning_rate": 2.5e-06,
"loss": 1.3319,
"step": 5
},
{
"epoch": 0.018140589569160998,
"grad_norm": 0.5742747957897,
"learning_rate": 3e-06,
"loss": 1.342,
"step": 6
},
{
"epoch": 0.021164021164021163,
"grad_norm": 0.557815390462239,
"learning_rate": 3.5e-06,
"loss": 1.3152,
"step": 7
},
{
"epoch": 0.02418745275888133,
"grad_norm": 0.4620246107786041,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2963,
"step": 8
},
{
"epoch": 0.027210884353741496,
"grad_norm": 0.44763809541022137,
"learning_rate": 4.5e-06,
"loss": 1.2895,
"step": 9
},
{
"epoch": 0.030234315948601664,
"grad_norm": 0.3416187088663793,
"learning_rate": 5e-06,
"loss": 1.2531,
"step": 10
},
{
"epoch": 0.03325774754346183,
"grad_norm": 0.31917539621933483,
"learning_rate": 4.999970800043822e-06,
"loss": 1.2006,
"step": 11
},
{
"epoch": 0.036281179138321996,
"grad_norm": 0.27239571970104204,
"learning_rate": 4.9998832008573975e-06,
"loss": 1.1767,
"step": 12
},
{
"epoch": 0.039304610733182165,
"grad_norm": 0.32495241030295385,
"learning_rate": 4.999737204487039e-06,
"loss": 1.1951,
"step": 13
},
{
"epoch": 0.042328042328042326,
"grad_norm": 0.31114523478470957,
"learning_rate": 4.999532814343219e-06,
"loss": 1.1474,
"step": 14
},
{
"epoch": 0.045351473922902494,
"grad_norm": 0.26573282398874887,
"learning_rate": 4.999270035200483e-06,
"loss": 1.1684,
"step": 15
},
{
"epoch": 0.04837490551776266,
"grad_norm": 0.27675989125666167,
"learning_rate": 4.998948873197342e-06,
"loss": 1.142,
"step": 16
},
{
"epoch": 0.05139833711262283,
"grad_norm": 0.2341024474066861,
"learning_rate": 4.99856933583613e-06,
"loss": 1.1735,
"step": 17
},
{
"epoch": 0.05442176870748299,
"grad_norm": 0.20679018253539813,
"learning_rate": 4.998131431982826e-06,
"loss": 1.0896,
"step": 18
},
{
"epoch": 0.05744520030234316,
"grad_norm": 0.21159362728987222,
"learning_rate": 4.9976351718668485e-06,
"loss": 1.1191,
"step": 19
},
{
"epoch": 0.06046863189720333,
"grad_norm": 0.19379985234830382,
"learning_rate": 4.9970805670808174e-06,
"loss": 1.1162,
"step": 20
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.2039064731806591,
"learning_rate": 4.9964676305802794e-06,
"loss": 1.1155,
"step": 21
},
{
"epoch": 0.06651549508692366,
"grad_norm": 0.22133580902562022,
"learning_rate": 4.995796376683411e-06,
"loss": 1.0603,
"step": 22
},
{
"epoch": 0.06953892668178382,
"grad_norm": 0.24913058306438574,
"learning_rate": 4.9950668210706795e-06,
"loss": 1.0854,
"step": 23
},
{
"epoch": 0.07256235827664399,
"grad_norm": 0.22434864947712013,
"learning_rate": 4.994278980784478e-06,
"loss": 1.0601,
"step": 24
},
{
"epoch": 0.07558578987150416,
"grad_norm": 0.18349247230596857,
"learning_rate": 4.9934328742287285e-06,
"loss": 1.1042,
"step": 25
},
{
"epoch": 0.07860922146636433,
"grad_norm": 0.1585429266996897,
"learning_rate": 4.992528521168449e-06,
"loss": 1.0409,
"step": 26
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.16168593725598268,
"learning_rate": 4.991565942729298e-06,
"loss": 1.0341,
"step": 27
},
{
"epoch": 0.08465608465608465,
"grad_norm": 0.19566832668054185,
"learning_rate": 4.990545161397073e-06,
"loss": 1.0689,
"step": 28
},
{
"epoch": 0.08767951625094482,
"grad_norm": 0.2499930738278608,
"learning_rate": 4.989466201017188e-06,
"loss": 1.0096,
"step": 29
},
{
"epoch": 0.09070294784580499,
"grad_norm": 0.2779488624344162,
"learning_rate": 4.988329086794122e-06,
"loss": 1.0609,
"step": 30
},
{
"epoch": 0.09372637944066516,
"grad_norm": 0.2244846945907016,
"learning_rate": 4.987133845290823e-06,
"loss": 1.0366,
"step": 31
},
{
"epoch": 0.09674981103552532,
"grad_norm": 0.17994766023159892,
"learning_rate": 4.98588050442809e-06,
"loss": 1.0314,
"step": 32
},
{
"epoch": 0.09977324263038549,
"grad_norm": 0.22279237142259942,
"learning_rate": 4.984569093483922e-06,
"loss": 1.0445,
"step": 33
},
{
"epoch": 0.10279667422524566,
"grad_norm": 0.2494526014297992,
"learning_rate": 4.983199643092833e-06,
"loss": 1.0344,
"step": 34
},
{
"epoch": 0.10582010582010581,
"grad_norm": 0.21434458455232053,
"learning_rate": 4.981772185245135e-06,
"loss": 1.0421,
"step": 35
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.18307769428152484,
"learning_rate": 4.980286753286196e-06,
"loss": 0.9864,
"step": 36
},
{
"epoch": 0.11186696900982615,
"grad_norm": 0.21179293089346243,
"learning_rate": 4.97874338191565e-06,
"loss": 0.9842,
"step": 37
},
{
"epoch": 0.11489040060468632,
"grad_norm": 0.23379777419897857,
"learning_rate": 4.977142107186602e-06,
"loss": 0.9955,
"step": 38
},
{
"epoch": 0.11791383219954649,
"grad_norm": 0.20298340697744424,
"learning_rate": 4.975482966504772e-06,
"loss": 0.9957,
"step": 39
},
{
"epoch": 0.12093726379440665,
"grad_norm": 0.22788321802784506,
"learning_rate": 4.973765998627628e-06,
"loss": 0.9909,
"step": 40
},
{
"epoch": 0.12396069538926682,
"grad_norm": 0.22447377185154144,
"learning_rate": 4.97199124366348e-06,
"loss": 0.9995,
"step": 41
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.19695029744427425,
"learning_rate": 4.970158743070542e-06,
"loss": 0.9781,
"step": 42
},
{
"epoch": 0.13000755857898716,
"grad_norm": 0.178963231333608,
"learning_rate": 4.9682685396559625e-06,
"loss": 0.9779,
"step": 43
},
{
"epoch": 0.1330309901738473,
"grad_norm": 0.1873471219218099,
"learning_rate": 4.966320677574828e-06,
"loss": 0.9796,
"step": 44
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.22949932135410833,
"learning_rate": 4.964315202329127e-06,
"loss": 0.9965,
"step": 45
},
{
"epoch": 0.13907785336356765,
"grad_norm": 0.2274052062281532,
"learning_rate": 4.9622521607666936e-06,
"loss": 0.9625,
"step": 46
},
{
"epoch": 0.1421012849584278,
"grad_norm": 0.1806669455946557,
"learning_rate": 4.960131601080104e-06,
"loss": 0.9807,
"step": 47
},
{
"epoch": 0.14512471655328799,
"grad_norm": 0.19467061044424094,
"learning_rate": 4.957953572805558e-06,
"loss": 0.9615,
"step": 48
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.2731410757300855,
"learning_rate": 4.9557181268217225e-06,
"loss": 0.9819,
"step": 49
},
{
"epoch": 0.15117157974300832,
"grad_norm": 0.19042718807008738,
"learning_rate": 4.953425315348534e-06,
"loss": 0.9547,
"step": 50
},
{
"epoch": 0.15419501133786848,
"grad_norm": 0.16643927370098177,
"learning_rate": 4.9510751919459895e-06,
"loss": 0.9892,
"step": 51
},
{
"epoch": 0.15721844293272866,
"grad_norm": 0.2524323083468839,
"learning_rate": 4.94866781151289e-06,
"loss": 1.0181,
"step": 52
},
{
"epoch": 0.1602418745275888,
"grad_norm": 0.27545197371921265,
"learning_rate": 4.946203230285558e-06,
"loss": 0.9713,
"step": 53
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.17013540947461778,
"learning_rate": 4.943681505836523e-06,
"loss": 1.0005,
"step": 54
},
{
"epoch": 0.16628873771730915,
"grad_norm": 0.18283369295290966,
"learning_rate": 4.941102697073181e-06,
"loss": 0.9183,
"step": 55
},
{
"epoch": 0.1693121693121693,
"grad_norm": 0.2189807492467087,
"learning_rate": 4.938466864236413e-06,
"loss": 0.9683,
"step": 56
},
{
"epoch": 0.17233560090702948,
"grad_norm": 0.2766806847549335,
"learning_rate": 4.935774068899184e-06,
"loss": 0.958,
"step": 57
},
{
"epoch": 0.17535903250188964,
"grad_norm": 0.2295270706172793,
"learning_rate": 4.933024373965097e-06,
"loss": 0.9399,
"step": 58
},
{
"epoch": 0.17838246409674982,
"grad_norm": 0.20415845821236425,
"learning_rate": 4.930217843666929e-06,
"loss": 0.9677,
"step": 59
},
{
"epoch": 0.18140589569160998,
"grad_norm": 0.18705886763979152,
"learning_rate": 4.927354543565131e-06,
"loss": 0.9453,
"step": 60
},
{
"epoch": 0.18442932728647016,
"grad_norm": 0.25228689054978015,
"learning_rate": 4.924434540546291e-06,
"loss": 0.9639,
"step": 61
},
{
"epoch": 0.1874527588813303,
"grad_norm": 0.2685784416971121,
"learning_rate": 4.921457902821578e-06,
"loss": 0.9561,
"step": 62
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.24674154778238747,
"learning_rate": 4.918424699925146e-06,
"loss": 0.952,
"step": 63
},
{
"epoch": 0.19349962207105065,
"grad_norm": 0.19937803912058571,
"learning_rate": 4.915335002712506e-06,
"loss": 0.9158,
"step": 64
},
{
"epoch": 0.1965230536659108,
"grad_norm": 0.21943107617585558,
"learning_rate": 4.912188883358879e-06,
"loss": 0.9622,
"step": 65
},
{
"epoch": 0.19954648526077098,
"grad_norm": 0.20789781104002328,
"learning_rate": 4.9089864153575016e-06,
"loss": 0.9432,
"step": 66
},
{
"epoch": 0.20256991685563114,
"grad_norm": 0.21625333461538526,
"learning_rate": 4.9057276735179134e-06,
"loss": 0.9136,
"step": 67
},
{
"epoch": 0.20559334845049132,
"grad_norm": 0.20774782340550482,
"learning_rate": 4.902412733964212e-06,
"loss": 0.9205,
"step": 68
},
{
"epoch": 0.20861678004535147,
"grad_norm": 0.23205941698573587,
"learning_rate": 4.899041674133266e-06,
"loss": 0.9193,
"step": 69
},
{
"epoch": 0.21164021164021163,
"grad_norm": 0.20096610581169602,
"learning_rate": 4.895614572772916e-06,
"loss": 0.9332,
"step": 70
},
{
"epoch": 0.2146636432350718,
"grad_norm": 0.18733010074274722,
"learning_rate": 4.89213150994013e-06,
"loss": 0.9562,
"step": 71
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.2131500035254074,
"learning_rate": 4.888592566999134e-06,
"loss": 0.978,
"step": 72
},
{
"epoch": 0.22071050642479215,
"grad_norm": 0.25995206465303416,
"learning_rate": 4.884997826619512e-06,
"loss": 0.9615,
"step": 73
},
{
"epoch": 0.2237339380196523,
"grad_norm": 0.20122899473383501,
"learning_rate": 4.88134737277427e-06,
"loss": 0.9223,
"step": 74
},
{
"epoch": 0.22675736961451248,
"grad_norm": 0.20082627865414718,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.9129,
"step": 75
},
{
"epoch": 0.22978080120937264,
"grad_norm": 0.22559902896183986,
"learning_rate": 4.873879667084301e-06,
"loss": 0.9331,
"step": 76
},
{
"epoch": 0.2328042328042328,
"grad_norm": 0.24097328648057836,
"learning_rate": 4.870062589684917e-06,
"loss": 0.9302,
"step": 77
},
{
"epoch": 0.23582766439909297,
"grad_norm": 0.2191859905396367,
"learning_rate": 4.866190147706525e-06,
"loss": 0.906,
"step": 78
},
{
"epoch": 0.23885109599395313,
"grad_norm": 0.1927603541449588,
"learning_rate": 4.862262431609235e-06,
"loss": 0.9158,
"step": 79
},
{
"epoch": 0.2418745275888133,
"grad_norm": 0.20091846606347583,
"learning_rate": 4.858279533144358e-06,
"loss": 0.9241,
"step": 80
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.19776572498006212,
"learning_rate": 4.854241545352262e-06,
"loss": 0.908,
"step": 81
},
{
"epoch": 0.24792139077853365,
"grad_norm": 0.19142342325998066,
"learning_rate": 4.8501485625602e-06,
"loss": 0.9031,
"step": 82
},
{
"epoch": 0.2509448223733938,
"grad_norm": 0.255824517812554,
"learning_rate": 4.846000680380106e-06,
"loss": 0.896,
"step": 83
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.23838401037023174,
"learning_rate": 4.841797995706362e-06,
"loss": 0.9169,
"step": 84
},
{
"epoch": 0.25699168556311414,
"grad_norm": 0.20594758086068155,
"learning_rate": 4.837540606713538e-06,
"loss": 0.9293,
"step": 85
},
{
"epoch": 0.2600151171579743,
"grad_norm": 0.21813818048500913,
"learning_rate": 4.833228612854088e-06,
"loss": 0.9194,
"step": 86
},
{
"epoch": 0.26303854875283444,
"grad_norm": 0.23454835369326738,
"learning_rate": 4.828862114856038e-06,
"loss": 0.9214,
"step": 87
},
{
"epoch": 0.2660619803476946,
"grad_norm": 0.2204000662732641,
"learning_rate": 4.824441214720629e-06,
"loss": 0.907,
"step": 88
},
{
"epoch": 0.2690854119425548,
"grad_norm": 0.2250848297991148,
"learning_rate": 4.819966015719933e-06,
"loss": 0.9032,
"step": 89
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.2535347696118056,
"learning_rate": 4.815436622394442e-06,
"loss": 0.9149,
"step": 90
},
{
"epoch": 0.2751322751322751,
"grad_norm": 0.22450012032883543,
"learning_rate": 4.810853140550625e-06,
"loss": 0.9055,
"step": 91
},
{
"epoch": 0.2781557067271353,
"grad_norm": 0.17386208282106705,
"learning_rate": 4.806215677258456e-06,
"loss": 0.8933,
"step": 92
},
{
"epoch": 0.2811791383219955,
"grad_norm": 0.19053752177477154,
"learning_rate": 4.801524340848917e-06,
"loss": 0.8915,
"step": 93
},
{
"epoch": 0.2842025699168556,
"grad_norm": 0.2725320545499666,
"learning_rate": 4.796779240911461e-06,
"loss": 0.9251,
"step": 94
},
{
"epoch": 0.2872260015117158,
"grad_norm": 0.2386183196781376,
"learning_rate": 4.791980488291457e-06,
"loss": 0.8928,
"step": 95
},
{
"epoch": 0.29024943310657597,
"grad_norm": 0.1817710733957378,
"learning_rate": 4.787128195087596e-06,
"loss": 0.9165,
"step": 96
},
{
"epoch": 0.29327286470143615,
"grad_norm": 0.17308690210240787,
"learning_rate": 4.782222474649279e-06,
"loss": 0.887,
"step": 97
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.2404735832702819,
"learning_rate": 4.777263441573963e-06,
"loss": 0.9012,
"step": 98
},
{
"epoch": 0.29931972789115646,
"grad_norm": 0.28779677911496493,
"learning_rate": 4.772251211704487e-06,
"loss": 0.9016,
"step": 99
},
{
"epoch": 0.30234315948601664,
"grad_norm": 0.15787837522906498,
"learning_rate": 4.7671859021263635e-06,
"loss": 0.9051,
"step": 100
},
{
"epoch": 0.30536659108087677,
"grad_norm": 0.1575234808015298,
"learning_rate": 4.762067631165049e-06,
"loss": 0.8917,
"step": 101
},
{
"epoch": 0.30839002267573695,
"grad_norm": 0.17558403452861931,
"learning_rate": 4.756896518383173e-06,
"loss": 0.9174,
"step": 102
},
{
"epoch": 0.31141345427059713,
"grad_norm": 0.28974349430226604,
"learning_rate": 4.751672684577747e-06,
"loss": 0.8929,
"step": 103
},
{
"epoch": 0.3144368858654573,
"grad_norm": 0.24411092218088543,
"learning_rate": 4.746396251777348e-06,
"loss": 0.8811,
"step": 104
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.16801064806045637,
"learning_rate": 4.74106734323926e-06,
"loss": 0.8758,
"step": 105
},
{
"epoch": 0.3204837490551776,
"grad_norm": 0.19248014461061233,
"learning_rate": 4.7356860834466e-06,
"loss": 0.9103,
"step": 106
},
{
"epoch": 0.3235071806500378,
"grad_norm": 0.27209908752286666,
"learning_rate": 4.730252598105407e-06,
"loss": 0.8843,
"step": 107
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.2293714752972601,
"learning_rate": 4.72476701414171e-06,
"loss": 0.9231,
"step": 108
},
{
"epoch": 0.3295540438397581,
"grad_norm": 0.18392800235656956,
"learning_rate": 4.7192294596985564e-06,
"loss": 0.8552,
"step": 109
},
{
"epoch": 0.3325774754346183,
"grad_norm": 0.1893627518175467,
"learning_rate": 4.7136400641330245e-06,
"loss": 0.8811,
"step": 110
},
{
"epoch": 0.3356009070294785,
"grad_norm": 0.27532406651290064,
"learning_rate": 4.7079989580132005e-06,
"loss": 0.9032,
"step": 111
},
{
"epoch": 0.3386243386243386,
"grad_norm": 0.21281637805817608,
"learning_rate": 4.702306273115122e-06,
"loss": 0.8731,
"step": 112
},
{
"epoch": 0.3416477702191988,
"grad_norm": 0.21685692387167585,
"learning_rate": 4.696562142419712e-06,
"loss": 0.8713,
"step": 113
},
{
"epoch": 0.34467120181405897,
"grad_norm": 0.27021306476550466,
"learning_rate": 4.690766700109659e-06,
"loss": 0.88,
"step": 114
},
{
"epoch": 0.3476946334089191,
"grad_norm": 0.23439835580439225,
"learning_rate": 4.684920081566295e-06,
"loss": 0.8814,
"step": 115
},
{
"epoch": 0.3507180650037793,
"grad_norm": 0.21025681348048122,
"learning_rate": 4.679022423366424e-06,
"loss": 0.8535,
"step": 116
},
{
"epoch": 0.35374149659863946,
"grad_norm": 0.21924118290065314,
"learning_rate": 4.673073863279133e-06,
"loss": 0.8869,
"step": 117
},
{
"epoch": 0.35676492819349964,
"grad_norm": 0.2875708297089177,
"learning_rate": 4.667074540262577e-06,
"loss": 0.8646,
"step": 118
},
{
"epoch": 0.35978835978835977,
"grad_norm": 0.20014737080144987,
"learning_rate": 4.661024594460733e-06,
"loss": 0.8718,
"step": 119
},
{
"epoch": 0.36281179138321995,
"grad_norm": 0.19119381829230253,
"learning_rate": 4.654924167200124e-06,
"loss": 0.8683,
"step": 120
},
{
"epoch": 0.36583522297808013,
"grad_norm": 0.2655620248145862,
"learning_rate": 4.648773400986513e-06,
"loss": 0.8655,
"step": 121
},
{
"epoch": 0.3688586545729403,
"grad_norm": 0.25081787812962225,
"learning_rate": 4.6425724395015865e-06,
"loss": 0.8582,
"step": 122
},
{
"epoch": 0.37188208616780044,
"grad_norm": 0.2146047325963571,
"learning_rate": 4.636321427599586e-06,
"loss": 0.8893,
"step": 123
},
{
"epoch": 0.3749055177626606,
"grad_norm": 0.2309806267470169,
"learning_rate": 4.63002051130393e-06,
"loss": 0.8486,
"step": 124
},
{
"epoch": 0.3779289493575208,
"grad_norm": 0.27736367362748365,
"learning_rate": 4.623669837803803e-06,
"loss": 0.8687,
"step": 125
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.2224756513405458,
"learning_rate": 4.617269555450715e-06,
"loss": 0.8825,
"step": 126
},
{
"epoch": 0.3839758125472411,
"grad_norm": 0.17936830170379472,
"learning_rate": 4.610819813755038e-06,
"loss": 0.8546,
"step": 127
},
{
"epoch": 0.3869992441421013,
"grad_norm": 0.18923636586433076,
"learning_rate": 4.604320763382512e-06,
"loss": 0.87,
"step": 128
},
{
"epoch": 0.3900226757369615,
"grad_norm": 0.18724186374787236,
"learning_rate": 4.597772556150724e-06,
"loss": 0.8676,
"step": 129
},
{
"epoch": 0.3930461073318216,
"grad_norm": 0.2914426770268331,
"learning_rate": 4.591175345025567e-06,
"loss": 0.8799,
"step": 130
},
{
"epoch": 0.3960695389266818,
"grad_norm": 0.23506817928141502,
"learning_rate": 4.584529284117662e-06,
"loss": 0.8895,
"step": 131
},
{
"epoch": 0.39909297052154197,
"grad_norm": 0.19429487340998514,
"learning_rate": 4.5778345286787575e-06,
"loss": 0.8272,
"step": 132
},
{
"epoch": 0.4021164021164021,
"grad_norm": 0.24906142354962724,
"learning_rate": 4.5710912350981066e-06,
"loss": 0.8647,
"step": 133
},
{
"epoch": 0.4051398337112623,
"grad_norm": 0.25795927507557026,
"learning_rate": 4.56429956089881e-06,
"loss": 0.8653,
"step": 134
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.18224019982541997,
"learning_rate": 4.5574596647341414e-06,
"loss": 0.8555,
"step": 135
},
{
"epoch": 0.41118669690098264,
"grad_norm": 0.20473182208619398,
"learning_rate": 4.550571706383833e-06,
"loss": 0.8664,
"step": 136
},
{
"epoch": 0.41421012849584277,
"grad_norm": 0.22168708013084754,
"learning_rate": 4.543635846750351e-06,
"loss": 0.8515,
"step": 137
},
{
"epoch": 0.41723356009070295,
"grad_norm": 0.21632029243557258,
"learning_rate": 4.536652247855133e-06,
"loss": 0.8619,
"step": 138
},
{
"epoch": 0.42025699168556313,
"grad_norm": 0.1920055931208493,
"learning_rate": 4.529621072834805e-06,
"loss": 0.8566,
"step": 139
},
{
"epoch": 0.42328042328042326,
"grad_norm": 0.1880614895437287,
"learning_rate": 4.522542485937369e-06,
"loss": 0.8243,
"step": 140
},
{
"epoch": 0.42630385487528344,
"grad_norm": 0.25600769805101486,
"learning_rate": 4.515416652518366e-06,
"loss": 0.8551,
"step": 141
},
{
"epoch": 0.4293272864701436,
"grad_norm": 0.2034314626277561,
"learning_rate": 4.508243739037016e-06,
"loss": 0.8603,
"step": 142
},
{
"epoch": 0.4323507180650038,
"grad_norm": 0.23508415301120186,
"learning_rate": 4.501023913052326e-06,
"loss": 0.8826,
"step": 143
},
{
"epoch": 0.43537414965986393,
"grad_norm": 0.2775448226015208,
"learning_rate": 4.4937573432191766e-06,
"loss": 0.8764,
"step": 144
},
{
"epoch": 0.4383975812547241,
"grad_norm": 0.24618223106362153,
"learning_rate": 4.486444199284386e-06,
"loss": 0.8973,
"step": 145
},
{
"epoch": 0.4414210128495843,
"grad_norm": 0.23424108283949535,
"learning_rate": 4.47908465208274e-06,
"loss": 0.8736,
"step": 146
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.22742376996470443,
"learning_rate": 4.471678873533002e-06,
"loss": 0.8581,
"step": 147
},
{
"epoch": 0.4474678760393046,
"grad_norm": 0.24653243269473768,
"learning_rate": 4.464227036633901e-06,
"loss": 0.8489,
"step": 148
},
{
"epoch": 0.4504913076341648,
"grad_norm": 0.2408835452466121,
"learning_rate": 4.456729315460084e-06,
"loss": 0.8637,
"step": 149
},
{
"epoch": 0.45351473922902497,
"grad_norm": 0.20149761505503935,
"learning_rate": 4.449185885158056e-06,
"loss": 0.8671,
"step": 150
},
{
"epoch": 0.4565381708238851,
"grad_norm": 0.19127590785183332,
"learning_rate": 4.4415969219420846e-06,
"loss": 0.8792,
"step": 151
},
{
"epoch": 0.4595616024187453,
"grad_norm": 0.22390628054581238,
"learning_rate": 4.433962603090083e-06,
"loss": 0.8468,
"step": 152
},
{
"epoch": 0.46258503401360546,
"grad_norm": 0.2957253215613366,
"learning_rate": 4.426283106939474e-06,
"loss": 0.8268,
"step": 153
},
{
"epoch": 0.4656084656084656,
"grad_norm": 0.20506648122584112,
"learning_rate": 4.418558612883016e-06,
"loss": 0.8772,
"step": 154
},
{
"epoch": 0.46863189720332576,
"grad_norm": 0.18636265474604682,
"learning_rate": 4.410789301364621e-06,
"loss": 0.858,
"step": 155
},
{
"epoch": 0.47165532879818595,
"grad_norm": 0.2674232446173923,
"learning_rate": 4.402975353875134e-06,
"loss": 0.8683,
"step": 156
},
{
"epoch": 0.47467876039304613,
"grad_norm": 0.2747499333038218,
"learning_rate": 4.3951169529480934e-06,
"loss": 0.8439,
"step": 157
},
{
"epoch": 0.47770219198790626,
"grad_norm": 0.18463338955505504,
"learning_rate": 4.3872142821554695e-06,
"loss": 0.8321,
"step": 158
},
{
"epoch": 0.48072562358276644,
"grad_norm": 0.19683973897761153,
"learning_rate": 4.379267526103374e-06,
"loss": 0.8378,
"step": 159
},
{
"epoch": 0.4837490551776266,
"grad_norm": 0.23093724944543254,
"learning_rate": 4.3712768704277535e-06,
"loss": 0.8342,
"step": 160
},
{
"epoch": 0.48677248677248675,
"grad_norm": 0.25457828536678356,
"learning_rate": 4.36324250179004e-06,
"loss": 0.8438,
"step": 161
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.2341347444247441,
"learning_rate": 4.355164607872806e-06,
"loss": 0.874,
"step": 162
},
{
"epoch": 0.4928193499622071,
"grad_norm": 0.19832386653308293,
"learning_rate": 4.347043377375369e-06,
"loss": 0.8871,
"step": 163
},
{
"epoch": 0.4958427815570673,
"grad_norm": 0.23548674821464477,
"learning_rate": 4.338879000009389e-06,
"loss": 0.8571,
"step": 164
},
{
"epoch": 0.4988662131519274,
"grad_norm": 0.2564635876122362,
"learning_rate": 4.3306716664944345e-06,
"loss": 0.8441,
"step": 165
},
{
"epoch": 0.5018896447467877,
"grad_norm": 0.22937827244764553,
"learning_rate": 4.322421568553529e-06,
"loss": 0.8435,
"step": 166
},
{
"epoch": 0.5049130763416477,
"grad_norm": 0.20546938114609037,
"learning_rate": 4.314128898908672e-06,
"loss": 0.8427,
"step": 167
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.24461216551872245,
"learning_rate": 4.305793851276335e-06,
"loss": 0.8488,
"step": 168
},
{
"epoch": 0.5109599395313681,
"grad_norm": 0.2280451372713774,
"learning_rate": 4.297416620362939e-06,
"loss": 0.8493,
"step": 169
},
{
"epoch": 0.5139833711262283,
"grad_norm": 0.2202142714476725,
"learning_rate": 4.288997401860303e-06,
"loss": 0.8514,
"step": 170
},
{
"epoch": 0.5170068027210885,
"grad_norm": 0.2426775141297586,
"learning_rate": 4.280536392441078e-06,
"loss": 0.8501,
"step": 171
},
{
"epoch": 0.5200302343159486,
"grad_norm": 0.1998543423805206,
"learning_rate": 4.272033789754146e-06,
"loss": 0.8313,
"step": 172
},
{
"epoch": 0.5230536659108088,
"grad_norm": 0.1847895892138973,
"learning_rate": 4.263489792420008e-06,
"loss": 0.8195,
"step": 173
},
{
"epoch": 0.5260770975056689,
"grad_norm": 0.23817124539909545,
"learning_rate": 4.254904600026143e-06,
"loss": 0.8581,
"step": 174
},
{
"epoch": 0.5291005291005291,
"grad_norm": 0.2575742303999011,
"learning_rate": 4.246278413122344e-06,
"loss": 0.8511,
"step": 175
},
{
"epoch": 0.5321239606953893,
"grad_norm": 0.22609359204972732,
"learning_rate": 4.2376114332160325e-06,
"loss": 0.843,
"step": 176
},
{
"epoch": 0.5351473922902494,
"grad_norm": 0.22696322689045012,
"learning_rate": 4.2289038627675585e-06,
"loss": 0.833,
"step": 177
},
{
"epoch": 0.5381708238851096,
"grad_norm": 0.2083064134180325,
"learning_rate": 4.220155905185461e-06,
"loss": 0.8707,
"step": 178
},
{
"epoch": 0.5411942554799698,
"grad_norm": 0.2188998951871127,
"learning_rate": 4.211367764821722e-06,
"loss": 0.8756,
"step": 179
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.21174182945781866,
"learning_rate": 4.202539646966993e-06,
"loss": 0.8431,
"step": 180
},
{
"epoch": 0.54724111866969,
"grad_norm": 0.26921219919236117,
"learning_rate": 4.193671757845797e-06,
"loss": 0.8346,
"step": 181
},
{
"epoch": 0.5502645502645502,
"grad_norm": 0.2410488610748255,
"learning_rate": 4.184764304611715e-06,
"loss": 0.8323,
"step": 182
},
{
"epoch": 0.5532879818594104,
"grad_norm": 0.19188924232191892,
"learning_rate": 4.17581749534254e-06,
"loss": 0.8275,
"step": 183
},
{
"epoch": 0.5563114134542706,
"grad_norm": 0.24965929389660024,
"learning_rate": 4.166831539035423e-06,
"loss": 0.8558,
"step": 184
},
{
"epoch": 0.5593348450491308,
"grad_norm": 0.2715497253670651,
"learning_rate": 4.1578066456019885e-06,
"loss": 0.8667,
"step": 185
},
{
"epoch": 0.562358276643991,
"grad_norm": 0.19906288449082996,
"learning_rate": 4.148743025863432e-06,
"loss": 0.8535,
"step": 186
},
{
"epoch": 0.5653817082388511,
"grad_norm": 0.22076525732705374,
"learning_rate": 4.139640891545591e-06,
"loss": 0.8296,
"step": 187
},
{
"epoch": 0.5684051398337112,
"grad_norm": 0.25483531753570576,
"learning_rate": 4.130500455274005e-06,
"loss": 0.8355,
"step": 188
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.24421069561222894,
"learning_rate": 4.121321930568946e-06,
"loss": 0.8357,
"step": 189
},
{
"epoch": 0.5744520030234316,
"grad_norm": 0.20339394657166124,
"learning_rate": 4.112105531840427e-06,
"loss": 0.8357,
"step": 190
},
{
"epoch": 0.5774754346182918,
"grad_norm": 0.24233770822338466,
"learning_rate": 4.1028514743832e-06,
"loss": 0.8313,
"step": 191
},
{
"epoch": 0.5804988662131519,
"grad_norm": 0.2829777666494022,
"learning_rate": 4.093559974371725e-06,
"loss": 0.8378,
"step": 192
},
{
"epoch": 0.5835222978080121,
"grad_norm": 0.1699407087734907,
"learning_rate": 4.084231248855113e-06,
"loss": 0.8208,
"step": 193
},
{
"epoch": 0.5865457294028723,
"grad_norm": 0.17498689950665328,
"learning_rate": 4.074865515752068e-06,
"loss": 0.838,
"step": 194
},
{
"epoch": 0.5895691609977324,
"grad_norm": 0.2475691965670073,
"learning_rate": 4.065462993845785e-06,
"loss": 0.849,
"step": 195
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.24997313540826083,
"learning_rate": 4.056023902778846e-06,
"loss": 0.8229,
"step": 196
},
{
"epoch": 0.5956160241874527,
"grad_norm": 0.19976933217581305,
"learning_rate": 4.046548463048089e-06,
"loss": 0.8301,
"step": 197
},
{
"epoch": 0.5986394557823129,
"grad_norm": 0.24028559185538167,
"learning_rate": 4.037036895999453e-06,
"loss": 0.8462,
"step": 198
},
{
"epoch": 0.6016628873771731,
"grad_norm": 0.27335949880058813,
"learning_rate": 4.0274894238228115e-06,
"loss": 0.8364,
"step": 199
},
{
"epoch": 0.6046863189720333,
"grad_norm": 0.18909543268493909,
"learning_rate": 4.017906269546778e-06,
"loss": 0.8083,
"step": 200
},
{
"epoch": 0.6077097505668935,
"grad_norm": 0.20724602824279856,
"learning_rate": 4.0082876570335025e-06,
"loss": 0.8193,
"step": 201
},
{
"epoch": 0.6107331821617535,
"grad_norm": 0.26651899455610345,
"learning_rate": 3.9986338109734354e-06,
"loss": 0.8299,
"step": 202
},
{
"epoch": 0.6137566137566137,
"grad_norm": 0.20515478118259406,
"learning_rate": 3.988944956880082e-06,
"loss": 0.8323,
"step": 203
},
{
"epoch": 0.6167800453514739,
"grad_norm": 0.1823781343576012,
"learning_rate": 3.979221321084734e-06,
"loss": 0.8224,
"step": 204
},
{
"epoch": 0.6198034769463341,
"grad_norm": 0.19460227890197035,
"learning_rate": 3.969463130731183e-06,
"loss": 0.8243,
"step": 205
},
{
"epoch": 0.6228269085411943,
"grad_norm": 0.25256274653870814,
"learning_rate": 3.959670613770414e-06,
"loss": 0.834,
"step": 206
},
{
"epoch": 0.6258503401360545,
"grad_norm": 0.2099371278262912,
"learning_rate": 3.949843998955279e-06,
"loss": 0.8001,
"step": 207
},
{
"epoch": 0.6288737717309146,
"grad_norm": 0.18831071399800087,
"learning_rate": 3.939983515835157e-06,
"loss": 0.846,
"step": 208
},
{
"epoch": 0.6318972033257747,
"grad_norm": 0.20326222391630303,
"learning_rate": 3.9300893947505865e-06,
"loss": 0.813,
"step": 209
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.28946931059014386,
"learning_rate": 3.92016186682789e-06,
"loss": 0.8252,
"step": 210
},
{
"epoch": 0.6379440665154951,
"grad_norm": 0.20146394091804065,
"learning_rate": 3.9102011639737715e-06,
"loss": 0.8273,
"step": 211
},
{
"epoch": 0.6409674981103552,
"grad_norm": 0.16554710439809656,
"learning_rate": 3.900207518869901e-06,
"loss": 0.8294,
"step": 212
},
{
"epoch": 0.6439909297052154,
"grad_norm": 0.19154551239872575,
"learning_rate": 3.890181164967476e-06,
"loss": 0.8331,
"step": 213
},
{
"epoch": 0.6470143613000756,
"grad_norm": 0.2863695398034112,
"learning_rate": 3.880122336481774e-06,
"loss": 0.8156,
"step": 214
},
{
"epoch": 0.6500377928949358,
"grad_norm": 0.21052777788511692,
"learning_rate": 3.870031268386676e-06,
"loss": 0.7963,
"step": 215
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.1566104119157067,
"learning_rate": 3.859908196409177e-06,
"loss": 0.8247,
"step": 216
},
{
"epoch": 0.656084656084656,
"grad_norm": 0.17376065755010325,
"learning_rate": 3.849753357023885e-06,
"loss": 0.8412,
"step": 217
},
{
"epoch": 0.6591080876795162,
"grad_norm": 0.2775570184417396,
"learning_rate": 3.839566987447492e-06,
"loss": 0.8444,
"step": 218
},
{
"epoch": 0.6621315192743764,
"grad_norm": 0.3002446727716999,
"learning_rate": 3.829349325633233e-06,
"loss": 0.8353,
"step": 219
},
{
"epoch": 0.6651549508692366,
"grad_norm": 0.17501583537193782,
"learning_rate": 3.819100610265332e-06,
"loss": 0.8406,
"step": 220
},
{
"epoch": 0.6681783824640968,
"grad_norm": 0.16018543435725524,
"learning_rate": 3.8088210807534185e-06,
"loss": 0.8143,
"step": 221
},
{
"epoch": 0.671201814058957,
"grad_norm": 0.26632239617155334,
"learning_rate": 3.7985109772269435e-06,
"loss": 0.8099,
"step": 222
},
{
"epoch": 0.674225245653817,
"grad_norm": 0.2502372675648549,
"learning_rate": 3.7881705405295623e-06,
"loss": 0.828,
"step": 223
},
{
"epoch": 0.6772486772486772,
"grad_norm": 0.21825897588135384,
"learning_rate": 3.777800012213514e-06,
"loss": 0.8246,
"step": 224
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.27497686942905814,
"learning_rate": 3.767399634533976e-06,
"loss": 0.8131,
"step": 225
},
{
"epoch": 0.6832955404383976,
"grad_norm": 0.22856597196018685,
"learning_rate": 3.756969650443408e-06,
"loss": 0.8098,
"step": 226
},
{
"epoch": 0.6863189720332578,
"grad_norm": 0.21059170940590144,
"learning_rate": 3.7465103035858718e-06,
"loss": 0.8187,
"step": 227
},
{
"epoch": 0.6893424036281179,
"grad_norm": 0.2289160214691356,
"learning_rate": 3.7360218382913426e-06,
"loss": 0.8265,
"step": 228
},
{
"epoch": 0.6923658352229781,
"grad_norm": 0.22771294742249917,
"learning_rate": 3.7255044995700024e-06,
"loss": 0.8063,
"step": 229
},
{
"epoch": 0.6953892668178382,
"grad_norm": 0.220912987205476,
"learning_rate": 3.714958533106515e-06,
"loss": 0.8141,
"step": 230
},
{
"epoch": 0.6984126984126984,
"grad_norm": 0.2331093248404988,
"learning_rate": 3.7043841852542884e-06,
"loss": 0.7967,
"step": 231
},
{
"epoch": 0.7014361300075586,
"grad_norm": 0.24044315675315245,
"learning_rate": 3.6937817030297164e-06,
"loss": 0.8202,
"step": 232
},
{
"epoch": 0.7044595616024187,
"grad_norm": 0.17808063026487772,
"learning_rate": 3.6831513341064128e-06,
"loss": 0.824,
"step": 233
},
{
"epoch": 0.7074829931972789,
"grad_norm": 0.1686282272216412,
"learning_rate": 3.672493326809422e-06,
"loss": 0.8265,
"step": 234
},
{
"epoch": 0.7105064247921391,
"grad_norm": 0.2620354561369418,
"learning_rate": 3.661807930109422e-06,
"loss": 0.8156,
"step": 235
},
{
"epoch": 0.7135298563869993,
"grad_norm": 0.325482330440253,
"learning_rate": 3.651095393616904e-06,
"loss": 0.828,
"step": 236
},
{
"epoch": 0.7165532879818595,
"grad_norm": 0.15080114640909387,
"learning_rate": 3.6403559675763457e-06,
"loss": 0.7995,
"step": 237
},
{
"epoch": 0.7195767195767195,
"grad_norm": 0.14745127928311055,
"learning_rate": 3.629589902860363e-06,
"loss": 0.8087,
"step": 238
},
{
"epoch": 0.7226001511715797,
"grad_norm": 0.2799111726866219,
"learning_rate": 3.6187974509638496e-06,
"loss": 0.8176,
"step": 239
},
{
"epoch": 0.7256235827664399,
"grad_norm": 0.2502547915239206,
"learning_rate": 3.607978863998104e-06,
"loss": 0.8064,
"step": 240
},
{
"epoch": 0.7286470143613001,
"grad_norm": 0.13777657856560566,
"learning_rate": 3.5971343946849374e-06,
"loss": 0.8178,
"step": 241
},
{
"epoch": 0.7316704459561603,
"grad_norm": 0.1385328283480905,
"learning_rate": 3.586264296350775e-06,
"loss": 0.8027,
"step": 242
},
{
"epoch": 0.7346938775510204,
"grad_norm": 0.17341004642304678,
"learning_rate": 3.57536882292073e-06,
"loss": 0.8096,
"step": 243
},
{
"epoch": 0.7377173091458806,
"grad_norm": 0.3691916406878038,
"learning_rate": 3.564448228912682e-06,
"loss": 0.8338,
"step": 244
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.21689653213933718,
"learning_rate": 3.5535027694313233e-06,
"loss": 0.7977,
"step": 245
},
{
"epoch": 0.7437641723356009,
"grad_norm": 0.16595312089208156,
"learning_rate": 3.5425327001622034e-06,
"loss": 0.7987,
"step": 246
},
{
"epoch": 0.7467876039304611,
"grad_norm": 0.21979225164562236,
"learning_rate": 3.5315382773657563e-06,
"loss": 0.8181,
"step": 247
},
{
"epoch": 0.7498110355253212,
"grad_norm": 0.31450056661452935,
"learning_rate": 3.520519757871313e-06,
"loss": 0.8128,
"step": 248
},
{
"epoch": 0.7528344671201814,
"grad_norm": 0.155403218509628,
"learning_rate": 3.5094773990711024e-06,
"loss": 0.807,
"step": 249
},
{
"epoch": 0.7558578987150416,
"grad_norm": 0.14490425331756726,
"learning_rate": 3.4984114589142388e-06,
"loss": 0.7883,
"step": 250
},
{
"epoch": 0.7588813303099018,
"grad_norm": 0.21380341083079393,
"learning_rate": 3.4873221959006973e-06,
"loss": 0.8162,
"step": 251
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.35920542267660566,
"learning_rate": 3.476209869075273e-06,
"loss": 0.7852,
"step": 252
},
{
"epoch": 0.764928193499622,
"grad_norm": 0.14693329979199346,
"learning_rate": 3.4650747380215296e-06,
"loss": 0.8164,
"step": 253
},
{
"epoch": 0.7679516250944822,
"grad_norm": 0.2613621433404773,
"learning_rate": 3.4539170628557383e-06,
"loss": 0.8083,
"step": 254
},
{
"epoch": 0.7709750566893424,
"grad_norm": 0.3665112092678806,
"learning_rate": 3.442737104220801e-06,
"loss": 0.8181,
"step": 255
},
{
"epoch": 0.7739984882842026,
"grad_norm": 0.16067983638579006,
"learning_rate": 3.4315351232801597e-06,
"loss": 0.8162,
"step": 256
},
{
"epoch": 0.7770219198790628,
"grad_norm": 0.24580578582443013,
"learning_rate": 3.4203113817116955e-06,
"loss": 0.8199,
"step": 257
},
{
"epoch": 0.780045351473923,
"grad_norm": 0.331248956918326,
"learning_rate": 3.409066141701618e-06,
"loss": 0.7913,
"step": 258
},
{
"epoch": 0.783068783068783,
"grad_norm": 0.16426278470075412,
"learning_rate": 3.3977996659383396e-06,
"loss": 0.8166,
"step": 259
},
{
"epoch": 0.7860922146636432,
"grad_norm": 0.2057865252683302,
"learning_rate": 3.386512217606339e-06,
"loss": 0.8018,
"step": 260
},
{
"epoch": 0.7891156462585034,
"grad_norm": 0.3793459602253602,
"learning_rate": 3.3752040603800148e-06,
"loss": 0.8243,
"step": 261
},
{
"epoch": 0.7921390778533636,
"grad_norm": 0.14811638555402215,
"learning_rate": 3.3638754584175222e-06,
"loss": 0.8144,
"step": 262
},
{
"epoch": 0.7951625094482238,
"grad_norm": 0.3237839618432774,
"learning_rate": 3.352526676354606e-06,
"loss": 0.7933,
"step": 263
},
{
"epoch": 0.7981859410430839,
"grad_norm": 0.21169351866452582,
"learning_rate": 3.3411579792984178e-06,
"loss": 0.8125,
"step": 264
},
{
"epoch": 0.8012093726379441,
"grad_norm": 0.14502913140221696,
"learning_rate": 3.3297696328213215e-06,
"loss": 0.7919,
"step": 265
},
{
"epoch": 0.8042328042328042,
"grad_norm": 0.130046065883626,
"learning_rate": 3.318361902954692e-06,
"loss": 0.7925,
"step": 266
},
{
"epoch": 0.8072562358276644,
"grad_norm": 0.1806023890937921,
"learning_rate": 3.3069350561826997e-06,
"loss": 0.7977,
"step": 267
},
{
"epoch": 0.8102796674225246,
"grad_norm": 0.3661239179855748,
"learning_rate": 3.295489359436083e-06,
"loss": 0.8121,
"step": 268
},
{
"epoch": 0.8133030990173847,
"grad_norm": 0.15684544823299335,
"learning_rate": 3.2840250800859185e-06,
"loss": 0.8439,
"step": 269
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.1442117724504863,
"learning_rate": 3.272542485937369e-06,
"loss": 0.8205,
"step": 270
},
{
"epoch": 0.8193499622071051,
"grad_norm": 0.1630144971387636,
"learning_rate": 3.2610418452234315e-06,
"loss": 0.8116,
"step": 271
},
{
"epoch": 0.8223733938019653,
"grad_norm": 0.2272302138625313,
"learning_rate": 3.249523426598669e-06,
"loss": 0.7889,
"step": 272
},
{
"epoch": 0.8253968253968254,
"grad_norm": 0.2630488611954438,
"learning_rate": 3.2379874991329374e-06,
"loss": 0.8101,
"step": 273
},
{
"epoch": 0.8284202569916855,
"grad_norm": 0.1636882510390679,
"learning_rate": 3.2264343323050985e-06,
"loss": 0.8067,
"step": 274
},
{
"epoch": 0.8314436885865457,
"grad_norm": 0.1800718434777349,
"learning_rate": 3.214864195996723e-06,
"loss": 0.8267,
"step": 275
},
{
"epoch": 0.8344671201814059,
"grad_norm": 0.27772170659214646,
"learning_rate": 3.2032773604857915e-06,
"loss": 0.8021,
"step": 276
},
{
"epoch": 0.8374905517762661,
"grad_norm": 0.2524388193093376,
"learning_rate": 3.1916740964403736e-06,
"loss": 0.8067,
"step": 277
},
{
"epoch": 0.8405139833711263,
"grad_norm": 0.18970600852145528,
"learning_rate": 3.1800546749123108e-06,
"loss": 0.8073,
"step": 278
},
{
"epoch": 0.8435374149659864,
"grad_norm": 0.19923073362072904,
"learning_rate": 3.168419367330883e-06,
"loss": 0.799,
"step": 279
},
{
"epoch": 0.8465608465608465,
"grad_norm": 0.25436094223895794,
"learning_rate": 3.1567684454964674e-06,
"loss": 0.8041,
"step": 280
},
{
"epoch": 0.8495842781557067,
"grad_norm": 0.21128266721448266,
"learning_rate": 3.14510218157419e-06,
"loss": 0.8113,
"step": 281
},
{
"epoch": 0.8526077097505669,
"grad_norm": 0.22163072880133364,
"learning_rate": 3.133420848087566e-06,
"loss": 0.7889,
"step": 282
},
{
"epoch": 0.8556311413454271,
"grad_norm": 0.22883591781527274,
"learning_rate": 3.121724717912138e-06,
"loss": 0.7917,
"step": 283
},
{
"epoch": 0.8586545729402872,
"grad_norm": 0.2032672012417271,
"learning_rate": 3.110014064269094e-06,
"loss": 0.8032,
"step": 284
},
{
"epoch": 0.8616780045351474,
"grad_norm": 0.1740199158625731,
"learning_rate": 3.0982891607188948e-06,
"loss": 0.7827,
"step": 285
},
{
"epoch": 0.8647014361300076,
"grad_norm": 0.18106353392739202,
"learning_rate": 3.0865502811548755e-06,
"loss": 0.7896,
"step": 286
},
{
"epoch": 0.8677248677248677,
"grad_norm": 0.2292881686201471,
"learning_rate": 3.0747976997968513e-06,
"loss": 0.8159,
"step": 287
},
{
"epoch": 0.8707482993197279,
"grad_norm": 0.27476966438745903,
"learning_rate": 3.0630316911847112e-06,
"loss": 0.7938,
"step": 288
},
{
"epoch": 0.873771730914588,
"grad_norm": 0.21250803524552264,
"learning_rate": 3.051252530172003e-06,
"loss": 0.7912,
"step": 289
},
{
"epoch": 0.8767951625094482,
"grad_norm": 0.20109882386036412,
"learning_rate": 3.039460491919516e-06,
"loss": 0.8005,
"step": 290
},
{
"epoch": 0.8798185941043084,
"grad_norm": 0.22987450725486983,
"learning_rate": 3.0276558518888496e-06,
"loss": 0.8081,
"step": 291
},
{
"epoch": 0.8828420256991686,
"grad_norm": 0.20495650915854588,
"learning_rate": 3.015838885835981e-06,
"loss": 0.8115,
"step": 292
},
{
"epoch": 0.8858654572940288,
"grad_norm": 0.17141615072214778,
"learning_rate": 3.0040098698048232e-06,
"loss": 0.7813,
"step": 293
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.18881546824196338,
"learning_rate": 2.992169080120776e-06,
"loss": 0.8113,
"step": 294
},
{
"epoch": 0.891912320483749,
"grad_norm": 0.20261508334609984,
"learning_rate": 2.9803167933842712e-06,
"loss": 0.7993,
"step": 295
},
{
"epoch": 0.8949357520786092,
"grad_norm": 0.2637865639683421,
"learning_rate": 2.9684532864643123e-06,
"loss": 0.8025,
"step": 296
},
{
"epoch": 0.8979591836734694,
"grad_norm": 0.20588016874386464,
"learning_rate": 2.9565788364920034e-06,
"loss": 0.7869,
"step": 297
},
{
"epoch": 0.9009826152683296,
"grad_norm": 0.1838418464531271,
"learning_rate": 2.944693720854081e-06,
"loss": 0.7976,
"step": 298
},
{
"epoch": 0.9040060468631897,
"grad_norm": 0.2238627689541774,
"learning_rate": 2.932798217186429e-06,
"loss": 0.7886,
"step": 299
},
{
"epoch": 0.9070294784580499,
"grad_norm": 0.2223361558094008,
"learning_rate": 2.920892603367596e-06,
"loss": 0.8163,
"step": 300
},
{
"epoch": 0.91005291005291,
"grad_norm": 0.1664138917818463,
"learning_rate": 2.908977157512305e-06,
"loss": 0.7859,
"step": 301
},
{
"epoch": 0.9130763416477702,
"grad_norm": 0.218098712406248,
"learning_rate": 2.897052157964952e-06,
"loss": 0.818,
"step": 302
},
{
"epoch": 0.9160997732426304,
"grad_norm": 0.25476932805817953,
"learning_rate": 2.8851178832931076e-06,
"loss": 0.7936,
"step": 303
},
{
"epoch": 0.9191232048374905,
"grad_norm": 0.20454797870655053,
"learning_rate": 2.8731746122810105e-06,
"loss": 0.8009,
"step": 304
},
{
"epoch": 0.9221466364323507,
"grad_norm": 0.2171163509058848,
"learning_rate": 2.8612226239230536e-06,
"loss": 0.8012,
"step": 305
},
{
"epoch": 0.9251700680272109,
"grad_norm": 0.3201406418230194,
"learning_rate": 2.8492621974172653e-06,
"loss": 0.8347,
"step": 306
},
{
"epoch": 0.9281934996220711,
"grad_norm": 0.20044446217181253,
"learning_rate": 2.8372936121587895e-06,
"loss": 0.8066,
"step": 307
},
{
"epoch": 0.9312169312169312,
"grad_norm": 0.16283549638272465,
"learning_rate": 2.8253171477333585e-06,
"loss": 0.8049,
"step": 308
},
{
"epoch": 0.9342403628117913,
"grad_norm": 0.20912249423273097,
"learning_rate": 2.813333083910761e-06,
"loss": 0.8112,
"step": 309
},
{
"epoch": 0.9372637944066515,
"grad_norm": 0.28501513792396893,
"learning_rate": 2.8013417006383078e-06,
"loss": 0.8033,
"step": 310
},
{
"epoch": 0.9402872260015117,
"grad_norm": 0.17569005132324075,
"learning_rate": 2.7893432780342928e-06,
"loss": 0.7905,
"step": 311
},
{
"epoch": 0.9433106575963719,
"grad_norm": 0.1707451012967817,
"learning_rate": 2.7773380963814454e-06,
"loss": 0.7992,
"step": 312
},
{
"epoch": 0.9463340891912321,
"grad_norm": 0.23658188962283105,
"learning_rate": 2.76532643612039e-06,
"loss": 0.7959,
"step": 313
},
{
"epoch": 0.9493575207860923,
"grad_norm": 0.2417426081720488,
"learning_rate": 2.7533085778430884e-06,
"loss": 0.7719,
"step": 314
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.21779534491141914,
"learning_rate": 2.7412848022862883e-06,
"loss": 0.8148,
"step": 315
},
{
"epoch": 0.9554043839758125,
"grad_norm": 0.1937439406511132,
"learning_rate": 2.729255390324966e-06,
"loss": 0.8099,
"step": 316
},
{
"epoch": 0.9584278155706727,
"grad_norm": 0.22418232835047394,
"learning_rate": 2.717220622965762e-06,
"loss": 0.8029,
"step": 317
},
{
"epoch": 0.9614512471655329,
"grad_norm": 0.24163066601859826,
"learning_rate": 2.7051807813404213e-06,
"loss": 0.8069,
"step": 318
},
{
"epoch": 0.9644746787603931,
"grad_norm": 0.17718761833134763,
"learning_rate": 2.6931361466992225e-06,
"loss": 0.7964,
"step": 319
},
{
"epoch": 0.9674981103552532,
"grad_norm": 0.21359305838545312,
"learning_rate": 2.6810870004044065e-06,
"loss": 0.7777,
"step": 320
},
{
"epoch": 0.9705215419501134,
"grad_norm": 0.2951108231827231,
"learning_rate": 2.6690336239236097e-06,
"loss": 0.7654,
"step": 321
},
{
"epoch": 0.9735449735449735,
"grad_norm": 0.17887426724913263,
"learning_rate": 2.6569762988232838e-06,
"loss": 0.8021,
"step": 322
},
{
"epoch": 0.9765684051398337,
"grad_norm": 0.16446650801438847,
"learning_rate": 2.644915306762121e-06,
"loss": 0.7996,
"step": 323
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.18349619699553313,
"learning_rate": 2.632850929484472e-06,
"loss": 0.769,
"step": 324
},
{
"epoch": 0.982615268329554,
"grad_norm": 0.23290485597057656,
"learning_rate": 2.620783448813768e-06,
"loss": 0.8104,
"step": 325
},
{
"epoch": 0.9856386999244142,
"grad_norm": 0.21697778026585082,
"learning_rate": 2.6087131466459344e-06,
"loss": 0.7919,
"step": 326
},
{
"epoch": 0.9886621315192744,
"grad_norm": 0.18436604515216662,
"learning_rate": 2.5966403049428056e-06,
"loss": 0.7819,
"step": 327
},
{
"epoch": 0.9916855631141346,
"grad_norm": 0.1916879714375915,
"learning_rate": 2.5845652057255414e-06,
"loss": 0.7565,
"step": 328
},
{
"epoch": 0.9947089947089947,
"grad_norm": 0.2338419771871179,
"learning_rate": 2.572488131068037e-06,
"loss": 0.8002,
"step": 329
},
{
"epoch": 0.9977324263038548,
"grad_norm": 0.19973120898443514,
"learning_rate": 2.560409363090331e-06,
"loss": 0.8019,
"step": 330
}
],
"logging_steps": 1,
"max_steps": 660,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 330,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.219445850938278e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}