eightwords-241112-mt / trainer_state.json
clinno's picture
Upload folder using huggingface_hub
21c6832 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.988870339454646,
"eval_steps": 1000,
"global_step": 3584,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.044518642181413465,
"grad_norm": 33.75,
"learning_rate": 2.785515320334262e-07,
"loss": 2.0545,
"step": 10
},
{
"epoch": 0.08903728436282693,
"grad_norm": 28.125,
"learning_rate": 5.571030640668524e-07,
"loss": 2.0294,
"step": 20
},
{
"epoch": 0.1335559265442404,
"grad_norm": 20.875,
"learning_rate": 8.356545961002786e-07,
"loss": 1.9841,
"step": 30
},
{
"epoch": 0.17807456872565386,
"grad_norm": 14.25,
"learning_rate": 1.1142061281337048e-06,
"loss": 1.8176,
"step": 40
},
{
"epoch": 0.22259321090706732,
"grad_norm": 23.5,
"learning_rate": 1.392757660167131e-06,
"loss": 1.6325,
"step": 50
},
{
"epoch": 0.2671118530884808,
"grad_norm": 19.25,
"learning_rate": 1.6713091922005572e-06,
"loss": 1.6341,
"step": 60
},
{
"epoch": 0.3116304952698943,
"grad_norm": 13.8125,
"learning_rate": 1.9498607242339835e-06,
"loss": 1.4943,
"step": 70
},
{
"epoch": 0.3561491374513077,
"grad_norm": 12.0,
"learning_rate": 2.2284122562674097e-06,
"loss": 1.4708,
"step": 80
},
{
"epoch": 0.4006677796327212,
"grad_norm": 11.125,
"learning_rate": 2.506963788300836e-06,
"loss": 1.415,
"step": 90
},
{
"epoch": 0.44518642181413465,
"grad_norm": 3.203125,
"learning_rate": 2.785515320334262e-06,
"loss": 1.4596,
"step": 100
},
{
"epoch": 0.48970506399554814,
"grad_norm": 2.71875,
"learning_rate": 3.064066852367688e-06,
"loss": 1.4339,
"step": 110
},
{
"epoch": 0.5342237061769616,
"grad_norm": 2.6875,
"learning_rate": 3.3426183844011143e-06,
"loss": 1.4009,
"step": 120
},
{
"epoch": 0.5787423483583751,
"grad_norm": 3.109375,
"learning_rate": 3.6211699164345405e-06,
"loss": 1.3688,
"step": 130
},
{
"epoch": 0.6232609905397886,
"grad_norm": 2.84375,
"learning_rate": 3.899721448467967e-06,
"loss": 1.3595,
"step": 140
},
{
"epoch": 0.667779632721202,
"grad_norm": 2.65625,
"learning_rate": 4.178272980501394e-06,
"loss": 1.3609,
"step": 150
},
{
"epoch": 0.7122982749026154,
"grad_norm": 2.953125,
"learning_rate": 4.456824512534819e-06,
"loss": 1.3777,
"step": 160
},
{
"epoch": 0.756816917084029,
"grad_norm": 2.71875,
"learning_rate": 4.735376044568246e-06,
"loss": 1.3374,
"step": 170
},
{
"epoch": 0.8013355592654424,
"grad_norm": 2.90625,
"learning_rate": 5.013927576601672e-06,
"loss": 1.3524,
"step": 180
},
{
"epoch": 0.8458542014468559,
"grad_norm": 2.5625,
"learning_rate": 5.292479108635098e-06,
"loss": 1.3153,
"step": 190
},
{
"epoch": 0.8903728436282693,
"grad_norm": 2.375,
"learning_rate": 5.571030640668524e-06,
"loss": 1.3519,
"step": 200
},
{
"epoch": 0.9348914858096828,
"grad_norm": 3.140625,
"learning_rate": 5.849582172701951e-06,
"loss": 1.348,
"step": 210
},
{
"epoch": 0.9794101279910963,
"grad_norm": 2.8125,
"learning_rate": 6.128133704735376e-06,
"loss": 1.3062,
"step": 220
},
{
"epoch": 1.0261547022815805,
"grad_norm": 2.5,
"learning_rate": 6.406685236768803e-06,
"loss": 1.4358,
"step": 230
},
{
"epoch": 1.070673344462994,
"grad_norm": 2.40625,
"learning_rate": 6.685236768802229e-06,
"loss": 1.2481,
"step": 240
},
{
"epoch": 1.1151919866444073,
"grad_norm": 2.8125,
"learning_rate": 6.963788300835655e-06,
"loss": 1.2833,
"step": 250
},
{
"epoch": 1.1597106288258208,
"grad_norm": 2.140625,
"learning_rate": 7.242339832869081e-06,
"loss": 1.1941,
"step": 260
},
{
"epoch": 1.2042292710072342,
"grad_norm": 2.46875,
"learning_rate": 7.5208913649025075e-06,
"loss": 1.2831,
"step": 270
},
{
"epoch": 1.2487479131886476,
"grad_norm": 2.671875,
"learning_rate": 7.799442896935934e-06,
"loss": 1.2854,
"step": 280
},
{
"epoch": 1.293266555370061,
"grad_norm": 2.46875,
"learning_rate": 8.07799442896936e-06,
"loss": 1.257,
"step": 290
},
{
"epoch": 1.3377851975514747,
"grad_norm": 2.03125,
"learning_rate": 8.356545961002787e-06,
"loss": 1.2468,
"step": 300
},
{
"epoch": 1.3823038397328882,
"grad_norm": 2.0625,
"learning_rate": 8.635097493036211e-06,
"loss": 1.2743,
"step": 310
},
{
"epoch": 1.4268224819143016,
"grad_norm": 2.109375,
"learning_rate": 8.913649025069639e-06,
"loss": 1.2265,
"step": 320
},
{
"epoch": 1.471341124095715,
"grad_norm": 2.078125,
"learning_rate": 9.192200557103064e-06,
"loss": 1.2898,
"step": 330
},
{
"epoch": 1.5158597662771287,
"grad_norm": 2.0,
"learning_rate": 9.470752089136492e-06,
"loss": 1.2406,
"step": 340
},
{
"epoch": 1.5603784084585421,
"grad_norm": 2.21875,
"learning_rate": 9.749303621169918e-06,
"loss": 1.2098,
"step": 350
},
{
"epoch": 1.6048970506399556,
"grad_norm": 1.953125,
"learning_rate": 9.9999976276417e-06,
"loss": 1.2067,
"step": 360
},
{
"epoch": 1.649415692821369,
"grad_norm": 1.9765625,
"learning_rate": 9.999712947369595e-06,
"loss": 1.2338,
"step": 370
},
{
"epoch": 1.6939343350027825,
"grad_norm": 1.859375,
"learning_rate": 9.998953826391322e-06,
"loss": 1.2546,
"step": 380
},
{
"epoch": 1.738452977184196,
"grad_norm": 2.015625,
"learning_rate": 9.997720336742596e-06,
"loss": 1.201,
"step": 390
},
{
"epoch": 1.7829716193656093,
"grad_norm": 2.09375,
"learning_rate": 9.996012595473676e-06,
"loss": 1.1761,
"step": 400
},
{
"epoch": 1.8274902615470228,
"grad_norm": 2.015625,
"learning_rate": 9.993830764638262e-06,
"loss": 1.1884,
"step": 410
},
{
"epoch": 1.8720089037284362,
"grad_norm": 1.90625,
"learning_rate": 9.991175051278111e-06,
"loss": 1.1951,
"step": 420
},
{
"epoch": 1.9165275459098496,
"grad_norm": 2.09375,
"learning_rate": 9.988045707403394e-06,
"loss": 1.175,
"step": 430
},
{
"epoch": 1.961046188091263,
"grad_norm": 2.0625,
"learning_rate": 9.984443029968786e-06,
"loss": 1.2045,
"step": 440
},
{
"epoch": 2.0077907623817475,
"grad_norm": 2.03125,
"learning_rate": 9.980367360845278e-06,
"loss": 1.3052,
"step": 450
},
{
"epoch": 2.052309404563161,
"grad_norm": 1.984375,
"learning_rate": 9.975819086787743e-06,
"loss": 1.1092,
"step": 460
},
{
"epoch": 2.0968280467445743,
"grad_norm": 2.1875,
"learning_rate": 9.970798639398228e-06,
"loss": 1.1435,
"step": 470
},
{
"epoch": 2.141346688925988,
"grad_norm": 1.8828125,
"learning_rate": 9.965306495085005e-06,
"loss": 1.0927,
"step": 480
},
{
"epoch": 2.185865331107401,
"grad_norm": 2.03125,
"learning_rate": 9.959343175017362e-06,
"loss": 1.0692,
"step": 490
},
{
"epoch": 2.2303839732888147,
"grad_norm": 1.8671875,
"learning_rate": 9.952909245076141e-06,
"loss": 1.0603,
"step": 500
},
{
"epoch": 2.274902615470228,
"grad_norm": 2.03125,
"learning_rate": 9.946005315800047e-06,
"loss": 1.0717,
"step": 510
},
{
"epoch": 2.3194212576516415,
"grad_norm": 1.9140625,
"learning_rate": 9.93863204232771e-06,
"loss": 1.0808,
"step": 520
},
{
"epoch": 2.363939899833055,
"grad_norm": 1.609375,
"learning_rate": 9.930790124335511e-06,
"loss": 1.0297,
"step": 530
},
{
"epoch": 2.4084585420144684,
"grad_norm": 1.84375,
"learning_rate": 9.922480305971193e-06,
"loss": 1.0481,
"step": 540
},
{
"epoch": 2.452977184195882,
"grad_norm": 1.90625,
"learning_rate": 9.91370337578325e-06,
"loss": 1.0919,
"step": 550
},
{
"epoch": 2.4974958263772953,
"grad_norm": 2.09375,
"learning_rate": 9.904460166646084e-06,
"loss": 1.0835,
"step": 560
},
{
"epoch": 2.542014468558709,
"grad_norm": 1.8046875,
"learning_rate": 9.894751555680988e-06,
"loss": 1.0336,
"step": 570
},
{
"epoch": 2.586533110740122,
"grad_norm": 2.0625,
"learning_rate": 9.884578464172901e-06,
"loss": 1.0728,
"step": 580
},
{
"epoch": 2.631051752921536,
"grad_norm": 1.6484375,
"learning_rate": 9.873941857482988e-06,
"loss": 1.0493,
"step": 590
},
{
"epoch": 2.6755703951029495,
"grad_norm": 1.796875,
"learning_rate": 9.862842744957037e-06,
"loss": 1.0346,
"step": 600
},
{
"epoch": 2.720089037284363,
"grad_norm": 1.7421875,
"learning_rate": 9.85128217982967e-06,
"loss": 1.0483,
"step": 610
},
{
"epoch": 2.7646076794657763,
"grad_norm": 1.6796875,
"learning_rate": 9.8392612591244e-06,
"loss": 1.0384,
"step": 620
},
{
"epoch": 2.80912632164719,
"grad_norm": 1.765625,
"learning_rate": 9.826781123549542e-06,
"loss": 1.0266,
"step": 630
},
{
"epoch": 2.853644963828603,
"grad_norm": 1.578125,
"learning_rate": 9.813842957389953e-06,
"loss": 1.0352,
"step": 640
},
{
"epoch": 2.8981636060100167,
"grad_norm": 1.796875,
"learning_rate": 9.800447988394657e-06,
"loss": 1.009,
"step": 650
},
{
"epoch": 2.94268224819143,
"grad_norm": 1.796875,
"learning_rate": 9.786597487660336e-06,
"loss": 1.0834,
"step": 660
},
{
"epoch": 2.9872008903728435,
"grad_norm": 1.8046875,
"learning_rate": 9.772292769510718e-06,
"loss": 1.0735,
"step": 670
},
{
"epoch": 3.033945464663328,
"grad_norm": 1.6484375,
"learning_rate": 9.75753519137185e-06,
"loss": 1.0532,
"step": 680
},
{
"epoch": 3.0784641068447414,
"grad_norm": 1.8125,
"learning_rate": 9.742326153643285e-06,
"loss": 0.9169,
"step": 690
},
{
"epoch": 3.122982749026155,
"grad_norm": 1.671875,
"learning_rate": 9.726667099565202e-06,
"loss": 0.9443,
"step": 700
},
{
"epoch": 3.1675013912075682,
"grad_norm": 1.6015625,
"learning_rate": 9.710559515081446e-06,
"loss": 0.9023,
"step": 710
},
{
"epoch": 3.2120200333889817,
"grad_norm": 1.7265625,
"learning_rate": 9.69400492869852e-06,
"loss": 0.9227,
"step": 720
},
{
"epoch": 3.256538675570395,
"grad_norm": 1.71875,
"learning_rate": 9.677004911340539e-06,
"loss": 0.9329,
"step": 730
},
{
"epoch": 3.3010573177518086,
"grad_norm": 1.875,
"learning_rate": 9.659561076200173e-06,
"loss": 0.903,
"step": 740
},
{
"epoch": 3.345575959933222,
"grad_norm": 1.484375,
"learning_rate": 9.64167507858554e-06,
"loss": 0.9046,
"step": 750
},
{
"epoch": 3.3900946021146354,
"grad_norm": 1.65625,
"learning_rate": 9.62334861576315e-06,
"loss": 0.927,
"step": 760
},
{
"epoch": 3.434613244296049,
"grad_norm": 1.8046875,
"learning_rate": 9.604583426796837e-06,
"loss": 0.9274,
"step": 770
},
{
"epoch": 3.4791318864774623,
"grad_norm": 1.53125,
"learning_rate": 9.585381292382734e-06,
"loss": 0.9127,
"step": 780
},
{
"epoch": 3.5236505286588757,
"grad_norm": 1.59375,
"learning_rate": 9.565744034680291e-06,
"loss": 0.9269,
"step": 790
},
{
"epoch": 3.5681691708402896,
"grad_norm": 1.6796875,
"learning_rate": 9.545673517139376e-06,
"loss": 0.8863,
"step": 800
},
{
"epoch": 3.6126878130217026,
"grad_norm": 1.359375,
"learning_rate": 9.52517164432343e-06,
"loss": 0.8776,
"step": 810
},
{
"epoch": 3.6572064552031165,
"grad_norm": 1.3359375,
"learning_rate": 9.50424036172875e-06,
"loss": 0.9424,
"step": 820
},
{
"epoch": 3.70172509738453,
"grad_norm": 1.4296875,
"learning_rate": 9.482881655599867e-06,
"loss": 0.8712,
"step": 830
},
{
"epoch": 3.7462437395659434,
"grad_norm": 1.5390625,
"learning_rate": 9.461097552741065e-06,
"loss": 0.9157,
"step": 840
},
{
"epoch": 3.790762381747357,
"grad_norm": 1.296875,
"learning_rate": 9.438890120324049e-06,
"loss": 0.8571,
"step": 850
},
{
"epoch": 3.8352810239287702,
"grad_norm": 1.328125,
"learning_rate": 9.416261465691786e-06,
"loss": 0.861,
"step": 860
},
{
"epoch": 3.8797996661101837,
"grad_norm": 1.265625,
"learning_rate": 9.393213736158532e-06,
"loss": 0.8952,
"step": 870
},
{
"epoch": 3.924318308291597,
"grad_norm": 1.328125,
"learning_rate": 9.369749118806063e-06,
"loss": 0.8598,
"step": 880
},
{
"epoch": 3.9688369504730105,
"grad_norm": 1.3359375,
"learning_rate": 9.345869840276138e-06,
"loss": 0.8614,
"step": 890
},
{
"epoch": 4.015581524763495,
"grad_norm": 1.0625,
"learning_rate": 9.321578166559202e-06,
"loss": 0.8842,
"step": 900
},
{
"epoch": 4.060100166944908,
"grad_norm": 1.25,
"learning_rate": 9.296876402779357e-06,
"loss": 0.7889,
"step": 910
},
{
"epoch": 4.104618809126322,
"grad_norm": 1.2109375,
"learning_rate": 9.271766892975632e-06,
"loss": 0.8188,
"step": 920
},
{
"epoch": 4.149137451307735,
"grad_norm": 1.1796875,
"learning_rate": 9.246252019879526e-06,
"loss": 0.7822,
"step": 930
},
{
"epoch": 4.193656093489149,
"grad_norm": 1.0546875,
"learning_rate": 9.22033420468893e-06,
"loss": 0.8268,
"step": 940
},
{
"epoch": 4.238174735670562,
"grad_norm": 1.015625,
"learning_rate": 9.194015906838345e-06,
"loss": 0.7838,
"step": 950
},
{
"epoch": 4.282693377851976,
"grad_norm": 1.2109375,
"learning_rate": 9.167299623765515e-06,
"loss": 0.7691,
"step": 960
},
{
"epoch": 4.3272120200333895,
"grad_norm": 1.171875,
"learning_rate": 9.14018789067443e-06,
"loss": 0.7575,
"step": 970
},
{
"epoch": 4.371730662214802,
"grad_norm": 1.1640625,
"learning_rate": 9.11268328029475e-06,
"loss": 0.8305,
"step": 980
},
{
"epoch": 4.416249304396216,
"grad_norm": 1.0390625,
"learning_rate": 9.08478840263767e-06,
"loss": 0.7607,
"step": 990
},
{
"epoch": 4.460767946577629,
"grad_norm": 0.99609375,
"learning_rate": 9.05650590474825e-06,
"loss": 0.7759,
"step": 1000
},
{
"epoch": 4.460767946577629,
"eval_loss": 1.0106589794158936,
"eval_runtime": 46.0703,
"eval_samples_per_second": 8.682,
"eval_steps_per_second": 8.682,
"step": 1000
},
{
"epoch": 4.505286588759043,
"grad_norm": 0.9140625,
"learning_rate": 9.027838470454222e-06,
"loss": 0.7025,
"step": 1010
},
{
"epoch": 4.549805230940456,
"grad_norm": 1.046875,
"learning_rate": 8.998788820111323e-06,
"loss": 0.776,
"step": 1020
},
{
"epoch": 4.59432387312187,
"grad_norm": 1.0078125,
"learning_rate": 8.969359710345132e-06,
"loss": 0.8328,
"step": 1030
},
{
"epoch": 4.638842515303283,
"grad_norm": 1.0546875,
"learning_rate": 8.939553933789499e-06,
"loss": 0.7564,
"step": 1040
},
{
"epoch": 4.683361157484697,
"grad_norm": 1.1796875,
"learning_rate": 8.90937431882154e-06,
"loss": 0.7684,
"step": 1050
},
{
"epoch": 4.72787979966611,
"grad_norm": 1.109375,
"learning_rate": 8.878823729293238e-06,
"loss": 0.8135,
"step": 1060
},
{
"epoch": 4.772398441847524,
"grad_norm": 1.1953125,
"learning_rate": 8.847905064259683e-06,
"loss": 0.8271,
"step": 1070
},
{
"epoch": 4.816917084028937,
"grad_norm": 1.0625,
"learning_rate": 8.816621257703969e-06,
"loss": 0.8179,
"step": 1080
},
{
"epoch": 4.861435726210351,
"grad_norm": 1.0546875,
"learning_rate": 8.784975278258783e-06,
"loss": 0.7721,
"step": 1090
},
{
"epoch": 4.905954368391764,
"grad_norm": 1.0546875,
"learning_rate": 8.752970128924696e-06,
"loss": 0.7752,
"step": 1100
},
{
"epoch": 4.950473010573178,
"grad_norm": 0.91015625,
"learning_rate": 8.7206088467852e-06,
"loss": 0.788,
"step": 1110
},
{
"epoch": 4.994991652754591,
"grad_norm": 0.9609375,
"learning_rate": 8.687894502718503e-06,
"loss": 0.8012,
"step": 1120
},
{
"epoch": 5.041736227045075,
"grad_norm": 0.8984375,
"learning_rate": 8.654830201106133e-06,
"loss": 0.8055,
"step": 1130
},
{
"epoch": 5.086254869226488,
"grad_norm": 0.98046875,
"learning_rate": 8.621419079538337e-06,
"loss": 0.7483,
"step": 1140
},
{
"epoch": 5.130773511407902,
"grad_norm": 0.9375,
"learning_rate": 8.587664308516361e-06,
"loss": 0.7349,
"step": 1150
},
{
"epoch": 5.175292153589315,
"grad_norm": 0.80859375,
"learning_rate": 8.553569091151576e-06,
"loss": 0.7454,
"step": 1160
},
{
"epoch": 5.219810795770729,
"grad_norm": 0.953125,
"learning_rate": 8.519136662861531e-06,
"loss": 0.6866,
"step": 1170
},
{
"epoch": 5.264329437952142,
"grad_norm": 0.859375,
"learning_rate": 8.484370291062927e-06,
"loss": 0.7269,
"step": 1180
},
{
"epoch": 5.308848080133556,
"grad_norm": 0.94140625,
"learning_rate": 8.449273274861566e-06,
"loss": 0.6977,
"step": 1190
},
{
"epoch": 5.353366722314969,
"grad_norm": 0.8984375,
"learning_rate": 8.413848944739282e-06,
"loss": 0.6814,
"step": 1200
},
{
"epoch": 5.397885364496383,
"grad_norm": 0.97265625,
"learning_rate": 8.378100662237904e-06,
"loss": 0.7206,
"step": 1210
},
{
"epoch": 5.442404006677796,
"grad_norm": 0.85546875,
"learning_rate": 8.342031819640263e-06,
"loss": 0.7317,
"step": 1220
},
{
"epoch": 5.48692264885921,
"grad_norm": 0.859375,
"learning_rate": 8.305645839648287e-06,
"loss": 0.7149,
"step": 1230
},
{
"epoch": 5.531441291040624,
"grad_norm": 0.8984375,
"learning_rate": 8.268946175058214e-06,
"loss": 0.6568,
"step": 1240
},
{
"epoch": 5.575959933222037,
"grad_norm": 0.859375,
"learning_rate": 8.231936308432935e-06,
"loss": 0.7292,
"step": 1250
},
{
"epoch": 5.6204785754034505,
"grad_norm": 0.82421875,
"learning_rate": 8.194619751771527e-06,
"loss": 0.6966,
"step": 1260
},
{
"epoch": 5.6649972175848635,
"grad_norm": 0.85546875,
"learning_rate": 8.157000046175984e-06,
"loss": 0.7128,
"step": 1270
},
{
"epoch": 5.709515859766277,
"grad_norm": 0.77734375,
"learning_rate": 8.119080761515197e-06,
"loss": 0.7343,
"step": 1280
},
{
"epoch": 5.75403450194769,
"grad_norm": 0.953125,
"learning_rate": 8.080865496086177e-06,
"loss": 0.7454,
"step": 1290
},
{
"epoch": 5.798553144129104,
"grad_norm": 0.89453125,
"learning_rate": 8.042357876272626e-06,
"loss": 0.7337,
"step": 1300
},
{
"epoch": 5.843071786310517,
"grad_norm": 0.796875,
"learning_rate": 8.003561556200796e-06,
"loss": 0.7011,
"step": 1310
},
{
"epoch": 5.887590428491931,
"grad_norm": 0.83984375,
"learning_rate": 7.964480217392739e-06,
"loss": 0.6969,
"step": 1320
},
{
"epoch": 5.932109070673344,
"grad_norm": 0.96484375,
"learning_rate": 7.925117568416966e-06,
"loss": 0.7272,
"step": 1330
},
{
"epoch": 5.976627712854758,
"grad_norm": 0.89453125,
"learning_rate": 7.885477344536516e-06,
"loss": 0.6795,
"step": 1340
},
{
"epoch": 6.023372287145242,
"grad_norm": 0.8828125,
"learning_rate": 7.845563307354506e-06,
"loss": 0.7507,
"step": 1350
},
{
"epoch": 6.067890929326656,
"grad_norm": 0.83984375,
"learning_rate": 7.80537924445718e-06,
"loss": 0.6812,
"step": 1360
},
{
"epoch": 6.112409571508069,
"grad_norm": 0.94921875,
"learning_rate": 7.764928969054493e-06,
"loss": 0.694,
"step": 1370
},
{
"epoch": 6.156928213689483,
"grad_norm": 0.87890625,
"learning_rate": 7.724216319618257e-06,
"loss": 0.6636,
"step": 1380
},
{
"epoch": 6.201446855870896,
"grad_norm": 0.828125,
"learning_rate": 7.683245159517903e-06,
"loss": 0.6817,
"step": 1390
},
{
"epoch": 6.24596549805231,
"grad_norm": 1.0546875,
"learning_rate": 7.642019376653858e-06,
"loss": 0.6709,
"step": 1400
},
{
"epoch": 6.290484140233723,
"grad_norm": 0.8515625,
"learning_rate": 7.600542883088629e-06,
"loss": 0.6755,
"step": 1410
},
{
"epoch": 6.3350027824151365,
"grad_norm": 1.03125,
"learning_rate": 7.5588196146755526e-06,
"loss": 0.7135,
"step": 1420
},
{
"epoch": 6.3795214245965495,
"grad_norm": 0.78515625,
"learning_rate": 7.5168535306853155e-06,
"loss": 0.6461,
"step": 1430
},
{
"epoch": 6.424040066777963,
"grad_norm": 0.77734375,
"learning_rate": 7.474648613430252e-06,
"loss": 0.6194,
"step": 1440
},
{
"epoch": 6.468558708959376,
"grad_norm": 1.2421875,
"learning_rate": 7.432208867886439e-06,
"loss": 0.6871,
"step": 1450
},
{
"epoch": 6.51307735114079,
"grad_norm": 1.3828125,
"learning_rate": 7.389538321313652e-06,
"loss": 0.6691,
"step": 1460
},
{
"epoch": 6.557595993322204,
"grad_norm": 1.625,
"learning_rate": 7.346641022873205e-06,
"loss": 0.6686,
"step": 1470
},
{
"epoch": 6.602114635503617,
"grad_norm": 1.8046875,
"learning_rate": 7.303521043243711e-06,
"loss": 0.648,
"step": 1480
},
{
"epoch": 6.646633277685031,
"grad_norm": 2.40625,
"learning_rate": 7.2601824742347985e-06,
"loss": 0.7131,
"step": 1490
},
{
"epoch": 6.691151919866444,
"grad_norm": 1.09375,
"learning_rate": 7.2166294283988315e-06,
"loss": 0.7121,
"step": 1500
},
{
"epoch": 6.735670562047858,
"grad_norm": 0.9765625,
"learning_rate": 7.172866038640644e-06,
"loss": 0.6216,
"step": 1510
},
{
"epoch": 6.780189204229271,
"grad_norm": 1.0390625,
"learning_rate": 7.128896457825364e-06,
"loss": 0.6726,
"step": 1520
},
{
"epoch": 6.824707846410685,
"grad_norm": 1.1328125,
"learning_rate": 7.084724858384326e-06,
"loss": 0.6597,
"step": 1530
},
{
"epoch": 6.869226488592098,
"grad_norm": 1.0546875,
"learning_rate": 7.04035543191914e-06,
"loss": 0.6608,
"step": 1540
},
{
"epoch": 6.913745130773512,
"grad_norm": 1.78125,
"learning_rate": 6.995792388803929e-06,
"loss": 0.6419,
"step": 1550
},
{
"epoch": 6.958263772954925,
"grad_norm": 1.7734375,
"learning_rate": 6.9510399577857976e-06,
"loss": 0.6505,
"step": 1560
},
{
"epoch": 7.005008347245409,
"grad_norm": 2.015625,
"learning_rate": 6.906102385583548e-06,
"loss": 0.734,
"step": 1570
},
{
"epoch": 7.049526989426822,
"grad_norm": 1.9140625,
"learning_rate": 6.860983936484689e-06,
"loss": 0.6262,
"step": 1580
},
{
"epoch": 7.094045631608236,
"grad_norm": 1.953125,
"learning_rate": 6.815688891940796e-06,
"loss": 0.6499,
"step": 1590
},
{
"epoch": 7.138564273789649,
"grad_norm": 4.53125,
"learning_rate": 6.770221550161214e-06,
"loss": 0.6259,
"step": 1600
},
{
"epoch": 7.183082915971063,
"grad_norm": 5.1875,
"learning_rate": 6.724586225705191e-06,
"loss": 0.6564,
"step": 1610
},
{
"epoch": 7.227601558152476,
"grad_norm": 6.1875,
"learning_rate": 6.678787249072456e-06,
"loss": 0.6358,
"step": 1620
},
{
"epoch": 7.27212020033389,
"grad_norm": 4.9375,
"learning_rate": 6.632828966292279e-06,
"loss": 0.6883,
"step": 1630
},
{
"epoch": 7.316638842515303,
"grad_norm": 4.625,
"learning_rate": 6.586715738511067e-06,
"loss": 0.6618,
"step": 1640
},
{
"epoch": 7.361157484696717,
"grad_norm": 10.9375,
"learning_rate": 6.540451941578505e-06,
"loss": 0.6233,
"step": 1650
},
{
"epoch": 7.40567612687813,
"grad_norm": 12.625,
"learning_rate": 6.494041965632335e-06,
"loss": 0.6973,
"step": 1660
},
{
"epoch": 7.450194769059544,
"grad_norm": 11.5,
"learning_rate": 6.447490214681742e-06,
"loss": 0.6683,
"step": 1670
},
{
"epoch": 7.494713411240957,
"grad_norm": 10.625,
"learning_rate": 6.400801106189457e-06,
"loss": 0.5964,
"step": 1680
},
{
"epoch": 7.539232053422371,
"grad_norm": 9.875,
"learning_rate": 6.353979070652555e-06,
"loss": 0.6784,
"step": 1690
},
{
"epoch": 7.583750695603785,
"grad_norm": 3.671875,
"learning_rate": 6.307028551182041e-06,
"loss": 0.6335,
"step": 1700
},
{
"epoch": 7.628269337785198,
"grad_norm": 3.421875,
"learning_rate": 6.259954003081215e-06,
"loss": 0.6539,
"step": 1710
},
{
"epoch": 7.6727879799666105,
"grad_norm": 2.828125,
"learning_rate": 6.212759893422908e-06,
"loss": 0.6371,
"step": 1720
},
{
"epoch": 7.717306622148024,
"grad_norm": 3.734375,
"learning_rate": 6.165450700625565e-06,
"loss": 0.6426,
"step": 1730
},
{
"epoch": 7.761825264329438,
"grad_norm": 3.0,
"learning_rate": 6.118030914028292e-06,
"loss": 0.6587,
"step": 1740
},
{
"epoch": 7.806343906510851,
"grad_norm": 3.234375,
"learning_rate": 6.070505033464835e-06,
"loss": 0.5994,
"step": 1750
},
{
"epoch": 7.850862548692265,
"grad_norm": 3.0,
"learning_rate": 6.022877568836579e-06,
"loss": 0.6387,
"step": 1760
},
{
"epoch": 7.895381190873678,
"grad_norm": 2.90625,
"learning_rate": 5.975153039684579e-06,
"loss": 0.6704,
"step": 1770
},
{
"epoch": 7.939899833055092,
"grad_norm": 2.609375,
"learning_rate": 5.927335974760699e-06,
"loss": 0.6274,
"step": 1780
},
{
"epoch": 7.984418475236505,
"grad_norm": 3.03125,
"learning_rate": 5.87943091159785e-06,
"loss": 0.6611,
"step": 1790
},
{
"epoch": 8.03116304952699,
"grad_norm": 2.609375,
"learning_rate": 5.831442396079413e-06,
"loss": 0.6732,
"step": 1800
},
{
"epoch": 8.075681691708404,
"grad_norm": 3.078125,
"learning_rate": 5.78337498200786e-06,
"loss": 0.5774,
"step": 1810
},
{
"epoch": 8.120200333889816,
"grad_norm": 2.71875,
"learning_rate": 5.735233230672636e-06,
"loss": 0.6312,
"step": 1820
},
{
"epoch": 8.16471897607123,
"grad_norm": 2.96875,
"learning_rate": 5.687021710417308e-06,
"loss": 0.6262,
"step": 1830
},
{
"epoch": 8.209237618252644,
"grad_norm": 2.765625,
"learning_rate": 5.638744996206074e-06,
"loss": 0.5604,
"step": 1840
},
{
"epoch": 8.253756260434058,
"grad_norm": 3.078125,
"learning_rate": 5.590407669189612e-06,
"loss": 0.6017,
"step": 1850
},
{
"epoch": 8.29827490261547,
"grad_norm": 2.140625,
"learning_rate": 5.542014316270377e-06,
"loss": 0.5133,
"step": 1860
},
{
"epoch": 8.342793544796884,
"grad_norm": 2.5,
"learning_rate": 5.493569529667312e-06,
"loss": 0.5995,
"step": 1870
},
{
"epoch": 8.387312186978297,
"grad_norm": 2.96875,
"learning_rate": 5.445077906480095e-06,
"loss": 0.6081,
"step": 1880
},
{
"epoch": 8.431830829159711,
"grad_norm": 2.359375,
"learning_rate": 5.396544048252893e-06,
"loss": 0.6193,
"step": 1890
},
{
"epoch": 8.476349471341123,
"grad_norm": 2.5625,
"learning_rate": 5.3479725605377065e-06,
"loss": 0.568,
"step": 1900
},
{
"epoch": 8.520868113522537,
"grad_norm": 2.59375,
"learning_rate": 5.299368052457332e-06,
"loss": 0.5966,
"step": 1910
},
{
"epoch": 8.565386755703951,
"grad_norm": 2.90625,
"learning_rate": 5.250735136267993e-06,
"loss": 0.6217,
"step": 1920
},
{
"epoch": 8.609905397885365,
"grad_norm": 2.375,
"learning_rate": 5.2020784269216515e-06,
"loss": 0.554,
"step": 1930
},
{
"epoch": 8.654424040066779,
"grad_norm": 2.40625,
"learning_rate": 5.153402541628097e-06,
"loss": 0.562,
"step": 1940
},
{
"epoch": 8.698942682248191,
"grad_norm": 2.203125,
"learning_rate": 5.1047120994167855e-06,
"loss": 0.598,
"step": 1950
},
{
"epoch": 8.743461324429605,
"grad_norm": 2.96875,
"learning_rate": 5.056011720698536e-06,
"loss": 0.6065,
"step": 1960
},
{
"epoch": 8.787979966611019,
"grad_norm": 2.53125,
"learning_rate": 5.007306026827076e-06,
"loss": 0.5696,
"step": 1970
},
{
"epoch": 8.832498608792433,
"grad_norm": 2.171875,
"learning_rate": 4.958599639660508e-06,
"loss": 0.5824,
"step": 1980
},
{
"epoch": 8.877017250973845,
"grad_norm": 2.5625,
"learning_rate": 4.909897181122725e-06,
"loss": 0.6082,
"step": 1990
},
{
"epoch": 8.921535893155259,
"grad_norm": 2.84375,
"learning_rate": 4.861203272764813e-06,
"loss": 0.554,
"step": 2000
},
{
"epoch": 8.921535893155259,
"eval_loss": 0.9391384720802307,
"eval_runtime": 17.1717,
"eval_samples_per_second": 23.294,
"eval_steps_per_second": 23.294,
"step": 2000
},
{
"epoch": 8.966054535336673,
"grad_norm": 2.125,
"learning_rate": 4.8125225353265085e-06,
"loss": 0.5373,
"step": 2010
},
{
"epoch": 9.012799109627156,
"grad_norm": 2.203125,
"learning_rate": 4.7638595882977064e-06,
"loss": 0.6353,
"step": 2020
},
{
"epoch": 9.05731775180857,
"grad_norm": 2.25,
"learning_rate": 4.71521904948011e-06,
"loss": 0.5151,
"step": 2030
},
{
"epoch": 9.101836393989982,
"grad_norm": 1.9296875,
"learning_rate": 4.666605534549021e-06,
"loss": 0.5314,
"step": 2040
},
{
"epoch": 9.146355036171396,
"grad_norm": 2.296875,
"learning_rate": 4.618023656615352e-06,
"loss": 0.5424,
"step": 2050
},
{
"epoch": 9.19087367835281,
"grad_norm": 2.21875,
"learning_rate": 4.569478025787869e-06,
"loss": 0.4959,
"step": 2060
},
{
"epoch": 9.235392320534224,
"grad_norm": 2.078125,
"learning_rate": 4.520973248735715e-06,
"loss": 0.5301,
"step": 2070
},
{
"epoch": 9.279910962715638,
"grad_norm": 2.265625,
"learning_rate": 4.472513928251275e-06,
"loss": 0.5219,
"step": 2080
},
{
"epoch": 9.32442960489705,
"grad_norm": 3.078125,
"learning_rate": 4.424104662813396e-06,
"loss": 0.5537,
"step": 2090
},
{
"epoch": 9.368948247078464,
"grad_norm": 2.21875,
"learning_rate": 4.375750046151023e-06,
"loss": 0.5269,
"step": 2100
},
{
"epoch": 9.413466889259878,
"grad_norm": 1.84375,
"learning_rate": 4.3274546668072835e-06,
"loss": 0.5535,
"step": 2110
},
{
"epoch": 9.457985531441292,
"grad_norm": 2.234375,
"learning_rate": 4.279223107704058e-06,
"loss": 0.5382,
"step": 2120
},
{
"epoch": 9.502504173622704,
"grad_norm": 2.1875,
"learning_rate": 4.2310599457071e-06,
"loss": 0.5643,
"step": 2130
},
{
"epoch": 9.547022815804118,
"grad_norm": 2.171875,
"learning_rate": 4.1829697511917146e-06,
"loss": 0.5493,
"step": 2140
},
{
"epoch": 9.591541457985532,
"grad_norm": 2.328125,
"learning_rate": 4.134957087609065e-06,
"loss": 0.5457,
"step": 2150
},
{
"epoch": 9.636060100166945,
"grad_norm": 2.484375,
"learning_rate": 4.087026511053116e-06,
"loss": 0.4859,
"step": 2160
},
{
"epoch": 9.680578742348358,
"grad_norm": 2.1875,
"learning_rate": 4.0391825698283084e-06,
"loss": 0.4969,
"step": 2170
},
{
"epoch": 9.725097384529771,
"grad_norm": 2.125,
"learning_rate": 3.991429804017944e-06,
"loss": 0.5311,
"step": 2180
},
{
"epoch": 9.769616026711185,
"grad_norm": 1.9921875,
"learning_rate": 3.9437727450533605e-06,
"loss": 0.5437,
"step": 2190
},
{
"epoch": 9.8141346688926,
"grad_norm": 1.9609375,
"learning_rate": 3.89621591528393e-06,
"loss": 0.5197,
"step": 2200
},
{
"epoch": 9.858653311074011,
"grad_norm": 1.90625,
"learning_rate": 3.848763827547915e-06,
"loss": 0.5104,
"step": 2210
},
{
"epoch": 9.903171953255425,
"grad_norm": 1.859375,
"learning_rate": 3.8014209847442345e-06,
"loss": 0.55,
"step": 2220
},
{
"epoch": 9.947690595436839,
"grad_norm": 1.8671875,
"learning_rate": 3.7541918794051637e-06,
"loss": 0.53,
"step": 2230
},
{
"epoch": 9.992209237618253,
"grad_norm": 2.203125,
"learning_rate": 3.7070809932700134e-06,
"loss": 0.4882,
"step": 2240
},
{
"epoch": 10.038953811908737,
"grad_norm": 1.75,
"learning_rate": 3.6600927968598588e-06,
"loss": 0.4714,
"step": 2250
},
{
"epoch": 10.08347245409015,
"grad_norm": 1.90625,
"learning_rate": 3.613231749053304e-06,
"loss": 0.4774,
"step": 2260
},
{
"epoch": 10.127991096271563,
"grad_norm": 2.046875,
"learning_rate": 3.5665022966633678e-06,
"loss": 0.4764,
"step": 2270
},
{
"epoch": 10.172509738452977,
"grad_norm": 1.953125,
"learning_rate": 3.519908874015501e-06,
"loss": 0.4632,
"step": 2280
},
{
"epoch": 10.21702838063439,
"grad_norm": 1.9453125,
"learning_rate": 3.473455902526809e-06,
"loss": 0.4604,
"step": 2290
},
{
"epoch": 10.261547022815805,
"grad_norm": 1.6484375,
"learning_rate": 3.4271477902864836e-06,
"loss": 0.4753,
"step": 2300
},
{
"epoch": 10.306065664997218,
"grad_norm": 1.6875,
"learning_rate": 3.3809889316375012e-06,
"loss": 0.4323,
"step": 2310
},
{
"epoch": 10.35058430717863,
"grad_norm": 1.734375,
"learning_rate": 3.334983706759627e-06,
"loss": 0.4659,
"step": 2320
},
{
"epoch": 10.395102949360044,
"grad_norm": 1.9453125,
"learning_rate": 3.2891364812537686e-06,
"loss": 0.4896,
"step": 2330
},
{
"epoch": 10.439621591541458,
"grad_norm": 1.8515625,
"learning_rate": 3.2434516057277055e-06,
"loss": 0.478,
"step": 2340
},
{
"epoch": 10.484140233722872,
"grad_norm": 1.765625,
"learning_rate": 3.1979334153832486e-06,
"loss": 0.4453,
"step": 2350
},
{
"epoch": 10.528658875904284,
"grad_norm": 1.7421875,
"learning_rate": 3.1525862296048446e-06,
"loss": 0.5075,
"step": 2360
},
{
"epoch": 10.573177518085698,
"grad_norm": 1.5546875,
"learning_rate": 3.1074143515497114e-06,
"loss": 0.4865,
"step": 2370
},
{
"epoch": 10.617696160267112,
"grad_norm": 1.5234375,
"learning_rate": 3.0624220677394854e-06,
"loss": 0.5178,
"step": 2380
},
{
"epoch": 10.662214802448526,
"grad_norm": 2.015625,
"learning_rate": 3.017613647653461e-06,
"loss": 0.5069,
"step": 2390
},
{
"epoch": 10.706733444629938,
"grad_norm": 1.4375,
"learning_rate": 2.9729933433234402e-06,
"loss": 0.4423,
"step": 2400
},
{
"epoch": 10.751252086811352,
"grad_norm": 1.609375,
"learning_rate": 2.9285653889302514e-06,
"loss": 0.4359,
"step": 2410
},
{
"epoch": 10.795770728992766,
"grad_norm": 1.296875,
"learning_rate": 2.8843340004019427e-06,
"loss": 0.4517,
"step": 2420
},
{
"epoch": 10.84028937117418,
"grad_norm": 1.578125,
"learning_rate": 2.8403033750137255e-06,
"loss": 0.4775,
"step": 2430
},
{
"epoch": 10.884808013355592,
"grad_norm": 1.5,
"learning_rate": 2.7964776909896733e-06,
"loss": 0.5064,
"step": 2440
},
{
"epoch": 10.929326655537006,
"grad_norm": 1.4609375,
"learning_rate": 2.7528611071062366e-06,
"loss": 0.4651,
"step": 2450
},
{
"epoch": 10.97384529771842,
"grad_norm": 1.4921875,
"learning_rate": 2.7094577622976096e-06,
"loss": 0.4909,
"step": 2460
},
{
"epoch": 11.020589872008903,
"grad_norm": 1.3984375,
"learning_rate": 2.6662717752629597e-06,
"loss": 0.4996,
"step": 2470
},
{
"epoch": 11.065108514190317,
"grad_norm": 1.3984375,
"learning_rate": 2.6233072440755934e-06,
"loss": 0.4445,
"step": 2480
},
{
"epoch": 11.109627156371731,
"grad_norm": 1.375,
"learning_rate": 2.580568245794085e-06,
"loss": 0.4471,
"step": 2490
},
{
"epoch": 11.154145798553143,
"grad_norm": 1.0703125,
"learning_rate": 2.538058836075373e-06,
"loss": 0.49,
"step": 2500
},
{
"epoch": 11.198664440734557,
"grad_norm": 0.9453125,
"learning_rate": 2.4957830487899224e-06,
"loss": 0.4148,
"step": 2510
},
{
"epoch": 11.243183082915971,
"grad_norm": 1.078125,
"learning_rate": 2.4537448956389146e-06,
"loss": 0.4247,
"step": 2520
},
{
"epoch": 11.287701725097385,
"grad_norm": 1.1953125,
"learning_rate": 2.411948365773588e-06,
"loss": 0.4368,
"step": 2530
},
{
"epoch": 11.332220367278797,
"grad_norm": 1.203125,
"learning_rate": 2.3703974254166704e-06,
"loss": 0.4273,
"step": 2540
},
{
"epoch": 11.376739009460211,
"grad_norm": 1.0,
"learning_rate": 2.3290960174860293e-06,
"loss": 0.4421,
"step": 2550
},
{
"epoch": 11.421257651641625,
"grad_norm": 0.97265625,
"learning_rate": 2.2880480612204925e-06,
"loss": 0.4072,
"step": 2560
},
{
"epoch": 11.465776293823039,
"grad_norm": 1.2421875,
"learning_rate": 2.247257451807961e-06,
"loss": 0.4472,
"step": 2570
},
{
"epoch": 11.510294936004453,
"grad_norm": 1.1328125,
"learning_rate": 2.206728060015761e-06,
"loss": 0.4613,
"step": 2580
},
{
"epoch": 11.554813578185865,
"grad_norm": 1.34375,
"learning_rate": 2.1664637318233484e-06,
"loss": 0.4111,
"step": 2590
},
{
"epoch": 11.599332220367279,
"grad_norm": 1.1796875,
"learning_rate": 2.1264682880573374e-06,
"loss": 0.4385,
"step": 2600
},
{
"epoch": 11.643850862548693,
"grad_norm": 0.98046875,
"learning_rate": 2.086745524028933e-06,
"loss": 0.4448,
"step": 2610
},
{
"epoch": 11.688369504730106,
"grad_norm": 1.1328125,
"learning_rate": 2.0472992091737886e-06,
"loss": 0.4292,
"step": 2620
},
{
"epoch": 11.732888146911518,
"grad_norm": 1.2421875,
"learning_rate": 2.0081330866942962e-06,
"loss": 0.425,
"step": 2630
},
{
"epoch": 11.777406789092932,
"grad_norm": 1.0859375,
"learning_rate": 1.96925087320439e-06,
"loss": 0.4311,
"step": 2640
},
{
"epoch": 11.821925431274346,
"grad_norm": 0.9375,
"learning_rate": 1.930656258376859e-06,
"loss": 0.4725,
"step": 2650
},
{
"epoch": 11.86644407345576,
"grad_norm": 1.0,
"learning_rate": 1.8923529045932292e-06,
"loss": 0.4149,
"step": 2660
},
{
"epoch": 11.910962715637172,
"grad_norm": 1.0625,
"learning_rate": 1.8543444465962147e-06,
"loss": 0.4436,
"step": 2670
},
{
"epoch": 11.955481357818586,
"grad_norm": 0.9375,
"learning_rate": 1.8166344911448115e-06,
"loss": 0.4254,
"step": 2680
},
{
"epoch": 12.00222593210907,
"grad_norm": 3.515625,
"learning_rate": 1.7792266166720368e-06,
"loss": 0.5129,
"step": 2690
},
{
"epoch": 12.046744574290484,
"grad_norm": 1.0625,
"learning_rate": 1.742124372945364e-06,
"loss": 0.4114,
"step": 2700
},
{
"epoch": 12.091263216471898,
"grad_norm": 0.98046875,
"learning_rate": 1.7053312807298633e-06,
"loss": 0.4351,
"step": 2710
},
{
"epoch": 12.135781858653312,
"grad_norm": 0.890625,
"learning_rate": 1.6688508314541086e-06,
"loss": 0.404,
"step": 2720
},
{
"epoch": 12.180300500834724,
"grad_norm": 0.99609375,
"learning_rate": 1.6326864868788678e-06,
"loss": 0.4349,
"step": 2730
},
{
"epoch": 12.224819143016138,
"grad_norm": 1.0234375,
"learning_rate": 1.5968416787685919e-06,
"loss": 0.4581,
"step": 2740
},
{
"epoch": 12.269337785197552,
"grad_norm": 0.9921875,
"learning_rate": 1.5613198085657804e-06,
"loss": 0.4589,
"step": 2750
},
{
"epoch": 12.313856427378965,
"grad_norm": 0.95703125,
"learning_rate": 1.5261242470681813e-06,
"loss": 0.4357,
"step": 2760
},
{
"epoch": 12.358375069560378,
"grad_norm": 0.78515625,
"learning_rate": 1.4912583341089516e-06,
"loss": 0.3949,
"step": 2770
},
{
"epoch": 12.402893711741791,
"grad_norm": 0.9140625,
"learning_rate": 1.4567253782397073e-06,
"loss": 0.4184,
"step": 2780
},
{
"epoch": 12.447412353923205,
"grad_norm": 0.91796875,
"learning_rate": 1.4225286564165785e-06,
"loss": 0.4309,
"step": 2790
},
{
"epoch": 12.49193099610462,
"grad_norm": 0.9453125,
"learning_rate": 1.3886714136892287e-06,
"loss": 0.4539,
"step": 2800
},
{
"epoch": 12.536449638286033,
"grad_norm": 0.77734375,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.4243,
"step": 2810
},
{
"epoch": 12.580968280467445,
"grad_norm": 0.83984375,
"learning_rate": 1.321988184343732e-06,
"loss": 0.4039,
"step": 2820
},
{
"epoch": 12.625486922648859,
"grad_norm": 1.0078125,
"learning_rate": 1.2891685255365517e-06,
"loss": 0.4182,
"step": 2830
},
{
"epoch": 12.670005564830273,
"grad_norm": 0.96875,
"learning_rate": 1.256701000846619e-06,
"loss": 0.4146,
"step": 2840
},
{
"epoch": 12.714524207011687,
"grad_norm": 0.8984375,
"learning_rate": 1.22458869123388e-06,
"loss": 0.434,
"step": 2850
},
{
"epoch": 12.759042849193099,
"grad_norm": 0.890625,
"learning_rate": 1.1928346439506526e-06,
"loss": 0.4356,
"step": 2860
},
{
"epoch": 12.803561491374513,
"grad_norm": 0.75390625,
"learning_rate": 1.1614418722524506e-06,
"loss": 0.4073,
"step": 2870
},
{
"epoch": 12.848080133555927,
"grad_norm": 0.84375,
"learning_rate": 1.1304133551120532e-06,
"loss": 0.4376,
"step": 2880
},
{
"epoch": 12.89259877573734,
"grad_norm": 0.72265625,
"learning_rate": 1.0997520369368158e-06,
"loss": 0.4078,
"step": 2890
},
{
"epoch": 12.937117417918753,
"grad_norm": 0.87109375,
"learning_rate": 1.0694608272892698e-06,
"loss": 0.4329,
"step": 2900
},
{
"epoch": 12.981636060100167,
"grad_norm": 0.9765625,
"learning_rate": 1.0395426006110164e-06,
"loss": 0.3766,
"step": 2910
},
{
"epoch": 13.02838063439065,
"grad_norm": 0.8046875,
"learning_rate": 1.0100001959499644e-06,
"loss": 0.3808,
"step": 2920
},
{
"epoch": 13.072899276572064,
"grad_norm": 0.64453125,
"learning_rate": 9.808364166909256e-07,
"loss": 0.4232,
"step": 2930
},
{
"epoch": 13.117417918753478,
"grad_norm": 0.8046875,
"learning_rate": 9.520540302895847e-07,
"loss": 0.4332,
"step": 2940
},
{
"epoch": 13.161936560934892,
"grad_norm": 0.6640625,
"learning_rate": 9.236557680098918e-07,
"loss": 0.4059,
"step": 2950
},
{
"epoch": 13.206455203116304,
"grad_norm": 0.83984375,
"learning_rate": 8.956443246648771e-07,
"loss": 0.3704,
"step": 2960
},
{
"epoch": 13.250973845297718,
"grad_norm": 0.8046875,
"learning_rate": 8.680223583609399e-07,
"loss": 0.4327,
"step": 2970
},
{
"epoch": 13.295492487479132,
"grad_norm": 0.875,
"learning_rate": 8.407924902455983e-07,
"loss": 0.4229,
"step": 2980
},
{
"epoch": 13.340011129660546,
"grad_norm": 0.80859375,
"learning_rate": 8.139573042587729e-07,
"loss": 0.4121,
"step": 2990
},
{
"epoch": 13.384529771841958,
"grad_norm": 0.70703125,
"learning_rate": 7.875193468875719e-07,
"loss": 0.423,
"step": 3000
},
{
"epoch": 13.384529771841958,
"eval_loss": 0.9250730872154236,
"eval_runtime": 17.1049,
"eval_samples_per_second": 23.385,
"eval_steps_per_second": 23.385,
"step": 3000
},
{
"epoch": 13.429048414023372,
"grad_norm": 0.90234375,
"learning_rate": 7.614811269246631e-07,
"loss": 0.4316,
"step": 3010
},
{
"epoch": 13.473567056204786,
"grad_norm": 0.87890625,
"learning_rate": 7.35845115230191e-07,
"loss": 0.4104,
"step": 3020
},
{
"epoch": 13.5180856983862,
"grad_norm": 0.76953125,
"learning_rate": 7.106137444973177e-07,
"loss": 0.4367,
"step": 3030
},
{
"epoch": 13.562604340567614,
"grad_norm": 0.83984375,
"learning_rate": 6.857894090213702e-07,
"loss": 0.417,
"step": 3040
},
{
"epoch": 13.607122982749026,
"grad_norm": 1.4765625,
"learning_rate": 6.613744644726383e-07,
"loss": 0.394,
"step": 3050
},
{
"epoch": 13.65164162493044,
"grad_norm": 0.91796875,
"learning_rate": 6.3737122767284e-07,
"loss": 0.4172,
"step": 3060
},
{
"epoch": 13.696160267111853,
"grad_norm": 0.86328125,
"learning_rate": 6.137819763752656e-07,
"loss": 0.4517,
"step": 3070
},
{
"epoch": 13.740678909293267,
"grad_norm": 1.046875,
"learning_rate": 5.90608949048635e-07,
"loss": 0.4256,
"step": 3080
},
{
"epoch": 13.78519755147468,
"grad_norm": 1.21875,
"learning_rate": 5.678543446646811e-07,
"loss": 0.4019,
"step": 3090
},
{
"epoch": 13.829716193656093,
"grad_norm": 0.96875,
"learning_rate": 5.455203224894857e-07,
"loss": 0.453,
"step": 3100
},
{
"epoch": 13.874234835837507,
"grad_norm": 0.92578125,
"learning_rate": 5.236090018785705e-07,
"loss": 0.4107,
"step": 3110
},
{
"epoch": 13.918753478018921,
"grad_norm": 0.8984375,
"learning_rate": 5.021224620757914e-07,
"loss": 0.4475,
"step": 3120
},
{
"epoch": 13.963272120200333,
"grad_norm": 0.9296875,
"learning_rate": 4.810627420160269e-07,
"loss": 0.4322,
"step": 3130
},
{
"epoch": 14.010016694490819,
"grad_norm": 0.91796875,
"learning_rate": 4.604318401317009e-07,
"loss": 0.4318,
"step": 3140
},
{
"epoch": 14.054535336672231,
"grad_norm": 1.3046875,
"learning_rate": 4.402317141631407e-07,
"loss": 0.4489,
"step": 3150
},
{
"epoch": 14.099053978853645,
"grad_norm": 1.40625,
"learning_rate": 4.2046428097279766e-07,
"loss": 0.4381,
"step": 3160
},
{
"epoch": 14.143572621035059,
"grad_norm": 1.1796875,
"learning_rate": 4.011314163633573e-07,
"loss": 0.4107,
"step": 3170
},
{
"epoch": 14.188091263216473,
"grad_norm": 1.3203125,
"learning_rate": 3.822349548997295e-07,
"loss": 0.4399,
"step": 3180
},
{
"epoch": 14.232609905397885,
"grad_norm": 1.3984375,
"learning_rate": 3.637766897349654e-07,
"loss": 0.417,
"step": 3190
},
{
"epoch": 14.277128547579299,
"grad_norm": 3.5,
"learning_rate": 3.4575837244009367e-07,
"loss": 0.4449,
"step": 3200
},
{
"epoch": 14.321647189760712,
"grad_norm": 3.671875,
"learning_rate": 3.281817128379139e-07,
"loss": 0.3875,
"step": 3210
},
{
"epoch": 14.366165831942126,
"grad_norm": 4.625,
"learning_rate": 3.1104837884073866e-07,
"loss": 0.4187,
"step": 3220
},
{
"epoch": 14.410684474123538,
"grad_norm": 3.875,
"learning_rate": 2.943599962921279e-07,
"loss": 0.41,
"step": 3230
},
{
"epoch": 14.455203116304952,
"grad_norm": 3.828125,
"learning_rate": 2.7811814881259503e-07,
"loss": 0.4016,
"step": 3240
},
{
"epoch": 14.499721758486366,
"grad_norm": 8.1875,
"learning_rate": 2.623243776493434e-07,
"loss": 0.3906,
"step": 3250
},
{
"epoch": 14.54424040066778,
"grad_norm": 8.125,
"learning_rate": 2.469801815300027e-07,
"loss": 0.4241,
"step": 3260
},
{
"epoch": 14.588759042849194,
"grad_norm": 9.5625,
"learning_rate": 2.3208701652041697e-07,
"loss": 0.4104,
"step": 3270
},
{
"epoch": 14.633277685030606,
"grad_norm": 7.59375,
"learning_rate": 2.1764629588646667e-07,
"loss": 0.4031,
"step": 3280
},
{
"epoch": 14.67779632721202,
"grad_norm": 7.125,
"learning_rate": 2.036593899599615e-07,
"loss": 0.3911,
"step": 3290
},
{
"epoch": 14.722314969393434,
"grad_norm": 2.546875,
"learning_rate": 1.9012762600860656e-07,
"loss": 0.4137,
"step": 3300
},
{
"epoch": 14.766833611574848,
"grad_norm": 2.609375,
"learning_rate": 1.7705228811005004e-07,
"loss": 0.4559,
"step": 3310
},
{
"epoch": 14.81135225375626,
"grad_norm": 2.640625,
"learning_rate": 1.6443461703003427e-07,
"loss": 0.3986,
"step": 3320
},
{
"epoch": 14.855870895937674,
"grad_norm": 2.484375,
"learning_rate": 1.5227581010465341e-07,
"loss": 0.4073,
"step": 3330
},
{
"epoch": 14.900389538119088,
"grad_norm": 2.734375,
"learning_rate": 1.4057702112673765e-07,
"loss": 0.4137,
"step": 3340
},
{
"epoch": 14.944908180300501,
"grad_norm": 2.25,
"learning_rate": 1.2933936023636073e-07,
"loss": 0.4283,
"step": 3350
},
{
"epoch": 14.989426822481914,
"grad_norm": 2.234375,
"learning_rate": 1.185638938154976e-07,
"loss": 0.4097,
"step": 3360
},
{
"epoch": 15.0361713967724,
"grad_norm": 2.203125,
"learning_rate": 1.08251644386832e-07,
"loss": 0.4326,
"step": 3370
},
{
"epoch": 15.080690038953811,
"grad_norm": 3.0,
"learning_rate": 9.84035905167241e-08,
"loss": 0.4338,
"step": 3380
},
{
"epoch": 15.125208681135225,
"grad_norm": 2.359375,
"learning_rate": 8.902066672235144e-08,
"loss": 0.4197,
"step": 3390
},
{
"epoch": 15.16972732331664,
"grad_norm": 2.234375,
"learning_rate": 8.010376338302872e-08,
"loss": 0.4277,
"step": 3400
},
{
"epoch": 15.214245965498053,
"grad_norm": 2.25,
"learning_rate": 7.165372665571879e-08,
"loss": 0.4325,
"step": 3410
},
{
"epoch": 15.258764607679465,
"grad_norm": 2.171875,
"learning_rate": 6.367135839473349e-08,
"loss": 0.399,
"step": 3420
},
{
"epoch": 15.303283249860879,
"grad_norm": 2.578125,
"learning_rate": 5.6157416075648954e-08,
"loss": 0.4368,
"step": 3430
},
{
"epoch": 15.347801892042293,
"grad_norm": 2.359375,
"learning_rate": 4.911261272341872e-08,
"loss": 0.4029,
"step": 3440
},
{
"epoch": 15.392320534223707,
"grad_norm": 2.296875,
"learning_rate": 4.25376168447178e-08,
"loss": 0.4269,
"step": 3450
},
{
"epoch": 15.436839176405119,
"grad_norm": 2.359375,
"learning_rate": 3.643305236450345e-08,
"loss": 0.4442,
"step": 3460
},
{
"epoch": 15.481357818586533,
"grad_norm": 2.28125,
"learning_rate": 3.079949856680975e-08,
"loss": 0.4207,
"step": 3470
},
{
"epoch": 15.525876460767947,
"grad_norm": 2.671875,
"learning_rate": 2.5637490039775447e-08,
"loss": 0.4257,
"step": 3480
},
{
"epoch": 15.57039510294936,
"grad_norm": 2.296875,
"learning_rate": 2.0947516624917898e-08,
"loss": 0.4161,
"step": 3490
},
{
"epoch": 15.614913745130773,
"grad_norm": 2.015625,
"learning_rate": 1.6730023370645775e-08,
"loss": 0.3976,
"step": 3500
},
{
"epoch": 15.659432387312187,
"grad_norm": 2.140625,
"learning_rate": 1.298541049003288e-08,
"loss": 0.4074,
"step": 3510
},
{
"epoch": 15.7039510294936,
"grad_norm": 2.421875,
"learning_rate": 9.714033322833494e-09,
"loss": 0.4155,
"step": 3520
},
{
"epoch": 15.748469671675014,
"grad_norm": 2.203125,
"learning_rate": 6.9162023017699255e-09,
"loss": 0.3747,
"step": 3530
},
{
"epoch": 15.792988313856428,
"grad_norm": 2.09375,
"learning_rate": 4.592182923068289e-09,
"loss": 0.3766,
"step": 3540
},
{
"epoch": 15.83750695603784,
"grad_norm": 2.125,
"learning_rate": 2.7421957212697692e-09,
"loss": 0.4017,
"step": 3550
},
{
"epoch": 15.882025598219254,
"grad_norm": 1.90625,
"learning_rate": 1.3664162482990296e-09,
"loss": 0.42,
"step": 3560
},
{
"epoch": 15.926544240400668,
"grad_norm": 1.7421875,
"learning_rate": 4.649750568080924e-10,
"loss": 0.44,
"step": 3570
},
{
"epoch": 15.971062882582082,
"grad_norm": 2.421875,
"learning_rate": 3.795768778680487e-11,
"loss": 0.4156,
"step": 3580
},
{
"epoch": 15.988870339454646,
"step": 3584,
"total_flos": 8.551781210951516e+17,
"train_loss": 0.7042842949075359,
"train_runtime": 4850.1152,
"train_samples_per_second": 11.853,
"train_steps_per_second": 0.739
}
],
"logging_steps": 10,
"max_steps": 3584,
"num_input_tokens_seen": 0,
"num_train_epochs": 16,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.551781210951516e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}