neuralwonderland's picture
Training in progress, step 4800, checkpoint
9142584 verified
raw
history blame
91.2 kB
{
"best_metric": 1.1947814226150513,
"best_model_checkpoint": "./output/checkpoint-4650",
"epoch": 0.21500559910414332,
"eval_steps": 150,
"global_step": 4800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004479283314669653,
"grad_norm": 6.820243835449219,
"learning_rate": 7.500000000000001e-07,
"loss": 1.2628,
"step": 10
},
{
"epoch": 0.0008958566629339306,
"grad_norm": 5.822151184082031,
"learning_rate": 1.5000000000000002e-06,
"loss": 1.3652,
"step": 20
},
{
"epoch": 0.0013437849944008958,
"grad_norm": 4.442959785461426,
"learning_rate": 2.25e-06,
"loss": 1.412,
"step": 30
},
{
"epoch": 0.0017917133258678612,
"grad_norm": 9.916281700134277,
"learning_rate": 3.0000000000000005e-06,
"loss": 1.5213,
"step": 40
},
{
"epoch": 0.0022396416573348264,
"grad_norm": 22.53717613220215,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3189,
"step": 50
},
{
"epoch": 0.0026875699888017916,
"grad_norm": 5.07314920425415,
"learning_rate": 4.5e-06,
"loss": 1.3022,
"step": 60
},
{
"epoch": 0.003135498320268757,
"grad_norm": 9.401494026184082,
"learning_rate": 5.2500000000000006e-06,
"loss": 1.5065,
"step": 70
},
{
"epoch": 0.0035834266517357225,
"grad_norm": 8.749906539916992,
"learning_rate": 6.000000000000001e-06,
"loss": 1.1579,
"step": 80
},
{
"epoch": 0.004031354983202688,
"grad_norm": 6.749314785003662,
"learning_rate": 6.7500000000000014e-06,
"loss": 1.2524,
"step": 90
},
{
"epoch": 0.004479283314669653,
"grad_norm": 8.411529541015625,
"learning_rate": 7.500000000000001e-06,
"loss": 1.3242,
"step": 100
},
{
"epoch": 0.004927211646136618,
"grad_norm": 5.293492794036865,
"learning_rate": 7.499922926093874e-06,
"loss": 0.9967,
"step": 110
},
{
"epoch": 0.005375139977603583,
"grad_norm": 8.860544204711914,
"learning_rate": 7.499691707543699e-06,
"loss": 1.1881,
"step": 120
},
{
"epoch": 0.0058230683090705485,
"grad_norm": 9.859148979187012,
"learning_rate": 7.499306353853963e-06,
"loss": 1.0598,
"step": 130
},
{
"epoch": 0.006270996640537514,
"grad_norm": 4.37281608581543,
"learning_rate": 7.49876688086505e-06,
"loss": 1.1233,
"step": 140
},
{
"epoch": 0.006718924972004479,
"grad_norm": 4.489595890045166,
"learning_rate": 7.4980733107525805e-06,
"loss": 1.2183,
"step": 150
},
{
"epoch": 0.006718924972004479,
"eval_loss": 1.282976508140564,
"eval_runtime": 51.7095,
"eval_samples_per_second": 9.669,
"eval_steps_per_second": 9.669,
"step": 150
},
{
"epoch": 0.007166853303471445,
"grad_norm": 6.339463233947754,
"learning_rate": 7.4972256720265044e-06,
"loss": 1.1818,
"step": 160
},
{
"epoch": 0.00761478163493841,
"grad_norm": 6.762680530548096,
"learning_rate": 7.496223999529932e-06,
"loss": 1.0349,
"step": 170
},
{
"epoch": 0.008062709966405375,
"grad_norm": 7.486023426055908,
"learning_rate": 7.4950683344376926e-06,
"loss": 1.1735,
"step": 180
},
{
"epoch": 0.00851063829787234,
"grad_norm": 4.099631309509277,
"learning_rate": 7.4937587242546544e-06,
"loss": 1.2452,
"step": 190
},
{
"epoch": 0.008958566629339306,
"grad_norm": 5.422396183013916,
"learning_rate": 7.492295222813762e-06,
"loss": 1.1032,
"step": 200
},
{
"epoch": 0.009406494960806271,
"grad_norm": 6.336536407470703,
"learning_rate": 7.490677890273828e-06,
"loss": 1.0852,
"step": 210
},
{
"epoch": 0.009854423292273236,
"grad_norm": 4.766495704650879,
"learning_rate": 7.488906793117058e-06,
"loss": 1.2168,
"step": 220
},
{
"epoch": 0.010302351623740201,
"grad_norm": 5.892153263092041,
"learning_rate": 7.486982004146319e-06,
"loss": 1.1595,
"step": 230
},
{
"epoch": 0.010750279955207167,
"grad_norm": 4.957208633422852,
"learning_rate": 7.484903602482148e-06,
"loss": 1.1423,
"step": 240
},
{
"epoch": 0.011198208286674132,
"grad_norm": 4.198282718658447,
"learning_rate": 7.4826716735594945e-06,
"loss": 1.0562,
"step": 250
},
{
"epoch": 0.011646136618141097,
"grad_norm": 3.4756815433502197,
"learning_rate": 7.480286309124216e-06,
"loss": 0.9894,
"step": 260
},
{
"epoch": 0.012094064949608062,
"grad_norm": 4.725418567657471,
"learning_rate": 7.477747607229302e-06,
"loss": 1.1761,
"step": 270
},
{
"epoch": 0.012541993281075027,
"grad_norm": 4.241955280303955,
"learning_rate": 7.475055672230844e-06,
"loss": 1.1118,
"step": 280
},
{
"epoch": 0.012989921612541993,
"grad_norm": 5.7904863357543945,
"learning_rate": 7.472210614783745e-06,
"loss": 1.0932,
"step": 290
},
{
"epoch": 0.013437849944008958,
"grad_norm": 4.546011924743652,
"learning_rate": 7.469212551837173e-06,
"loss": 1.1187,
"step": 300
},
{
"epoch": 0.013437849944008958,
"eval_loss": 1.26471745967865,
"eval_runtime": 51.7822,
"eval_samples_per_second": 9.656,
"eval_steps_per_second": 9.656,
"step": 300
},
{
"epoch": 0.013885778275475923,
"grad_norm": 6.256772994995117,
"learning_rate": 7.4660616066297565e-06,
"loss": 1.2176,
"step": 310
},
{
"epoch": 0.01433370660694289,
"grad_norm": 7.437366485595703,
"learning_rate": 7.462757908684509e-06,
"loss": 1.046,
"step": 320
},
{
"epoch": 0.014781634938409855,
"grad_norm": 8.049488067626953,
"learning_rate": 7.459301593803512e-06,
"loss": 1.2396,
"step": 330
},
{
"epoch": 0.01522956326987682,
"grad_norm": 5.115020751953125,
"learning_rate": 7.455692804062335e-06,
"loss": 1.1018,
"step": 340
},
{
"epoch": 0.015677491601343786,
"grad_norm": 5.805201530456543,
"learning_rate": 7.451931687804189e-06,
"loss": 1.0083,
"step": 350
},
{
"epoch": 0.01612541993281075,
"grad_norm": 5.960669040679932,
"learning_rate": 7.448018399633831e-06,
"loss": 1.1773,
"step": 360
},
{
"epoch": 0.016573348264277716,
"grad_norm": 4.82655143737793,
"learning_rate": 7.443953100411214e-06,
"loss": 1.2279,
"step": 370
},
{
"epoch": 0.01702127659574468,
"grad_norm": 5.768619060516357,
"learning_rate": 7.439735957244862e-06,
"loss": 1.0924,
"step": 380
},
{
"epoch": 0.017469204927211646,
"grad_norm": 4.603348731994629,
"learning_rate": 7.435367143485015e-06,
"loss": 0.9547,
"step": 390
},
{
"epoch": 0.01791713325867861,
"grad_norm": 3.802041530609131,
"learning_rate": 7.430846838716496e-06,
"loss": 1.0569,
"step": 400
},
{
"epoch": 0.018365061590145577,
"grad_norm": 4.473762035369873,
"learning_rate": 7.426175228751328e-06,
"loss": 1.1299,
"step": 410
},
{
"epoch": 0.018812989921612542,
"grad_norm": 4.674028396606445,
"learning_rate": 7.421352505621099e-06,
"loss": 1.0512,
"step": 420
},
{
"epoch": 0.019260918253079507,
"grad_norm": 5.1446852684021,
"learning_rate": 7.416378867569069e-06,
"loss": 1.2024,
"step": 430
},
{
"epoch": 0.019708846584546472,
"grad_norm": 3.742156744003296,
"learning_rate": 7.411254519042017e-06,
"loss": 1.1778,
"step": 440
},
{
"epoch": 0.020156774916013438,
"grad_norm": 4.0376200675964355,
"learning_rate": 7.4059796706818396e-06,
"loss": 1.1754,
"step": 450
},
{
"epoch": 0.020156774916013438,
"eval_loss": 1.2499778270721436,
"eval_runtime": 51.5995,
"eval_samples_per_second": 9.69,
"eval_steps_per_second": 9.69,
"step": 450
},
{
"epoch": 0.020604703247480403,
"grad_norm": 3.672325372695923,
"learning_rate": 7.400554539316894e-06,
"loss": 1.1627,
"step": 460
},
{
"epoch": 0.021052631578947368,
"grad_norm": 4.949635982513428,
"learning_rate": 7.394979347953081e-06,
"loss": 1.3115,
"step": 470
},
{
"epoch": 0.021500559910414333,
"grad_norm": 4.03855037689209,
"learning_rate": 7.389254325764681e-06,
"loss": 1.1176,
"step": 480
},
{
"epoch": 0.0219484882418813,
"grad_norm": 4.981250762939453,
"learning_rate": 7.383379708084934e-06,
"loss": 1.0668,
"step": 490
},
{
"epoch": 0.022396416573348264,
"grad_norm": 4.68571138381958,
"learning_rate": 7.377355736396362e-06,
"loss": 1.1235,
"step": 500
},
{
"epoch": 0.02284434490481523,
"grad_norm": 5.7003326416015625,
"learning_rate": 7.371182658320847e-06,
"loss": 1.0535,
"step": 510
},
{
"epoch": 0.023292273236282194,
"grad_norm": 2.357079029083252,
"learning_rate": 7.36486072760945e-06,
"loss": 0.9768,
"step": 520
},
{
"epoch": 0.02374020156774916,
"grad_norm": 4.828664779663086,
"learning_rate": 7.358390204131984e-06,
"loss": 1.0385,
"step": 530
},
{
"epoch": 0.024188129899216124,
"grad_norm": 3.4303321838378906,
"learning_rate": 7.3517713538663235e-06,
"loss": 0.9826,
"step": 540
},
{
"epoch": 0.02463605823068309,
"grad_norm": 8.705097198486328,
"learning_rate": 7.345004448887478e-06,
"loss": 1.0988,
"step": 550
},
{
"epoch": 0.025083986562150055,
"grad_norm": 4.806099891662598,
"learning_rate": 7.3380897673564085e-06,
"loss": 1.2765,
"step": 560
},
{
"epoch": 0.02553191489361702,
"grad_norm": 3.948829174041748,
"learning_rate": 7.33102759350859e-06,
"loss": 1.2548,
"step": 570
},
{
"epoch": 0.025979843225083985,
"grad_norm": 8.706982612609863,
"learning_rate": 7.323818217642328e-06,
"loss": 1.1907,
"step": 580
},
{
"epoch": 0.02642777155655095,
"grad_norm": 4.196287155151367,
"learning_rate": 7.316461936106827e-06,
"loss": 1.1541,
"step": 590
},
{
"epoch": 0.026875699888017916,
"grad_norm": 4.2185187339782715,
"learning_rate": 7.3089590512900084e-06,
"loss": 1.0761,
"step": 600
},
{
"epoch": 0.026875699888017916,
"eval_loss": 1.2407419681549072,
"eval_runtime": 51.6589,
"eval_samples_per_second": 9.679,
"eval_steps_per_second": 9.679,
"step": 600
},
{
"epoch": 0.02732362821948488,
"grad_norm": 4.50939416885376,
"learning_rate": 7.301309871606081e-06,
"loss": 1.1746,
"step": 610
},
{
"epoch": 0.027771556550951846,
"grad_norm": 5.48988676071167,
"learning_rate": 7.293514711482861e-06,
"loss": 1.0518,
"step": 620
},
{
"epoch": 0.028219484882418815,
"grad_norm": 4.441885471343994,
"learning_rate": 7.285573891348849e-06,
"loss": 1.0679,
"step": 630
},
{
"epoch": 0.02866741321388578,
"grad_norm": 6.711030006408691,
"learning_rate": 7.27748773762006e-06,
"loss": 1.2901,
"step": 640
},
{
"epoch": 0.029115341545352745,
"grad_norm": 5.328275680541992,
"learning_rate": 7.269256582686603e-06,
"loss": 1.1749,
"step": 650
},
{
"epoch": 0.02956326987681971,
"grad_norm": 3.016313314437866,
"learning_rate": 7.260880764899016e-06,
"loss": 1.1398,
"step": 660
},
{
"epoch": 0.030011198208286675,
"grad_norm": 4.6470866203308105,
"learning_rate": 7.252360628554363e-06,
"loss": 1.0427,
"step": 670
},
{
"epoch": 0.03045912653975364,
"grad_norm": 9.044170379638672,
"learning_rate": 7.243696523882079e-06,
"loss": 1.0913,
"step": 680
},
{
"epoch": 0.030907054871220606,
"grad_norm": 4.983870029449463,
"learning_rate": 7.2348888070295705e-06,
"loss": 1.1174,
"step": 690
},
{
"epoch": 0.03135498320268757,
"grad_norm": 10.38315486907959,
"learning_rate": 7.225937840047583e-06,
"loss": 1.2386,
"step": 700
},
{
"epoch": 0.031802911534154536,
"grad_norm": 5.104282855987549,
"learning_rate": 7.216843990875307e-06,
"loss": 1.1014,
"step": 710
},
{
"epoch": 0.0322508398656215,
"grad_norm": 5.493166446685791,
"learning_rate": 7.207607633325266e-06,
"loss": 1.2569,
"step": 720
},
{
"epoch": 0.03269876819708847,
"grad_norm": 5.069271564483643,
"learning_rate": 7.198229147067941e-06,
"loss": 1.1938,
"step": 730
},
{
"epoch": 0.03314669652855543,
"grad_norm": 5.183401107788086,
"learning_rate": 7.18870891761617e-06,
"loss": 0.9859,
"step": 740
},
{
"epoch": 0.0335946248600224,
"grad_norm": 4.3622965812683105,
"learning_rate": 7.1790473363092974e-06,
"loss": 1.1359,
"step": 750
},
{
"epoch": 0.0335946248600224,
"eval_loss": 1.2344202995300293,
"eval_runtime": 51.6321,
"eval_samples_per_second": 9.684,
"eval_steps_per_second": 9.684,
"step": 750
},
{
"epoch": 0.03404255319148936,
"grad_norm": 4.141931056976318,
"learning_rate": 7.169244800297089e-06,
"loss": 1.2613,
"step": 760
},
{
"epoch": 0.03449048152295633,
"grad_norm": 4.191932201385498,
"learning_rate": 7.159301712523407e-06,
"loss": 1.1802,
"step": 770
},
{
"epoch": 0.03493840985442329,
"grad_norm": 4.759700775146484,
"learning_rate": 7.149218481709644e-06,
"loss": 1.0651,
"step": 780
},
{
"epoch": 0.03538633818589026,
"grad_norm": 3.969430923461914,
"learning_rate": 7.1389955223379266e-06,
"loss": 0.9129,
"step": 790
},
{
"epoch": 0.03583426651735722,
"grad_norm": 5.1956467628479,
"learning_rate": 7.128633254634072e-06,
"loss": 1.2688,
"step": 800
},
{
"epoch": 0.03628219484882419,
"grad_norm": 3.615705966949463,
"learning_rate": 7.118132104550322e-06,
"loss": 1.1092,
"step": 810
},
{
"epoch": 0.036730123180291153,
"grad_norm": 3.635277271270752,
"learning_rate": 7.107492503747826e-06,
"loss": 1.0265,
"step": 820
},
{
"epoch": 0.03717805151175812,
"grad_norm": 4.518077373504639,
"learning_rate": 7.096714889578898e-06,
"loss": 1.0817,
"step": 830
},
{
"epoch": 0.037625979843225084,
"grad_norm": 6.652565002441406,
"learning_rate": 7.085799705069046e-06,
"loss": 0.9709,
"step": 840
},
{
"epoch": 0.03807390817469205,
"grad_norm": 5.337361812591553,
"learning_rate": 7.0747473988987515e-06,
"loss": 1.0883,
"step": 850
},
{
"epoch": 0.038521836506159014,
"grad_norm": 5.067249774932861,
"learning_rate": 7.063558425385033e-06,
"loss": 1.08,
"step": 860
},
{
"epoch": 0.03896976483762598,
"grad_norm": 3.9859232902526855,
"learning_rate": 7.052233244462769e-06,
"loss": 1.0063,
"step": 870
},
{
"epoch": 0.039417693169092945,
"grad_norm": 5.297623634338379,
"learning_rate": 7.040772321665788e-06,
"loss": 0.9638,
"step": 880
},
{
"epoch": 0.03986562150055991,
"grad_norm": 6.088709354400635,
"learning_rate": 7.029176128107734e-06,
"loss": 1.2673,
"step": 890
},
{
"epoch": 0.040313549832026875,
"grad_norm": 7.997159957885742,
"learning_rate": 7.017445140462711e-06,
"loss": 0.9986,
"step": 900
},
{
"epoch": 0.040313549832026875,
"eval_loss": 1.2309150695800781,
"eval_runtime": 51.612,
"eval_samples_per_second": 9.688,
"eval_steps_per_second": 9.688,
"step": 900
},
{
"epoch": 0.04076147816349384,
"grad_norm": 6.393094062805176,
"learning_rate": 7.00557984094567e-06,
"loss": 1.066,
"step": 910
},
{
"epoch": 0.041209406494960805,
"grad_norm": 4.47462797164917,
"learning_rate": 6.993580717292601e-06,
"loss": 1.3117,
"step": 920
},
{
"epoch": 0.04165733482642777,
"grad_norm": 4.160079479217529,
"learning_rate": 6.981448262740483e-06,
"loss": 1.3003,
"step": 930
},
{
"epoch": 0.042105263157894736,
"grad_norm": 5.260162353515625,
"learning_rate": 6.969182976006999e-06,
"loss": 1.312,
"step": 940
},
{
"epoch": 0.0425531914893617,
"grad_norm": 4.503716468811035,
"learning_rate": 6.95678536127005e-06,
"loss": 1.185,
"step": 950
},
{
"epoch": 0.043001119820828666,
"grad_norm": 3.7414872646331787,
"learning_rate": 6.944255928147017e-06,
"loss": 1.1585,
"step": 960
},
{
"epoch": 0.04344904815229563,
"grad_norm": 5.410964012145996,
"learning_rate": 6.931595191673823e-06,
"loss": 1.1403,
"step": 970
},
{
"epoch": 0.0438969764837626,
"grad_norm": 4.388716220855713,
"learning_rate": 6.9188036722837555e-06,
"loss": 1.0452,
"step": 980
},
{
"epoch": 0.04434490481522956,
"grad_norm": 2.7749533653259277,
"learning_rate": 6.905881895786076e-06,
"loss": 1.0638,
"step": 990
},
{
"epoch": 0.04479283314669653,
"grad_norm": 5.431761741638184,
"learning_rate": 6.892830393344403e-06,
"loss": 1.2718,
"step": 1000
},
{
"epoch": 0.04524076147816349,
"grad_norm": 4.384571552276611,
"learning_rate": 6.879649701454886e-06,
"loss": 1.0594,
"step": 1010
},
{
"epoch": 0.04568868980963046,
"grad_norm": 5.040534019470215,
"learning_rate": 6.866340361924141e-06,
"loss": 1.2255,
"step": 1020
},
{
"epoch": 0.04613661814109742,
"grad_norm": 4.800682544708252,
"learning_rate": 6.852902921846988e-06,
"loss": 1.1093,
"step": 1030
},
{
"epoch": 0.04658454647256439,
"grad_norm": 5.662080764770508,
"learning_rate": 6.8393379335839565e-06,
"loss": 1.2003,
"step": 1040
},
{
"epoch": 0.04703247480403135,
"grad_norm": 3.93361234664917,
"learning_rate": 6.825645954738586e-06,
"loss": 1.0652,
"step": 1050
},
{
"epoch": 0.04703247480403135,
"eval_loss": 1.2271474599838257,
"eval_runtime": 51.5746,
"eval_samples_per_second": 9.695,
"eval_steps_per_second": 9.695,
"step": 1050
},
{
"epoch": 0.04748040313549832,
"grad_norm": 4.918002605438232,
"learning_rate": 6.811827548134495e-06,
"loss": 1.156,
"step": 1060
},
{
"epoch": 0.047928331466965284,
"grad_norm": 3.533487319946289,
"learning_rate": 6.797883281792261e-06,
"loss": 1.0533,
"step": 1070
},
{
"epoch": 0.04837625979843225,
"grad_norm": 4.698348045349121,
"learning_rate": 6.783813728906054e-06,
"loss": 1.2621,
"step": 1080
},
{
"epoch": 0.048824188129899214,
"grad_norm": 3.90852427482605,
"learning_rate": 6.769619467820086e-06,
"loss": 1.0754,
"step": 1090
},
{
"epoch": 0.04927211646136618,
"grad_norm": 6.924786567687988,
"learning_rate": 6.755301082004838e-06,
"loss": 1.0617,
"step": 1100
},
{
"epoch": 0.049720044792833144,
"grad_norm": 5.685960292816162,
"learning_rate": 6.740859160033068e-06,
"loss": 1.2185,
"step": 1110
},
{
"epoch": 0.05016797312430011,
"grad_norm": 5.533092975616455,
"learning_rate": 6.726294295555623e-06,
"loss": 1.0583,
"step": 1120
},
{
"epoch": 0.050615901455767075,
"grad_norm": 4.5029988288879395,
"learning_rate": 6.711607087277034e-06,
"loss": 1.1781,
"step": 1130
},
{
"epoch": 0.05106382978723404,
"grad_norm": 3.2203736305236816,
"learning_rate": 6.69679813893091e-06,
"loss": 1.151,
"step": 1140
},
{
"epoch": 0.051511758118701005,
"grad_norm": 6.602795600891113,
"learning_rate": 6.681868059255113e-06,
"loss": 1.1373,
"step": 1150
},
{
"epoch": 0.05195968645016797,
"grad_norm": 3.071552038192749,
"learning_rate": 6.666817461966741e-06,
"loss": 1.1554,
"step": 1160
},
{
"epoch": 0.052407614781634936,
"grad_norm": 5.886751174926758,
"learning_rate": 6.651646965736902e-06,
"loss": 1.1328,
"step": 1170
},
{
"epoch": 0.0528555431131019,
"grad_norm": 4.323307991027832,
"learning_rate": 6.636357194165274e-06,
"loss": 1.1535,
"step": 1180
},
{
"epoch": 0.053303471444568866,
"grad_norm": 4.585876941680908,
"learning_rate": 6.620948775754481e-06,
"loss": 1.1636,
"step": 1190
},
{
"epoch": 0.05375139977603583,
"grad_norm": 3.9351437091827393,
"learning_rate": 6.605422343884255e-06,
"loss": 1.2689,
"step": 1200
},
{
"epoch": 0.05375139977603583,
"eval_loss": 1.2224195003509521,
"eval_runtime": 51.5936,
"eval_samples_per_second": 9.691,
"eval_steps_per_second": 9.691,
"step": 1200
},
{
"epoch": 0.054199328107502796,
"grad_norm": 3.1242146492004395,
"learning_rate": 6.589778536785396e-06,
"loss": 1.2646,
"step": 1210
},
{
"epoch": 0.05464725643896976,
"grad_norm": 3.1645703315734863,
"learning_rate": 6.5740179975135426e-06,
"loss": 0.9831,
"step": 1220
},
{
"epoch": 0.05509518477043673,
"grad_norm": 6.550941467285156,
"learning_rate": 6.5581413739227314e-06,
"loss": 1.1777,
"step": 1230
},
{
"epoch": 0.05554311310190369,
"grad_norm": 17.51181983947754,
"learning_rate": 6.542149318638777e-06,
"loss": 1.0765,
"step": 1240
},
{
"epoch": 0.055991041433370664,
"grad_norm": 6.8737664222717285,
"learning_rate": 6.526042489032434e-06,
"loss": 1.0107,
"step": 1250
},
{
"epoch": 0.05643896976483763,
"grad_norm": 3.5256145000457764,
"learning_rate": 6.509821547192383e-06,
"loss": 1.1973,
"step": 1260
},
{
"epoch": 0.056886898096304594,
"grad_norm": 5.974047660827637,
"learning_rate": 6.493487159898006e-06,
"loss": 1.2409,
"step": 1270
},
{
"epoch": 0.05733482642777156,
"grad_norm": 3.98787522315979,
"learning_rate": 6.477039998591991e-06,
"loss": 1.3272,
"step": 1280
},
{
"epoch": 0.057782754759238525,
"grad_norm": 5.225778102874756,
"learning_rate": 6.460480739352719e-06,
"loss": 1.2937,
"step": 1290
},
{
"epoch": 0.05823068309070549,
"grad_norm": 3.719729423522949,
"learning_rate": 6.4438100628664795e-06,
"loss": 1.0965,
"step": 1300
},
{
"epoch": 0.058678611422172455,
"grad_norm": 2.8820245265960693,
"learning_rate": 6.4270286543994874e-06,
"loss": 1.2178,
"step": 1310
},
{
"epoch": 0.05912653975363942,
"grad_norm": 3.031202793121338,
"learning_rate": 6.410137203769718e-06,
"loss": 1.354,
"step": 1320
},
{
"epoch": 0.059574468085106386,
"grad_norm": 3.010680675506592,
"learning_rate": 6.393136405318545e-06,
"loss": 1.185,
"step": 1330
},
{
"epoch": 0.06002239641657335,
"grad_norm": 3.756014823913574,
"learning_rate": 6.376026957882207e-06,
"loss": 1.1636,
"step": 1340
},
{
"epoch": 0.060470324748040316,
"grad_norm": 4.391636848449707,
"learning_rate": 6.3588095647630754e-06,
"loss": 1.2252,
"step": 1350
},
{
"epoch": 0.060470324748040316,
"eval_loss": 1.222408652305603,
"eval_runtime": 51.5211,
"eval_samples_per_second": 9.705,
"eval_steps_per_second": 9.705,
"step": 1350
},
{
"epoch": 0.06091825307950728,
"grad_norm": 3.5359737873077393,
"learning_rate": 6.341484933700744e-06,
"loss": 1.0688,
"step": 1360
},
{
"epoch": 0.061366181410974247,
"grad_norm": 4.412395477294922,
"learning_rate": 6.32405377684294e-06,
"loss": 1.1889,
"step": 1370
},
{
"epoch": 0.06181410974244121,
"grad_norm": 7.099231719970703,
"learning_rate": 6.306516810716249e-06,
"loss": 1.0922,
"step": 1380
},
{
"epoch": 0.06226203807390818,
"grad_norm": 3.257270097732544,
"learning_rate": 6.288874756196662e-06,
"loss": 1.2291,
"step": 1390
},
{
"epoch": 0.06270996640537514,
"grad_norm": 3.6133875846862793,
"learning_rate": 6.271128338479939e-06,
"loss": 1.0567,
"step": 1400
},
{
"epoch": 0.06315789473684211,
"grad_norm": 4.996825695037842,
"learning_rate": 6.253278287051806e-06,
"loss": 1.1242,
"step": 1410
},
{
"epoch": 0.06360582306830907,
"grad_norm": 5.642391204833984,
"learning_rate": 6.235325335657962e-06,
"loss": 1.1998,
"step": 1420
},
{
"epoch": 0.06405375139977604,
"grad_norm": 4.652320384979248,
"learning_rate": 6.217270222273923e-06,
"loss": 1.0647,
"step": 1430
},
{
"epoch": 0.064501679731243,
"grad_norm": 8.814513206481934,
"learning_rate": 6.1991136890746825e-06,
"loss": 0.97,
"step": 1440
},
{
"epoch": 0.06494960806270997,
"grad_norm": 4.535324573516846,
"learning_rate": 6.180856482404208e-06,
"loss": 1.0829,
"step": 1450
},
{
"epoch": 0.06539753639417693,
"grad_norm": 5.13389778137207,
"learning_rate": 6.162499352744754e-06,
"loss": 1.3333,
"step": 1460
},
{
"epoch": 0.0658454647256439,
"grad_norm": 4.871939182281494,
"learning_rate": 6.144043054686022e-06,
"loss": 1.1397,
"step": 1470
},
{
"epoch": 0.06629339305711086,
"grad_norm": 3.31581449508667,
"learning_rate": 6.125488346894139e-06,
"loss": 1.0983,
"step": 1480
},
{
"epoch": 0.06674132138857783,
"grad_norm": 6.067586898803711,
"learning_rate": 6.106835992080464e-06,
"loss": 1.0931,
"step": 1490
},
{
"epoch": 0.0671892497200448,
"grad_norm": 4.4560465812683105,
"learning_rate": 6.088086756970252e-06,
"loss": 1.0743,
"step": 1500
},
{
"epoch": 0.0671892497200448,
"eval_loss": 1.21743643283844,
"eval_runtime": 51.7437,
"eval_samples_per_second": 9.663,
"eval_steps_per_second": 9.663,
"step": 1500
},
{
"epoch": 0.06763717805151176,
"grad_norm": 6.724518775939941,
"learning_rate": 6.0692414122711184e-06,
"loss": 1.2655,
"step": 1510
},
{
"epoch": 0.06808510638297872,
"grad_norm": 4.3255085945129395,
"learning_rate": 6.050300732641376e-06,
"loss": 1.0058,
"step": 1520
},
{
"epoch": 0.06853303471444569,
"grad_norm": 2.7948145866394043,
"learning_rate": 6.0312654966581755e-06,
"loss": 1.1331,
"step": 1530
},
{
"epoch": 0.06898096304591265,
"grad_norm": 4.223801612854004,
"learning_rate": 6.012136486785512e-06,
"loss": 0.9267,
"step": 1540
},
{
"epoch": 0.06942889137737962,
"grad_norm": 8.328617095947266,
"learning_rate": 5.992914489342061e-06,
"loss": 1.0601,
"step": 1550
},
{
"epoch": 0.06987681970884659,
"grad_norm": 3.9401023387908936,
"learning_rate": 5.9736002944688474e-06,
"loss": 1.1296,
"step": 1560
},
{
"epoch": 0.07032474804031355,
"grad_norm": 4.462929725646973,
"learning_rate": 5.954194696096775e-06,
"loss": 1.1266,
"step": 1570
},
{
"epoch": 0.07077267637178052,
"grad_norm": 9.879998207092285,
"learning_rate": 5.9346984919139865e-06,
"loss": 1.0835,
"step": 1580
},
{
"epoch": 0.07122060470324748,
"grad_norm": 4.088196277618408,
"learning_rate": 5.9151124833330745e-06,
"loss": 1.1256,
"step": 1590
},
{
"epoch": 0.07166853303471445,
"grad_norm": 6.066174030303955,
"learning_rate": 5.895437475458137e-06,
"loss": 1.2295,
"step": 1600
},
{
"epoch": 0.07211646136618141,
"grad_norm": 4.754509449005127,
"learning_rate": 5.875674277051688e-06,
"loss": 1.1676,
"step": 1610
},
{
"epoch": 0.07256438969764838,
"grad_norm": 3.898282289505005,
"learning_rate": 5.855823700501406e-06,
"loss": 1.2583,
"step": 1620
},
{
"epoch": 0.07301231802911534,
"grad_norm": 5.35301399230957,
"learning_rate": 5.835886561786744e-06,
"loss": 1.3667,
"step": 1630
},
{
"epoch": 0.07346024636058231,
"grad_norm": 6.24777889251709,
"learning_rate": 5.815863680445385e-06,
"loss": 1.1099,
"step": 1640
},
{
"epoch": 0.07390817469204927,
"grad_norm": 3.7771286964416504,
"learning_rate": 5.795755879539558e-06,
"loss": 0.9985,
"step": 1650
},
{
"epoch": 0.07390817469204927,
"eval_loss": 1.2118867635726929,
"eval_runtime": 51.6701,
"eval_samples_per_second": 9.677,
"eval_steps_per_second": 9.677,
"step": 1650
},
{
"epoch": 0.07435610302351624,
"grad_norm": 4.368626117706299,
"learning_rate": 5.775563985622202e-06,
"loss": 1.1,
"step": 1660
},
{
"epoch": 0.0748040313549832,
"grad_norm": 6.341384410858154,
"learning_rate": 5.755288828702987e-06,
"loss": 1.0292,
"step": 1670
},
{
"epoch": 0.07525195968645017,
"grad_norm": 5.869757652282715,
"learning_rate": 5.734931242214204e-06,
"loss": 1.0937,
"step": 1680
},
{
"epoch": 0.07569988801791713,
"grad_norm": 4.857089042663574,
"learning_rate": 5.7144920629764955e-06,
"loss": 1.0987,
"step": 1690
},
{
"epoch": 0.0761478163493841,
"grad_norm": 5.114626884460449,
"learning_rate": 5.693972131164471e-06,
"loss": 0.9623,
"step": 1700
},
{
"epoch": 0.07659574468085106,
"grad_norm": 5.152310371398926,
"learning_rate": 5.673372290272149e-06,
"loss": 1.1423,
"step": 1710
},
{
"epoch": 0.07704367301231803,
"grad_norm": 3.8204965591430664,
"learning_rate": 5.652693387078309e-06,
"loss": 1.0523,
"step": 1720
},
{
"epoch": 0.077491601343785,
"grad_norm": 3.0346767902374268,
"learning_rate": 5.631936271611667e-06,
"loss": 1.0483,
"step": 1730
},
{
"epoch": 0.07793952967525196,
"grad_norm": 4.436351299285889,
"learning_rate": 5.611101797115939e-06,
"loss": 1.0144,
"step": 1740
},
{
"epoch": 0.07838745800671892,
"grad_norm": 5.614783763885498,
"learning_rate": 5.5901908200147685e-06,
"loss": 1.078,
"step": 1750
},
{
"epoch": 0.07883538633818589,
"grad_norm": 4.0426926612854,
"learning_rate": 5.56920419987652e-06,
"loss": 1.2628,
"step": 1760
},
{
"epoch": 0.07928331466965285,
"grad_norm": 5.30089807510376,
"learning_rate": 5.5481427993789534e-06,
"loss": 1.1257,
"step": 1770
},
{
"epoch": 0.07973124300111982,
"grad_norm": 3.5508739948272705,
"learning_rate": 5.527007484273746e-06,
"loss": 1.0355,
"step": 1780
},
{
"epoch": 0.08017917133258678,
"grad_norm": 4.027277946472168,
"learning_rate": 5.5057991233509225e-06,
"loss": 0.9196,
"step": 1790
},
{
"epoch": 0.08062709966405375,
"grad_norm": 7.427858352661133,
"learning_rate": 5.484518588403134e-06,
"loss": 1.1913,
"step": 1800
},
{
"epoch": 0.08062709966405375,
"eval_loss": 1.2111696004867554,
"eval_runtime": 51.6854,
"eval_samples_per_second": 9.674,
"eval_steps_per_second": 9.674,
"step": 1800
},
{
"epoch": 0.08107502799552072,
"grad_norm": 6.3730597496032715,
"learning_rate": 5.463166754189819e-06,
"loss": 1.171,
"step": 1810
},
{
"epoch": 0.08152295632698768,
"grad_norm": 5.194447994232178,
"learning_rate": 5.441744498401255e-06,
"loss": 1.2202,
"step": 1820
},
{
"epoch": 0.08197088465845465,
"grad_norm": 4.3045454025268555,
"learning_rate": 5.4202527016224725e-06,
"loss": 1.1318,
"step": 1830
},
{
"epoch": 0.08241881298992161,
"grad_norm": 5.316900253295898,
"learning_rate": 5.398692247297059e-06,
"loss": 1.2107,
"step": 1840
},
{
"epoch": 0.08286674132138858,
"grad_norm": 8.284939765930176,
"learning_rate": 5.377064021690844e-06,
"loss": 1.1683,
"step": 1850
},
{
"epoch": 0.08331466965285554,
"grad_norm": 4.051226615905762,
"learning_rate": 5.355368913855472e-06,
"loss": 1.2974,
"step": 1860
},
{
"epoch": 0.0837625979843225,
"grad_norm": 5.353118896484375,
"learning_rate": 5.333607815591851e-06,
"loss": 1.235,
"step": 1870
},
{
"epoch": 0.08421052631578947,
"grad_norm": 5.097784996032715,
"learning_rate": 5.311781621413497e-06,
"loss": 1.0172,
"step": 1880
},
{
"epoch": 0.08465845464725644,
"grad_norm": 3.437659978866577,
"learning_rate": 5.289891228509769e-06,
"loss": 1.0104,
"step": 1890
},
{
"epoch": 0.0851063829787234,
"grad_norm": 4.631069660186768,
"learning_rate": 5.267937536708977e-06,
"loss": 1.0368,
"step": 1900
},
{
"epoch": 0.08555431131019037,
"grad_norm": 5.044907569885254,
"learning_rate": 5.245921448441407e-06,
"loss": 1.0732,
"step": 1910
},
{
"epoch": 0.08600223964165733,
"grad_norm": 3.2756667137145996,
"learning_rate": 5.223843868702214e-06,
"loss": 1.2815,
"step": 1920
},
{
"epoch": 0.0864501679731243,
"grad_norm": 5.061473369598389,
"learning_rate": 5.201705705014231e-06,
"loss": 1.1059,
"step": 1930
},
{
"epoch": 0.08689809630459126,
"grad_norm": 4.924319744110107,
"learning_rate": 5.1795078673906575e-06,
"loss": 1.0561,
"step": 1940
},
{
"epoch": 0.08734602463605823,
"grad_norm": 4.019739627838135,
"learning_rate": 5.1572512682976546e-06,
"loss": 0.9889,
"step": 1950
},
{
"epoch": 0.08734602463605823,
"eval_loss": 1.2077045440673828,
"eval_runtime": 51.7283,
"eval_samples_per_second": 9.666,
"eval_steps_per_second": 9.666,
"step": 1950
},
{
"epoch": 0.0877939529675252,
"grad_norm": 6.297740459442139,
"learning_rate": 5.134936822616837e-06,
"loss": 1.1664,
"step": 1960
},
{
"epoch": 0.08824188129899216,
"grad_norm": 5.478749752044678,
"learning_rate": 5.112565447607669e-06,
"loss": 1.2503,
"step": 1970
},
{
"epoch": 0.08868980963045912,
"grad_norm": 4.692316055297852,
"learning_rate": 5.090138062869755e-06,
"loss": 1.1421,
"step": 1980
},
{
"epoch": 0.08913773796192609,
"grad_norm": 3.5623536109924316,
"learning_rate": 5.067655590305036e-06,
"loss": 1.1203,
"step": 1990
},
{
"epoch": 0.08958566629339305,
"grad_norm": 6.875621318817139,
"learning_rate": 5.045118954079904e-06,
"loss": 1.1348,
"step": 2000
},
{
"epoch": 0.09003359462486002,
"grad_norm": 5.2604756355285645,
"learning_rate": 5.022529080587205e-06,
"loss": 1.0326,
"step": 2010
},
{
"epoch": 0.09048152295632698,
"grad_norm": 5.012307643890381,
"learning_rate": 4.999886898408157e-06,
"loss": 1.12,
"step": 2020
},
{
"epoch": 0.09092945128779395,
"grad_norm": 5.246688365936279,
"learning_rate": 4.977193338274189e-06,
"loss": 1.1164,
"step": 2030
},
{
"epoch": 0.09137737961926092,
"grad_norm": 3.9779398441314697,
"learning_rate": 4.954449333028672e-06,
"loss": 1.0607,
"step": 2040
},
{
"epoch": 0.09182530795072788,
"grad_norm": 5.392056465148926,
"learning_rate": 4.931655817588579e-06,
"loss": 1.1102,
"step": 2050
},
{
"epoch": 0.09227323628219485,
"grad_norm": 5.144470691680908,
"learning_rate": 4.9088137289060535e-06,
"loss": 1.0649,
"step": 2060
},
{
"epoch": 0.09272116461366181,
"grad_norm": 3.7060792446136475,
"learning_rate": 4.885924005929896e-06,
"loss": 1.0718,
"step": 2070
},
{
"epoch": 0.09316909294512878,
"grad_norm": 3.357794761657715,
"learning_rate": 4.862987589566965e-06,
"loss": 1.1003,
"step": 2080
},
{
"epoch": 0.09361702127659574,
"grad_norm": 5.704718589782715,
"learning_rate": 4.840005422643503e-06,
"loss": 1.2042,
"step": 2090
},
{
"epoch": 0.0940649496080627,
"grad_norm": 5.481514930725098,
"learning_rate": 4.816978449866372e-06,
"loss": 1.0777,
"step": 2100
},
{
"epoch": 0.0940649496080627,
"eval_loss": 1.2093305587768555,
"eval_runtime": 51.7975,
"eval_samples_per_second": 9.653,
"eval_steps_per_second": 9.653,
"step": 2100
},
{
"epoch": 0.09451287793952967,
"grad_norm": 5.508385181427002,
"learning_rate": 4.793907617784238e-06,
"loss": 1.5375,
"step": 2110
},
{
"epoch": 0.09496080627099664,
"grad_norm": 4.192409515380859,
"learning_rate": 4.770793874748642e-06,
"loss": 0.9964,
"step": 2120
},
{
"epoch": 0.0954087346024636,
"grad_norm": 4.068387508392334,
"learning_rate": 4.747638170875032e-06,
"loss": 0.9244,
"step": 2130
},
{
"epoch": 0.09585666293393057,
"grad_norm": 2.513946771621704,
"learning_rate": 4.724441458003699e-06,
"loss": 1.1329,
"step": 2140
},
{
"epoch": 0.09630459126539753,
"grad_norm": 4.470638275146484,
"learning_rate": 4.701204689660653e-06,
"loss": 1.0299,
"step": 2150
},
{
"epoch": 0.0967525195968645,
"grad_norm": 5.644805908203125,
"learning_rate": 4.67792882101843e-06,
"loss": 1.2654,
"step": 2160
},
{
"epoch": 0.09720044792833146,
"grad_norm": 5.1912736892700195,
"learning_rate": 4.654614808856823e-06,
"loss": 1.2265,
"step": 2170
},
{
"epoch": 0.09764837625979843,
"grad_norm": 11.092533111572266,
"learning_rate": 4.631263611523557e-06,
"loss": 1.2182,
"step": 2180
},
{
"epoch": 0.09809630459126539,
"grad_norm": 4.138496398925781,
"learning_rate": 4.607876188894896e-06,
"loss": 1.2283,
"step": 2190
},
{
"epoch": 0.09854423292273236,
"grad_norm": 5.229914665222168,
"learning_rate": 4.58445350233618e-06,
"loss": 1.1319,
"step": 2200
},
{
"epoch": 0.09899216125419932,
"grad_norm": 4.059961318969727,
"learning_rate": 4.560996514662314e-06,
"loss": 1.0411,
"step": 2210
},
{
"epoch": 0.09944008958566629,
"grad_norm": 4.80086088180542,
"learning_rate": 4.5375061900981855e-06,
"loss": 1.23,
"step": 2220
},
{
"epoch": 0.09988801791713325,
"grad_norm": 5.166756629943848,
"learning_rate": 4.513983494239034e-06,
"loss": 1.219,
"step": 2230
},
{
"epoch": 0.10033594624860022,
"grad_norm": 5.53660249710083,
"learning_rate": 4.490429394010752e-06,
"loss": 1.1245,
"step": 2240
},
{
"epoch": 0.10078387458006718,
"grad_norm": 2.9756040573120117,
"learning_rate": 4.466844857630147e-06,
"loss": 1.1395,
"step": 2250
},
{
"epoch": 0.10078387458006718,
"eval_loss": 1.2089135646820068,
"eval_runtime": 51.6342,
"eval_samples_per_second": 9.684,
"eval_steps_per_second": 9.684,
"step": 2250
},
{
"epoch": 0.10123180291153415,
"grad_norm": 3.644266128540039,
"learning_rate": 4.443230854565133e-06,
"loss": 1.0985,
"step": 2260
},
{
"epoch": 0.10167973124300111,
"grad_norm": 4.662050724029541,
"learning_rate": 4.4195883554948885e-06,
"loss": 1.3397,
"step": 2270
},
{
"epoch": 0.10212765957446808,
"grad_norm": 5.3237385749816895,
"learning_rate": 4.3959183322699466e-06,
"loss": 1.1351,
"step": 2280
},
{
"epoch": 0.10257558790593505,
"grad_norm": 4.3604207038879395,
"learning_rate": 4.372221757872255e-06,
"loss": 1.1208,
"step": 2290
},
{
"epoch": 0.10302351623740201,
"grad_norm": 3.731410264968872,
"learning_rate": 4.3484996063751725e-06,
"loss": 1.1584,
"step": 2300
},
{
"epoch": 0.10347144456886898,
"grad_norm": 4.031397342681885,
"learning_rate": 4.324752852903435e-06,
"loss": 0.9656,
"step": 2310
},
{
"epoch": 0.10391937290033594,
"grad_norm": 3.564148187637329,
"learning_rate": 4.300982473593068e-06,
"loss": 1.0031,
"step": 2320
},
{
"epoch": 0.1043673012318029,
"grad_norm": 5.459331035614014,
"learning_rate": 4.277189445551261e-06,
"loss": 1.0037,
"step": 2330
},
{
"epoch": 0.10481522956326987,
"grad_norm": 4.870905876159668,
"learning_rate": 4.253374746816209e-06,
"loss": 0.9615,
"step": 2340
},
{
"epoch": 0.10526315789473684,
"grad_norm": 5.284097671508789,
"learning_rate": 4.229539356316898e-06,
"loss": 1.3278,
"step": 2350
},
{
"epoch": 0.1057110862262038,
"grad_norm": 5.323864459991455,
"learning_rate": 4.205684253832877e-06,
"loss": 1.1903,
"step": 2360
},
{
"epoch": 0.10615901455767077,
"grad_norm": 7.844208717346191,
"learning_rate": 4.1818104199539735e-06,
"loss": 1.056,
"step": 2370
},
{
"epoch": 0.10660694288913773,
"grad_norm": 4.325316905975342,
"learning_rate": 4.1579188360399916e-06,
"loss": 1.2431,
"step": 2380
},
{
"epoch": 0.1070548712206047,
"grad_norm": 3.5362424850463867,
"learning_rate": 4.134010484180368e-06,
"loss": 1.1804,
"step": 2390
},
{
"epoch": 0.10750279955207166,
"grad_norm": 3.2404041290283203,
"learning_rate": 4.110086347153807e-06,
"loss": 1.1556,
"step": 2400
},
{
"epoch": 0.10750279955207166,
"eval_loss": 1.2038679122924805,
"eval_runtime": 51.7303,
"eval_samples_per_second": 9.666,
"eval_steps_per_second": 9.666,
"step": 2400
},
{
"epoch": 0.10795072788353863,
"grad_norm": 3.8270246982574463,
"learning_rate": 4.0861474083878765e-06,
"loss": 1.0918,
"step": 2410
},
{
"epoch": 0.10839865621500559,
"grad_norm": 5.627485752105713,
"learning_rate": 4.062194651918585e-06,
"loss": 1.257,
"step": 2420
},
{
"epoch": 0.10884658454647256,
"grad_norm": 4.910660743713379,
"learning_rate": 4.0382290623499384e-06,
"loss": 1.2748,
"step": 2430
},
{
"epoch": 0.10929451287793952,
"grad_norm": 2.3609941005706787,
"learning_rate": 4.014251624813453e-06,
"loss": 0.9422,
"step": 2440
},
{
"epoch": 0.10974244120940649,
"grad_norm": 3.063828706741333,
"learning_rate": 3.990263324927675e-06,
"loss": 1.1829,
"step": 2450
},
{
"epoch": 0.11019036954087345,
"grad_norm": 2.658452033996582,
"learning_rate": 3.966265148757655e-06,
"loss": 1.0062,
"step": 2460
},
{
"epoch": 0.11063829787234042,
"grad_norm": 6.130062103271484,
"learning_rate": 3.9422580827744224e-06,
"loss": 1.1504,
"step": 2470
},
{
"epoch": 0.11108622620380738,
"grad_norm": 3.3496034145355225,
"learning_rate": 3.9182431138144315e-06,
"loss": 0.8731,
"step": 2480
},
{
"epoch": 0.11153415453527436,
"grad_norm": 3.8455569744110107,
"learning_rate": 3.894221229038995e-06,
"loss": 1.0125,
"step": 2490
},
{
"epoch": 0.11198208286674133,
"grad_norm": 4.499962329864502,
"learning_rate": 3.870193415893709e-06,
"loss": 1.0228,
"step": 2500
},
{
"epoch": 0.1124300111982083,
"grad_norm": 6.230105876922607,
"learning_rate": 3.846160662067859e-06,
"loss": 1.1794,
"step": 2510
},
{
"epoch": 0.11287793952967526,
"grad_norm": 7.316727638244629,
"learning_rate": 3.8221239554538275e-06,
"loss": 1.2728,
"step": 2520
},
{
"epoch": 0.11332586786114222,
"grad_norm": 3.291714906692505,
"learning_rate": 3.798084284106478e-06,
"loss": 1.167,
"step": 2530
},
{
"epoch": 0.11377379619260919,
"grad_norm": 5.075141429901123,
"learning_rate": 3.7740426362025424e-06,
"loss": 1.0547,
"step": 2540
},
{
"epoch": 0.11422172452407615,
"grad_norm": 3.961540937423706,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.0713,
"step": 2550
},
{
"epoch": 0.11422172452407615,
"eval_loss": 1.2046430110931396,
"eval_runtime": 51.7175,
"eval_samples_per_second": 9.668,
"eval_steps_per_second": 9.668,
"step": 2550
},
{
"epoch": 0.11466965285554312,
"grad_norm": 6.124125003814697,
"learning_rate": 3.7259573637974587e-06,
"loss": 1.0568,
"step": 2560
},
{
"epoch": 0.11511758118701008,
"grad_norm": 4.3748602867126465,
"learning_rate": 3.701915715893523e-06,
"loss": 1.4124,
"step": 2570
},
{
"epoch": 0.11556550951847705,
"grad_norm": 7.382061004638672,
"learning_rate": 3.677876044546174e-06,
"loss": 1.1357,
"step": 2580
},
{
"epoch": 0.11601343784994401,
"grad_norm": 4.097735404968262,
"learning_rate": 3.6538393379321427e-06,
"loss": 1.0885,
"step": 2590
},
{
"epoch": 0.11646136618141098,
"grad_norm": 5.039736270904541,
"learning_rate": 3.6298065841062934e-06,
"loss": 1.107,
"step": 2600
},
{
"epoch": 0.11690929451287795,
"grad_norm": 4.383152008056641,
"learning_rate": 3.6057787709610064e-06,
"loss": 1.1695,
"step": 2610
},
{
"epoch": 0.11735722284434491,
"grad_norm": 4.900496482849121,
"learning_rate": 3.5817568861855708e-06,
"loss": 1.1107,
"step": 2620
},
{
"epoch": 0.11780515117581188,
"grad_norm": 6.267992973327637,
"learning_rate": 3.557741917225579e-06,
"loss": 1.1896,
"step": 2630
},
{
"epoch": 0.11825307950727884,
"grad_norm": 3.8060693740844727,
"learning_rate": 3.5337348512423468e-06,
"loss": 1.2245,
"step": 2640
},
{
"epoch": 0.1187010078387458,
"grad_norm": 3.5068161487579346,
"learning_rate": 3.5097366750723275e-06,
"loss": 1.0629,
"step": 2650
},
{
"epoch": 0.11914893617021277,
"grad_norm": 4.6765360832214355,
"learning_rate": 3.4857483751865478e-06,
"loss": 1.1783,
"step": 2660
},
{
"epoch": 0.11959686450167974,
"grad_norm": 7.864380836486816,
"learning_rate": 3.461770937650064e-06,
"loss": 1.0683,
"step": 2670
},
{
"epoch": 0.1200447928331467,
"grad_norm": 3.138843297958374,
"learning_rate": 3.437805348081416e-06,
"loss": 0.9814,
"step": 2680
},
{
"epoch": 0.12049272116461367,
"grad_norm": 5.134324550628662,
"learning_rate": 3.413852591612125e-06,
"loss": 1.1631,
"step": 2690
},
{
"epoch": 0.12094064949608063,
"grad_norm": 4.688596725463867,
"learning_rate": 3.389913652846194e-06,
"loss": 1.0644,
"step": 2700
},
{
"epoch": 0.12094064949608063,
"eval_loss": 1.2033374309539795,
"eval_runtime": 51.6099,
"eval_samples_per_second": 9.688,
"eval_steps_per_second": 9.688,
"step": 2700
},
{
"epoch": 0.1213885778275476,
"grad_norm": 4.218849182128906,
"learning_rate": 3.365989515819633e-06,
"loss": 1.1395,
"step": 2710
},
{
"epoch": 0.12183650615901456,
"grad_norm": 5.043267726898193,
"learning_rate": 3.34208116396001e-06,
"loss": 1.2327,
"step": 2720
},
{
"epoch": 0.12228443449048153,
"grad_norm": 7.991638660430908,
"learning_rate": 3.318189580046028e-06,
"loss": 1.0106,
"step": 2730
},
{
"epoch": 0.12273236282194849,
"grad_norm": 4.103755474090576,
"learning_rate": 3.294315746167124e-06,
"loss": 0.9751,
"step": 2740
},
{
"epoch": 0.12318029115341546,
"grad_norm": 4.224274635314941,
"learning_rate": 3.2704606436831023e-06,
"loss": 1.1427,
"step": 2750
},
{
"epoch": 0.12362821948488242,
"grad_norm": 5.190283298492432,
"learning_rate": 3.2466252531837934e-06,
"loss": 1.1758,
"step": 2760
},
{
"epoch": 0.12407614781634939,
"grad_norm": 6.470210075378418,
"learning_rate": 3.2228105544487405e-06,
"loss": 1.2584,
"step": 2770
},
{
"epoch": 0.12452407614781635,
"grad_norm": 4.470674514770508,
"learning_rate": 3.1990175264069333e-06,
"loss": 1.0279,
"step": 2780
},
{
"epoch": 0.12497200447928332,
"grad_norm": 4.63865327835083,
"learning_rate": 3.1752471470965653e-06,
"loss": 1.2431,
"step": 2790
},
{
"epoch": 0.12541993281075028,
"grad_norm": 5.2822089195251465,
"learning_rate": 3.151500393624829e-06,
"loss": 1.0206,
"step": 2800
},
{
"epoch": 0.12586786114221724,
"grad_norm": 3.3929495811462402,
"learning_rate": 3.127778242127747e-06,
"loss": 0.9654,
"step": 2810
},
{
"epoch": 0.12631578947368421,
"grad_norm": 3.526858329772949,
"learning_rate": 3.104081667730055e-06,
"loss": 1.0832,
"step": 2820
},
{
"epoch": 0.12676371780515117,
"grad_norm": 5.531039714813232,
"learning_rate": 3.0804116445051133e-06,
"loss": 1.1649,
"step": 2830
},
{
"epoch": 0.12721164613661815,
"grad_norm": 5.811004161834717,
"learning_rate": 3.0567691454348674e-06,
"loss": 1.095,
"step": 2840
},
{
"epoch": 0.1276595744680851,
"grad_norm": 4.319146633148193,
"learning_rate": 3.033155142369855e-06,
"loss": 0.9761,
"step": 2850
},
{
"epoch": 0.1276595744680851,
"eval_loss": 1.2028086185455322,
"eval_runtime": 51.6383,
"eval_samples_per_second": 9.683,
"eval_steps_per_second": 9.683,
"step": 2850
},
{
"epoch": 0.12810750279955208,
"grad_norm": 5.54340124130249,
"learning_rate": 3.009570605989249e-06,
"loss": 0.999,
"step": 2860
},
{
"epoch": 0.12855543113101903,
"grad_norm": 3.859863758087158,
"learning_rate": 2.986016505760967e-06,
"loss": 1.025,
"step": 2870
},
{
"epoch": 0.129003359462486,
"grad_norm": 5.119099140167236,
"learning_rate": 2.962493809901815e-06,
"loss": 1.3963,
"step": 2880
},
{
"epoch": 0.12945128779395296,
"grad_norm": 5.8379130363464355,
"learning_rate": 2.9390034853376875e-06,
"loss": 1.0822,
"step": 2890
},
{
"epoch": 0.12989921612541994,
"grad_norm": 3.261016845703125,
"learning_rate": 2.9155464976638217e-06,
"loss": 1.0526,
"step": 2900
},
{
"epoch": 0.1303471444568869,
"grad_norm": 3.678527355194092,
"learning_rate": 2.8921238111051057e-06,
"loss": 1.1167,
"step": 2910
},
{
"epoch": 0.13079507278835387,
"grad_norm": 4.787365436553955,
"learning_rate": 2.8687363884764434e-06,
"loss": 1.0829,
"step": 2920
},
{
"epoch": 0.13124300111982082,
"grad_norm": 3.475607395172119,
"learning_rate": 2.8453851911431783e-06,
"loss": 1.0801,
"step": 2930
},
{
"epoch": 0.1316909294512878,
"grad_norm": 6.456125736236572,
"learning_rate": 2.822071178981572e-06,
"loss": 1.1287,
"step": 2940
},
{
"epoch": 0.13213885778275475,
"grad_norm": 3.778585910797119,
"learning_rate": 2.7987953103393484e-06,
"loss": 1.1359,
"step": 2950
},
{
"epoch": 0.13258678611422173,
"grad_norm": 3.37793231010437,
"learning_rate": 2.7755585419963026e-06,
"loss": 1.0584,
"step": 2960
},
{
"epoch": 0.13303471444568868,
"grad_norm": 5.2485575675964355,
"learning_rate": 2.7523618291249687e-06,
"loss": 1.2037,
"step": 2970
},
{
"epoch": 0.13348264277715566,
"grad_norm": 4.524936676025391,
"learning_rate": 2.729206125251359e-06,
"loss": 0.9778,
"step": 2980
},
{
"epoch": 0.1339305711086226,
"grad_norm": 5.820756912231445,
"learning_rate": 2.7060923822157638e-06,
"loss": 1.0351,
"step": 2990
},
{
"epoch": 0.1343784994400896,
"grad_norm": 5.031400680541992,
"learning_rate": 2.6830215501336288e-06,
"loss": 1.1926,
"step": 3000
},
{
"epoch": 0.1343784994400896,
"eval_loss": 1.199351191520691,
"eval_runtime": 51.5688,
"eval_samples_per_second": 9.696,
"eval_steps_per_second": 9.696,
"step": 3000
},
{
"epoch": 0.13482642777155654,
"grad_norm": 4.307104587554932,
"learning_rate": 2.6599945773564997e-06,
"loss": 1.1743,
"step": 3010
},
{
"epoch": 0.13527435610302352,
"grad_norm": 4.9457221031188965,
"learning_rate": 2.6370124104330357e-06,
"loss": 1.1287,
"step": 3020
},
{
"epoch": 0.13572228443449047,
"grad_norm": 3.17401385307312,
"learning_rate": 2.614075994070105e-06,
"loss": 1.1686,
"step": 3030
},
{
"epoch": 0.13617021276595745,
"grad_norm": 6.098177433013916,
"learning_rate": 2.591186271093948e-06,
"loss": 1.1546,
"step": 3040
},
{
"epoch": 0.1366181410974244,
"grad_norm": 4.12905216217041,
"learning_rate": 2.568344182411423e-06,
"loss": 1.0909,
"step": 3050
},
{
"epoch": 0.13706606942889138,
"grad_norm": 4.946627616882324,
"learning_rate": 2.5455506669713293e-06,
"loss": 1.2223,
"step": 3060
},
{
"epoch": 0.13751399776035833,
"grad_norm": 4.25789737701416,
"learning_rate": 2.522806661725812e-06,
"loss": 1.0383,
"step": 3070
},
{
"epoch": 0.1379619260918253,
"grad_norm": 6.536715030670166,
"learning_rate": 2.5001131015918444e-06,
"loss": 0.9992,
"step": 3080
},
{
"epoch": 0.13840985442329226,
"grad_norm": 5.861030578613281,
"learning_rate": 2.4774709194127973e-06,
"loss": 1.1678,
"step": 3090
},
{
"epoch": 0.13885778275475924,
"grad_norm": 4.58046293258667,
"learning_rate": 2.4548810459200973e-06,
"loss": 1.2545,
"step": 3100
},
{
"epoch": 0.1393057110862262,
"grad_norm": 6.048022270202637,
"learning_rate": 2.4323444096949647e-06,
"loss": 1.0531,
"step": 3110
},
{
"epoch": 0.13975363941769317,
"grad_norm": 5.86400842666626,
"learning_rate": 2.409861937130248e-06,
"loss": 1.1093,
"step": 3120
},
{
"epoch": 0.14020156774916012,
"grad_norm": 3.7916102409362793,
"learning_rate": 2.3874345523923327e-06,
"loss": 1.1048,
"step": 3130
},
{
"epoch": 0.1406494960806271,
"grad_norm": 4.009166717529297,
"learning_rate": 2.3650631773831644e-06,
"loss": 1.0198,
"step": 3140
},
{
"epoch": 0.14109742441209405,
"grad_norm": 4.695572853088379,
"learning_rate": 2.3427487317023477e-06,
"loss": 1.1909,
"step": 3150
},
{
"epoch": 0.14109742441209405,
"eval_loss": 1.1985480785369873,
"eval_runtime": 51.6619,
"eval_samples_per_second": 9.678,
"eval_steps_per_second": 9.678,
"step": 3150
},
{
"epoch": 0.14154535274356103,
"grad_norm": 5.317529201507568,
"learning_rate": 2.320492132609344e-06,
"loss": 1.084,
"step": 3160
},
{
"epoch": 0.14199328107502798,
"grad_norm": 3.3507909774780273,
"learning_rate": 2.2982942949857705e-06,
"loss": 1.0169,
"step": 3170
},
{
"epoch": 0.14244120940649496,
"grad_norm": 5.125346660614014,
"learning_rate": 2.276156131297787e-06,
"loss": 1.0202,
"step": 3180
},
{
"epoch": 0.1428891377379619,
"grad_norm": 6.09945821762085,
"learning_rate": 2.254078551558594e-06,
"loss": 1.1235,
"step": 3190
},
{
"epoch": 0.1433370660694289,
"grad_norm": 6.263647079467773,
"learning_rate": 2.2320624632910232e-06,
"loss": 1.1284,
"step": 3200
},
{
"epoch": 0.14378499440089584,
"grad_norm": 6.879512310028076,
"learning_rate": 2.210108771490233e-06,
"loss": 1.0602,
"step": 3210
},
{
"epoch": 0.14423292273236282,
"grad_norm": 3.726658582687378,
"learning_rate": 2.1882183785865047e-06,
"loss": 1.1038,
"step": 3220
},
{
"epoch": 0.14468085106382977,
"grad_norm": 5.486456394195557,
"learning_rate": 2.166392184408152e-06,
"loss": 1.1794,
"step": 3230
},
{
"epoch": 0.14512877939529675,
"grad_norm": 4.750957012176514,
"learning_rate": 2.1446310861445306e-06,
"loss": 0.9833,
"step": 3240
},
{
"epoch": 0.1455767077267637,
"grad_norm": 3.6656692028045654,
"learning_rate": 2.1229359783091576e-06,
"loss": 1.0272,
"step": 3250
},
{
"epoch": 0.14602463605823068,
"grad_norm": 3.691014528274536,
"learning_rate": 2.1013077527029428e-06,
"loss": 1.0861,
"step": 3260
},
{
"epoch": 0.14647256438969763,
"grad_norm": 5.651008605957031,
"learning_rate": 2.079747298377528e-06,
"loss": 1.096,
"step": 3270
},
{
"epoch": 0.14692049272116461,
"grad_norm": 4.2657318115234375,
"learning_rate": 2.058255501598745e-06,
"loss": 1.0871,
"step": 3280
},
{
"epoch": 0.14736842105263157,
"grad_norm": 3.884568452835083,
"learning_rate": 2.0368332458101814e-06,
"loss": 1.0087,
"step": 3290
},
{
"epoch": 0.14781634938409854,
"grad_norm": 3.191197395324707,
"learning_rate": 2.015481411596869e-06,
"loss": 1.1387,
"step": 3300
},
{
"epoch": 0.14781634938409854,
"eval_loss": 1.1979233026504517,
"eval_runtime": 51.7549,
"eval_samples_per_second": 9.661,
"eval_steps_per_second": 9.661,
"step": 3300
},
{
"epoch": 0.14826427771556552,
"grad_norm": 6.709813594818115,
"learning_rate": 1.9942008766490793e-06,
"loss": 1.0685,
"step": 3310
},
{
"epoch": 0.14871220604703247,
"grad_norm": 3.687634229660034,
"learning_rate": 1.9729925157262554e-06,
"loss": 1.1542,
"step": 3320
},
{
"epoch": 0.14916013437849945,
"grad_norm": 3.637235403060913,
"learning_rate": 1.9518572006210484e-06,
"loss": 1.1365,
"step": 3330
},
{
"epoch": 0.1496080627099664,
"grad_norm": 3.113184690475464,
"learning_rate": 1.9307958001234794e-06,
"loss": 1.0218,
"step": 3340
},
{
"epoch": 0.15005599104143338,
"grad_norm": 4.447634220123291,
"learning_rate": 1.9098091799852347e-06,
"loss": 1.222,
"step": 3350
},
{
"epoch": 0.15050391937290034,
"grad_norm": 3.8236501216888428,
"learning_rate": 1.8888982028840636e-06,
"loss": 1.2012,
"step": 3360
},
{
"epoch": 0.15095184770436731,
"grad_norm": 5.108892440795898,
"learning_rate": 1.8680637283883355e-06,
"loss": 1.0181,
"step": 3370
},
{
"epoch": 0.15139977603583427,
"grad_norm": 3.81886887550354,
"learning_rate": 1.8473066129216927e-06,
"loss": 1.125,
"step": 3380
},
{
"epoch": 0.15184770436730124,
"grad_norm": 4.7799835205078125,
"learning_rate": 1.8266277097278527e-06,
"loss": 1.1038,
"step": 3390
},
{
"epoch": 0.1522956326987682,
"grad_norm": 6.478558540344238,
"learning_rate": 1.8060278688355313e-06,
"loss": 0.9218,
"step": 3400
},
{
"epoch": 0.15274356103023518,
"grad_norm": 4.482583522796631,
"learning_rate": 1.7855079370235043e-06,
"loss": 1.0629,
"step": 3410
},
{
"epoch": 0.15319148936170213,
"grad_norm": 2.6053950786590576,
"learning_rate": 1.7650687577857972e-06,
"loss": 1.1975,
"step": 3420
},
{
"epoch": 0.1536394176931691,
"grad_norm": 4.930041313171387,
"learning_rate": 1.7447111712970138e-06,
"loss": 1.0566,
"step": 3430
},
{
"epoch": 0.15408734602463606,
"grad_norm": 4.492660045623779,
"learning_rate": 1.7244360143778004e-06,
"loss": 1.1441,
"step": 3440
},
{
"epoch": 0.15453527435610304,
"grad_norm": 4.847555637359619,
"learning_rate": 1.704244120460443e-06,
"loss": 1.231,
"step": 3450
},
{
"epoch": 0.15453527435610304,
"eval_loss": 1.198148488998413,
"eval_runtime": 51.6757,
"eval_samples_per_second": 9.676,
"eval_steps_per_second": 9.676,
"step": 3450
},
{
"epoch": 0.15498320268757,
"grad_norm": 5.320653438568115,
"learning_rate": 1.6841363195546162e-06,
"loss": 0.996,
"step": 3460
},
{
"epoch": 0.15543113101903697,
"grad_norm": 4.333999156951904,
"learning_rate": 1.6641134382132576e-06,
"loss": 1.2536,
"step": 3470
},
{
"epoch": 0.15587905935050392,
"grad_norm": 6.867399215698242,
"learning_rate": 1.6441762994985947e-06,
"loss": 1.1461,
"step": 3480
},
{
"epoch": 0.1563269876819709,
"grad_norm": 3.2110917568206787,
"learning_rate": 1.6243257229483141e-06,
"loss": 1.1086,
"step": 3490
},
{
"epoch": 0.15677491601343785,
"grad_norm": 3.345970630645752,
"learning_rate": 1.6045625245418648e-06,
"loss": 0.9485,
"step": 3500
},
{
"epoch": 0.15722284434490483,
"grad_norm": 4.890392780303955,
"learning_rate": 1.584887516666928e-06,
"loss": 1.0968,
"step": 3510
},
{
"epoch": 0.15767077267637178,
"grad_norm": 5.448171615600586,
"learning_rate": 1.565301508086015e-06,
"loss": 1.1305,
"step": 3520
},
{
"epoch": 0.15811870100783876,
"grad_norm": 7.16267728805542,
"learning_rate": 1.5458053039032263e-06,
"loss": 1.2279,
"step": 3530
},
{
"epoch": 0.1585666293393057,
"grad_norm": 5.2700018882751465,
"learning_rate": 1.5263997055311536e-06,
"loss": 1.0474,
"step": 3540
},
{
"epoch": 0.1590145576707727,
"grad_norm": 5.955024719238281,
"learning_rate": 1.5070855106579404e-06,
"loss": 1.1283,
"step": 3550
},
{
"epoch": 0.15946248600223964,
"grad_norm": 2.882784366607666,
"learning_rate": 1.4878635132144885e-06,
"loss": 0.9112,
"step": 3560
},
{
"epoch": 0.15991041433370662,
"grad_norm": 4.2263875007629395,
"learning_rate": 1.4687345033418258e-06,
"loss": 1.1554,
"step": 3570
},
{
"epoch": 0.16035834266517357,
"grad_norm": 4.622799396514893,
"learning_rate": 1.4496992673586262e-06,
"loss": 1.3423,
"step": 3580
},
{
"epoch": 0.16080627099664055,
"grad_norm": 5.2950897216796875,
"learning_rate": 1.4307585877288822e-06,
"loss": 1.0494,
"step": 3590
},
{
"epoch": 0.1612541993281075,
"grad_norm": 5.289889335632324,
"learning_rate": 1.4119132430297496e-06,
"loss": 1.1448,
"step": 3600
},
{
"epoch": 0.1612541993281075,
"eval_loss": 1.1965739727020264,
"eval_runtime": 51.7182,
"eval_samples_per_second": 9.668,
"eval_steps_per_second": 9.668,
"step": 3600
},
{
"epoch": 0.16170212765957448,
"grad_norm": 6.415092468261719,
"learning_rate": 1.3931640079195365e-06,
"loss": 1.0204,
"step": 3610
},
{
"epoch": 0.16215005599104143,
"grad_norm": 3.348160743713379,
"learning_rate": 1.3745116531058645e-06,
"loss": 1.1308,
"step": 3620
},
{
"epoch": 0.1625979843225084,
"grad_norm": 6.698293209075928,
"learning_rate": 1.3559569453139797e-06,
"loss": 0.9401,
"step": 3630
},
{
"epoch": 0.16304591265397536,
"grad_norm": 3.5045154094696045,
"learning_rate": 1.3375006472552483e-06,
"loss": 1.152,
"step": 3640
},
{
"epoch": 0.16349384098544234,
"grad_norm": 4.656421661376953,
"learning_rate": 1.3191435175957945e-06,
"loss": 1.1775,
"step": 3650
},
{
"epoch": 0.1639417693169093,
"grad_norm": 8.8998384475708,
"learning_rate": 1.3008863109253174e-06,
"loss": 1.0061,
"step": 3660
},
{
"epoch": 0.16438969764837627,
"grad_norm": 3.5046370029449463,
"learning_rate": 1.282729777726078e-06,
"loss": 1.1871,
"step": 3670
},
{
"epoch": 0.16483762597984322,
"grad_norm": 4.024252891540527,
"learning_rate": 1.2646746643420392e-06,
"loss": 1.2593,
"step": 3680
},
{
"epoch": 0.1652855543113102,
"grad_norm": 4.861652851104736,
"learning_rate": 1.2467217129481952e-06,
"loss": 1.1068,
"step": 3690
},
{
"epoch": 0.16573348264277715,
"grad_norm": 6.007284641265869,
"learning_rate": 1.2288716615200617e-06,
"loss": 1.0237,
"step": 3700
},
{
"epoch": 0.16618141097424413,
"grad_norm": 4.506286144256592,
"learning_rate": 1.2111252438033404e-06,
"loss": 1.0827,
"step": 3710
},
{
"epoch": 0.16662933930571108,
"grad_norm": 7.5774102210998535,
"learning_rate": 1.1934831892837524e-06,
"loss": 1.2481,
"step": 3720
},
{
"epoch": 0.16707726763717806,
"grad_norm": 4.199349880218506,
"learning_rate": 1.1759462231570618e-06,
"loss": 1.1948,
"step": 3730
},
{
"epoch": 0.167525195968645,
"grad_norm": 3.675760269165039,
"learning_rate": 1.1585150662992578e-06,
"loss": 0.8945,
"step": 3740
},
{
"epoch": 0.167973124300112,
"grad_norm": 4.647981643676758,
"learning_rate": 1.1411904352369262e-06,
"loss": 1.0746,
"step": 3750
},
{
"epoch": 0.167973124300112,
"eval_loss": 1.1958056688308716,
"eval_runtime": 51.7591,
"eval_samples_per_second": 9.66,
"eval_steps_per_second": 9.66,
"step": 3750
},
{
"epoch": 0.16842105263157894,
"grad_norm": 2.354313611984253,
"learning_rate": 1.1239730421177952e-06,
"loss": 1.0362,
"step": 3760
},
{
"epoch": 0.16886898096304592,
"grad_norm": 4.00113582611084,
"learning_rate": 1.1068635946814569e-06,
"loss": 1.0924,
"step": 3770
},
{
"epoch": 0.16931690929451287,
"grad_norm": 3.765235185623169,
"learning_rate": 1.0898627962302831e-06,
"loss": 1.3452,
"step": 3780
},
{
"epoch": 0.16976483762597985,
"grad_norm": 3.814605236053467,
"learning_rate": 1.072971345600513e-06,
"loss": 1.0048,
"step": 3790
},
{
"epoch": 0.1702127659574468,
"grad_norm": 3.447803020477295,
"learning_rate": 1.056189937133522e-06,
"loss": 1.149,
"step": 3800
},
{
"epoch": 0.17066069428891378,
"grad_norm": 7.1337714195251465,
"learning_rate": 1.0395192606472822e-06,
"loss": 1.1497,
"step": 3810
},
{
"epoch": 0.17110862262038073,
"grad_norm": 5.239931583404541,
"learning_rate": 1.0229600014080101e-06,
"loss": 0.9874,
"step": 3820
},
{
"epoch": 0.1715565509518477,
"grad_norm": 3.4100687503814697,
"learning_rate": 1.006512840101995e-06,
"loss": 1.0393,
"step": 3830
},
{
"epoch": 0.17200447928331467,
"grad_norm": 4.527777671813965,
"learning_rate": 9.90178452807619e-07,
"loss": 0.968,
"step": 3840
},
{
"epoch": 0.17245240761478164,
"grad_norm": 3.7964625358581543,
"learning_rate": 9.739575109675674e-07,
"loss": 1.1207,
"step": 3850
},
{
"epoch": 0.1729003359462486,
"grad_norm": 4.329505920410156,
"learning_rate": 9.578506813612243e-07,
"loss": 1.0924,
"step": 3860
},
{
"epoch": 0.17334826427771557,
"grad_norm": 3.9827823638916016,
"learning_rate": 9.418586260772695e-07,
"loss": 1.0937,
"step": 3870
},
{
"epoch": 0.17379619260918253,
"grad_norm": 4.150352954864502,
"learning_rate": 9.259820024864594e-07,
"loss": 1.2071,
"step": 3880
},
{
"epoch": 0.1742441209406495,
"grad_norm": 2.648918867111206,
"learning_rate": 9.102214632146059e-07,
"loss": 1.1754,
"step": 3890
},
{
"epoch": 0.17469204927211646,
"grad_norm": 5.348718166351318,
"learning_rate": 8.94577656115746e-07,
"loss": 1.1031,
"step": 3900
},
{
"epoch": 0.17469204927211646,
"eval_loss": 1.1968835592269897,
"eval_runtime": 51.6518,
"eval_samples_per_second": 9.68,
"eval_steps_per_second": 9.68,
"step": 3900
},
{
"epoch": 0.17513997760358344,
"grad_norm": 6.799318313598633,
"learning_rate": 8.790512242455198e-07,
"loss": 1.1188,
"step": 3910
},
{
"epoch": 0.1755879059350504,
"grad_norm": 4.05487060546875,
"learning_rate": 8.636428058347274e-07,
"loss": 1.3045,
"step": 3920
},
{
"epoch": 0.17603583426651737,
"grad_norm": 4.513579845428467,
"learning_rate": 8.483530342630993e-07,
"loss": 1.2577,
"step": 3930
},
{
"epoch": 0.17648376259798432,
"grad_norm": 7.971194267272949,
"learning_rate": 8.331825380332599e-07,
"loss": 1.1376,
"step": 3940
},
{
"epoch": 0.1769316909294513,
"grad_norm": 3.740802764892578,
"learning_rate": 8.181319407448884e-07,
"loss": 1.1413,
"step": 3950
},
{
"epoch": 0.17737961926091825,
"grad_norm": 3.431658983230591,
"learning_rate": 8.032018610690914e-07,
"loss": 1.0802,
"step": 3960
},
{
"epoch": 0.17782754759238523,
"grad_norm": 3.8207449913024902,
"learning_rate": 7.883929127229665e-07,
"loss": 1.173,
"step": 3970
},
{
"epoch": 0.17827547592385218,
"grad_norm": 3.088942289352417,
"learning_rate": 7.737057044443793e-07,
"loss": 1.1144,
"step": 3980
},
{
"epoch": 0.17872340425531916,
"grad_norm": 3.705589532852173,
"learning_rate": 7.591408399669337e-07,
"loss": 1.2676,
"step": 3990
},
{
"epoch": 0.1791713325867861,
"grad_norm": 4.925235271453857,
"learning_rate": 7.446989179951632e-07,
"loss": 1.0197,
"step": 4000
},
{
"epoch": 0.1796192609182531,
"grad_norm": 4.373708248138428,
"learning_rate": 7.303805321799146e-07,
"loss": 1.0041,
"step": 4010
},
{
"epoch": 0.18006718924972004,
"grad_norm": 4.23321008682251,
"learning_rate": 7.161862710939476e-07,
"loss": 1.0504,
"step": 4020
},
{
"epoch": 0.18051511758118702,
"grad_norm": 6.634941101074219,
"learning_rate": 7.021167182077403e-07,
"loss": 1.062,
"step": 4030
},
{
"epoch": 0.18096304591265397,
"grad_norm": 12.015007972717285,
"learning_rate": 6.881724518655049e-07,
"loss": 1.3095,
"step": 4040
},
{
"epoch": 0.18141097424412095,
"grad_norm": 5.376244068145752,
"learning_rate": 6.743540452614152e-07,
"loss": 1.0552,
"step": 4050
},
{
"epoch": 0.18141097424412095,
"eval_loss": 1.1952238082885742,
"eval_runtime": 51.6946,
"eval_samples_per_second": 9.672,
"eval_steps_per_second": 9.672,
"step": 4050
},
{
"epoch": 0.1818589025755879,
"grad_norm": 5.1148858070373535,
"learning_rate": 6.606620664160438e-07,
"loss": 1.0796,
"step": 4060
},
{
"epoch": 0.18230683090705488,
"grad_norm": 3.497487783432007,
"learning_rate": 6.470970781530139e-07,
"loss": 1.0996,
"step": 4070
},
{
"epoch": 0.18275475923852183,
"grad_norm": 4.02069616317749,
"learning_rate": 6.336596380758604e-07,
"loss": 1.18,
"step": 4080
},
{
"epoch": 0.1832026875699888,
"grad_norm": 4.936882495880127,
"learning_rate": 6.203502985451152e-07,
"loss": 1.1434,
"step": 4090
},
{
"epoch": 0.18365061590145576,
"grad_norm": 3.6114046573638916,
"learning_rate": 6.071696066555978e-07,
"loss": 1.1957,
"step": 4100
},
{
"epoch": 0.18409854423292274,
"grad_norm": 3.0989315509796143,
"learning_rate": 5.941181042139258e-07,
"loss": 1.1672,
"step": 4110
},
{
"epoch": 0.1845464725643897,
"grad_norm": 3.9395434856414795,
"learning_rate": 5.811963277162466e-07,
"loss": 1.3213,
"step": 4120
},
{
"epoch": 0.18499440089585667,
"grad_norm": 3.7421300411224365,
"learning_rate": 5.684048083261789e-07,
"loss": 0.9563,
"step": 4130
},
{
"epoch": 0.18544232922732362,
"grad_norm": 3.190976858139038,
"learning_rate": 5.557440718529848e-07,
"loss": 1.1234,
"step": 4140
},
{
"epoch": 0.1858902575587906,
"grad_norm": 3.461064100265503,
"learning_rate": 5.432146387299522e-07,
"loss": 1.0016,
"step": 4150
},
{
"epoch": 0.18633818589025755,
"grad_norm": 6.645826816558838,
"learning_rate": 5.308170239930022e-07,
"loss": 1.1967,
"step": 4160
},
{
"epoch": 0.18678611422172453,
"grad_norm": 4.823378562927246,
"learning_rate": 5.185517372595187e-07,
"loss": 1.032,
"step": 4170
},
{
"epoch": 0.18723404255319148,
"grad_norm": 3.5760250091552734,
"learning_rate": 5.064192827073995e-07,
"loss": 1.1513,
"step": 4180
},
{
"epoch": 0.18768197088465846,
"grad_norm": 3.162781000137329,
"learning_rate": 4.944201590543308e-07,
"loss": 0.9593,
"step": 4190
},
{
"epoch": 0.1881298992161254,
"grad_norm": 8.633989334106445,
"learning_rate": 4.825548595372898e-07,
"loss": 1.2696,
"step": 4200
},
{
"epoch": 0.1881298992161254,
"eval_loss": 1.1959577798843384,
"eval_runtime": 51.6407,
"eval_samples_per_second": 9.682,
"eval_steps_per_second": 9.682,
"step": 4200
},
{
"epoch": 0.1885778275475924,
"grad_norm": 4.277423858642578,
"learning_rate": 4.7082387189226646e-07,
"loss": 1.0834,
"step": 4210
},
{
"epoch": 0.18902575587905934,
"grad_norm": 3.7345645427703857,
"learning_rate": 4.5922767833421454e-07,
"loss": 1.255,
"step": 4220
},
{
"epoch": 0.18947368421052632,
"grad_norm": 5.163575172424316,
"learning_rate": 4.477667555372326e-07,
"loss": 1.1317,
"step": 4230
},
{
"epoch": 0.18992161254199327,
"grad_norm": 5.2220892906188965,
"learning_rate": 4.364415746149678e-07,
"loss": 1.0966,
"step": 4240
},
{
"epoch": 0.19036954087346025,
"grad_norm": 5.796306610107422,
"learning_rate": 4.2525260110124964e-07,
"loss": 1.0268,
"step": 4250
},
{
"epoch": 0.1908174692049272,
"grad_norm": 4.295403003692627,
"learning_rate": 4.1420029493095623e-07,
"loss": 1.0465,
"step": 4260
},
{
"epoch": 0.19126539753639418,
"grad_norm": 5.671868324279785,
"learning_rate": 4.032851104211036e-07,
"loss": 1.2124,
"step": 4270
},
{
"epoch": 0.19171332586786113,
"grad_norm": 4.053644180297852,
"learning_rate": 3.925074962521762e-07,
"loss": 1.0574,
"step": 4280
},
{
"epoch": 0.1921612541993281,
"grad_norm": 3.7694053649902344,
"learning_rate": 3.818678954496787e-07,
"loss": 1.0604,
"step": 4290
},
{
"epoch": 0.19260918253079506,
"grad_norm": 4.982527256011963,
"learning_rate": 3.713667453659287e-07,
"loss": 1.1518,
"step": 4300
},
{
"epoch": 0.19305711086226204,
"grad_norm": 5.036848545074463,
"learning_rate": 3.6100447766207473e-07,
"loss": 1.0251,
"step": 4310
},
{
"epoch": 0.193505039193729,
"grad_norm": 5.744006633758545,
"learning_rate": 3.5078151829035693e-07,
"loss": 1.0103,
"step": 4320
},
{
"epoch": 0.19395296752519597,
"grad_norm": 3.843419075012207,
"learning_rate": 3.4069828747659405e-07,
"loss": 1.0053,
"step": 4330
},
{
"epoch": 0.19440089585666293,
"grad_norm": 4.357511043548584,
"learning_rate": 3.3075519970291144e-07,
"loss": 1.202,
"step": 4340
},
{
"epoch": 0.1948488241881299,
"grad_norm": 6.164062976837158,
"learning_rate": 3.209526636907036e-07,
"loss": 1.1136,
"step": 4350
},
{
"epoch": 0.1948488241881299,
"eval_loss": 1.1951868534088135,
"eval_runtime": 51.6432,
"eval_samples_per_second": 9.682,
"eval_steps_per_second": 9.682,
"step": 4350
},
{
"epoch": 0.19529675251959686,
"grad_norm": 3.893348217010498,
"learning_rate": 3.1129108238383095e-07,
"loss": 1.2238,
"step": 4360
},
{
"epoch": 0.19574468085106383,
"grad_norm": 3.704392433166504,
"learning_rate": 3.017708529320604e-07,
"loss": 1.0766,
"step": 4370
},
{
"epoch": 0.19619260918253079,
"grad_norm": 4.406269073486328,
"learning_rate": 2.923923666747357e-07,
"loss": 0.9588,
"step": 4380
},
{
"epoch": 0.19664053751399777,
"grad_norm": 6.578729152679443,
"learning_rate": 2.8315600912469477e-07,
"loss": 1.1622,
"step": 4390
},
{
"epoch": 0.19708846584546472,
"grad_norm": 4.1804094314575195,
"learning_rate": 2.740621599524189e-07,
"loss": 1.1999,
"step": 4400
},
{
"epoch": 0.1975363941769317,
"grad_norm": 6.192513465881348,
"learning_rate": 2.651111929704303e-07,
"loss": 1.1274,
"step": 4410
},
{
"epoch": 0.19798432250839865,
"grad_norm": 4.356874942779541,
"learning_rate": 2.563034761179223e-07,
"loss": 1.0262,
"step": 4420
},
{
"epoch": 0.19843225083986563,
"grad_norm": 4.435469627380371,
"learning_rate": 2.476393714456384e-07,
"loss": 1.1814,
"step": 4430
},
{
"epoch": 0.19888017917133258,
"grad_norm": 3.9173505306243896,
"learning_rate": 2.391192351009855e-07,
"loss": 0.7984,
"step": 4440
},
{
"epoch": 0.19932810750279956,
"grad_norm": 6.546506881713867,
"learning_rate": 2.3074341731339837e-07,
"loss": 1.168,
"step": 4450
},
{
"epoch": 0.1997760358342665,
"grad_norm": 6.1646223068237305,
"learning_rate": 2.225122623799407e-07,
"loss": 1.2589,
"step": 4460
},
{
"epoch": 0.2002239641657335,
"grad_norm": 3.210203170776367,
"learning_rate": 2.1442610865115135e-07,
"loss": 1.0636,
"step": 4470
},
{
"epoch": 0.20067189249720044,
"grad_norm": 5.133816242218018,
"learning_rate": 2.0648528851714077e-07,
"loss": 1.0195,
"step": 4480
},
{
"epoch": 0.20111982082866742,
"grad_norm": 4.449398517608643,
"learning_rate": 1.9869012839392064e-07,
"loss": 1.1007,
"step": 4490
},
{
"epoch": 0.20156774916013437,
"grad_norm": 4.8083977699279785,
"learning_rate": 1.9104094870999264e-07,
"loss": 1.1975,
"step": 4500
},
{
"epoch": 0.20156774916013437,
"eval_loss": 1.1950809955596924,
"eval_runtime": 51.7311,
"eval_samples_per_second": 9.665,
"eval_steps_per_second": 9.665,
"step": 4500
},
{
"epoch": 0.20201567749160135,
"grad_norm": 4.709386348724365,
"learning_rate": 1.8353806389317428e-07,
"loss": 0.9829,
"step": 4510
},
{
"epoch": 0.2024636058230683,
"grad_norm": 5.23099946975708,
"learning_rate": 1.761817823576731e-07,
"loss": 1.1149,
"step": 4520
},
{
"epoch": 0.20291153415453528,
"grad_norm": 3.4107179641723633,
"learning_rate": 1.6897240649141125e-07,
"loss": 0.9822,
"step": 4530
},
{
"epoch": 0.20335946248600223,
"grad_norm": 3.951052188873291,
"learning_rate": 1.619102326435923e-07,
"loss": 1.2333,
"step": 4540
},
{
"epoch": 0.2038073908174692,
"grad_norm": 4.30809211730957,
"learning_rate": 1.5499555111252285e-07,
"loss": 1.0641,
"step": 4550
},
{
"epoch": 0.20425531914893616,
"grad_norm": 4.1274189949035645,
"learning_rate": 1.4822864613367766e-07,
"loss": 1.0962,
"step": 4560
},
{
"epoch": 0.20470324748040314,
"grad_norm": 6.046044826507568,
"learning_rate": 1.4160979586801724e-07,
"loss": 1.0241,
"step": 4570
},
{
"epoch": 0.2051511758118701,
"grad_norm": 4.066288471221924,
"learning_rate": 1.3513927239055036e-07,
"loss": 0.9061,
"step": 4580
},
{
"epoch": 0.20559910414333707,
"grad_norm": 3.9250218868255615,
"learning_rate": 1.2881734167915425e-07,
"loss": 1.1666,
"step": 4590
},
{
"epoch": 0.20604703247480402,
"grad_norm": 4.965548515319824,
"learning_rate": 1.2264426360363956e-07,
"loss": 0.8048,
"step": 4600
},
{
"epoch": 0.206494960806271,
"grad_norm": 5.192389965057373,
"learning_rate": 1.1662029191506775e-07,
"loss": 0.9869,
"step": 4610
},
{
"epoch": 0.20694288913773795,
"grad_norm": 4.953862190246582,
"learning_rate": 1.107456742353201e-07,
"loss": 1.0042,
"step": 4620
},
{
"epoch": 0.20739081746920493,
"grad_norm": 4.955436706542969,
"learning_rate": 1.0502065204692062e-07,
"loss": 1.101,
"step": 4630
},
{
"epoch": 0.20783874580067188,
"grad_norm": 2.5195674896240234,
"learning_rate": 9.94454606831076e-08,
"loss": 0.9542,
"step": 4640
},
{
"epoch": 0.20828667413213886,
"grad_norm": 4.142997741699219,
"learning_rate": 9.402032931816144e-08,
"loss": 1.1318,
"step": 4650
},
{
"epoch": 0.20828667413213886,
"eval_loss": 1.1947814226150513,
"eval_runtime": 51.8063,
"eval_samples_per_second": 9.651,
"eval_steps_per_second": 9.651,
"step": 4650
},
{
"epoch": 0.2087346024636058,
"grad_norm": 4.046876907348633,
"learning_rate": 8.874548095798464e-08,
"loss": 1.1393,
"step": 4660
},
{
"epoch": 0.2091825307950728,
"grad_norm": 4.740685939788818,
"learning_rate": 8.362113243093245e-08,
"loss": 1.0529,
"step": 4670
},
{
"epoch": 0.20963045912653974,
"grad_norm": 6.356805324554443,
"learning_rate": 7.864749437890173e-08,
"loss": 1.2791,
"step": 4680
},
{
"epoch": 0.21007838745800672,
"grad_norm": 4.329228401184082,
"learning_rate": 7.382477124867282e-08,
"loss": 1.2672,
"step": 4690
},
{
"epoch": 0.21052631578947367,
"grad_norm": 5.217611312866211,
"learning_rate": 6.915316128350461e-08,
"loss": 0.9357,
"step": 4700
},
{
"epoch": 0.21097424412094065,
"grad_norm": 5.418657302856445,
"learning_rate": 6.463285651498563e-08,
"loss": 1.011,
"step": 4710
},
{
"epoch": 0.2114221724524076,
"grad_norm": 6.056429386138916,
"learning_rate": 6.026404275513875e-08,
"loss": 1.4377,
"step": 4720
},
{
"epoch": 0.21187010078387458,
"grad_norm": 3.5456736087799072,
"learning_rate": 5.604689958878723e-08,
"loss": 1.1192,
"step": 4730
},
{
"epoch": 0.21231802911534153,
"grad_norm": 5.697049140930176,
"learning_rate": 5.198160036616898e-08,
"loss": 1.0392,
"step": 4740
},
{
"epoch": 0.2127659574468085,
"grad_norm": 4.248316764831543,
"learning_rate": 4.8068312195811847e-08,
"loss": 1.0041,
"step": 4750
},
{
"epoch": 0.21321388577827546,
"grad_norm": 3.3937604427337646,
"learning_rate": 4.4307195937666194e-08,
"loss": 0.9791,
"step": 4760
},
{
"epoch": 0.21366181410974244,
"grad_norm": 3.097196340560913,
"learning_rate": 4.069840619648935e-08,
"loss": 1.1306,
"step": 4770
},
{
"epoch": 0.2141097424412094,
"grad_norm": 5.534854888916016,
"learning_rate": 3.72420913154932e-08,
"loss": 1.104,
"step": 4780
},
{
"epoch": 0.21455767077267637,
"grad_norm": 5.693947792053223,
"learning_rate": 3.3938393370244876e-08,
"loss": 1.1541,
"step": 4790
},
{
"epoch": 0.21500559910414332,
"grad_norm": 4.025967597961426,
"learning_rate": 3.078744816282731e-08,
"loss": 1.1515,
"step": 4800
},
{
"epoch": 0.21500559910414332,
"eval_loss": 1.1954809427261353,
"eval_runtime": 51.6284,
"eval_samples_per_second": 9.685,
"eval_steps_per_second": 9.685,
"step": 4800
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.204448348803072e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}