neuralwonderland's picture
Training in progress, step 4950, checkpoint
0a49d64 verified
raw
history blame
94 kB
{
"best_metric": 1.9684966802597046,
"best_model_checkpoint": "./output/checkpoint-4950",
"epoch": 0.17762945419313167,
"eval_steps": 150,
"global_step": 4950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003588473822083468,
"grad_norm": 6.717320442199707,
"learning_rate": 5.500000000000001e-06,
"loss": 2.1469,
"step": 10
},
{
"epoch": 0.0007176947644166936,
"grad_norm": 8.888245582580566,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.0737,
"step": 20
},
{
"epoch": 0.0010765421466250404,
"grad_norm": 6.897828102111816,
"learning_rate": 1.65e-05,
"loss": 2.3197,
"step": 30
},
{
"epoch": 0.0014353895288333873,
"grad_norm": 5.266988754272461,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.1048,
"step": 40
},
{
"epoch": 0.0017942369110417339,
"grad_norm": 5.98182487487793,
"learning_rate": 2.75e-05,
"loss": 2.1213,
"step": 50
},
{
"epoch": 0.002153084293250081,
"grad_norm": 5.487156867980957,
"learning_rate": 3.3e-05,
"loss": 2.0035,
"step": 60
},
{
"epoch": 0.0025119316754584277,
"grad_norm": 7.080353260040283,
"learning_rate": 3.85e-05,
"loss": 1.9454,
"step": 70
},
{
"epoch": 0.0028707790576667745,
"grad_norm": 6.8405046463012695,
"learning_rate": 4.4000000000000006e-05,
"loss": 2.0665,
"step": 80
},
{
"epoch": 0.003229626439875121,
"grad_norm": 5.570849895477295,
"learning_rate": 4.9500000000000004e-05,
"loss": 2.0471,
"step": 90
},
{
"epoch": 0.0035884738220834677,
"grad_norm": 8.464041709899902,
"learning_rate": 5.5e-05,
"loss": 2.2792,
"step": 100
},
{
"epoch": 0.0039473212042918145,
"grad_norm": 6.397585868835449,
"learning_rate": 5.4999434791355066e-05,
"loss": 2.145,
"step": 110
},
{
"epoch": 0.004306168586500162,
"grad_norm": 6.912049293518066,
"learning_rate": 5.4997739188653784e-05,
"loss": 2.2255,
"step": 120
},
{
"epoch": 0.004665015968708508,
"grad_norm": 5.9162139892578125,
"learning_rate": 5.4994913261595724e-05,
"loss": 2.1511,
"step": 130
},
{
"epoch": 0.005023863350916855,
"grad_norm": 6.609055995941162,
"learning_rate": 5.49909571263437e-05,
"loss": 2.2134,
"step": 140
},
{
"epoch": 0.005382710733125202,
"grad_norm": 5.2473835945129395,
"learning_rate": 5.498587094551892e-05,
"loss": 2.043,
"step": 150
},
{
"epoch": 0.005382710733125202,
"eval_loss": 2.19863224029541,
"eval_runtime": 54.8159,
"eval_samples_per_second": 9.121,
"eval_steps_per_second": 9.121,
"step": 150
},
{
"epoch": 0.005741558115333549,
"grad_norm": 7.073841094970703,
"learning_rate": 5.497965492819436e-05,
"loss": 2.2452,
"step": 160
},
{
"epoch": 0.006100405497541895,
"grad_norm": 6.833587646484375,
"learning_rate": 5.4972309329886156e-05,
"loss": 2.0677,
"step": 170
},
{
"epoch": 0.006459252879750242,
"grad_norm": 5.268569469451904,
"learning_rate": 5.496383445254307e-05,
"loss": 2.131,
"step": 180
},
{
"epoch": 0.006818100261958589,
"grad_norm": 6.808563709259033,
"learning_rate": 5.495423064453413e-05,
"loss": 2.206,
"step": 190
},
{
"epoch": 0.007176947644166935,
"grad_norm": 5.996222019195557,
"learning_rate": 5.4943498300634254e-05,
"loss": 2.2165,
"step": 200
},
{
"epoch": 0.007535795026375283,
"grad_norm": 6.295897006988525,
"learning_rate": 5.493163786200807e-05,
"loss": 2.0522,
"step": 210
},
{
"epoch": 0.007894642408583629,
"grad_norm": 8.73654556274414,
"learning_rate": 5.491864981619175e-05,
"loss": 2.1393,
"step": 220
},
{
"epoch": 0.008253489790791976,
"grad_norm": 6.038145065307617,
"learning_rate": 5.4904534697073e-05,
"loss": 2.0871,
"step": 230
},
{
"epoch": 0.008612337173000324,
"grad_norm": 7.730770111083984,
"learning_rate": 5.488929308486908e-05,
"loss": 2.0365,
"step": 240
},
{
"epoch": 0.008971184555208669,
"grad_norm": 6.322446823120117,
"learning_rate": 5.487292560610295e-05,
"loss": 1.9943,
"step": 250
},
{
"epoch": 0.009330031937417016,
"grad_norm": 5.963322639465332,
"learning_rate": 5.485543293357758e-05,
"loss": 2.2698,
"step": 260
},
{
"epoch": 0.009688879319625364,
"grad_norm": 7.290161609649658,
"learning_rate": 5.483681578634821e-05,
"loss": 2.1537,
"step": 270
},
{
"epoch": 0.01004772670183371,
"grad_norm": 6.640771389007568,
"learning_rate": 5.481707492969285e-05,
"loss": 2.3031,
"step": 280
},
{
"epoch": 0.010406574084042056,
"grad_norm": 5.640336036682129,
"learning_rate": 5.479621117508079e-05,
"loss": 2.2397,
"step": 290
},
{
"epoch": 0.010765421466250404,
"grad_norm": 6.066910743713379,
"learning_rate": 5.477422538013927e-05,
"loss": 1.8947,
"step": 300
},
{
"epoch": 0.010765421466250404,
"eval_loss": 2.1935410499572754,
"eval_runtime": 56.6355,
"eval_samples_per_second": 8.828,
"eval_steps_per_second": 8.828,
"step": 300
},
{
"epoch": 0.01112426884845875,
"grad_norm": 5.537936687469482,
"learning_rate": 5.475111844861821e-05,
"loss": 2.3336,
"step": 310
},
{
"epoch": 0.011483116230667098,
"grad_norm": 4.925268173217773,
"learning_rate": 5.4726891330353056e-05,
"loss": 2.0345,
"step": 320
},
{
"epoch": 0.011841963612875444,
"grad_norm": 7.5510430335998535,
"learning_rate": 5.4701545021225746e-05,
"loss": 2.2388,
"step": 330
},
{
"epoch": 0.01220081099508379,
"grad_norm": 6.070944309234619,
"learning_rate": 5.4675080563123786e-05,
"loss": 2.1464,
"step": 340
},
{
"epoch": 0.012559658377292138,
"grad_norm": 5.479851245880127,
"learning_rate": 5.4647499043897386e-05,
"loss": 2.1711,
"step": 350
},
{
"epoch": 0.012918505759500484,
"grad_norm": 5.476010799407959,
"learning_rate": 5.461880159731476e-05,
"loss": 2.1122,
"step": 360
},
{
"epoch": 0.01327735314170883,
"grad_norm": 5.9396820068359375,
"learning_rate": 5.4588989403015564e-05,
"loss": 2.1398,
"step": 370
},
{
"epoch": 0.013636200523917178,
"grad_norm": 5.315182209014893,
"learning_rate": 5.4558063686462315e-05,
"loss": 1.9519,
"step": 380
},
{
"epoch": 0.013995047906125525,
"grad_norm": 6.243162631988525,
"learning_rate": 5.4526025718890104e-05,
"loss": 2.2456,
"step": 390
},
{
"epoch": 0.01435389528833387,
"grad_norm": 7.608538627624512,
"learning_rate": 5.44928768172543e-05,
"loss": 2.0085,
"step": 400
},
{
"epoch": 0.014712742670542218,
"grad_norm": 6.5568623542785645,
"learning_rate": 5.44586183441764e-05,
"loss": 1.9001,
"step": 410
},
{
"epoch": 0.015071590052750565,
"grad_norm": 6.803008079528809,
"learning_rate": 5.442325170788806e-05,
"loss": 2.0497,
"step": 420
},
{
"epoch": 0.015430437434958913,
"grad_norm": 6.07423734664917,
"learning_rate": 5.438677836217317e-05,
"loss": 2.157,
"step": 430
},
{
"epoch": 0.015789284817167258,
"grad_norm": 6.988314628601074,
"learning_rate": 5.434919980630811e-05,
"loss": 2.1349,
"step": 440
},
{
"epoch": 0.016148132199375605,
"grad_norm": 5.507582664489746,
"learning_rate": 5.431051758500015e-05,
"loss": 1.9094,
"step": 450
},
{
"epoch": 0.016148132199375605,
"eval_loss": 2.174536943435669,
"eval_runtime": 55.7176,
"eval_samples_per_second": 8.974,
"eval_steps_per_second": 8.974,
"step": 450
},
{
"epoch": 0.016506979581583953,
"grad_norm": 8.126635551452637,
"learning_rate": 5.427073328832388e-05,
"loss": 1.9681,
"step": 460
},
{
"epoch": 0.0168658269637923,
"grad_norm": 5.529940128326416,
"learning_rate": 5.422984855165592e-05,
"loss": 2.2576,
"step": 470
},
{
"epoch": 0.017224674346000647,
"grad_norm": 8.195639610290527,
"learning_rate": 5.418786505560766e-05,
"loss": 1.9816,
"step": 480
},
{
"epoch": 0.017583521728208994,
"grad_norm": 6.6597795486450195,
"learning_rate": 5.414478452595617e-05,
"loss": 2.1214,
"step": 490
},
{
"epoch": 0.017942369110417338,
"grad_norm": 7.02098274230957,
"learning_rate": 5.4100608733573315e-05,
"loss": 2.166,
"step": 500
},
{
"epoch": 0.018301216492625685,
"grad_norm": 5.826364517211914,
"learning_rate": 5.4055339494352874e-05,
"loss": 2.0209,
"step": 510
},
{
"epoch": 0.018660063874834033,
"grad_norm": 6.55180025100708,
"learning_rate": 5.400897866913597e-05,
"loss": 2.0571,
"step": 520
},
{
"epoch": 0.01901891125704238,
"grad_norm": 5.248165607452393,
"learning_rate": 5.3961528163634546e-05,
"loss": 2.152,
"step": 530
},
{
"epoch": 0.019377758639250727,
"grad_norm": 6.659270286560059,
"learning_rate": 5.391298992835303e-05,
"loss": 2.2554,
"step": 540
},
{
"epoch": 0.019736606021459074,
"grad_norm": 6.1025919914245605,
"learning_rate": 5.386336595850817e-05,
"loss": 2.0898,
"step": 550
},
{
"epoch": 0.02009545340366742,
"grad_norm": 7.94935417175293,
"learning_rate": 5.3812658293946995e-05,
"loss": 2.124,
"step": 560
},
{
"epoch": 0.020454300785875765,
"grad_norm": 5.731258869171143,
"learning_rate": 5.376086901906299e-05,
"loss": 2.1947,
"step": 570
},
{
"epoch": 0.020813148168084113,
"grad_norm": 5.822957515716553,
"learning_rate": 5.37080002627104e-05,
"loss": 1.8958,
"step": 580
},
{
"epoch": 0.02117199555029246,
"grad_norm": 6.069146156311035,
"learning_rate": 5.365405419811673e-05,
"loss": 2.1358,
"step": 590
},
{
"epoch": 0.021530842932500807,
"grad_norm": 6.830340385437012,
"learning_rate": 5.359903304279339e-05,
"loss": 2.0506,
"step": 600
},
{
"epoch": 0.021530842932500807,
"eval_loss": 2.1694562435150146,
"eval_runtime": 54.4455,
"eval_samples_per_second": 9.183,
"eval_steps_per_second": 9.183,
"step": 600
},
{
"epoch": 0.021889690314709154,
"grad_norm": 6.6071085929870605,
"learning_rate": 5.354293905844459e-05,
"loss": 2.1761,
"step": 610
},
{
"epoch": 0.0222485376969175,
"grad_norm": 7.093519687652588,
"learning_rate": 5.3485774550874306e-05,
"loss": 1.9589,
"step": 620
},
{
"epoch": 0.02260738507912585,
"grad_norm": 6.186059951782227,
"learning_rate": 5.3427541869891556e-05,
"loss": 2.1178,
"step": 630
},
{
"epoch": 0.022966232461334196,
"grad_norm": 6.744318962097168,
"learning_rate": 5.336824340921377e-05,
"loss": 2.1708,
"step": 640
},
{
"epoch": 0.02332507984354254,
"grad_norm": 5.510601997375488,
"learning_rate": 5.330788160636841e-05,
"loss": 2.0606,
"step": 650
},
{
"epoch": 0.023683927225750887,
"grad_norm": 4.821707725524902,
"learning_rate": 5.3246458942592776e-05,
"loss": 2.0368,
"step": 660
},
{
"epoch": 0.024042774607959234,
"grad_norm": 6.990603446960449,
"learning_rate": 5.318397794273199e-05,
"loss": 2.0595,
"step": 670
},
{
"epoch": 0.02440162199016758,
"grad_norm": 6.442526817321777,
"learning_rate": 5.312044117513524e-05,
"loss": 2.052,
"step": 680
},
{
"epoch": 0.02476046937237593,
"grad_norm": 6.1826934814453125,
"learning_rate": 5.305585125155018e-05,
"loss": 2.1528,
"step": 690
},
{
"epoch": 0.025119316754584276,
"grad_norm": 6.038548469543457,
"learning_rate": 5.29902108270156e-05,
"loss": 2.2279,
"step": 700
},
{
"epoch": 0.025478164136792623,
"grad_norm": 6.814423561096191,
"learning_rate": 5.2923522599752245e-05,
"loss": 2.0293,
"step": 710
},
{
"epoch": 0.025837011519000967,
"grad_norm": 7.148327350616455,
"learning_rate": 5.2855789311051945e-05,
"loss": 2.1998,
"step": 720
},
{
"epoch": 0.026195858901209314,
"grad_norm": 6.155678749084473,
"learning_rate": 5.27870137451649e-05,
"loss": 2.2251,
"step": 730
},
{
"epoch": 0.02655470628341766,
"grad_norm": 4.626000881195068,
"learning_rate": 5.2717198729185245e-05,
"loss": 2.112,
"step": 740
},
{
"epoch": 0.02691355366562601,
"grad_norm": 6.421958923339844,
"learning_rate": 5.264634713293485e-05,
"loss": 2.2201,
"step": 750
},
{
"epoch": 0.02691355366562601,
"eval_loss": 2.1646759510040283,
"eval_runtime": 54.8939,
"eval_samples_per_second": 9.108,
"eval_steps_per_second": 9.108,
"step": 750
},
{
"epoch": 0.027272401047834356,
"grad_norm": 6.699018478393555,
"learning_rate": 5.2574461868845316e-05,
"loss": 1.9635,
"step": 760
},
{
"epoch": 0.027631248430042703,
"grad_norm": 6.737663269042969,
"learning_rate": 5.2501545891838315e-05,
"loss": 2.0685,
"step": 770
},
{
"epoch": 0.02799009581225105,
"grad_norm": 5.519904136657715,
"learning_rate": 5.242760219920405e-05,
"loss": 2.2582,
"step": 780
},
{
"epoch": 0.028348943194459398,
"grad_norm": 6.503427505493164,
"learning_rate": 5.235263383047812e-05,
"loss": 2.1068,
"step": 790
},
{
"epoch": 0.02870779057666774,
"grad_norm": 7.244829177856445,
"learning_rate": 5.2276643867316525e-05,
"loss": 2.0203,
"step": 800
},
{
"epoch": 0.02906663795887609,
"grad_norm": 5.9835638999938965,
"learning_rate": 5.219963543336902e-05,
"loss": 2.1161,
"step": 810
},
{
"epoch": 0.029425485341084436,
"grad_norm": 5.395669460296631,
"learning_rate": 5.212161169415071e-05,
"loss": 2.0434,
"step": 820
},
{
"epoch": 0.029784332723292783,
"grad_norm": 6.662552356719971,
"learning_rate": 5.204257585691191e-05,
"loss": 1.9083,
"step": 830
},
{
"epoch": 0.03014318010550113,
"grad_norm": 6.782001495361328,
"learning_rate": 5.196253117050633e-05,
"loss": 2.0935,
"step": 840
},
{
"epoch": 0.030502027487709478,
"grad_norm": 6.106321334838867,
"learning_rate": 5.188148092525751e-05,
"loss": 2.1237,
"step": 850
},
{
"epoch": 0.030860874869917825,
"grad_norm": 7.272143840789795,
"learning_rate": 5.179942845282357e-05,
"loss": 2.0028,
"step": 860
},
{
"epoch": 0.031219722252126172,
"grad_norm": 9.065749168395996,
"learning_rate": 5.17163771260603e-05,
"loss": 2.0158,
"step": 870
},
{
"epoch": 0.031578569634334516,
"grad_norm": 6.9140706062316895,
"learning_rate": 5.163233035888244e-05,
"loss": 2.0894,
"step": 880
},
{
"epoch": 0.03193741701654287,
"grad_norm": 6.950376033782959,
"learning_rate": 5.154729160612338e-05,
"loss": 2.0526,
"step": 890
},
{
"epoch": 0.03229626439875121,
"grad_norm": 5.954463005065918,
"learning_rate": 5.146126436339321e-05,
"loss": 2.2572,
"step": 900
},
{
"epoch": 0.03229626439875121,
"eval_loss": 2.160017490386963,
"eval_runtime": 55.1945,
"eval_samples_per_second": 9.059,
"eval_steps_per_second": 9.059,
"step": 900
},
{
"epoch": 0.032655111780959555,
"grad_norm": 6.65114164352417,
"learning_rate": 5.137425216693491e-05,
"loss": 2.3123,
"step": 910
},
{
"epoch": 0.033013959163167905,
"grad_norm": 6.134308338165283,
"learning_rate": 5.128625859347907e-05,
"loss": 2.0594,
"step": 920
},
{
"epoch": 0.03337280654537625,
"grad_norm": 6.7907586097717285,
"learning_rate": 5.1197287260096865e-05,
"loss": 2.1689,
"step": 930
},
{
"epoch": 0.0337316539275846,
"grad_norm": 7.259077072143555,
"learning_rate": 5.110734182405132e-05,
"loss": 2.1629,
"step": 940
},
{
"epoch": 0.034090501309792944,
"grad_norm": 6.0704264640808105,
"learning_rate": 5.1016425982647025e-05,
"loss": 2.0007,
"step": 950
},
{
"epoch": 0.034449348692001294,
"grad_norm": 7.288009166717529,
"learning_rate": 5.092454347307812e-05,
"loss": 2.1205,
"step": 960
},
{
"epoch": 0.03480819607420964,
"grad_norm": 6.185722827911377,
"learning_rate": 5.08316980722747e-05,
"loss": 2.2027,
"step": 970
},
{
"epoch": 0.03516704345641799,
"grad_norm": 5.903477191925049,
"learning_rate": 5.0737893596747534e-05,
"loss": 2.1436,
"step": 980
},
{
"epoch": 0.03552589083862633,
"grad_norm": 5.645431995391846,
"learning_rate": 5.064313390243121e-05,
"loss": 2.1647,
"step": 990
},
{
"epoch": 0.035884738220834676,
"grad_norm": 6.282730579376221,
"learning_rate": 5.054742288452562e-05,
"loss": 2.2418,
"step": 1000
},
{
"epoch": 0.03624358560304303,
"grad_norm": 4.841719627380371,
"learning_rate": 5.0450764477335825e-05,
"loss": 2.1825,
"step": 1010
},
{
"epoch": 0.03660243298525137,
"grad_norm": 5.957107067108154,
"learning_rate": 5.035316265411036e-05,
"loss": 1.9629,
"step": 1020
},
{
"epoch": 0.03696128036745972,
"grad_norm": 6.100106239318848,
"learning_rate": 5.02546214268779e-05,
"loss": 2.1342,
"step": 1030
},
{
"epoch": 0.037320127749668065,
"grad_norm": 6.175196170806885,
"learning_rate": 5.0155144846282345e-05,
"loss": 2.2566,
"step": 1040
},
{
"epoch": 0.037678975131876416,
"grad_norm": 7.361276626586914,
"learning_rate": 5.005473700141629e-05,
"loss": 2.094,
"step": 1050
},
{
"epoch": 0.037678975131876416,
"eval_loss": 2.147434711456299,
"eval_runtime": 54.7074,
"eval_samples_per_second": 9.14,
"eval_steps_per_second": 9.14,
"step": 1050
},
{
"epoch": 0.03803782251408476,
"grad_norm": 5.384734153747559,
"learning_rate": 4.995340201965296e-05,
"loss": 2.0639,
"step": 1060
},
{
"epoch": 0.038396669896293104,
"grad_norm": 6.4175872802734375,
"learning_rate": 4.985114406647658e-05,
"loss": 2.0119,
"step": 1070
},
{
"epoch": 0.038755517278501454,
"grad_norm": 7.1519646644592285,
"learning_rate": 4.9747967345311055e-05,
"loss": 1.9558,
"step": 1080
},
{
"epoch": 0.0391143646607098,
"grad_norm": 6.3451151847839355,
"learning_rate": 4.9643876097347296e-05,
"loss": 2.0597,
"step": 1090
},
{
"epoch": 0.03947321204291815,
"grad_norm": 6.151363849639893,
"learning_rate": 4.953887460136881e-05,
"loss": 2.1947,
"step": 1100
},
{
"epoch": 0.03983205942512649,
"grad_norm": 5.386812210083008,
"learning_rate": 4.943296717357583e-05,
"loss": 1.9995,
"step": 1110
},
{
"epoch": 0.04019090680733484,
"grad_norm": 6.169729709625244,
"learning_rate": 4.93261581674079e-05,
"loss": 2.0852,
"step": 1120
},
{
"epoch": 0.04054975418954319,
"grad_norm": 5.84644079208374,
"learning_rate": 4.921845197336491e-05,
"loss": 2.0196,
"step": 1130
},
{
"epoch": 0.04090860157175153,
"grad_norm": 6.577886581420898,
"learning_rate": 4.910985301882667e-05,
"loss": 1.9489,
"step": 1140
},
{
"epoch": 0.04126744895395988,
"grad_norm": 7.484111309051514,
"learning_rate": 4.9000365767870824e-05,
"loss": 2.3324,
"step": 1150
},
{
"epoch": 0.041626296336168225,
"grad_norm": 5.723704814910889,
"learning_rate": 4.8889994721089426e-05,
"loss": 2.0919,
"step": 1160
},
{
"epoch": 0.041985143718376576,
"grad_norm": 7.106482982635498,
"learning_rate": 4.877874441540394e-05,
"loss": 2.2332,
"step": 1170
},
{
"epoch": 0.04234399110058492,
"grad_norm": 6.120841979980469,
"learning_rate": 4.866661942387867e-05,
"loss": 2.0377,
"step": 1180
},
{
"epoch": 0.04270283848279327,
"grad_norm": 6.847580432891846,
"learning_rate": 4.855362435553285e-05,
"loss": 2.0172,
"step": 1190
},
{
"epoch": 0.043061685865001614,
"grad_norm": 5.777744770050049,
"learning_rate": 4.84397638551512e-05,
"loss": 2.0351,
"step": 1200
},
{
"epoch": 0.043061685865001614,
"eval_loss": 2.1424458026885986,
"eval_runtime": 53.9494,
"eval_samples_per_second": 9.268,
"eval_steps_per_second": 9.268,
"step": 1200
},
{
"epoch": 0.043420533247209965,
"grad_norm": 7.368004322052002,
"learning_rate": 4.83250426030929e-05,
"loss": 2.209,
"step": 1210
},
{
"epoch": 0.04377938062941831,
"grad_norm": 5.8113017082214355,
"learning_rate": 4.82094653150993e-05,
"loss": 1.626,
"step": 1220
},
{
"epoch": 0.04413822801162665,
"grad_norm": 6.2266435623168945,
"learning_rate": 4.8093036742100026e-05,
"loss": 1.9042,
"step": 1230
},
{
"epoch": 0.044497075393835,
"grad_norm": 6.476573467254639,
"learning_rate": 4.79757616700177e-05,
"loss": 2.1182,
"step": 1240
},
{
"epoch": 0.04485592277604335,
"grad_norm": 16.597238540649414,
"learning_rate": 4.7857644919571176e-05,
"loss": 2.0605,
"step": 1250
},
{
"epoch": 0.0452147701582517,
"grad_norm": 6.337398052215576,
"learning_rate": 4.773869134607747e-05,
"loss": 2.0734,
"step": 1260
},
{
"epoch": 0.04557361754046004,
"grad_norm": 8.511456489562988,
"learning_rate": 4.761890583925204e-05,
"loss": 1.9335,
"step": 1270
},
{
"epoch": 0.04593246492266839,
"grad_norm": 7.134552478790283,
"learning_rate": 4.749829332300792e-05,
"loss": 2.0861,
"step": 1280
},
{
"epoch": 0.046291312304876736,
"grad_norm": 8.12893295288086,
"learning_rate": 4.737685875525327e-05,
"loss": 2.3456,
"step": 1290
},
{
"epoch": 0.04665015968708508,
"grad_norm": 5.985340118408203,
"learning_rate": 4.725460712768751e-05,
"loss": 1.9234,
"step": 1300
},
{
"epoch": 0.04700900706929343,
"grad_norm": 5.3571600914001465,
"learning_rate": 4.7131543465596236e-05,
"loss": 2.0328,
"step": 1310
},
{
"epoch": 0.047367854451501774,
"grad_norm": 6.484771728515625,
"learning_rate": 4.700767282764459e-05,
"loss": 2.3337,
"step": 1320
},
{
"epoch": 0.047726701833710125,
"grad_norm": 6.518951416015625,
"learning_rate": 4.688300030566933e-05,
"loss": 2.4661,
"step": 1330
},
{
"epoch": 0.04808554921591847,
"grad_norm": 5.992385387420654,
"learning_rate": 4.6757531024469514e-05,
"loss": 2.2451,
"step": 1340
},
{
"epoch": 0.04844439659812682,
"grad_norm": 4.671659469604492,
"learning_rate": 4.663127014159588e-05,
"loss": 1.9866,
"step": 1350
},
{
"epoch": 0.04844439659812682,
"eval_loss": 2.134829044342041,
"eval_runtime": 54.778,
"eval_samples_per_second": 9.128,
"eval_steps_per_second": 9.128,
"step": 1350
},
{
"epoch": 0.04880324398033516,
"grad_norm": 6.9112372398376465,
"learning_rate": 4.650422284713878e-05,
"loss": 2.0803,
"step": 1360
},
{
"epoch": 0.04916209136254351,
"grad_norm": 5.6216912269592285,
"learning_rate": 4.637639436351489e-05,
"loss": 2.2036,
"step": 1370
},
{
"epoch": 0.04952093874475186,
"grad_norm": 6.844085216522217,
"learning_rate": 4.624778994525249e-05,
"loss": 2.131,
"step": 1380
},
{
"epoch": 0.0498797861269602,
"grad_norm": 5.771690368652344,
"learning_rate": 4.6118414878775514e-05,
"loss": 2.2257,
"step": 1390
},
{
"epoch": 0.05023863350916855,
"grad_norm": 7.582961559295654,
"learning_rate": 4.5988274482186214e-05,
"loss": 2.1688,
"step": 1400
},
{
"epoch": 0.050597480891376896,
"grad_norm": 7.324771881103516,
"learning_rate": 4.5857374105046574e-05,
"loss": 1.9221,
"step": 1410
},
{
"epoch": 0.05095632827358525,
"grad_norm": 5.763311386108398,
"learning_rate": 4.572571912815838e-05,
"loss": 2.0738,
"step": 1420
},
{
"epoch": 0.05131517565579359,
"grad_norm": 6.5076584815979,
"learning_rate": 4.55933149633421e-05,
"loss": 2.1497,
"step": 1430
},
{
"epoch": 0.051674023038001934,
"grad_norm": 5.565882682800293,
"learning_rate": 4.5460167053214335e-05,
"loss": 2.102,
"step": 1440
},
{
"epoch": 0.052032870420210285,
"grad_norm": 6.655306339263916,
"learning_rate": 4.532628087096419e-05,
"loss": 2.0995,
"step": 1450
},
{
"epoch": 0.05239171780241863,
"grad_norm": 6.302664279937744,
"learning_rate": 4.5191661920128194e-05,
"loss": 2.024,
"step": 1460
},
{
"epoch": 0.05275056518462698,
"grad_norm": 8.187132835388184,
"learning_rate": 4.5056315734364154e-05,
"loss": 2.0004,
"step": 1470
},
{
"epoch": 0.05310941256683532,
"grad_norm": 6.062857627868652,
"learning_rate": 4.492024787722368e-05,
"loss": 2.0406,
"step": 1480
},
{
"epoch": 0.053468259949043674,
"grad_norm": 6.693349838256836,
"learning_rate": 4.47834639419234e-05,
"loss": 2.2014,
"step": 1490
},
{
"epoch": 0.05382710733125202,
"grad_norm": 4.937839031219482,
"learning_rate": 4.464596955111518e-05,
"loss": 2.1102,
"step": 1500
},
{
"epoch": 0.05382710733125202,
"eval_loss": 2.113711357116699,
"eval_runtime": 54.1712,
"eval_samples_per_second": 9.23,
"eval_steps_per_second": 9.23,
"step": 1500
},
{
"epoch": 0.05418595471346037,
"grad_norm": 6.417968273162842,
"learning_rate": 4.450777035665487e-05,
"loss": 2.0496,
"step": 1510
},
{
"epoch": 0.05454480209566871,
"grad_norm": 6.756476402282715,
"learning_rate": 4.436887203937009e-05,
"loss": 1.8097,
"step": 1520
},
{
"epoch": 0.054903649477877056,
"grad_norm": 6.680830955505371,
"learning_rate": 4.422928030882661e-05,
"loss": 1.9655,
"step": 1530
},
{
"epoch": 0.05526249686008541,
"grad_norm": 6.578978061676025,
"learning_rate": 4.4089000903093746e-05,
"loss": 2.0555,
"step": 1540
},
{
"epoch": 0.05562134424229375,
"grad_norm": 6.425287246704102,
"learning_rate": 4.394803958850844e-05,
"loss": 2.0528,
"step": 1550
},
{
"epoch": 0.0559801916245021,
"grad_norm": 6.3165974617004395,
"learning_rate": 4.380640215943821e-05,
"loss": 2.0266,
"step": 1560
},
{
"epoch": 0.056339039006710445,
"grad_norm": 5.450393199920654,
"learning_rate": 4.366409443804301e-05,
"loss": 2.1695,
"step": 1570
},
{
"epoch": 0.056697886388918796,
"grad_norm": 5.4890031814575195,
"learning_rate": 4.352112227403589e-05,
"loss": 1.8961,
"step": 1580
},
{
"epoch": 0.05705673377112714,
"grad_norm": 5.995747089385986,
"learning_rate": 4.337749154444254e-05,
"loss": 1.8865,
"step": 1590
},
{
"epoch": 0.05741558115333548,
"grad_norm": 6.454551696777344,
"learning_rate": 4.3233208153359665e-05,
"loss": 2.0315,
"step": 1600
},
{
"epoch": 0.057774428535543834,
"grad_norm": 8.371834754943848,
"learning_rate": 4.308827803171238e-05,
"loss": 2.1006,
"step": 1610
},
{
"epoch": 0.05813327591775218,
"grad_norm": 5.792433261871338,
"learning_rate": 4.294270713701031e-05,
"loss": 2.0556,
"step": 1620
},
{
"epoch": 0.05849212329996053,
"grad_norm": 5.559154033660889,
"learning_rate": 4.2796501453102784e-05,
"loss": 2.2598,
"step": 1630
},
{
"epoch": 0.05885097068216887,
"grad_norm": 7.165154933929443,
"learning_rate": 4.264966698993282e-05,
"loss": 1.8554,
"step": 1640
},
{
"epoch": 0.05920981806437722,
"grad_norm": 6.646023750305176,
"learning_rate": 4.2502209783290085e-05,
"loss": 2.0373,
"step": 1650
},
{
"epoch": 0.05920981806437722,
"eval_loss": 2.1128716468811035,
"eval_runtime": 54.1565,
"eval_samples_per_second": 9.233,
"eval_steps_per_second": 9.233,
"step": 1650
},
{
"epoch": 0.05956866544658557,
"grad_norm": 6.937278747558594,
"learning_rate": 4.235413589456281e-05,
"loss": 2.0333,
"step": 1660
},
{
"epoch": 0.05992751282879391,
"grad_norm": 6.656656742095947,
"learning_rate": 4.2205451410488565e-05,
"loss": 1.9593,
"step": 1670
},
{
"epoch": 0.06028636021100226,
"grad_norm": 5.974368095397949,
"learning_rate": 4.205616244290416e-05,
"loss": 2.009,
"step": 1680
},
{
"epoch": 0.060645207593210605,
"grad_norm": 5.182796478271484,
"learning_rate": 4.1906275128494296e-05,
"loss": 2.003,
"step": 1690
},
{
"epoch": 0.061004054975418956,
"grad_norm": 5.904313564300537,
"learning_rate": 4.175579562853945e-05,
"loss": 2.2,
"step": 1700
},
{
"epoch": 0.0613629023576273,
"grad_norm": 6.160686492919922,
"learning_rate": 4.160473012866242e-05,
"loss": 2.061,
"step": 1710
},
{
"epoch": 0.06172174973983565,
"grad_norm": 7.759803771972656,
"learning_rate": 4.145308483857426e-05,
"loss": 2.084,
"step": 1720
},
{
"epoch": 0.062080597122043994,
"grad_norm": 5.889579772949219,
"learning_rate": 4.1300865991818885e-05,
"loss": 2.1036,
"step": 1730
},
{
"epoch": 0.062439444504252345,
"grad_norm": 6.886099815368652,
"learning_rate": 4.114807984551688e-05,
"loss": 1.9959,
"step": 1740
},
{
"epoch": 0.06279829188646069,
"grad_norm": 6.403357982635498,
"learning_rate": 4.0994732680108296e-05,
"loss": 2.2174,
"step": 1750
},
{
"epoch": 0.06315713926866903,
"grad_norm": 5.627391338348389,
"learning_rate": 4.084083079909448e-05,
"loss": 1.9977,
"step": 1760
},
{
"epoch": 0.06351598665087738,
"grad_norm": 6.172947883605957,
"learning_rate": 4.068638052877899e-05,
"loss": 2.031,
"step": 1770
},
{
"epoch": 0.06387483403308573,
"grad_norm": 6.240361213684082,
"learning_rate": 4.0531388218007466e-05,
"loss": 1.892,
"step": 1780
},
{
"epoch": 0.06423368141529408,
"grad_norm": 6.1612467765808105,
"learning_rate": 4.037586023790676e-05,
"loss": 2.0618,
"step": 1790
},
{
"epoch": 0.06459252879750242,
"grad_norm": 5.873776435852051,
"learning_rate": 4.0219802981622975e-05,
"loss": 2.0004,
"step": 1800
},
{
"epoch": 0.06459252879750242,
"eval_loss": 2.101851463317871,
"eval_runtime": 55.5537,
"eval_samples_per_second": 9.0,
"eval_steps_per_second": 9.0,
"step": 1800
},
{
"epoch": 0.06495137617971077,
"grad_norm": 7.73934268951416,
"learning_rate": 4.006322286405867e-05,
"loss": 1.9404,
"step": 1810
},
{
"epoch": 0.06531022356191911,
"grad_norm": 7.713481903076172,
"learning_rate": 3.99061263216092e-05,
"loss": 2.1536,
"step": 1820
},
{
"epoch": 0.06566907094412747,
"grad_norm": 5.538480758666992,
"learning_rate": 3.974851981189813e-05,
"loss": 2.1028,
"step": 1830
},
{
"epoch": 0.06602791832633581,
"grad_norm": 4.73122501373291,
"learning_rate": 3.9590409813511765e-05,
"loss": 2.1097,
"step": 1840
},
{
"epoch": 0.06638676570854415,
"grad_norm": 6.530364036560059,
"learning_rate": 3.943180282573285e-05,
"loss": 1.9667,
"step": 1850
},
{
"epoch": 0.0667456130907525,
"grad_norm": 6.589163303375244,
"learning_rate": 3.927270536827346e-05,
"loss": 2.0643,
"step": 1860
},
{
"epoch": 0.06710446047296086,
"grad_norm": 7.016654968261719,
"learning_rate": 3.91131239810069e-05,
"loss": 1.8271,
"step": 1870
},
{
"epoch": 0.0674633078551692,
"grad_norm": 6.05830717086792,
"learning_rate": 3.895306522369898e-05,
"loss": 2.0217,
"step": 1880
},
{
"epoch": 0.06782215523737754,
"grad_norm": 7.59095573425293,
"learning_rate": 3.87925356757383e-05,
"loss": 2.1178,
"step": 1890
},
{
"epoch": 0.06818100261958589,
"grad_norm": 6.172999382019043,
"learning_rate": 3.863154193586583e-05,
"loss": 2.0382,
"step": 1900
},
{
"epoch": 0.06853985000179423,
"grad_norm": 5.4832763671875,
"learning_rate": 3.847009062190365e-05,
"loss": 2.0855,
"step": 1910
},
{
"epoch": 0.06889869738400259,
"grad_norm": 7.419639587402344,
"learning_rate": 3.83081883704829e-05,
"loss": 2.0993,
"step": 1920
},
{
"epoch": 0.06925754476621093,
"grad_norm": 6.414045333862305,
"learning_rate": 3.814584183677102e-05,
"loss": 1.7863,
"step": 1930
},
{
"epoch": 0.06961639214841928,
"grad_norm": 6.358382701873779,
"learning_rate": 3.7983057694198145e-05,
"loss": 1.9655,
"step": 1940
},
{
"epoch": 0.06997523953062762,
"grad_norm": 6.980108261108398,
"learning_rate": 3.781984263418279e-05,
"loss": 1.8557,
"step": 1950
},
{
"epoch": 0.06997523953062762,
"eval_loss": 2.0883588790893555,
"eval_runtime": 56.8739,
"eval_samples_per_second": 8.791,
"eval_steps_per_second": 8.791,
"step": 1950
},
{
"epoch": 0.07033408691283598,
"grad_norm": 6.167099952697754,
"learning_rate": 3.76562033658568e-05,
"loss": 2.1926,
"step": 1960
},
{
"epoch": 0.07069293429504432,
"grad_norm": 5.376810550689697,
"learning_rate": 3.749214661578957e-05,
"loss": 2.1606,
"step": 1970
},
{
"epoch": 0.07105178167725267,
"grad_norm": 6.163499355316162,
"learning_rate": 3.732767912771153e-05,
"loss": 2.2241,
"step": 1980
},
{
"epoch": 0.07141062905946101,
"grad_norm": 5.99036979675293,
"learning_rate": 3.716280766223693e-05,
"loss": 2.1552,
"step": 1990
},
{
"epoch": 0.07176947644166935,
"grad_norm": 6.162662982940674,
"learning_rate": 3.699753899658596e-05,
"loss": 1.9543,
"step": 2000
},
{
"epoch": 0.07212832382387771,
"grad_norm": 6.522611618041992,
"learning_rate": 3.683187992430616e-05,
"loss": 2.0151,
"step": 2010
},
{
"epoch": 0.07248717120608605,
"grad_norm": 7.548040390014648,
"learning_rate": 3.666583725499315e-05,
"loss": 1.9932,
"step": 2020
},
{
"epoch": 0.0728460185882944,
"grad_norm": 5.687351226806641,
"learning_rate": 3.6499417814010715e-05,
"loss": 2.0136,
"step": 2030
},
{
"epoch": 0.07320486597050274,
"grad_norm": 5.9614715576171875,
"learning_rate": 3.6332628442210255e-05,
"loss": 2.127,
"step": 2040
},
{
"epoch": 0.07356371335271109,
"grad_norm": 5.873690128326416,
"learning_rate": 3.616547599564958e-05,
"loss": 1.9534,
"step": 2050
},
{
"epoch": 0.07392256073491944,
"grad_norm": 4.788761615753174,
"learning_rate": 3.599796734531105e-05,
"loss": 2.145,
"step": 2060
},
{
"epoch": 0.07428140811712779,
"grad_norm": 6.241523742675781,
"learning_rate": 3.5830109376819235e-05,
"loss": 2.1061,
"step": 2070
},
{
"epoch": 0.07464025549933613,
"grad_norm": 7.868603229522705,
"learning_rate": 3.566190899015774e-05,
"loss": 2.0651,
"step": 2080
},
{
"epoch": 0.07499910288154447,
"grad_norm": 6.114261627197266,
"learning_rate": 3.5493373099385677e-05,
"loss": 1.905,
"step": 2090
},
{
"epoch": 0.07535795026375283,
"grad_norm": 6.0507588386535645,
"learning_rate": 3.5324508632353394e-05,
"loss": 1.9759,
"step": 2100
},
{
"epoch": 0.07535795026375283,
"eval_loss": 2.0796425342559814,
"eval_runtime": 55.915,
"eval_samples_per_second": 8.942,
"eval_steps_per_second": 8.942,
"step": 2100
},
{
"epoch": 0.07571679764596118,
"grad_norm": 6.28771448135376,
"learning_rate": 3.515532253041774e-05,
"loss": 1.9569,
"step": 2110
},
{
"epoch": 0.07607564502816952,
"grad_norm": 5.322301864624023,
"learning_rate": 3.498582174815671e-05,
"loss": 2.061,
"step": 2120
},
{
"epoch": 0.07643449241037786,
"grad_norm": 5.690662384033203,
"learning_rate": 3.481601325308357e-05,
"loss": 1.7273,
"step": 2130
},
{
"epoch": 0.07679333979258621,
"grad_norm": 6.411470890045166,
"learning_rate": 3.4645904025360455e-05,
"loss": 1.8976,
"step": 2140
},
{
"epoch": 0.07715218717479456,
"grad_norm": 7.68689489364624,
"learning_rate": 3.447550105751145e-05,
"loss": 2.3546,
"step": 2150
},
{
"epoch": 0.07751103455700291,
"grad_norm": 6.074337005615234,
"learning_rate": 3.4304811354135145e-05,
"loss": 2.1717,
"step": 2160
},
{
"epoch": 0.07786988193921125,
"grad_norm": 6.254448413848877,
"learning_rate": 3.4133841931616696e-05,
"loss": 2.1495,
"step": 2170
},
{
"epoch": 0.0782287293214196,
"grad_norm": 6.312126636505127,
"learning_rate": 3.396259981783942e-05,
"loss": 1.8669,
"step": 2180
},
{
"epoch": 0.07858757670362795,
"grad_norm": 6.91135311126709,
"learning_rate": 3.37910920518959e-05,
"loss": 1.9932,
"step": 2190
},
{
"epoch": 0.0789464240858363,
"grad_norm": 5.6363935470581055,
"learning_rate": 3.3619325683798646e-05,
"loss": 2.0576,
"step": 2200
},
{
"epoch": 0.07930527146804464,
"grad_norm": 5.843607425689697,
"learning_rate": 3.3447307774190296e-05,
"loss": 1.8834,
"step": 2210
},
{
"epoch": 0.07966411885025299,
"grad_norm": 5.396796226501465,
"learning_rate": 3.327504539405335e-05,
"loss": 2.0703,
"step": 2220
},
{
"epoch": 0.08002296623246133,
"grad_norm": 5.016238212585449,
"learning_rate": 3.3102545624419583e-05,
"loss": 2.0865,
"step": 2230
},
{
"epoch": 0.08038181361466969,
"grad_norm": 6.115816116333008,
"learning_rate": 3.292981555607884e-05,
"loss": 2.2753,
"step": 2240
},
{
"epoch": 0.08074066099687803,
"grad_norm": 8.006583213806152,
"learning_rate": 3.2756862289287746e-05,
"loss": 2.0162,
"step": 2250
},
{
"epoch": 0.08074066099687803,
"eval_loss": 2.0666754245758057,
"eval_runtime": 55.4275,
"eval_samples_per_second": 9.021,
"eval_steps_per_second": 9.021,
"step": 2250
},
{
"epoch": 0.08109950837908637,
"grad_norm": 7.408049583435059,
"learning_rate": 3.258369293347764e-05,
"loss": 1.822,
"step": 2260
},
{
"epoch": 0.08145835576129472,
"grad_norm": 6.273202896118164,
"learning_rate": 3.241031460696251e-05,
"loss": 2.0655,
"step": 2270
},
{
"epoch": 0.08181720314350306,
"grad_norm": 6.0729756355285645,
"learning_rate": 3.223673443664627e-05,
"loss": 1.8968,
"step": 2280
},
{
"epoch": 0.08217605052571142,
"grad_norm": 5.854898452758789,
"learning_rate": 3.206295955772987e-05,
"loss": 2.0447,
"step": 2290
},
{
"epoch": 0.08253489790791976,
"grad_norm": 6.057408809661865,
"learning_rate": 3.188899711341793e-05,
"loss": 1.9488,
"step": 2300
},
{
"epoch": 0.0828937452901281,
"grad_norm": 7.64241886138916,
"learning_rate": 3.171485425462518e-05,
"loss": 2.0667,
"step": 2310
},
{
"epoch": 0.08325259267233645,
"grad_norm": 5.485935688018799,
"learning_rate": 3.15405381396825e-05,
"loss": 1.9668,
"step": 2320
},
{
"epoch": 0.08361144005454481,
"grad_norm": 6.850480556488037,
"learning_rate": 3.136605593404258e-05,
"loss": 2.0711,
"step": 2330
},
{
"epoch": 0.08397028743675315,
"grad_norm": 5.64699649810791,
"learning_rate": 3.119141480998553e-05,
"loss": 2.0573,
"step": 2340
},
{
"epoch": 0.0843291348189615,
"grad_norm": 7.39286994934082,
"learning_rate": 3.101662194632392e-05,
"loss": 1.8136,
"step": 2350
},
{
"epoch": 0.08468798220116984,
"grad_norm": 6.044868469238281,
"learning_rate": 3.0841684528107766e-05,
"loss": 1.6417,
"step": 2360
},
{
"epoch": 0.08504682958337818,
"grad_norm": 5.207494258880615,
"learning_rate": 3.066660974632914e-05,
"loss": 1.8696,
"step": 2370
},
{
"epoch": 0.08540567696558654,
"grad_norm": 4.362065315246582,
"learning_rate": 3.0491404797626605e-05,
"loss": 1.8506,
"step": 2380
},
{
"epoch": 0.08576452434779488,
"grad_norm": 7.511015892028809,
"learning_rate": 3.031607688398936e-05,
"loss": 2.084,
"step": 2390
},
{
"epoch": 0.08612337173000323,
"grad_norm": 6.082335472106934,
"learning_rate": 3.0140633212461248e-05,
"loss": 1.8544,
"step": 2400
},
{
"epoch": 0.08612337173000323,
"eval_loss": 2.056474208831787,
"eval_runtime": 55.8391,
"eval_samples_per_second": 8.954,
"eval_steps_per_second": 8.954,
"step": 2400
},
{
"epoch": 0.08648221911221157,
"grad_norm": 6.311079502105713,
"learning_rate": 2.9965080994844422e-05,
"loss": 2.0455,
"step": 2410
},
{
"epoch": 0.08684106649441993,
"grad_norm": 5.970705986022949,
"learning_rate": 2.978942744740296e-05,
"loss": 1.9218,
"step": 2420
},
{
"epoch": 0.08719991387662827,
"grad_norm": 6.782005786895752,
"learning_rate": 2.961367979056621e-05,
"loss": 2.0712,
"step": 2430
},
{
"epoch": 0.08755876125883662,
"grad_norm": 6.76627254486084,
"learning_rate": 2.9437845248631984e-05,
"loss": 1.984,
"step": 2440
},
{
"epoch": 0.08791760864104496,
"grad_norm": 6.093632698059082,
"learning_rate": 2.926193104946961e-05,
"loss": 2.0155,
"step": 2450
},
{
"epoch": 0.0882764560232533,
"grad_norm": 4.814499378204346,
"learning_rate": 2.90859444242228e-05,
"loss": 2.0989,
"step": 2460
},
{
"epoch": 0.08863530340546166,
"grad_norm": 7.10281229019165,
"learning_rate": 2.8909892607012427e-05,
"loss": 1.9328,
"step": 2470
},
{
"epoch": 0.08899415078767,
"grad_norm": 8.456049919128418,
"learning_rate": 2.8733782834639165e-05,
"loss": 1.8714,
"step": 2480
},
{
"epoch": 0.08935299816987835,
"grad_norm": 5.278759479522705,
"learning_rate": 2.8557622346285957e-05,
"loss": 1.9494,
"step": 2490
},
{
"epoch": 0.0897118455520867,
"grad_norm": 5.60235071182251,
"learning_rate": 2.8381418383220526e-05,
"loss": 2.1887,
"step": 2500
},
{
"epoch": 0.09007069293429504,
"grad_norm": 5.99770975112915,
"learning_rate": 2.8205178188497627e-05,
"loss": 2.0496,
"step": 2510
},
{
"epoch": 0.0904295403165034,
"grad_norm": 6.4976396560668945,
"learning_rate": 2.8028909006661396e-05,
"loss": 2.0247,
"step": 2520
},
{
"epoch": 0.09078838769871174,
"grad_norm": 6.767058849334717,
"learning_rate": 2.78526180834475e-05,
"loss": 2.051,
"step": 2530
},
{
"epoch": 0.09114723508092008,
"grad_norm": 7.581692218780518,
"learning_rate": 2.7676312665485307e-05,
"loss": 2.0146,
"step": 2540
},
{
"epoch": 0.09150608246312843,
"grad_norm": 5.49759578704834,
"learning_rate": 2.75e-05,
"loss": 2.164,
"step": 2550
},
{
"epoch": 0.09150608246312843,
"eval_loss": 2.0493671894073486,
"eval_runtime": 55.2033,
"eval_samples_per_second": 9.057,
"eval_steps_per_second": 9.057,
"step": 2550
},
{
"epoch": 0.09186492984533678,
"grad_norm": 6.024051189422607,
"learning_rate": 2.7323687334514695e-05,
"loss": 2.0757,
"step": 2560
},
{
"epoch": 0.09222377722754513,
"grad_norm": 6.39860725402832,
"learning_rate": 2.71473819165525e-05,
"loss": 1.979,
"step": 2570
},
{
"epoch": 0.09258262460975347,
"grad_norm": 6.058781147003174,
"learning_rate": 2.6971090993338606e-05,
"loss": 2.2082,
"step": 2580
},
{
"epoch": 0.09294147199196182,
"grad_norm": 5.549355506896973,
"learning_rate": 2.679482181150238e-05,
"loss": 2.1154,
"step": 2590
},
{
"epoch": 0.09330031937417016,
"grad_norm": 5.480435371398926,
"learning_rate": 2.6618581616779483e-05,
"loss": 2.0381,
"step": 2600
},
{
"epoch": 0.09365916675637852,
"grad_norm": 4.98717737197876,
"learning_rate": 2.644237765371404e-05,
"loss": 1.9317,
"step": 2610
},
{
"epoch": 0.09401801413858686,
"grad_norm": 7.349699020385742,
"learning_rate": 2.626621716536085e-05,
"loss": 2.135,
"step": 2620
},
{
"epoch": 0.0943768615207952,
"grad_norm": 6.156417369842529,
"learning_rate": 2.6090107392987575e-05,
"loss": 1.8111,
"step": 2630
},
{
"epoch": 0.09473570890300355,
"grad_norm": 5.92335844039917,
"learning_rate": 2.591405557577721e-05,
"loss": 2.0825,
"step": 2640
},
{
"epoch": 0.09509455628521189,
"grad_norm": 6.328557014465332,
"learning_rate": 2.5738068950530398e-05,
"loss": 2.0139,
"step": 2650
},
{
"epoch": 0.09545340366742025,
"grad_norm": 5.462120056152344,
"learning_rate": 2.5562154751368014e-05,
"loss": 2.0133,
"step": 2660
},
{
"epoch": 0.0958122510496286,
"grad_norm": 5.9423933029174805,
"learning_rate": 2.5386320209433798e-05,
"loss": 2.0737,
"step": 2670
},
{
"epoch": 0.09617109843183694,
"grad_norm": 5.581554889678955,
"learning_rate": 2.5210572552597046e-05,
"loss": 2.1384,
"step": 2680
},
{
"epoch": 0.09652994581404528,
"grad_norm": 5.9095048904418945,
"learning_rate": 2.5034919005155583e-05,
"loss": 1.6144,
"step": 2690
},
{
"epoch": 0.09688879319625364,
"grad_norm": 4.735531330108643,
"learning_rate": 2.4859366787538754e-05,
"loss": 1.8778,
"step": 2700
},
{
"epoch": 0.09688879319625364,
"eval_loss": 2.0385708808898926,
"eval_runtime": 55.0586,
"eval_samples_per_second": 9.081,
"eval_steps_per_second": 9.081,
"step": 2700
},
{
"epoch": 0.09724764057846198,
"grad_norm": 8.026204109191895,
"learning_rate": 2.468392311601064e-05,
"loss": 1.7923,
"step": 2710
},
{
"epoch": 0.09760648796067033,
"grad_norm": 4.735470771789551,
"learning_rate": 2.4508595202373404e-05,
"loss": 1.9045,
"step": 2720
},
{
"epoch": 0.09796533534287867,
"grad_norm": 5.316372871398926,
"learning_rate": 2.433339025367087e-05,
"loss": 1.8494,
"step": 2730
},
{
"epoch": 0.09832418272508701,
"grad_norm": 6.198282241821289,
"learning_rate": 2.415831547189224e-05,
"loss": 1.8475,
"step": 2740
},
{
"epoch": 0.09868303010729537,
"grad_norm": 6.189024448394775,
"learning_rate": 2.3983378053676083e-05,
"loss": 1.8453,
"step": 2750
},
{
"epoch": 0.09904187748950372,
"grad_norm": 6.285246849060059,
"learning_rate": 2.3808585190014484e-05,
"loss": 2.0555,
"step": 2760
},
{
"epoch": 0.09940072487171206,
"grad_norm": 5.848416328430176,
"learning_rate": 2.3633944065957427e-05,
"loss": 1.8966,
"step": 2770
},
{
"epoch": 0.0997595722539204,
"grad_norm": 4.883105278015137,
"learning_rate": 2.345946186031751e-05,
"loss": 1.9917,
"step": 2780
},
{
"epoch": 0.10011841963612876,
"grad_norm": 7.421256065368652,
"learning_rate": 2.328514574537481e-05,
"loss": 2.087,
"step": 2790
},
{
"epoch": 0.1004772670183371,
"grad_norm": 7.284880638122559,
"learning_rate": 2.311100288658208e-05,
"loss": 2.2515,
"step": 2800
},
{
"epoch": 0.10083611440054545,
"grad_norm": 6.9479241371154785,
"learning_rate": 2.2937040442270142e-05,
"loss": 2.1164,
"step": 2810
},
{
"epoch": 0.10119496178275379,
"grad_norm": 5.4754838943481445,
"learning_rate": 2.2763265563353733e-05,
"loss": 2.0222,
"step": 2820
},
{
"epoch": 0.10155380916496214,
"grad_norm": 5.791376113891602,
"learning_rate": 2.2589685393037495e-05,
"loss": 1.8716,
"step": 2830
},
{
"epoch": 0.1019126565471705,
"grad_norm": 6.261075496673584,
"learning_rate": 2.241630706652236e-05,
"loss": 1.8274,
"step": 2840
},
{
"epoch": 0.10227150392937884,
"grad_norm": 6.414833068847656,
"learning_rate": 2.2243137710712266e-05,
"loss": 2.1204,
"step": 2850
},
{
"epoch": 0.10227150392937884,
"eval_loss": 2.02933406829834,
"eval_runtime": 56.1943,
"eval_samples_per_second": 8.898,
"eval_steps_per_second": 8.898,
"step": 2850
},
{
"epoch": 0.10263035131158718,
"grad_norm": 4.790386199951172,
"learning_rate": 2.2070184443921156e-05,
"loss": 1.7387,
"step": 2860
},
{
"epoch": 0.10298919869379553,
"grad_norm": 5.4755964279174805,
"learning_rate": 2.1897454375580425e-05,
"loss": 2.0086,
"step": 2870
},
{
"epoch": 0.10334804607600387,
"grad_norm": 7.731913089752197,
"learning_rate": 2.1724954605946642e-05,
"loss": 1.927,
"step": 2880
},
{
"epoch": 0.10370689345821223,
"grad_norm": 5.966332912445068,
"learning_rate": 2.1552692225809706e-05,
"loss": 1.8911,
"step": 2890
},
{
"epoch": 0.10406574084042057,
"grad_norm": 5.681090354919434,
"learning_rate": 2.1380674316201356e-05,
"loss": 2.1041,
"step": 2900
},
{
"epoch": 0.10442458822262891,
"grad_norm": 5.348470687866211,
"learning_rate": 2.1208907948104105e-05,
"loss": 2.0891,
"step": 2910
},
{
"epoch": 0.10478343560483726,
"grad_norm": 5.933499336242676,
"learning_rate": 2.1037400182160584e-05,
"loss": 2.1604,
"step": 2920
},
{
"epoch": 0.10514228298704562,
"grad_norm": 6.975137233734131,
"learning_rate": 2.0866158068383306e-05,
"loss": 1.9219,
"step": 2930
},
{
"epoch": 0.10550113036925396,
"grad_norm": 5.7571702003479,
"learning_rate": 2.069518864586486e-05,
"loss": 2.1009,
"step": 2940
},
{
"epoch": 0.1058599777514623,
"grad_norm": 6.262353897094727,
"learning_rate": 2.052449894248855e-05,
"loss": 1.9981,
"step": 2950
},
{
"epoch": 0.10621882513367065,
"grad_norm": 6.145860195159912,
"learning_rate": 2.035409597463955e-05,
"loss": 1.9854,
"step": 2960
},
{
"epoch": 0.10657767251587899,
"grad_norm": 8.0071382522583,
"learning_rate": 2.0183986746916438e-05,
"loss": 2.0659,
"step": 2970
},
{
"epoch": 0.10693651989808735,
"grad_norm": 6.31511926651001,
"learning_rate": 2.0014178251843294e-05,
"loss": 2.0654,
"step": 2980
},
{
"epoch": 0.10729536728029569,
"grad_norm": 4.879230976104736,
"learning_rate": 1.9844677469582266e-05,
"loss": 2.2041,
"step": 2990
},
{
"epoch": 0.10765421466250404,
"grad_norm": 7.417413234710693,
"learning_rate": 1.967549136764661e-05,
"loss": 1.7006,
"step": 3000
},
{
"epoch": 0.10765421466250404,
"eval_loss": 2.021873712539673,
"eval_runtime": 56.6447,
"eval_samples_per_second": 8.827,
"eval_steps_per_second": 8.827,
"step": 3000
},
{
"epoch": 0.10801306204471238,
"grad_norm": 6.9786696434021,
"learning_rate": 1.950662690061433e-05,
"loss": 1.8899,
"step": 3010
},
{
"epoch": 0.10837190942692074,
"grad_norm": 6.570639133453369,
"learning_rate": 1.9338091009842258e-05,
"loss": 1.9747,
"step": 3020
},
{
"epoch": 0.10873075680912908,
"grad_norm": 6.848568439483643,
"learning_rate": 1.916989062318077e-05,
"loss": 1.8844,
"step": 3030
},
{
"epoch": 0.10908960419133742,
"grad_norm": 5.3527631759643555,
"learning_rate": 1.900203265468895e-05,
"loss": 2.0005,
"step": 3040
},
{
"epoch": 0.10944845157354577,
"grad_norm": 6.555069446563721,
"learning_rate": 1.8834524004350432e-05,
"loss": 2.0391,
"step": 3050
},
{
"epoch": 0.10980729895575411,
"grad_norm": 4.19492244720459,
"learning_rate": 1.8667371557789747e-05,
"loss": 2.0223,
"step": 3060
},
{
"epoch": 0.11016614633796247,
"grad_norm": 7.25844144821167,
"learning_rate": 1.8500582185989287e-05,
"loss": 1.7081,
"step": 3070
},
{
"epoch": 0.11052499372017081,
"grad_norm": 5.693355560302734,
"learning_rate": 1.8334162745006857e-05,
"loss": 1.8408,
"step": 3080
},
{
"epoch": 0.11088384110237916,
"grad_norm": 5.997495174407959,
"learning_rate": 1.8168120075693843e-05,
"loss": 1.9224,
"step": 3090
},
{
"epoch": 0.1112426884845875,
"grad_norm": 5.106387615203857,
"learning_rate": 1.8002461003414043e-05,
"loss": 2.3499,
"step": 3100
},
{
"epoch": 0.11160153586679585,
"grad_norm": 6.284130573272705,
"learning_rate": 1.7837192337763072e-05,
"loss": 2.0203,
"step": 3110
},
{
"epoch": 0.1119603832490042,
"grad_norm": 6.099617004394531,
"learning_rate": 1.7672320872288483e-05,
"loss": 2.0099,
"step": 3120
},
{
"epoch": 0.11231923063121255,
"grad_norm": 6.717756748199463,
"learning_rate": 1.750785338421044e-05,
"loss": 2.1629,
"step": 3130
},
{
"epoch": 0.11267807801342089,
"grad_norm": 7.799689769744873,
"learning_rate": 1.7343796634143204e-05,
"loss": 2.1267,
"step": 3140
},
{
"epoch": 0.11303692539562923,
"grad_norm": 7.066042900085449,
"learning_rate": 1.7180157365817214e-05,
"loss": 2.1079,
"step": 3150
},
{
"epoch": 0.11303692539562923,
"eval_loss": 2.0140340328216553,
"eval_runtime": 54.1066,
"eval_samples_per_second": 9.241,
"eval_steps_per_second": 9.241,
"step": 3150
},
{
"epoch": 0.11339577277783759,
"grad_norm": 5.944588661193848,
"learning_rate": 1.7016942305801853e-05,
"loss": 2.0258,
"step": 3160
},
{
"epoch": 0.11375462016004594,
"grad_norm": 6.430848598480225,
"learning_rate": 1.6854158163228982e-05,
"loss": 2.0832,
"step": 3170
},
{
"epoch": 0.11411346754225428,
"grad_norm": 5.41016149520874,
"learning_rate": 1.6691811629517104e-05,
"loss": 1.7936,
"step": 3180
},
{
"epoch": 0.11447231492446262,
"grad_norm": 5.203214168548584,
"learning_rate": 1.6529909378096355e-05,
"loss": 2.0471,
"step": 3190
},
{
"epoch": 0.11483116230667097,
"grad_norm": 7.248607635498047,
"learning_rate": 1.636845806413417e-05,
"loss": 2.0012,
"step": 3200
},
{
"epoch": 0.11519000968887932,
"grad_norm": 7.401021480560303,
"learning_rate": 1.6207464324261707e-05,
"loss": 2.0099,
"step": 3210
},
{
"epoch": 0.11554885707108767,
"grad_norm": 6.161318302154541,
"learning_rate": 1.6046934776301034e-05,
"loss": 1.9928,
"step": 3220
},
{
"epoch": 0.11590770445329601,
"grad_norm": 6.958371162414551,
"learning_rate": 1.588687601899311e-05,
"loss": 1.8316,
"step": 3230
},
{
"epoch": 0.11626655183550436,
"grad_norm": 6.18934440612793,
"learning_rate": 1.5727294631726555e-05,
"loss": 1.987,
"step": 3240
},
{
"epoch": 0.11662539921771271,
"grad_norm": 6.302839756011963,
"learning_rate": 1.5568197174267155e-05,
"loss": 1.9393,
"step": 3250
},
{
"epoch": 0.11698424659992106,
"grad_norm": 6.34058952331543,
"learning_rate": 1.5409590186488247e-05,
"loss": 1.7194,
"step": 3260
},
{
"epoch": 0.1173430939821294,
"grad_norm": 5.862635612487793,
"learning_rate": 1.5251480188101872e-05,
"loss": 1.7798,
"step": 3270
},
{
"epoch": 0.11770194136433774,
"grad_norm": 6.237195014953613,
"learning_rate": 1.5093873678390796e-05,
"loss": 2.0145,
"step": 3280
},
{
"epoch": 0.11806078874654609,
"grad_norm": 7.536279678344727,
"learning_rate": 1.4936777135941329e-05,
"loss": 1.927,
"step": 3290
},
{
"epoch": 0.11841963612875445,
"grad_norm": 4.670105457305908,
"learning_rate": 1.4780197018377037e-05,
"loss": 1.9629,
"step": 3300
},
{
"epoch": 0.11841963612875445,
"eval_loss": 2.0052058696746826,
"eval_runtime": 54.4162,
"eval_samples_per_second": 9.188,
"eval_steps_per_second": 9.188,
"step": 3300
},
{
"epoch": 0.11877848351096279,
"grad_norm": 5.815291881561279,
"learning_rate": 1.4624139762093247e-05,
"loss": 1.902,
"step": 3310
},
{
"epoch": 0.11913733089317113,
"grad_norm": 9.564926147460938,
"learning_rate": 1.4468611781992537e-05,
"loss": 1.7636,
"step": 3320
},
{
"epoch": 0.11949617827537948,
"grad_norm": 6.171423435211182,
"learning_rate": 1.4313619471221022e-05,
"loss": 2.1226,
"step": 3330
},
{
"epoch": 0.11985502565758782,
"grad_norm": 7.247731685638428,
"learning_rate": 1.4159169200905515e-05,
"loss": 1.9989,
"step": 3340
},
{
"epoch": 0.12021387303979618,
"grad_norm": 6.01906156539917,
"learning_rate": 1.4005267319891719e-05,
"loss": 1.8619,
"step": 3350
},
{
"epoch": 0.12057272042200452,
"grad_norm": 5.43905782699585,
"learning_rate": 1.3851920154483133e-05,
"loss": 2.022,
"step": 3360
},
{
"epoch": 0.12093156780421287,
"grad_norm": 5.442080020904541,
"learning_rate": 1.3699134008181126e-05,
"loss": 1.8867,
"step": 3370
},
{
"epoch": 0.12129041518642121,
"grad_norm": 6.583103179931641,
"learning_rate": 1.3546915161425745e-05,
"loss": 1.6851,
"step": 3380
},
{
"epoch": 0.12164926256862957,
"grad_norm": 5.341058731079102,
"learning_rate": 1.3395269871337586e-05,
"loss": 1.8617,
"step": 3390
},
{
"epoch": 0.12200810995083791,
"grad_norm": 6.2624993324279785,
"learning_rate": 1.3244204371460562e-05,
"loss": 1.8916,
"step": 3400
},
{
"epoch": 0.12236695733304626,
"grad_norm": 5.550163745880127,
"learning_rate": 1.3093724871505698e-05,
"loss": 1.8684,
"step": 3410
},
{
"epoch": 0.1227258047152546,
"grad_norm": 6.564154624938965,
"learning_rate": 1.2943837557095845e-05,
"loss": 2.0437,
"step": 3420
},
{
"epoch": 0.12308465209746294,
"grad_norm": 5.661564826965332,
"learning_rate": 1.2794548589511433e-05,
"loss": 1.8696,
"step": 3430
},
{
"epoch": 0.1234434994796713,
"grad_norm": 5.914830207824707,
"learning_rate": 1.2645864105437201e-05,
"loss": 1.8871,
"step": 3440
},
{
"epoch": 0.12380234686187964,
"grad_norm": 5.326396942138672,
"learning_rate": 1.2497790216709914e-05,
"loss": 1.8341,
"step": 3450
},
{
"epoch": 0.12380234686187964,
"eval_loss": 2.000833034515381,
"eval_runtime": 54.9976,
"eval_samples_per_second": 9.091,
"eval_steps_per_second": 9.091,
"step": 3450
},
{
"epoch": 0.12416119424408799,
"grad_norm": 6.134740829467773,
"learning_rate": 1.2350333010067184e-05,
"loss": 1.7077,
"step": 3460
},
{
"epoch": 0.12452004162629633,
"grad_norm": 5.5340576171875,
"learning_rate": 1.2203498546897221e-05,
"loss": 1.9318,
"step": 3470
},
{
"epoch": 0.12487888900850469,
"grad_norm": 6.9045305252075195,
"learning_rate": 1.2057292862989693e-05,
"loss": 2.0016,
"step": 3480
},
{
"epoch": 0.12523773639071303,
"grad_norm": 7.031048774719238,
"learning_rate": 1.1911721968287635e-05,
"loss": 1.875,
"step": 3490
},
{
"epoch": 0.12559658377292138,
"grad_norm": 5.962172508239746,
"learning_rate": 1.176679184664034e-05,
"loss": 1.9473,
"step": 3500
},
{
"epoch": 0.12595543115512972,
"grad_norm": 5.502182483673096,
"learning_rate": 1.1622508455557471e-05,
"loss": 1.9898,
"step": 3510
},
{
"epoch": 0.12631427853733806,
"grad_norm": 4.957609176635742,
"learning_rate": 1.1478877725964109e-05,
"loss": 1.9896,
"step": 3520
},
{
"epoch": 0.1266731259195464,
"grad_norm": 5.988749980926514,
"learning_rate": 1.1335905561956992e-05,
"loss": 1.8328,
"step": 3530
},
{
"epoch": 0.12703197330175475,
"grad_norm": 7.231197357177734,
"learning_rate": 1.1193597840561793e-05,
"loss": 1.8092,
"step": 3540
},
{
"epoch": 0.12739082068396312,
"grad_norm": 5.365965366363525,
"learning_rate": 1.1051960411491561e-05,
"loss": 1.6869,
"step": 3550
},
{
"epoch": 0.12774966806617147,
"grad_norm": 5.000406265258789,
"learning_rate": 1.0910999096906248e-05,
"loss": 1.7699,
"step": 3560
},
{
"epoch": 0.1281085154483798,
"grad_norm": 5.213188648223877,
"learning_rate": 1.0770719691173388e-05,
"loss": 2.0603,
"step": 3570
},
{
"epoch": 0.12846736283058816,
"grad_norm": 6.383294582366943,
"learning_rate": 1.0631127960629924e-05,
"loss": 1.9855,
"step": 3580
},
{
"epoch": 0.1288262102127965,
"grad_norm": 5.159536838531494,
"learning_rate": 1.0492229643345136e-05,
"loss": 1.9706,
"step": 3590
},
{
"epoch": 0.12918505759500484,
"grad_norm": 6.980235576629639,
"learning_rate": 1.0354030448884829e-05,
"loss": 2.0075,
"step": 3600
},
{
"epoch": 0.12918505759500484,
"eval_loss": 1.9919238090515137,
"eval_runtime": 56.2773,
"eval_samples_per_second": 8.885,
"eval_steps_per_second": 8.885,
"step": 3600
},
{
"epoch": 0.1295439049772132,
"grad_norm": 5.887603759765625,
"learning_rate": 1.02165360580766e-05,
"loss": 1.9446,
"step": 3610
},
{
"epoch": 0.12990275235942153,
"grad_norm": 5.93571662902832,
"learning_rate": 1.0079752122776338e-05,
"loss": 1.9175,
"step": 3620
},
{
"epoch": 0.13026159974162987,
"grad_norm": 5.173825740814209,
"learning_rate": 9.94368426563585e-06,
"loss": 1.7861,
"step": 3630
},
{
"epoch": 0.13062044712383822,
"grad_norm": 5.444214344024658,
"learning_rate": 9.80833807987182e-06,
"loss": 1.9604,
"step": 3640
},
{
"epoch": 0.1309792945060466,
"grad_norm": 6.3942718505859375,
"learning_rate": 9.673719129035826e-06,
"loss": 1.8913,
"step": 3650
},
{
"epoch": 0.13133814188825493,
"grad_norm": 6.097336769104004,
"learning_rate": 9.53983294678566e-06,
"loss": 1.9104,
"step": 3660
},
{
"epoch": 0.13169698927046328,
"grad_norm": 6.532045364379883,
"learning_rate": 9.406685036657904e-06,
"loss": 2.192,
"step": 3670
},
{
"epoch": 0.13205583665267162,
"grad_norm": 5.635133743286133,
"learning_rate": 9.27428087184162e-06,
"loss": 1.9662,
"step": 3680
},
{
"epoch": 0.13241468403487996,
"grad_norm": 5.020458698272705,
"learning_rate": 9.142625894953431e-06,
"loss": 1.8653,
"step": 3690
},
{
"epoch": 0.1327735314170883,
"grad_norm": 6.398831367492676,
"learning_rate": 9.011725517813786e-06,
"loss": 1.9715,
"step": 3700
},
{
"epoch": 0.13313237879929665,
"grad_norm": 5.7541890144348145,
"learning_rate": 8.881585121224496e-06,
"loss": 1.7682,
"step": 3710
},
{
"epoch": 0.133491226181505,
"grad_norm": 6.622452259063721,
"learning_rate": 8.752210054747517e-06,
"loss": 1.9944,
"step": 3720
},
{
"epoch": 0.13385007356371334,
"grad_norm": 5.588860034942627,
"learning_rate": 8.623605636485119e-06,
"loss": 2.1929,
"step": 3730
},
{
"epoch": 0.1342089209459217,
"grad_norm": 5.577978134155273,
"learning_rate": 8.495777152861222e-06,
"loss": 1.9228,
"step": 3740
},
{
"epoch": 0.13456776832813006,
"grad_norm": 5.694477081298828,
"learning_rate": 8.368729858404125e-06,
"loss": 1.9947,
"step": 3750
},
{
"epoch": 0.13456776832813006,
"eval_loss": 1.9859551191329956,
"eval_runtime": 55.4111,
"eval_samples_per_second": 9.023,
"eval_steps_per_second": 9.023,
"step": 3750
},
{
"epoch": 0.1349266157103384,
"grad_norm": 7.570437431335449,
"learning_rate": 8.242468975530497e-06,
"loss": 1.8791,
"step": 3760
},
{
"epoch": 0.13528546309254674,
"grad_norm": 7.051590442657471,
"learning_rate": 8.116999694330684e-06,
"loss": 1.9133,
"step": 3770
},
{
"epoch": 0.1356443104747551,
"grad_norm": 8.620963096618652,
"learning_rate": 7.99232717235541e-06,
"loss": 1.9702,
"step": 3780
},
{
"epoch": 0.13600315785696343,
"grad_norm": 5.9947686195373535,
"learning_rate": 7.86845653440376e-06,
"loss": 1.9478,
"step": 3790
},
{
"epoch": 0.13636200523917177,
"grad_norm": 7.256514072418213,
"learning_rate": 7.745392872312495e-06,
"loss": 1.9925,
"step": 3800
},
{
"epoch": 0.13672085262138012,
"grad_norm": 6.057373523712158,
"learning_rate": 7.623141244746736e-06,
"loss": 1.9152,
"step": 3810
},
{
"epoch": 0.13707970000358846,
"grad_norm": 6.098325729370117,
"learning_rate": 7.5017066769920735e-06,
"loss": 1.7821,
"step": 3820
},
{
"epoch": 0.13743854738579683,
"grad_norm": 6.742500305175781,
"learning_rate": 7.381094160747963e-06,
"loss": 2.0775,
"step": 3830
},
{
"epoch": 0.13779739476800518,
"grad_norm": 6.884401798248291,
"learning_rate": 7.261308653922539e-06,
"loss": 2.111,
"step": 3840
},
{
"epoch": 0.13815624215021352,
"grad_norm": 5.880162239074707,
"learning_rate": 7.1423550804288275e-06,
"loss": 1.9414,
"step": 3850
},
{
"epoch": 0.13851508953242186,
"grad_norm": 5.843265056610107,
"learning_rate": 7.024238329982311e-06,
"loss": 1.982,
"step": 3860
},
{
"epoch": 0.1388739369146302,
"grad_norm": 6.20538854598999,
"learning_rate": 6.906963257899975e-06,
"loss": 2.0604,
"step": 3870
},
{
"epoch": 0.13923278429683855,
"grad_norm": 5.832340240478516,
"learning_rate": 6.7905346849007014e-06,
"loss": 1.86,
"step": 3880
},
{
"epoch": 0.1395916316790469,
"grad_norm": 5.763581275939941,
"learning_rate": 6.674957396907109e-06,
"loss": 1.8716,
"step": 3890
},
{
"epoch": 0.13995047906125524,
"grad_norm": 5.205382823944092,
"learning_rate": 6.560236144848803e-06,
"loss": 1.9625,
"step": 3900
},
{
"epoch": 0.13995047906125524,
"eval_loss": 1.9828507900238037,
"eval_runtime": 54.4769,
"eval_samples_per_second": 9.178,
"eval_steps_per_second": 9.178,
"step": 3900
},
{
"epoch": 0.14030932644346358,
"grad_norm": 6.159921169281006,
"learning_rate": 6.4463756444671446e-06,
"loss": 2.0608,
"step": 3910
},
{
"epoch": 0.14066817382567195,
"grad_norm": 7.304522514343262,
"learning_rate": 6.333380576121334e-06,
"loss": 2.1806,
"step": 3920
},
{
"epoch": 0.1410270212078803,
"grad_norm": 6.259190559387207,
"learning_rate": 6.221255584596061e-06,
"loss": 1.9558,
"step": 3930
},
{
"epoch": 0.14138586859008864,
"grad_norm": 5.298434734344482,
"learning_rate": 6.110005278910572e-06,
"loss": 1.9614,
"step": 3940
},
{
"epoch": 0.141744715972297,
"grad_norm": 5.2134528160095215,
"learning_rate": 5.999634232129181e-06,
"loss": 1.7626,
"step": 3950
},
{
"epoch": 0.14210356335450533,
"grad_norm": 6.282635688781738,
"learning_rate": 5.890146981173336e-06,
"loss": 1.801,
"step": 3960
},
{
"epoch": 0.14246241073671367,
"grad_norm": 5.433168888092041,
"learning_rate": 5.781548026635087e-06,
"loss": 2.0156,
"step": 3970
},
{
"epoch": 0.14282125811892202,
"grad_norm": 7.175068378448486,
"learning_rate": 5.673841832592114e-06,
"loss": 2.1676,
"step": 3980
},
{
"epoch": 0.14318010550113036,
"grad_norm": 5.729061603546143,
"learning_rate": 5.56703282642418e-06,
"loss": 2.0456,
"step": 3990
},
{
"epoch": 0.1435389528833387,
"grad_norm": 5.6335883140563965,
"learning_rate": 5.461125398631196e-06,
"loss": 1.8919,
"step": 4000
},
{
"epoch": 0.14389780026554708,
"grad_norm": 6.064164161682129,
"learning_rate": 5.356123902652707e-06,
"loss": 1.9008,
"step": 4010
},
{
"epoch": 0.14425664764775542,
"grad_norm": 5.617476463317871,
"learning_rate": 5.252032654688949e-06,
"loss": 1.7857,
"step": 4020
},
{
"epoch": 0.14461549502996376,
"grad_norm": 5.734108924865723,
"learning_rate": 5.148855933523428e-06,
"loss": 1.8976,
"step": 4030
},
{
"epoch": 0.1449743424121721,
"grad_norm": 5.556881427764893,
"learning_rate": 5.046597980347035e-06,
"loss": 2.1003,
"step": 4040
},
{
"epoch": 0.14533318979438045,
"grad_norm": 6.115296840667725,
"learning_rate": 4.945262998583711e-06,
"loss": 1.9994,
"step": 4050
},
{
"epoch": 0.14533318979438045,
"eval_loss": 1.977493405342102,
"eval_runtime": 53.8847,
"eval_samples_per_second": 9.279,
"eval_steps_per_second": 9.279,
"step": 4050
},
{
"epoch": 0.1456920371765888,
"grad_norm": 5.492610454559326,
"learning_rate": 4.844855153717654e-06,
"loss": 1.785,
"step": 4060
},
{
"epoch": 0.14605088455879714,
"grad_norm": 6.202512264251709,
"learning_rate": 4.745378573122101e-06,
"loss": 1.9527,
"step": 4070
},
{
"epoch": 0.14640973194100548,
"grad_norm": 5.397292137145996,
"learning_rate": 4.646837345889642e-06,
"loss": 1.8806,
"step": 4080
},
{
"epoch": 0.14676857932321383,
"grad_norm": 4.79435396194458,
"learning_rate": 4.5492355226641775e-06,
"loss": 1.6986,
"step": 4090
},
{
"epoch": 0.14712742670542217,
"grad_norm": 5.705773830413818,
"learning_rate": 4.452577115474384e-06,
"loss": 1.9535,
"step": 4100
},
{
"epoch": 0.14748627408763054,
"grad_norm": 6.980686187744141,
"learning_rate": 4.3568660975687884e-06,
"loss": 2.0289,
"step": 4110
},
{
"epoch": 0.14784512146983889,
"grad_norm": 5.97441291809082,
"learning_rate": 4.262106403252474e-06,
"loss": 1.8012,
"step": 4120
},
{
"epoch": 0.14820396885204723,
"grad_norm": 6.008128643035889,
"learning_rate": 4.168301927725312e-06,
"loss": 1.7863,
"step": 4130
},
{
"epoch": 0.14856281623425557,
"grad_norm": 6.05222749710083,
"learning_rate": 4.075456526921887e-06,
"loss": 1.6858,
"step": 4140
},
{
"epoch": 0.14892166361646392,
"grad_norm": 4.7681803703308105,
"learning_rate": 3.983574017352983e-06,
"loss": 1.8194,
"step": 4150
},
{
"epoch": 0.14928051099867226,
"grad_norm": 6.2553887367248535,
"learning_rate": 3.8926581759486824e-06,
"loss": 1.9261,
"step": 4160
},
{
"epoch": 0.1496393583808806,
"grad_norm": 6.245451927185059,
"learning_rate": 3.8027127399031364e-06,
"loss": 2.0298,
"step": 4170
},
{
"epoch": 0.14999820576308895,
"grad_norm": 5.130878448486328,
"learning_rate": 3.7137414065209284e-06,
"loss": 2.0474,
"step": 4180
},
{
"epoch": 0.1503570531452973,
"grad_norm": 5.6020684242248535,
"learning_rate": 3.6257478330650916e-06,
"loss": 1.975,
"step": 4190
},
{
"epoch": 0.15071590052750566,
"grad_norm": 6.997092247009277,
"learning_rate": 3.5387356366067913e-06,
"loss": 1.8301,
"step": 4200
},
{
"epoch": 0.15071590052750566,
"eval_loss": 1.9740864038467407,
"eval_runtime": 54.8117,
"eval_samples_per_second": 9.122,
"eval_steps_per_second": 9.122,
"step": 4200
},
{
"epoch": 0.151074747909714,
"grad_norm": 5.829411029815674,
"learning_rate": 3.45270839387662e-06,
"loss": 1.8474,
"step": 4210
},
{
"epoch": 0.15143359529192235,
"grad_norm": 5.0621018409729,
"learning_rate": 3.3676696411175727e-06,
"loss": 1.8278,
"step": 4220
},
{
"epoch": 0.1517924426741307,
"grad_norm": 5.297969818115234,
"learning_rate": 3.283622873939705e-06,
"loss": 1.9351,
"step": 4230
},
{
"epoch": 0.15215129005633904,
"grad_norm": 7.4726738929748535,
"learning_rate": 3.2005715471764303e-06,
"loss": 2.0655,
"step": 4240
},
{
"epoch": 0.15251013743854738,
"grad_norm": 7.288102149963379,
"learning_rate": 3.118519074742497e-06,
"loss": 2.1868,
"step": 4250
},
{
"epoch": 0.15286898482075573,
"grad_norm": 6.620118618011475,
"learning_rate": 3.037468829493679e-06,
"loss": 2.0158,
"step": 4260
},
{
"epoch": 0.15322783220296407,
"grad_norm": 5.841970443725586,
"learning_rate": 2.9574241430880926e-06,
"loss": 1.891,
"step": 4270
},
{
"epoch": 0.15358667958517241,
"grad_norm": 6.563481330871582,
"learning_rate": 2.878388305849292e-06,
"loss": 2.0859,
"step": 4280
},
{
"epoch": 0.15394552696738079,
"grad_norm": 6.011387348175049,
"learning_rate": 2.8003645666309768e-06,
"loss": 1.9052,
"step": 4290
},
{
"epoch": 0.15430437434958913,
"grad_norm": 5.620529651641846,
"learning_rate": 2.7233561326834765e-06,
"loss": 1.8405,
"step": 4300
},
{
"epoch": 0.15466322173179747,
"grad_norm": 7.352434158325195,
"learning_rate": 2.647366169521881e-06,
"loss": 1.9094,
"step": 4310
},
{
"epoch": 0.15502206911400582,
"grad_norm": 6.008353233337402,
"learning_rate": 2.5723978007959507e-06,
"loss": 1.8437,
"step": 4320
},
{
"epoch": 0.15538091649621416,
"grad_norm": 6.1527419090271,
"learning_rate": 2.4984541081616895e-06,
"loss": 1.9673,
"step": 4330
},
{
"epoch": 0.1557397638784225,
"grad_norm": 5.206054210662842,
"learning_rate": 2.4255381311546833e-06,
"loss": 1.8418,
"step": 4340
},
{
"epoch": 0.15609861126063085,
"grad_norm": 5.852407455444336,
"learning_rate": 2.3536528670651595e-06,
"loss": 2.0213,
"step": 4350
},
{
"epoch": 0.15609861126063085,
"eval_loss": 1.9717787504196167,
"eval_runtime": 55.4488,
"eval_samples_per_second": 9.017,
"eval_steps_per_second": 9.017,
"step": 4350
},
{
"epoch": 0.1564574586428392,
"grad_norm": 6.1430439949035645,
"learning_rate": 2.2828012708147603e-06,
"loss": 1.7364,
"step": 4360
},
{
"epoch": 0.15681630602504754,
"grad_norm": 7.5634989738464355,
"learning_rate": 2.2129862548351094e-06,
"loss": 1.8513,
"step": 4370
},
{
"epoch": 0.1571751534072559,
"grad_norm": 4.393237590789795,
"learning_rate": 2.1442106889480615e-06,
"loss": 2.0098,
"step": 4380
},
{
"epoch": 0.15753400078946425,
"grad_norm": 7.880530834197998,
"learning_rate": 2.0764774002477615e-06,
"loss": 1.878,
"step": 4390
},
{
"epoch": 0.1578928481716726,
"grad_norm": 5.541848659515381,
"learning_rate": 2.009789172984405e-06,
"loss": 1.7038,
"step": 4400
},
{
"epoch": 0.15825169555388094,
"grad_norm": 6.225683212280273,
"learning_rate": 1.9441487484498223e-06,
"loss": 1.8292,
"step": 4410
},
{
"epoch": 0.15861054293608928,
"grad_norm": 5.8407464027404785,
"learning_rate": 1.8795588248647634e-06,
"loss": 2.0944,
"step": 4420
},
{
"epoch": 0.15896939031829763,
"grad_norm": 5.6365580558776855,
"learning_rate": 1.8160220572680145e-06,
"loss": 1.8074,
"step": 4430
},
{
"epoch": 0.15932823770050597,
"grad_norm": 5.789924621582031,
"learning_rate": 1.753541057407227e-06,
"loss": 1.9745,
"step": 4440
},
{
"epoch": 0.15968708508271431,
"grad_norm": 6.211676120758057,
"learning_rate": 1.692118393631588e-06,
"loss": 1.7479,
"step": 4450
},
{
"epoch": 0.16004593246492266,
"grad_norm": 5.150852680206299,
"learning_rate": 1.6317565907862317e-06,
"loss": 1.8298,
"step": 4460
},
{
"epoch": 0.160404779847131,
"grad_norm": 7.136476039886475,
"learning_rate": 1.5724581301084432e-06,
"loss": 1.946,
"step": 4470
},
{
"epoch": 0.16076362722933937,
"grad_norm": 5.5826311111450195,
"learning_rate": 1.5142254491256988e-06,
"loss": 1.9713,
"step": 4480
},
{
"epoch": 0.16112247461154772,
"grad_norm": 7.025698661804199,
"learning_rate": 1.4570609415554178e-06,
"loss": 1.9835,
"step": 4490
},
{
"epoch": 0.16148132199375606,
"grad_norm": 5.846466541290283,
"learning_rate": 1.4009669572066124e-06,
"loss": 2.0159,
"step": 4500
},
{
"epoch": 0.16148132199375606,
"eval_loss": 1.9697834253311157,
"eval_runtime": 53.8246,
"eval_samples_per_second": 9.289,
"eval_steps_per_second": 9.289,
"step": 4500
},
{
"epoch": 0.1618401693759644,
"grad_norm": 5.875072002410889,
"learning_rate": 1.345945801883278e-06,
"loss": 2.0738,
"step": 4510
},
{
"epoch": 0.16219901675817275,
"grad_norm": 7.465427398681641,
"learning_rate": 1.2919997372896026e-06,
"loss": 1.8473,
"step": 4520
},
{
"epoch": 0.1625578641403811,
"grad_norm": 6.052853584289551,
"learning_rate": 1.2391309809370159e-06,
"loss": 1.983,
"step": 4530
},
{
"epoch": 0.16291671152258944,
"grad_norm": 5.640964031219482,
"learning_rate": 1.18734170605301e-06,
"loss": 1.7979,
"step": 4540
},
{
"epoch": 0.16327555890479778,
"grad_norm": 6.585112571716309,
"learning_rate": 1.136634041491834e-06,
"loss": 1.7018,
"step": 4550
},
{
"epoch": 0.16363440628700612,
"grad_norm": 5.19002628326416,
"learning_rate": 1.0870100716469694e-06,
"loss": 1.9632,
"step": 4560
},
{
"epoch": 0.1639932536692145,
"grad_norm": 5.477431774139404,
"learning_rate": 1.0384718363654598e-06,
"loss": 1.8951,
"step": 4570
},
{
"epoch": 0.16435210105142284,
"grad_norm": 6.346002101898193,
"learning_rate": 9.910213308640359e-07,
"loss": 1.6237,
"step": 4580
},
{
"epoch": 0.16471094843363118,
"grad_norm": 6.293008327484131,
"learning_rate": 9.446605056471311e-07,
"loss": 1.9506,
"step": 4590
},
{
"epoch": 0.16506979581583953,
"grad_norm": 6.646719932556152,
"learning_rate": 8.993912664266901e-07,
"loss": 1.9005,
"step": 4600
},
{
"epoch": 0.16542864319804787,
"grad_norm": 6.372305393218994,
"learning_rate": 8.5521547404383e-07,
"loss": 1.9383,
"step": 4610
},
{
"epoch": 0.1657874905802562,
"grad_norm": 5.362992286682129,
"learning_rate": 8.121349443923473e-07,
"loss": 1.8118,
"step": 4620
},
{
"epoch": 0.16614633796246456,
"grad_norm": 5.154087066650391,
"learning_rate": 7.701514483440844e-07,
"loss": 2.0736,
"step": 4630
},
{
"epoch": 0.1665051853446729,
"grad_norm": 5.9843926429748535,
"learning_rate": 7.292667116761223e-07,
"loss": 2.1853,
"step": 4640
},
{
"epoch": 0.16686403272688125,
"grad_norm": 6.360889911651611,
"learning_rate": 6.894824149998505e-07,
"loss": 1.9762,
"step": 4650
},
{
"epoch": 0.16686403272688125,
"eval_loss": 1.9689069986343384,
"eval_runtime": 55.2702,
"eval_samples_per_second": 9.046,
"eval_steps_per_second": 9.046,
"step": 4650
},
{
"epoch": 0.16722288010908962,
"grad_norm": 5.460212707519531,
"learning_rate": 6.508001936918873e-07,
"loss": 1.8351,
"step": 4660
},
{
"epoch": 0.16758172749129796,
"grad_norm": 4.702284812927246,
"learning_rate": 6.132216378268379e-07,
"loss": 1.8153,
"step": 4670
},
{
"epoch": 0.1679405748735063,
"grad_norm": 5.906696796417236,
"learning_rate": 5.767482921119461e-07,
"loss": 1.9498,
"step": 4680
},
{
"epoch": 0.16829942225571465,
"grad_norm": 6.808102607727051,
"learning_rate": 5.413816558236007e-07,
"loss": 2.1874,
"step": 4690
},
{
"epoch": 0.168658269637923,
"grad_norm": 5.334954738616943,
"learning_rate": 5.071231827457004e-07,
"loss": 1.9131,
"step": 4700
},
{
"epoch": 0.16901711702013134,
"grad_norm": 6.644302845001221,
"learning_rate": 4.739742811098946e-07,
"loss": 1.8804,
"step": 4710
},
{
"epoch": 0.16937596440233968,
"grad_norm": 6.628371715545654,
"learning_rate": 4.4193631353768414e-07,
"loss": 2.2605,
"step": 4720
},
{
"epoch": 0.16973481178454802,
"grad_norm": 7.336843490600586,
"learning_rate": 4.1101059698443965e-07,
"loss": 1.8735,
"step": 4730
},
{
"epoch": 0.17009365916675637,
"grad_norm": 6.385683059692383,
"learning_rate": 3.8119840268523914e-07,
"loss": 1.7274,
"step": 4740
},
{
"epoch": 0.17045250654896474,
"grad_norm": 6.481418132781982,
"learning_rate": 3.525009561026202e-07,
"loss": 2.012,
"step": 4750
},
{
"epoch": 0.17081135393117308,
"grad_norm": 5.986774921417236,
"learning_rate": 3.2491943687621873e-07,
"loss": 2.0702,
"step": 4760
},
{
"epoch": 0.17117020131338143,
"grad_norm": 6.33309268951416,
"learning_rate": 2.984549787742552e-07,
"loss": 1.8632,
"step": 4770
},
{
"epoch": 0.17152904869558977,
"grad_norm": 5.965672969818115,
"learning_rate": 2.731086696469501e-07,
"loss": 1.7732,
"step": 4780
},
{
"epoch": 0.1718878960777981,
"grad_norm": 6.534940242767334,
"learning_rate": 2.4888155138179576e-07,
"loss": 2.2091,
"step": 4790
},
{
"epoch": 0.17224674346000646,
"grad_norm": 5.720672130584717,
"learning_rate": 2.2577461986073356e-07,
"loss": 1.9488,
"step": 4800
},
{
"epoch": 0.17224674346000646,
"eval_loss": 1.968702793121338,
"eval_runtime": 54.4379,
"eval_samples_per_second": 9.185,
"eval_steps_per_second": 9.185,
"step": 4800
},
{
"epoch": 0.1726055908422148,
"grad_norm": 6.99819278717041,
"learning_rate": 2.0378882491921159e-07,
"loss": 1.8417,
"step": 4810
},
{
"epoch": 0.17296443822442314,
"grad_norm": 5.92847204208374,
"learning_rate": 1.8292507030715362e-07,
"loss": 1.8105,
"step": 4820
},
{
"epoch": 0.1733232856066315,
"grad_norm": 6.517411231994629,
"learning_rate": 1.6318421365179055e-07,
"loss": 1.8031,
"step": 4830
},
{
"epoch": 0.17368213298883986,
"grad_norm": 5.3509345054626465,
"learning_rate": 1.4456706642242134e-07,
"loss": 1.9194,
"step": 4840
},
{
"epoch": 0.1740409803710482,
"grad_norm": 5.721834182739258,
"learning_rate": 1.2707439389704867e-07,
"loss": 1.9414,
"step": 4850
},
{
"epoch": 0.17439982775325655,
"grad_norm": 5.240856647491455,
"learning_rate": 1.1070691513092563e-07,
"loss": 2.1085,
"step": 4860
},
{
"epoch": 0.1747586751354649,
"grad_norm": 5.592405796051025,
"learning_rate": 9.546530292699863e-08,
"loss": 1.9062,
"step": 4870
},
{
"epoch": 0.17511752251767324,
"grad_norm": 5.133212089538574,
"learning_rate": 8.135018380824921e-08,
"loss": 1.9475,
"step": 4880
},
{
"epoch": 0.17547636989988158,
"grad_norm": 5.703231334686279,
"learning_rate": 6.836213799193497e-08,
"loss": 2.0048,
"step": 4890
},
{
"epoch": 0.17583521728208992,
"grad_norm": 6.332388401031494,
"learning_rate": 5.6501699365750784e-08,
"loss": 1.6435,
"step": 4900
},
{
"epoch": 0.17619406466429827,
"grad_norm": 5.111960411071777,
"learning_rate": 4.5769355465876964e-08,
"loss": 1.8596,
"step": 4910
},
{
"epoch": 0.1765529120465066,
"grad_norm": 6.382414817810059,
"learning_rate": 3.616554745692946e-08,
"loss": 1.9124,
"step": 4920
},
{
"epoch": 0.17691175942871495,
"grad_norm": 7.501326084136963,
"learning_rate": 2.7690670113848792e-08,
"loss": 1.9002,
"step": 4930
},
{
"epoch": 0.17727060681092333,
"grad_norm": 6.54906702041626,
"learning_rate": 2.034507180563916e-08,
"loss": 1.8475,
"step": 4940
},
{
"epoch": 0.17762945419313167,
"grad_norm": 5.6887078285217285,
"learning_rate": 1.4129054481082926e-08,
"loss": 1.7603,
"step": 4950
},
{
"epoch": 0.17762945419313167,
"eval_loss": 1.9684966802597046,
"eval_runtime": 53.5323,
"eval_samples_per_second": 9.34,
"eval_steps_per_second": 9.34,
"step": 4950
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.9553961729366426e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}