alicegoesdown's picture
Training in progress, step 3900, checkpoint
904d15a verified
{
"best_metric": 1.4924039840698242,
"best_model_checkpoint": "./output/checkpoint-3900",
"epoch": 0.34580599397056216,
"eval_steps": 150,
"global_step": 3900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008866820358219542,
"grad_norm": 12.221611976623535,
"learning_rate": 1.0000000000000004e-05,
"loss": 1.5409,
"step": 10
},
{
"epoch": 0.0017733640716439084,
"grad_norm": 10.172511100769043,
"learning_rate": 2.000000000000001e-05,
"loss": 1.5034,
"step": 20
},
{
"epoch": 0.0026600461074658627,
"grad_norm": 7.491011619567871,
"learning_rate": 3.0000000000000008e-05,
"loss": 1.5275,
"step": 30
},
{
"epoch": 0.003546728143287817,
"grad_norm": 8.695981979370117,
"learning_rate": 4.000000000000002e-05,
"loss": 1.6105,
"step": 40
},
{
"epoch": 0.004433410179109771,
"grad_norm": 9.496545791625977,
"learning_rate": 5.0000000000000016e-05,
"loss": 1.5488,
"step": 50
},
{
"epoch": 0.005320092214931725,
"grad_norm": 8.968175888061523,
"learning_rate": 6.0000000000000015e-05,
"loss": 1.5128,
"step": 60
},
{
"epoch": 0.00620677425075368,
"grad_norm": 9.554357528686523,
"learning_rate": 7.000000000000002e-05,
"loss": 1.5392,
"step": 70
},
{
"epoch": 0.007093456286575634,
"grad_norm": 10.577646255493164,
"learning_rate": 8.000000000000003e-05,
"loss": 1.7024,
"step": 80
},
{
"epoch": 0.007980138322397589,
"grad_norm": 7.967463493347168,
"learning_rate": 9.000000000000003e-05,
"loss": 1.6036,
"step": 90
},
{
"epoch": 0.008866820358219542,
"grad_norm": 10.403460502624512,
"learning_rate": 0.00010000000000000003,
"loss": 1.7425,
"step": 100
},
{
"epoch": 0.009753502394041496,
"grad_norm": 7.898177623748779,
"learning_rate": 9.999897234791833e-05,
"loss": 1.6833,
"step": 110
},
{
"epoch": 0.01064018442986345,
"grad_norm": 9.69663143157959,
"learning_rate": 9.9995889433916e-05,
"loss": 1.5979,
"step": 120
},
{
"epoch": 0.011526866465685405,
"grad_norm": 10.984465599060059,
"learning_rate": 9.999075138471954e-05,
"loss": 1.8234,
"step": 130
},
{
"epoch": 0.01241354850150736,
"grad_norm": 7.519535541534424,
"learning_rate": 9.998355841153402e-05,
"loss": 1.5598,
"step": 140
},
{
"epoch": 0.013300230537329314,
"grad_norm": 10.765931129455566,
"learning_rate": 9.997431081003442e-05,
"loss": 1.6027,
"step": 150
},
{
"epoch": 0.013300230537329314,
"eval_loss": 1.7432395219802856,
"eval_runtime": 59.4356,
"eval_samples_per_second": 8.412,
"eval_steps_per_second": 8.412,
"step": 150
},
{
"epoch": 0.014186912573151267,
"grad_norm": 7.04581356048584,
"learning_rate": 9.996300896035342e-05,
"loss": 1.625,
"step": 160
},
{
"epoch": 0.015073594608973222,
"grad_norm": 10.549965858459473,
"learning_rate": 9.994965332706576e-05,
"loss": 1.7696,
"step": 170
},
{
"epoch": 0.015960276644795178,
"grad_norm": 17.5423641204834,
"learning_rate": 9.993424445916925e-05,
"loss": 1.6977,
"step": 180
},
{
"epoch": 0.01684695868061713,
"grad_norm": 15.295858383178711,
"learning_rate": 9.991678299006208e-05,
"loss": 1.7496,
"step": 190
},
{
"epoch": 0.017733640716439084,
"grad_norm": 14.050594329833984,
"learning_rate": 9.989726963751685e-05,
"loss": 1.6698,
"step": 200
},
{
"epoch": 0.01862032275226104,
"grad_norm": 9.636832237243652,
"learning_rate": 9.987570520365106e-05,
"loss": 1.7278,
"step": 210
},
{
"epoch": 0.019507004788082993,
"grad_norm": 10.37330150604248,
"learning_rate": 9.985209057489412e-05,
"loss": 1.6985,
"step": 220
},
{
"epoch": 0.02039368682390495,
"grad_norm": 8.965282440185547,
"learning_rate": 9.982642672195095e-05,
"loss": 1.652,
"step": 230
},
{
"epoch": 0.0212803688597269,
"grad_norm": 11.881516456604004,
"learning_rate": 9.979871469976199e-05,
"loss": 1.6963,
"step": 240
},
{
"epoch": 0.022167050895548854,
"grad_norm": 8.806026458740234,
"learning_rate": 9.976895564745994e-05,
"loss": 1.6385,
"step": 250
},
{
"epoch": 0.02305373293137081,
"grad_norm": 8.204315185546875,
"learning_rate": 9.97371507883229e-05,
"loss": 1.8237,
"step": 260
},
{
"epoch": 0.023940414967192764,
"grad_norm": 7.4131879806518555,
"learning_rate": 9.970330142972404e-05,
"loss": 1.6853,
"step": 270
},
{
"epoch": 0.02482709700301472,
"grad_norm": 8.07067584991455,
"learning_rate": 9.966740896307794e-05,
"loss": 1.6961,
"step": 280
},
{
"epoch": 0.025713779038836673,
"grad_norm": 10.194974899291992,
"learning_rate": 9.962947486378328e-05,
"loss": 1.6454,
"step": 290
},
{
"epoch": 0.02660046107465863,
"grad_norm": 9.990214347839355,
"learning_rate": 9.958950069116233e-05,
"loss": 1.7213,
"step": 300
},
{
"epoch": 0.02660046107465863,
"eval_loss": 1.710451364517212,
"eval_runtime": 59.6513,
"eval_samples_per_second": 8.382,
"eval_steps_per_second": 8.382,
"step": 300
},
{
"epoch": 0.02748714311048058,
"grad_norm": 9.65303897857666,
"learning_rate": 9.954748808839677e-05,
"loss": 1.6927,
"step": 310
},
{
"epoch": 0.028373825146302534,
"grad_norm": 11.48330307006836,
"learning_rate": 9.950343878246013e-05,
"loss": 1.6804,
"step": 320
},
{
"epoch": 0.02926050718212449,
"grad_norm": 8.232380867004395,
"learning_rate": 9.945735458404684e-05,
"loss": 1.6682,
"step": 330
},
{
"epoch": 0.030147189217946443,
"grad_norm": 8.22367000579834,
"learning_rate": 9.940923738749781e-05,
"loss": 1.705,
"step": 340
},
{
"epoch": 0.0310338712537684,
"grad_norm": 8.59586238861084,
"learning_rate": 9.935908917072255e-05,
"loss": 1.7646,
"step": 350
},
{
"epoch": 0.031920553289590356,
"grad_norm": 11.29736328125,
"learning_rate": 9.930691199511777e-05,
"loss": 1.5999,
"step": 360
},
{
"epoch": 0.032807235325412305,
"grad_norm": 7.492193222045898,
"learning_rate": 9.925270800548287e-05,
"loss": 1.7898,
"step": 370
},
{
"epoch": 0.03369391736123426,
"grad_norm": 9.845335006713867,
"learning_rate": 9.919647942993151e-05,
"loss": 1.7107,
"step": 380
},
{
"epoch": 0.03458059939705622,
"grad_norm": 6.399631500244141,
"learning_rate": 9.913822857980023e-05,
"loss": 1.8382,
"step": 390
},
{
"epoch": 0.03546728143287817,
"grad_norm": 14.003453254699707,
"learning_rate": 9.90779578495533e-05,
"loss": 1.5307,
"step": 400
},
{
"epoch": 0.03635396346870012,
"grad_norm": 7.186993598937988,
"learning_rate": 9.90156697166844e-05,
"loss": 1.6269,
"step": 410
},
{
"epoch": 0.03724064550452208,
"grad_norm": 10.279706001281738,
"learning_rate": 9.895136674161468e-05,
"loss": 1.7234,
"step": 420
},
{
"epoch": 0.038127327540344036,
"grad_norm": 15.368351936340332,
"learning_rate": 9.888505156758762e-05,
"loss": 1.6987,
"step": 430
},
{
"epoch": 0.039014009576165985,
"grad_norm": 12.989547729492188,
"learning_rate": 9.881672692056024e-05,
"loss": 1.7605,
"step": 440
},
{
"epoch": 0.03990069161198794,
"grad_norm": 10.625384330749512,
"learning_rate": 9.87463956090912e-05,
"loss": 1.7047,
"step": 450
},
{
"epoch": 0.03990069161198794,
"eval_loss": 1.7071540355682373,
"eval_runtime": 59.4581,
"eval_samples_per_second": 8.409,
"eval_steps_per_second": 8.409,
"step": 450
},
{
"epoch": 0.0407873736478099,
"grad_norm": 13.621452331542969,
"learning_rate": 9.867406052422526e-05,
"loss": 1.6615,
"step": 460
},
{
"epoch": 0.04167405568363185,
"grad_norm": 11.978480339050293,
"learning_rate": 9.859972463937443e-05,
"loss": 1.712,
"step": 470
},
{
"epoch": 0.0425607377194538,
"grad_norm": 8.805001258850098,
"learning_rate": 9.852339101019577e-05,
"loss": 1.6285,
"step": 480
},
{
"epoch": 0.04344741975527576,
"grad_norm": 6.610208034515381,
"learning_rate": 9.84450627744658e-05,
"loss": 1.6863,
"step": 490
},
{
"epoch": 0.04433410179109771,
"grad_norm": 8.630401611328125,
"learning_rate": 9.83647431519515e-05,
"loss": 1.7646,
"step": 500
},
{
"epoch": 0.045220783826919665,
"grad_norm": 9.726890563964844,
"learning_rate": 9.828243544427798e-05,
"loss": 1.7782,
"step": 510
},
{
"epoch": 0.04610746586274162,
"grad_norm": 9.516313552856445,
"learning_rate": 9.81981430347927e-05,
"loss": 1.7227,
"step": 520
},
{
"epoch": 0.04699414789856358,
"grad_norm": 8.171401023864746,
"learning_rate": 9.811186938842648e-05,
"loss": 1.6726,
"step": 530
},
{
"epoch": 0.04788082993438553,
"grad_norm": 9.201889991760254,
"learning_rate": 9.8023618051551e-05,
"loss": 1.4651,
"step": 540
},
{
"epoch": 0.04876751197020748,
"grad_norm": 8.409120559692383,
"learning_rate": 9.793339265183306e-05,
"loss": 1.5729,
"step": 550
},
{
"epoch": 0.04965419400602944,
"grad_norm": 8.624395370483398,
"learning_rate": 9.784119689808547e-05,
"loss": 1.7279,
"step": 560
},
{
"epoch": 0.05054087604185139,
"grad_norm": 9.849170684814453,
"learning_rate": 9.774703458011455e-05,
"loss": 1.7016,
"step": 570
},
{
"epoch": 0.051427558077673345,
"grad_norm": 10.26559829711914,
"learning_rate": 9.765090956856439e-05,
"loss": 1.6263,
"step": 580
},
{
"epoch": 0.0523142401134953,
"grad_norm": 8.199737548828125,
"learning_rate": 9.755282581475771e-05,
"loss": 1.6802,
"step": 590
},
{
"epoch": 0.05320092214931726,
"grad_norm": 8.513672828674316,
"learning_rate": 9.745278735053346e-05,
"loss": 1.8307,
"step": 600
},
{
"epoch": 0.05320092214931726,
"eval_loss": 1.7056752443313599,
"eval_runtime": 59.3846,
"eval_samples_per_second": 8.42,
"eval_steps_per_second": 8.42,
"step": 600
},
{
"epoch": 0.05408760418513921,
"grad_norm": 6.43255615234375,
"learning_rate": 9.73507982880811e-05,
"loss": 1.6386,
"step": 610
},
{
"epoch": 0.05497428622096116,
"grad_norm": 7.917320728302002,
"learning_rate": 9.724686281977149e-05,
"loss": 1.4825,
"step": 620
},
{
"epoch": 0.05586096825678312,
"grad_norm": 7.494490146636963,
"learning_rate": 9.714098521798468e-05,
"loss": 1.6621,
"step": 630
},
{
"epoch": 0.05674765029260507,
"grad_norm": 7.454612731933594,
"learning_rate": 9.703316983493417e-05,
"loss": 1.6406,
"step": 640
},
{
"epoch": 0.057634332328427025,
"grad_norm": 8.165127754211426,
"learning_rate": 9.692342110248805e-05,
"loss": 1.7403,
"step": 650
},
{
"epoch": 0.05852101436424898,
"grad_norm": 7.547806739807129,
"learning_rate": 9.68117435319869e-05,
"loss": 1.8081,
"step": 660
},
{
"epoch": 0.05940769640007094,
"grad_norm": 6.669929504394531,
"learning_rate": 9.669814171405819e-05,
"loss": 1.5977,
"step": 670
},
{
"epoch": 0.06029437843589289,
"grad_norm": 10.013628005981445,
"learning_rate": 9.658262031842773e-05,
"loss": 1.7354,
"step": 680
},
{
"epoch": 0.06118106047171484,
"grad_norm": 7.262237071990967,
"learning_rate": 9.646518409372763e-05,
"loss": 1.73,
"step": 690
},
{
"epoch": 0.0620677425075368,
"grad_norm": 10.55190658569336,
"learning_rate": 9.634583786730112e-05,
"loss": 1.5776,
"step": 700
},
{
"epoch": 0.06295442454335876,
"grad_norm": 6.680520534515381,
"learning_rate": 9.62245865450041e-05,
"loss": 1.6498,
"step": 710
},
{
"epoch": 0.06384110657918071,
"grad_norm": 8.451497077941895,
"learning_rate": 9.610143511100356e-05,
"loss": 1.5667,
"step": 720
},
{
"epoch": 0.06472778861500265,
"grad_norm": 6.46457576751709,
"learning_rate": 9.597638862757256e-05,
"loss": 1.6396,
"step": 730
},
{
"epoch": 0.06561447065082461,
"grad_norm": 7.483574867248535,
"learning_rate": 9.58494522348823e-05,
"loss": 1.7103,
"step": 740
},
{
"epoch": 0.06650115268664657,
"grad_norm": 7.17427921295166,
"learning_rate": 9.572063115079066e-05,
"loss": 1.7465,
"step": 750
},
{
"epoch": 0.06650115268664657,
"eval_loss": 1.7024177312850952,
"eval_runtime": 59.3886,
"eval_samples_per_second": 8.419,
"eval_steps_per_second": 8.419,
"step": 750
},
{
"epoch": 0.06738783472246852,
"grad_norm": 10.202959060668945,
"learning_rate": 9.558993067062788e-05,
"loss": 1.5595,
"step": 760
},
{
"epoch": 0.06827451675829048,
"grad_norm": 8.259611129760742,
"learning_rate": 9.545735616697878e-05,
"loss": 1.6273,
"step": 770
},
{
"epoch": 0.06916119879411244,
"grad_norm": 10.105010032653809,
"learning_rate": 9.532291308946193e-05,
"loss": 1.6441,
"step": 780
},
{
"epoch": 0.07004788082993439,
"grad_norm": 11.567327499389648,
"learning_rate": 9.518660696450571e-05,
"loss": 1.7721,
"step": 790
},
{
"epoch": 0.07093456286575633,
"grad_norm": 7.6176838874816895,
"learning_rate": 9.504844339512098e-05,
"loss": 1.5159,
"step": 800
},
{
"epoch": 0.07182124490157829,
"grad_norm": 8.693132400512695,
"learning_rate": 9.490842806067098e-05,
"loss": 1.7085,
"step": 810
},
{
"epoch": 0.07270792693740025,
"grad_norm": 8.85315227508545,
"learning_rate": 9.476656671663768e-05,
"loss": 1.6169,
"step": 820
},
{
"epoch": 0.0735946089732222,
"grad_norm": 8.727082252502441,
"learning_rate": 9.462286519438532e-05,
"loss": 1.5426,
"step": 830
},
{
"epoch": 0.07448129100904416,
"grad_norm": 9.754756927490234,
"learning_rate": 9.447732940092063e-05,
"loss": 1.5564,
"step": 840
},
{
"epoch": 0.07536797304486612,
"grad_norm": 9.421085357666016,
"learning_rate": 9.432996531865004e-05,
"loss": 1.6749,
"step": 850
},
{
"epoch": 0.07625465508068807,
"grad_norm": 8.565560340881348,
"learning_rate": 9.418077900513379e-05,
"loss": 1.5869,
"step": 860
},
{
"epoch": 0.07714133711651001,
"grad_norm": 6.376341342926025,
"learning_rate": 9.402977659283693e-05,
"loss": 1.6504,
"step": 870
},
{
"epoch": 0.07802801915233197,
"grad_norm": 8.475419044494629,
"learning_rate": 9.387696428887718e-05,
"loss": 1.6349,
"step": 880
},
{
"epoch": 0.07891470118815393,
"grad_norm": 8.961620330810547,
"learning_rate": 9.372234837476981e-05,
"loss": 1.6964,
"step": 890
},
{
"epoch": 0.07980138322397588,
"grad_norm": 10.981645584106445,
"learning_rate": 9.35659352061695e-05,
"loss": 1.8223,
"step": 900
},
{
"epoch": 0.07980138322397588,
"eval_loss": 1.7124860286712646,
"eval_runtime": 59.2388,
"eval_samples_per_second": 8.44,
"eval_steps_per_second": 8.44,
"step": 900
},
{
"epoch": 0.08068806525979784,
"grad_norm": 7.960820198059082,
"learning_rate": 9.340773121260896e-05,
"loss": 1.7361,
"step": 910
},
{
"epoch": 0.0815747472956198,
"grad_norm": 7.7724785804748535,
"learning_rate": 9.32477428972347e-05,
"loss": 1.7118,
"step": 920
},
{
"epoch": 0.08246142933144175,
"grad_norm": 12.896247863769531,
"learning_rate": 9.308597683653978e-05,
"loss": 1.6849,
"step": 930
},
{
"epoch": 0.0833481113672637,
"grad_norm": 10.572025299072266,
"learning_rate": 9.292243968009333e-05,
"loss": 1.7373,
"step": 940
},
{
"epoch": 0.08423479340308565,
"grad_norm": 15.746474266052246,
"learning_rate": 9.275713815026734e-05,
"loss": 1.5829,
"step": 950
},
{
"epoch": 0.0851214754389076,
"grad_norm": 8.268464088439941,
"learning_rate": 9.259007904196024e-05,
"loss": 1.659,
"step": 960
},
{
"epoch": 0.08600815747472956,
"grad_norm": 6.80485200881958,
"learning_rate": 9.242126922231766e-05,
"loss": 1.5132,
"step": 970
},
{
"epoch": 0.08689483951055152,
"grad_norm": 9.059598922729492,
"learning_rate": 9.225071563045009e-05,
"loss": 1.5896,
"step": 980
},
{
"epoch": 0.08778152154637348,
"grad_norm": 6.6695170402526855,
"learning_rate": 9.207842527714768e-05,
"loss": 1.7439,
"step": 990
},
{
"epoch": 0.08866820358219542,
"grad_norm": 8.449771881103516,
"learning_rate": 9.190440524459205e-05,
"loss": 1.9,
"step": 1000
},
{
"epoch": 0.08955488561801737,
"grad_norm": 45.434688568115234,
"learning_rate": 9.172866268606516e-05,
"loss": 1.6689,
"step": 1010
},
{
"epoch": 0.09044156765383933,
"grad_norm": 7.402377605438232,
"learning_rate": 9.155120482565522e-05,
"loss": 1.7466,
"step": 1020
},
{
"epoch": 0.09132824968966129,
"grad_norm": 8.143401145935059,
"learning_rate": 9.137203895795986e-05,
"loss": 1.5923,
"step": 1030
},
{
"epoch": 0.09221493172548324,
"grad_norm": 7.547423839569092,
"learning_rate": 9.11911724477861e-05,
"loss": 1.6487,
"step": 1040
},
{
"epoch": 0.0931016137613052,
"grad_norm": 6.291411399841309,
"learning_rate": 9.100861272984782e-05,
"loss": 1.7453,
"step": 1050
},
{
"epoch": 0.0931016137613052,
"eval_loss": 1.6903132200241089,
"eval_runtime": 59.3722,
"eval_samples_per_second": 8.421,
"eval_steps_per_second": 8.421,
"step": 1050
},
{
"epoch": 0.09398829579712716,
"grad_norm": 6.51155948638916,
"learning_rate": 9.082436730845996e-05,
"loss": 1.5121,
"step": 1060
},
{
"epoch": 0.0948749778329491,
"grad_norm": 7.315041542053223,
"learning_rate": 9.063844375723016e-05,
"loss": 1.45,
"step": 1070
},
{
"epoch": 0.09576165986877105,
"grad_norm": 9.287749290466309,
"learning_rate": 9.045084971874741e-05,
"loss": 1.6892,
"step": 1080
},
{
"epoch": 0.09664834190459301,
"grad_norm": 7.6157097816467285,
"learning_rate": 9.026159290426783e-05,
"loss": 1.832,
"step": 1090
},
{
"epoch": 0.09753502394041497,
"grad_norm": 6.081124782562256,
"learning_rate": 9.007068109339786e-05,
"loss": 1.6911,
"step": 1100
},
{
"epoch": 0.09842170597623692,
"grad_norm": 7.2468671798706055,
"learning_rate": 8.987812213377425e-05,
"loss": 1.6959,
"step": 1110
},
{
"epoch": 0.09930838801205888,
"grad_norm": 7.454516887664795,
"learning_rate": 8.968392394074165e-05,
"loss": 1.5169,
"step": 1120
},
{
"epoch": 0.10019507004788084,
"grad_norm": 10.253645896911621,
"learning_rate": 8.948809449702714e-05,
"loss": 1.6779,
"step": 1130
},
{
"epoch": 0.10108175208370278,
"grad_norm": 8.075345993041992,
"learning_rate": 8.929064185241216e-05,
"loss": 1.6622,
"step": 1140
},
{
"epoch": 0.10196843411952473,
"grad_norm": 11.007535934448242,
"learning_rate": 8.909157412340152e-05,
"loss": 1.7568,
"step": 1150
},
{
"epoch": 0.10285511615534669,
"grad_norm": 8.019722938537598,
"learning_rate": 8.889089949288989e-05,
"loss": 1.6177,
"step": 1160
},
{
"epoch": 0.10374179819116865,
"grad_norm": 8.618474960327148,
"learning_rate": 8.868862620982537e-05,
"loss": 1.5605,
"step": 1170
},
{
"epoch": 0.1046284802269906,
"grad_norm": 8.008125305175781,
"learning_rate": 8.848476258887034e-05,
"loss": 1.5995,
"step": 1180
},
{
"epoch": 0.10551516226281256,
"grad_norm": 11.63944149017334,
"learning_rate": 8.827931701005976e-05,
"loss": 1.5778,
"step": 1190
},
{
"epoch": 0.10640184429863452,
"grad_norm": 9.485556602478027,
"learning_rate": 8.807229791845674e-05,
"loss": 1.547,
"step": 1200
},
{
"epoch": 0.10640184429863452,
"eval_loss": 1.6847599744796753,
"eval_runtime": 59.4403,
"eval_samples_per_second": 8.412,
"eval_steps_per_second": 8.412,
"step": 1200
},
{
"epoch": 0.10728852633445646,
"grad_norm": 8.38836669921875,
"learning_rate": 8.786371382380529e-05,
"loss": 1.6206,
"step": 1210
},
{
"epoch": 0.10817520837027841,
"grad_norm": 6.360514163970947,
"learning_rate": 8.765357330018059e-05,
"loss": 1.5409,
"step": 1220
},
{
"epoch": 0.10906189040610037,
"grad_norm": 8.692633628845215,
"learning_rate": 8.744188498563644e-05,
"loss": 1.549,
"step": 1230
},
{
"epoch": 0.10994857244192233,
"grad_norm": 6.637635707855225,
"learning_rate": 8.722865758185038e-05,
"loss": 1.6354,
"step": 1240
},
{
"epoch": 0.11083525447774428,
"grad_norm": 9.428290367126465,
"learning_rate": 8.70138998537658e-05,
"loss": 1.84,
"step": 1250
},
{
"epoch": 0.11172193651356624,
"grad_norm": 7.926419734954834,
"learning_rate": 8.679762062923178e-05,
"loss": 1.6915,
"step": 1260
},
{
"epoch": 0.1126086185493882,
"grad_norm": 9.914402961730957,
"learning_rate": 8.65798287986401e-05,
"loss": 1.6808,
"step": 1270
},
{
"epoch": 0.11349530058521014,
"grad_norm": 11.937222480773926,
"learning_rate": 8.636053331455989e-05,
"loss": 1.6807,
"step": 1280
},
{
"epoch": 0.1143819826210321,
"grad_norm": 9.676164627075195,
"learning_rate": 8.613974319136961e-05,
"loss": 1.6505,
"step": 1290
},
{
"epoch": 0.11526866465685405,
"grad_norm": 8.247169494628906,
"learning_rate": 8.59174675048864e-05,
"loss": 1.6287,
"step": 1300
},
{
"epoch": 0.116155346692676,
"grad_norm": 7.142345905303955,
"learning_rate": 8.569371539199318e-05,
"loss": 1.6104,
"step": 1310
},
{
"epoch": 0.11704202872849796,
"grad_norm": 7.208193778991699,
"learning_rate": 8.546849605026292e-05,
"loss": 1.5853,
"step": 1320
},
{
"epoch": 0.11792871076431992,
"grad_norm": 7.444331645965576,
"learning_rate": 8.524181873758061e-05,
"loss": 1.5583,
"step": 1330
},
{
"epoch": 0.11881539280014188,
"grad_norm": 9.676839828491211,
"learning_rate": 8.501369277176277e-05,
"loss": 1.7519,
"step": 1340
},
{
"epoch": 0.11970207483596382,
"grad_norm": 8.323569297790527,
"learning_rate": 8.478412753017435e-05,
"loss": 1.6618,
"step": 1350
},
{
"epoch": 0.11970207483596382,
"eval_loss": 1.6926313638687134,
"eval_runtime": 59.2541,
"eval_samples_per_second": 8.438,
"eval_steps_per_second": 8.438,
"step": 1350
},
{
"epoch": 0.12058875687178577,
"grad_norm": 9.658554077148438,
"learning_rate": 8.455313244934327e-05,
"loss": 1.8027,
"step": 1360
},
{
"epoch": 0.12147543890760773,
"grad_norm": 10.467687606811523,
"learning_rate": 8.432071702457255e-05,
"loss": 1.6109,
"step": 1370
},
{
"epoch": 0.12236212094342969,
"grad_norm": 8.080743789672852,
"learning_rate": 8.408689080955001e-05,
"loss": 1.5724,
"step": 1380
},
{
"epoch": 0.12324880297925164,
"grad_norm": 7.568329334259033,
"learning_rate": 8.38516634159555e-05,
"loss": 1.5831,
"step": 1390
},
{
"epoch": 0.1241354850150736,
"grad_norm": 6.801577091217041,
"learning_rate": 8.361504451306586e-05,
"loss": 1.6606,
"step": 1400
},
{
"epoch": 0.12502216705089556,
"grad_norm": 8.583120346069336,
"learning_rate": 8.337704382735743e-05,
"loss": 1.8597,
"step": 1410
},
{
"epoch": 0.1259088490867175,
"grad_norm": 9.193146705627441,
"learning_rate": 8.313767114210618e-05,
"loss": 1.6648,
"step": 1420
},
{
"epoch": 0.12679553112253947,
"grad_norm": 7.931524753570557,
"learning_rate": 8.289693629698566e-05,
"loss": 1.6386,
"step": 1430
},
{
"epoch": 0.12768221315836142,
"grad_norm": 8.280115127563477,
"learning_rate": 8.265484918766245e-05,
"loss": 1.5829,
"step": 1440
},
{
"epoch": 0.12856889519418335,
"grad_norm": 7.822704315185547,
"learning_rate": 8.241141976538945e-05,
"loss": 1.7449,
"step": 1450
},
{
"epoch": 0.1294555772300053,
"grad_norm": 6.792067050933838,
"learning_rate": 8.216665803659673e-05,
"loss": 1.8019,
"step": 1460
},
{
"epoch": 0.13034225926582726,
"grad_norm": 6.622004508972168,
"learning_rate": 8.19205740624803e-05,
"loss": 1.6184,
"step": 1470
},
{
"epoch": 0.13122894130164922,
"grad_norm": 8.733943939208984,
"learning_rate": 8.167317795858853e-05,
"loss": 1.7708,
"step": 1480
},
{
"epoch": 0.13211562333747118,
"grad_norm": 12.745617866516113,
"learning_rate": 8.142447989440621e-05,
"loss": 1.4564,
"step": 1490
},
{
"epoch": 0.13300230537329313,
"grad_norm": 16.666255950927734,
"learning_rate": 8.117449009293671e-05,
"loss": 1.7387,
"step": 1500
},
{
"epoch": 0.13300230537329313,
"eval_loss": 1.6664111614227295,
"eval_runtime": 59.3291,
"eval_samples_per_second": 8.428,
"eval_steps_per_second": 8.428,
"step": 1500
},
{
"epoch": 0.1338889874091151,
"grad_norm": 9.83285140991211,
"learning_rate": 8.09232188302816e-05,
"loss": 1.545,
"step": 1510
},
{
"epoch": 0.13477566944493705,
"grad_norm": 12.203434944152832,
"learning_rate": 8.067067643521836e-05,
"loss": 1.8037,
"step": 1520
},
{
"epoch": 0.135662351480759,
"grad_norm": 6.172882556915283,
"learning_rate": 8.041687328877568e-05,
"loss": 1.5307,
"step": 1530
},
{
"epoch": 0.13654903351658096,
"grad_norm": 8.186355590820312,
"learning_rate": 8.016181982380684e-05,
"loss": 1.4897,
"step": 1540
},
{
"epoch": 0.13743571555240292,
"grad_norm": 6.983214378356934,
"learning_rate": 7.990552652456082e-05,
"loss": 1.5217,
"step": 1550
},
{
"epoch": 0.13832239758822487,
"grad_norm": 7.637870788574219,
"learning_rate": 7.964800392625131e-05,
"loss": 1.4999,
"step": 1560
},
{
"epoch": 0.13920907962404683,
"grad_norm": 6.830051422119141,
"learning_rate": 7.938926261462369e-05,
"loss": 1.6,
"step": 1570
},
{
"epoch": 0.14009576165986878,
"grad_norm": 8.118239402770996,
"learning_rate": 7.912931322551983e-05,
"loss": 1.5677,
"step": 1580
},
{
"epoch": 0.1409824436956907,
"grad_norm": 6.605192184448242,
"learning_rate": 7.8868166444441e-05,
"loss": 1.701,
"step": 1590
},
{
"epoch": 0.14186912573151267,
"grad_norm": 8.868054389953613,
"learning_rate": 7.86058330061085e-05,
"loss": 1.5491,
"step": 1600
},
{
"epoch": 0.14275580776733462,
"grad_norm": 9.033772468566895,
"learning_rate": 7.834232369402252e-05,
"loss": 1.6596,
"step": 1610
},
{
"epoch": 0.14364248980315658,
"grad_norm": 10.658461570739746,
"learning_rate": 7.807764934001877e-05,
"loss": 1.4879,
"step": 1620
},
{
"epoch": 0.14452917183897854,
"grad_norm": 8.768953323364258,
"learning_rate": 7.781182082382326e-05,
"loss": 1.6105,
"step": 1630
},
{
"epoch": 0.1454158538748005,
"grad_norm": 6.057528018951416,
"learning_rate": 7.754484907260515e-05,
"loss": 1.3959,
"step": 1640
},
{
"epoch": 0.14630253591062245,
"grad_norm": 8.636338233947754,
"learning_rate": 7.727674506052746e-05,
"loss": 1.4808,
"step": 1650
},
{
"epoch": 0.14630253591062245,
"eval_loss": 1.6922165155410767,
"eval_runtime": 59.3594,
"eval_samples_per_second": 8.423,
"eval_steps_per_second": 8.423,
"step": 1650
},
{
"epoch": 0.1471892179464444,
"grad_norm": 8.238250732421875,
"learning_rate": 7.700751980829603e-05,
"loss": 1.6605,
"step": 1660
},
{
"epoch": 0.14807589998226636,
"grad_norm": 7.141147136688232,
"learning_rate": 7.67371843827065e-05,
"loss": 1.7022,
"step": 1670
},
{
"epoch": 0.14896258201808832,
"grad_norm": 7.410867691040039,
"learning_rate": 7.64657498961894e-05,
"loss": 1.676,
"step": 1680
},
{
"epoch": 0.14984926405391027,
"grad_norm": 7.417080402374268,
"learning_rate": 7.61932275063533e-05,
"loss": 1.6914,
"step": 1690
},
{
"epoch": 0.15073594608973223,
"grad_norm": 29.780086517333984,
"learning_rate": 7.591962841552628e-05,
"loss": 1.7629,
"step": 1700
},
{
"epoch": 0.1516226281255542,
"grad_norm": 7.910691738128662,
"learning_rate": 7.564496387029534e-05,
"loss": 1.4673,
"step": 1710
},
{
"epoch": 0.15250931016137614,
"grad_norm": 8.581075668334961,
"learning_rate": 7.536924516104414e-05,
"loss": 1.588,
"step": 1720
},
{
"epoch": 0.15339599219719807,
"grad_norm": 9.336210250854492,
"learning_rate": 7.50924836214889e-05,
"loss": 1.7385,
"step": 1730
},
{
"epoch": 0.15428267423302003,
"grad_norm": 7.002266883850098,
"learning_rate": 7.481469062821253e-05,
"loss": 1.6644,
"step": 1740
},
{
"epoch": 0.15516935626884198,
"grad_norm": 13.515497207641602,
"learning_rate": 7.453587760019692e-05,
"loss": 1.7763,
"step": 1750
},
{
"epoch": 0.15605603830466394,
"grad_norm": 8.133156776428223,
"learning_rate": 7.425605599835362e-05,
"loss": 1.5641,
"step": 1760
},
{
"epoch": 0.1569427203404859,
"grad_norm": 6.456133842468262,
"learning_rate": 7.397523732505273e-05,
"loss": 1.7309,
"step": 1770
},
{
"epoch": 0.15782940237630785,
"grad_norm": 14.745148658752441,
"learning_rate": 7.369343312364996e-05,
"loss": 1.6363,
"step": 1780
},
{
"epoch": 0.1587160844121298,
"grad_norm": 7.444967746734619,
"learning_rate": 7.341065497801231e-05,
"loss": 1.581,
"step": 1790
},
{
"epoch": 0.15960276644795177,
"grad_norm": 7.5540595054626465,
"learning_rate": 7.31269145120418e-05,
"loss": 1.7818,
"step": 1800
},
{
"epoch": 0.15960276644795177,
"eval_loss": 1.6818472146987915,
"eval_runtime": 59.3071,
"eval_samples_per_second": 8.431,
"eval_steps_per_second": 8.431,
"step": 1800
},
{
"epoch": 0.16048944848377372,
"grad_norm": 9.014618873596191,
"learning_rate": 7.284222338919761e-05,
"loss": 1.7272,
"step": 1810
},
{
"epoch": 0.16137613051959568,
"grad_norm": 7.285683631896973,
"learning_rate": 7.255659331201675e-05,
"loss": 1.7136,
"step": 1820
},
{
"epoch": 0.16226281255541763,
"grad_norm": 8.174327850341797,
"learning_rate": 7.227003602163298e-05,
"loss": 1.6277,
"step": 1830
},
{
"epoch": 0.1631494945912396,
"grad_norm": 8.313567161560059,
"learning_rate": 7.198256329729413e-05,
"loss": 1.5496,
"step": 1840
},
{
"epoch": 0.16403617662706155,
"grad_norm": 6.585691452026367,
"learning_rate": 7.169418695587792e-05,
"loss": 1.4631,
"step": 1850
},
{
"epoch": 0.1649228586628835,
"grad_norm": 6.994291305541992,
"learning_rate": 7.140491885140631e-05,
"loss": 1.6103,
"step": 1860
},
{
"epoch": 0.16580954069870543,
"grad_norm": 10.888928413391113,
"learning_rate": 7.111477087455802e-05,
"loss": 1.5637,
"step": 1870
},
{
"epoch": 0.1666962227345274,
"grad_norm": 9.979802131652832,
"learning_rate": 7.082375495217998e-05,
"loss": 1.7041,
"step": 1880
},
{
"epoch": 0.16758290477034934,
"grad_norm": 7.943565368652344,
"learning_rate": 7.053188304679693e-05,
"loss": 1.5222,
"step": 1890
},
{
"epoch": 0.1684695868061713,
"grad_norm": 7.496694087982178,
"learning_rate": 7.02391671561197e-05,
"loss": 1.4436,
"step": 1900
},
{
"epoch": 0.16935626884199326,
"grad_norm": 10.703848838806152,
"learning_rate": 6.994561931255211e-05,
"loss": 1.6628,
"step": 1910
},
{
"epoch": 0.1702429508778152,
"grad_norm": 9.519083976745605,
"learning_rate": 6.96512515826962e-05,
"loss": 1.5567,
"step": 1920
},
{
"epoch": 0.17112963291363717,
"grad_norm": 9.946447372436523,
"learning_rate": 6.935607606685643e-05,
"loss": 1.5579,
"step": 1930
},
{
"epoch": 0.17201631494945913,
"grad_norm": 10.239102363586426,
"learning_rate": 6.90601048985421e-05,
"loss": 1.8462,
"step": 1940
},
{
"epoch": 0.17290299698528108,
"grad_norm": 7.735941410064697,
"learning_rate": 6.876335024396873e-05,
"loss": 1.7393,
"step": 1950
},
{
"epoch": 0.17290299698528108,
"eval_loss": 1.6459492444992065,
"eval_runtime": 59.3323,
"eval_samples_per_second": 8.427,
"eval_steps_per_second": 8.427,
"step": 1950
},
{
"epoch": 0.17378967902110304,
"grad_norm": 7.9266228675842285,
"learning_rate": 6.846582430155784e-05,
"loss": 1.6871,
"step": 1960
},
{
"epoch": 0.174676361056925,
"grad_norm": 7.106112003326416,
"learning_rate": 6.81675393014356e-05,
"loss": 1.4537,
"step": 1970
},
{
"epoch": 0.17556304309274695,
"grad_norm": 7.142345428466797,
"learning_rate": 6.786850750493007e-05,
"loss": 1.7198,
"step": 1980
},
{
"epoch": 0.1764497251285689,
"grad_norm": 9.443406105041504,
"learning_rate": 6.756874120406716e-05,
"loss": 1.8101,
"step": 1990
},
{
"epoch": 0.17733640716439084,
"grad_norm": 9.909912109375,
"learning_rate": 6.72682527210654e-05,
"loss": 1.5669,
"step": 2000
},
{
"epoch": 0.1782230892002128,
"grad_norm": 7.394705772399902,
"learning_rate": 6.696705440782941e-05,
"loss": 1.7875,
"step": 2010
},
{
"epoch": 0.17910977123603475,
"grad_norm": 6.871555805206299,
"learning_rate": 6.66651586454421e-05,
"loss": 1.6649,
"step": 2020
},
{
"epoch": 0.1799964532718567,
"grad_norm": 8.4517183303833,
"learning_rate": 6.636257784365587e-05,
"loss": 1.5413,
"step": 2030
},
{
"epoch": 0.18088313530767866,
"grad_norm": 7.050330638885498,
"learning_rate": 6.60593244403823e-05,
"loss": 1.591,
"step": 2040
},
{
"epoch": 0.18176981734350062,
"grad_norm": 8.818882942199707,
"learning_rate": 6.575541090118106e-05,
"loss": 1.5542,
"step": 2050
},
{
"epoch": 0.18265649937932257,
"grad_norm": 9.987334251403809,
"learning_rate": 6.54508497187474e-05,
"loss": 1.6636,
"step": 2060
},
{
"epoch": 0.18354318141514453,
"grad_norm": 9.73635196685791,
"learning_rate": 6.514565341239862e-05,
"loss": 1.8135,
"step": 2070
},
{
"epoch": 0.18442986345096649,
"grad_norm": 6.916482925415039,
"learning_rate": 6.483983452755954e-05,
"loss": 1.7043,
"step": 2080
},
{
"epoch": 0.18531654548678844,
"grad_norm": 13.21588134765625,
"learning_rate": 6.45334056352467e-05,
"loss": 1.5767,
"step": 2090
},
{
"epoch": 0.1862032275226104,
"grad_norm": 7.8419904708862305,
"learning_rate": 6.422637933155163e-05,
"loss": 1.5591,
"step": 2100
},
{
"epoch": 0.1862032275226104,
"eval_loss": 1.6301392316818237,
"eval_runtime": 59.4431,
"eval_samples_per_second": 8.411,
"eval_steps_per_second": 8.411,
"step": 2100
},
{
"epoch": 0.18708990955843235,
"grad_norm": 8.962486267089844,
"learning_rate": 6.391876823712319e-05,
"loss": 1.4843,
"step": 2110
},
{
"epoch": 0.1879765915942543,
"grad_norm": 10.67493724822998,
"learning_rate": 6.361058499664857e-05,
"loss": 1.6638,
"step": 2120
},
{
"epoch": 0.18886327363007627,
"grad_norm": 8.06369686126709,
"learning_rate": 6.330184227833377e-05,
"loss": 1.6439,
"step": 2130
},
{
"epoch": 0.1897499556658982,
"grad_norm": 9.005534172058105,
"learning_rate": 6.299255277338267e-05,
"loss": 1.5289,
"step": 2140
},
{
"epoch": 0.19063663770172015,
"grad_norm": 9.255204200744629,
"learning_rate": 6.268272919547539e-05,
"loss": 1.46,
"step": 2150
},
{
"epoch": 0.1915233197375421,
"grad_norm": 7.344980239868164,
"learning_rate": 6.237238428024573e-05,
"loss": 1.4932,
"step": 2160
},
{
"epoch": 0.19241000177336406,
"grad_norm": 8.692234992980957,
"learning_rate": 6.206153078475765e-05,
"loss": 1.6582,
"step": 2170
},
{
"epoch": 0.19329668380918602,
"grad_norm": 7.381601333618164,
"learning_rate": 6.175018148698078e-05,
"loss": 1.5007,
"step": 2180
},
{
"epoch": 0.19418336584500798,
"grad_norm": 7.794239044189453,
"learning_rate": 6.143834918526529e-05,
"loss": 1.6501,
"step": 2190
},
{
"epoch": 0.19507004788082993,
"grad_norm": 8.13096809387207,
"learning_rate": 6.112604669781574e-05,
"loss": 1.6862,
"step": 2200
},
{
"epoch": 0.1959567299166519,
"grad_norm": 6.846219539642334,
"learning_rate": 6.081328686216419e-05,
"loss": 1.5702,
"step": 2210
},
{
"epoch": 0.19684341195247385,
"grad_norm": 8.771533966064453,
"learning_rate": 6.0500082534642485e-05,
"loss": 1.6259,
"step": 2220
},
{
"epoch": 0.1977300939882958,
"grad_norm": 6.50418758392334,
"learning_rate": 6.01864465898538e-05,
"loss": 1.6948,
"step": 2230
},
{
"epoch": 0.19861677602411776,
"grad_norm": 8.83719539642334,
"learning_rate": 5.987239192014337e-05,
"loss": 1.643,
"step": 2240
},
{
"epoch": 0.19950345805993971,
"grad_norm": 7.24541711807251,
"learning_rate": 5.955793143506864e-05,
"loss": 1.624,
"step": 2250
},
{
"epoch": 0.19950345805993971,
"eval_loss": 1.5997846126556396,
"eval_runtime": 59.4561,
"eval_samples_per_second": 8.41,
"eval_steps_per_second": 8.41,
"step": 2250
},
{
"epoch": 0.20039014009576167,
"grad_norm": 13.114813804626465,
"learning_rate": 5.9243078060868454e-05,
"loss": 1.5787,
"step": 2260
},
{
"epoch": 0.20127682213158363,
"grad_norm": 6.7087321281433105,
"learning_rate": 5.8927844739931854e-05,
"loss": 1.3785,
"step": 2270
},
{
"epoch": 0.20216350416740556,
"grad_norm": 6.644030570983887,
"learning_rate": 5.8612244430265966e-05,
"loss": 1.5126,
"step": 2280
},
{
"epoch": 0.2030501862032275,
"grad_norm": 10.291509628295898,
"learning_rate": 5.829629010496342e-05,
"loss": 1.4863,
"step": 2290
},
{
"epoch": 0.20393686823904947,
"grad_norm": 6.426754951477051,
"learning_rate": 5.797999475166898e-05,
"loss": 1.5586,
"step": 2300
},
{
"epoch": 0.20482355027487142,
"grad_norm": 9.044095039367676,
"learning_rate": 5.766337137204581e-05,
"loss": 1.5063,
"step": 2310
},
{
"epoch": 0.20571023231069338,
"grad_norm": 8.852991104125977,
"learning_rate": 5.734643298124092e-05,
"loss": 1.7211,
"step": 2320
},
{
"epoch": 0.20659691434651534,
"grad_norm": 73.65837860107422,
"learning_rate": 5.702919260735016e-05,
"loss": 1.5191,
"step": 2330
},
{
"epoch": 0.2074835963823373,
"grad_norm": 8.413342475891113,
"learning_rate": 5.671166329088279e-05,
"loss": 1.5013,
"step": 2340
},
{
"epoch": 0.20837027841815925,
"grad_norm": 6.938820838928223,
"learning_rate": 5.639385808422532e-05,
"loss": 1.5099,
"step": 2350
},
{
"epoch": 0.2092569604539812,
"grad_norm": 7.757599353790283,
"learning_rate": 5.6075790051105044e-05,
"loss": 1.5848,
"step": 2360
},
{
"epoch": 0.21014364248980316,
"grad_norm": 7.502821445465088,
"learning_rate": 5.5757472266052994e-05,
"loss": 1.7166,
"step": 2370
},
{
"epoch": 0.21103032452562512,
"grad_norm": 11.332352638244629,
"learning_rate": 5.543891781386657e-05,
"loss": 1.671,
"step": 2380
},
{
"epoch": 0.21191700656144707,
"grad_norm": 7.515905380249023,
"learning_rate": 5.512013978907158e-05,
"loss": 1.6298,
"step": 2390
},
{
"epoch": 0.21280368859726903,
"grad_norm": 6.094747543334961,
"learning_rate": 5.4801151295384105e-05,
"loss": 1.5135,
"step": 2400
},
{
"epoch": 0.21280368859726903,
"eval_loss": 1.5888803005218506,
"eval_runtime": 59.4453,
"eval_samples_per_second": 8.411,
"eval_steps_per_second": 8.411,
"step": 2400
},
{
"epoch": 0.213690370633091,
"grad_norm": 7.49708366394043,
"learning_rate": 5.448196544517169e-05,
"loss": 1.5031,
"step": 2410
},
{
"epoch": 0.21457705266891292,
"grad_norm": 8.41457748413086,
"learning_rate": 5.4162595358914485e-05,
"loss": 1.5116,
"step": 2420
},
{
"epoch": 0.21546373470473487,
"grad_norm": 7.308359146118164,
"learning_rate": 5.3843054164665855e-05,
"loss": 1.4185,
"step": 2430
},
{
"epoch": 0.21635041674055683,
"grad_norm": 13.086946487426758,
"learning_rate": 5.352335499751271e-05,
"loss": 1.6723,
"step": 2440
},
{
"epoch": 0.21723709877637878,
"grad_norm": 7.7518157958984375,
"learning_rate": 5.3203510999035666e-05,
"loss": 1.4357,
"step": 2450
},
{
"epoch": 0.21812378081220074,
"grad_norm": 7.657406806945801,
"learning_rate": 5.2883535316768745e-05,
"loss": 1.5464,
"step": 2460
},
{
"epoch": 0.2190104628480227,
"grad_norm": 6.197967529296875,
"learning_rate": 5.2563441103658975e-05,
"loss": 1.3296,
"step": 2470
},
{
"epoch": 0.21989714488384465,
"grad_norm": 10.529012680053711,
"learning_rate": 5.224324151752577e-05,
"loss": 1.6508,
"step": 2480
},
{
"epoch": 0.2207838269196666,
"grad_norm": 7.516609191894531,
"learning_rate": 5.1922949720519935e-05,
"loss": 1.5441,
"step": 2490
},
{
"epoch": 0.22167050895548857,
"grad_norm": 5.759303569793701,
"learning_rate": 5.160257887858279e-05,
"loss": 1.5121,
"step": 2500
},
{
"epoch": 0.22255719099131052,
"grad_norm": 9.412184715270996,
"learning_rate": 5.1282142160904794e-05,
"loss": 1.5882,
"step": 2510
},
{
"epoch": 0.22344387302713248,
"grad_norm": 6.849535942077637,
"learning_rate": 5.096165273938437e-05,
"loss": 1.4593,
"step": 2520
},
{
"epoch": 0.22433055506295443,
"grad_norm": 6.8701300621032715,
"learning_rate": 5.064112378808638e-05,
"loss": 1.6232,
"step": 2530
},
{
"epoch": 0.2252172370987764,
"grad_norm": 10.939626693725586,
"learning_rate": 5.032056848270057e-05,
"loss": 1.605,
"step": 2540
},
{
"epoch": 0.22610391913459835,
"grad_norm": 9.180508613586426,
"learning_rate": 5.0000000000000016e-05,
"loss": 1.572,
"step": 2550
},
{
"epoch": 0.22610391913459835,
"eval_loss": 1.5821589231491089,
"eval_runtime": 59.3605,
"eval_samples_per_second": 8.423,
"eval_steps_per_second": 8.423,
"step": 2550
},
{
"epoch": 0.22699060117042028,
"grad_norm": 7.859452247619629,
"learning_rate": 4.967943151729946e-05,
"loss": 1.4249,
"step": 2560
},
{
"epoch": 0.22787728320624223,
"grad_norm": 8.19686222076416,
"learning_rate": 4.935887621191365e-05,
"loss": 1.5235,
"step": 2570
},
{
"epoch": 0.2287639652420642,
"grad_norm": 6.515259265899658,
"learning_rate": 4.903834726061566e-05,
"loss": 1.6435,
"step": 2580
},
{
"epoch": 0.22965064727788614,
"grad_norm": 7.381898403167725,
"learning_rate": 4.8717857839095245e-05,
"loss": 1.5029,
"step": 2590
},
{
"epoch": 0.2305373293137081,
"grad_norm": 7.736898422241211,
"learning_rate": 4.8397421121417256e-05,
"loss": 1.4735,
"step": 2600
},
{
"epoch": 0.23142401134953006,
"grad_norm": 5.957932949066162,
"learning_rate": 4.807705027948009e-05,
"loss": 1.4774,
"step": 2610
},
{
"epoch": 0.232310693385352,
"grad_norm": 6.916577339172363,
"learning_rate": 4.7756758482474285e-05,
"loss": 1.5927,
"step": 2620
},
{
"epoch": 0.23319737542117397,
"grad_norm": 11.942724227905273,
"learning_rate": 4.7436558896341064e-05,
"loss": 1.4889,
"step": 2630
},
{
"epoch": 0.23408405745699593,
"grad_norm": 6.2363057136535645,
"learning_rate": 4.71164646832313e-05,
"loss": 1.5574,
"step": 2640
},
{
"epoch": 0.23497073949281788,
"grad_norm": 7.307931423187256,
"learning_rate": 4.679648900096437e-05,
"loss": 1.4904,
"step": 2650
},
{
"epoch": 0.23585742152863984,
"grad_norm": 6.648471355438232,
"learning_rate": 4.6476645002487314e-05,
"loss": 1.5208,
"step": 2660
},
{
"epoch": 0.2367441035644618,
"grad_norm": 7.0004167556762695,
"learning_rate": 4.615694583533419e-05,
"loss": 1.5019,
"step": 2670
},
{
"epoch": 0.23763078560028375,
"grad_norm": 7.872572422027588,
"learning_rate": 4.583740464108555e-05,
"loss": 1.5381,
"step": 2680
},
{
"epoch": 0.23851746763610568,
"grad_norm": 8.477245330810547,
"learning_rate": 4.5518034554828346e-05,
"loss": 1.5339,
"step": 2690
},
{
"epoch": 0.23940414967192764,
"grad_norm": 9.522911071777344,
"learning_rate": 4.519884870461593e-05,
"loss": 1.5553,
"step": 2700
},
{
"epoch": 0.23940414967192764,
"eval_loss": 1.5684587955474854,
"eval_runtime": 59.3936,
"eval_samples_per_second": 8.418,
"eval_steps_per_second": 8.418,
"step": 2700
},
{
"epoch": 0.2402908317077496,
"grad_norm": 6.57583475112915,
"learning_rate": 4.487986021092845e-05,
"loss": 1.584,
"step": 2710
},
{
"epoch": 0.24117751374357155,
"grad_norm": 10.243136405944824,
"learning_rate": 4.456108218613348e-05,
"loss": 1.5983,
"step": 2720
},
{
"epoch": 0.2420641957793935,
"grad_norm": 7.099559307098389,
"learning_rate": 4.424252773394705e-05,
"loss": 1.5817,
"step": 2730
},
{
"epoch": 0.24295087781521546,
"grad_norm": 7.89496374130249,
"learning_rate": 4.3924209948894995e-05,
"loss": 1.6713,
"step": 2740
},
{
"epoch": 0.24383755985103742,
"grad_norm": 7.425642490386963,
"learning_rate": 4.360614191577471e-05,
"loss": 1.4357,
"step": 2750
},
{
"epoch": 0.24472424188685937,
"grad_norm": 7.661828994750977,
"learning_rate": 4.3288336709117256e-05,
"loss": 1.4964,
"step": 2760
},
{
"epoch": 0.24561092392268133,
"grad_norm": 9.707741737365723,
"learning_rate": 4.297080739264988e-05,
"loss": 1.3392,
"step": 2770
},
{
"epoch": 0.24649760595850329,
"grad_norm": 6.905391693115234,
"learning_rate": 4.2653567018759114e-05,
"loss": 1.4212,
"step": 2780
},
{
"epoch": 0.24738428799432524,
"grad_norm": 7.846536636352539,
"learning_rate": 4.233662862795421e-05,
"loss": 1.6355,
"step": 2790
},
{
"epoch": 0.2482709700301472,
"grad_norm": 6.390925884246826,
"learning_rate": 4.202000524833106e-05,
"loss": 1.4986,
"step": 2800
},
{
"epoch": 0.24915765206596915,
"grad_norm": 7.581704616546631,
"learning_rate": 4.170370989503664e-05,
"loss": 1.6026,
"step": 2810
},
{
"epoch": 0.2500443341017911,
"grad_norm": 4.918117523193359,
"learning_rate": 4.138775556973407e-05,
"loss": 1.4725,
"step": 2820
},
{
"epoch": 0.25093101613761304,
"grad_norm": 12.692071914672852,
"learning_rate": 4.1072155260068185e-05,
"loss": 1.6113,
"step": 2830
},
{
"epoch": 0.251817698173435,
"grad_norm": 6.600620746612549,
"learning_rate": 4.075692193913158e-05,
"loss": 1.5967,
"step": 2840
},
{
"epoch": 0.25270438020925695,
"grad_norm": 6.879825115203857,
"learning_rate": 4.0442068564931405e-05,
"loss": 1.4693,
"step": 2850
},
{
"epoch": 0.25270438020925695,
"eval_loss": 1.5608752965927124,
"eval_runtime": 59.3283,
"eval_samples_per_second": 8.428,
"eval_steps_per_second": 8.428,
"step": 2850
},
{
"epoch": 0.25359106224507894,
"grad_norm": 7.351998329162598,
"learning_rate": 4.012760807985666e-05,
"loss": 1.5118,
"step": 2860
},
{
"epoch": 0.25447774428090086,
"grad_norm": 9.371225357055664,
"learning_rate": 3.9813553410146234e-05,
"loss": 1.5299,
"step": 2870
},
{
"epoch": 0.25536442631672285,
"grad_norm": 7.500007152557373,
"learning_rate": 3.949991746535754e-05,
"loss": 1.5855,
"step": 2880
},
{
"epoch": 0.2562511083525448,
"grad_norm": 7.6006903648376465,
"learning_rate": 3.918671313783584e-05,
"loss": 1.5459,
"step": 2890
},
{
"epoch": 0.2571377903883667,
"grad_norm": 6.81592321395874,
"learning_rate": 3.8873953302184295e-05,
"loss": 1.361,
"step": 2900
},
{
"epoch": 0.2580244724241887,
"grad_norm": 6.851174831390381,
"learning_rate": 3.856165081473475e-05,
"loss": 1.2751,
"step": 2910
},
{
"epoch": 0.2589111544600106,
"grad_norm": 8.746306419372559,
"learning_rate": 3.824981851301925e-05,
"loss": 1.3964,
"step": 2920
},
{
"epoch": 0.2597978364958326,
"grad_norm": 8.92397689819336,
"learning_rate": 3.7938469215242386e-05,
"loss": 1.5833,
"step": 2930
},
{
"epoch": 0.26068451853165453,
"grad_norm": 12.532337188720703,
"learning_rate": 3.762761571975431e-05,
"loss": 1.754,
"step": 2940
},
{
"epoch": 0.2615712005674765,
"grad_norm": 7.304866313934326,
"learning_rate": 3.731727080452465e-05,
"loss": 1.5328,
"step": 2950
},
{
"epoch": 0.26245788260329844,
"grad_norm": 7.864557266235352,
"learning_rate": 3.700744722661737e-05,
"loss": 1.5286,
"step": 2960
},
{
"epoch": 0.2633445646391204,
"grad_norm": 6.201906204223633,
"learning_rate": 3.669815772166626e-05,
"loss": 1.5775,
"step": 2970
},
{
"epoch": 0.26423124667494235,
"grad_norm": 8.181777954101562,
"learning_rate": 3.6389415003351454e-05,
"loss": 1.6203,
"step": 2980
},
{
"epoch": 0.26511792871076434,
"grad_norm": 8.13985824584961,
"learning_rate": 3.608123176287686e-05,
"loss": 1.4212,
"step": 2990
},
{
"epoch": 0.26600461074658627,
"grad_norm": 7.873915672302246,
"learning_rate": 3.577362066844839e-05,
"loss": 1.4327,
"step": 3000
},
{
"epoch": 0.26600461074658627,
"eval_loss": 1.5497733354568481,
"eval_runtime": 59.3515,
"eval_samples_per_second": 8.424,
"eval_steps_per_second": 8.424,
"step": 3000
},
{
"epoch": 0.26689129278240825,
"grad_norm": 9.156728744506836,
"learning_rate": 3.546659436475333e-05,
"loss": 1.4905,
"step": 3010
},
{
"epoch": 0.2677779748182302,
"grad_norm": 9.707823753356934,
"learning_rate": 3.516016547244048e-05,
"loss": 1.537,
"step": 3020
},
{
"epoch": 0.26866465685405216,
"grad_norm": 7.352388858795166,
"learning_rate": 3.485434658760141e-05,
"loss": 1.4339,
"step": 3030
},
{
"epoch": 0.2695513388898741,
"grad_norm": 6.827575206756592,
"learning_rate": 3.454915028125264e-05,
"loss": 1.4186,
"step": 3040
},
{
"epoch": 0.270438020925696,
"grad_norm": 10.17138671875,
"learning_rate": 3.424458909881898e-05,
"loss": 1.7247,
"step": 3050
},
{
"epoch": 0.271324702961518,
"grad_norm": 6.4703850746154785,
"learning_rate": 3.394067555961773e-05,
"loss": 1.5231,
"step": 3060
},
{
"epoch": 0.27221138499733993,
"grad_norm": 10.700502395629883,
"learning_rate": 3.363742215634417e-05,
"loss": 1.6463,
"step": 3070
},
{
"epoch": 0.2730980670331619,
"grad_norm": 7.374842643737793,
"learning_rate": 3.333484135455793e-05,
"loss": 1.4507,
"step": 3080
},
{
"epoch": 0.27398474906898385,
"grad_norm": 20.006195068359375,
"learning_rate": 3.303294559217064e-05,
"loss": 1.5481,
"step": 3090
},
{
"epoch": 0.27487143110480583,
"grad_norm": 9.058706283569336,
"learning_rate": 3.273174727893464e-05,
"loss": 1.4567,
"step": 3100
},
{
"epoch": 0.27575811314062776,
"grad_norm": 6.159706115722656,
"learning_rate": 3.243125879593287e-05,
"loss": 1.6317,
"step": 3110
},
{
"epoch": 0.27664479517644974,
"grad_norm": 7.272984981536865,
"learning_rate": 3.213149249506998e-05,
"loss": 1.4127,
"step": 3120
},
{
"epoch": 0.27753147721227167,
"grad_norm": 7.003290176391602,
"learning_rate": 3.183246069856444e-05,
"loss": 1.4704,
"step": 3130
},
{
"epoch": 0.27841815924809366,
"grad_norm": 9.395560264587402,
"learning_rate": 3.15341756984422e-05,
"loss": 1.4897,
"step": 3140
},
{
"epoch": 0.2793048412839156,
"grad_norm": 9.366192817687988,
"learning_rate": 3.123664975603131e-05,
"loss": 1.4859,
"step": 3150
},
{
"epoch": 0.2793048412839156,
"eval_loss": 1.5420976877212524,
"eval_runtime": 59.3868,
"eval_samples_per_second": 8.419,
"eval_steps_per_second": 8.419,
"step": 3150
},
{
"epoch": 0.28019152331973757,
"grad_norm": 9.653646469116211,
"learning_rate": 3.093989510145792e-05,
"loss": 1.5305,
"step": 3160
},
{
"epoch": 0.2810782053555595,
"grad_norm": 7.124954700469971,
"learning_rate": 3.0643923933143614e-05,
"loss": 1.5319,
"step": 3170
},
{
"epoch": 0.2819648873913814,
"grad_norm": 7.334105968475342,
"learning_rate": 3.0348748417303834e-05,
"loss": 1.4719,
"step": 3180
},
{
"epoch": 0.2828515694272034,
"grad_norm": 8.093483924865723,
"learning_rate": 3.005438068744793e-05,
"loss": 1.6516,
"step": 3190
},
{
"epoch": 0.28373825146302534,
"grad_norm": 7.165219306945801,
"learning_rate": 2.9760832843880317e-05,
"loss": 1.5383,
"step": 3200
},
{
"epoch": 0.2846249334988473,
"grad_norm": 8.381077766418457,
"learning_rate": 2.9468116953203113e-05,
"loss": 1.5084,
"step": 3210
},
{
"epoch": 0.28551161553466925,
"grad_norm": 8.025052070617676,
"learning_rate": 2.917624504782007e-05,
"loss": 1.4437,
"step": 3220
},
{
"epoch": 0.28639829757049123,
"grad_norm": 6.2781453132629395,
"learning_rate": 2.8885229125442027e-05,
"loss": 1.6153,
"step": 3230
},
{
"epoch": 0.28728497960631316,
"grad_norm": 7.848852157592773,
"learning_rate": 2.8595081148593748e-05,
"loss": 1.4542,
"step": 3240
},
{
"epoch": 0.28817166164213515,
"grad_norm": 9.784353256225586,
"learning_rate": 2.8305813044122107e-05,
"loss": 1.4358,
"step": 3250
},
{
"epoch": 0.2890583436779571,
"grad_norm": 8.469407081604004,
"learning_rate": 2.8017436702705908e-05,
"loss": 1.4224,
"step": 3260
},
{
"epoch": 0.28994502571377906,
"grad_norm": 7.443441390991211,
"learning_rate": 2.7729963978367048e-05,
"loss": 1.5289,
"step": 3270
},
{
"epoch": 0.290831707749601,
"grad_norm": 9.007468223571777,
"learning_rate": 2.7443406687983272e-05,
"loss": 1.4351,
"step": 3280
},
{
"epoch": 0.29171838978542297,
"grad_norm": 9.351861953735352,
"learning_rate": 2.715777661080242e-05,
"loss": 1.5687,
"step": 3290
},
{
"epoch": 0.2926050718212449,
"grad_norm": 7.077907562255859,
"learning_rate": 2.6873085487958257e-05,
"loss": 1.7491,
"step": 3300
},
{
"epoch": 0.2926050718212449,
"eval_loss": 1.5311921834945679,
"eval_runtime": 59.4179,
"eval_samples_per_second": 8.415,
"eval_steps_per_second": 8.415,
"step": 3300
},
{
"epoch": 0.29349175385706683,
"grad_norm": 5.730030059814453,
"learning_rate": 2.6589345021987728e-05,
"loss": 1.5217,
"step": 3310
},
{
"epoch": 0.2943784358928888,
"grad_norm": 6.85732889175415,
"learning_rate": 2.6306566876350076e-05,
"loss": 1.4184,
"step": 3320
},
{
"epoch": 0.29526511792871074,
"grad_norm": 9.774615287780762,
"learning_rate": 2.602476267494732e-05,
"loss": 1.4287,
"step": 3330
},
{
"epoch": 0.2961517999645327,
"grad_norm": 6.502627372741699,
"learning_rate": 2.5743944001646398e-05,
"loss": 1.6562,
"step": 3340
},
{
"epoch": 0.29703848200035465,
"grad_norm": 10.487425804138184,
"learning_rate": 2.546412239980313e-05,
"loss": 1.5361,
"step": 3350
},
{
"epoch": 0.29792516403617664,
"grad_norm": 6.752458095550537,
"learning_rate": 2.518530937178752e-05,
"loss": 1.473,
"step": 3360
},
{
"epoch": 0.29881184607199857,
"grad_norm": 9.102508544921875,
"learning_rate": 2.4907516378511142e-05,
"loss": 1.626,
"step": 3370
},
{
"epoch": 0.29969852810782055,
"grad_norm": 11.789603233337402,
"learning_rate": 2.4630754838955908e-05,
"loss": 1.5676,
"step": 3380
},
{
"epoch": 0.3005852101436425,
"grad_norm": 4.690525531768799,
"learning_rate": 2.4355036129704707e-05,
"loss": 1.3247,
"step": 3390
},
{
"epoch": 0.30147189217946446,
"grad_norm": 8.3900785446167,
"learning_rate": 2.4080371584473755e-05,
"loss": 1.5134,
"step": 3400
},
{
"epoch": 0.3023585742152864,
"grad_norm": 6.29799747467041,
"learning_rate": 2.380677249364673e-05,
"loss": 1.4641,
"step": 3410
},
{
"epoch": 0.3032452562511084,
"grad_norm": 6.937891960144043,
"learning_rate": 2.3534250103810636e-05,
"loss": 1.5212,
"step": 3420
},
{
"epoch": 0.3041319382869303,
"grad_norm": 8.222491264343262,
"learning_rate": 2.326281561729352e-05,
"loss": 1.452,
"step": 3430
},
{
"epoch": 0.3050186203227523,
"grad_norm": 7.146228313446045,
"learning_rate": 2.299248019170401e-05,
"loss": 1.4393,
"step": 3440
},
{
"epoch": 0.3059053023585742,
"grad_norm": 8.970130920410156,
"learning_rate": 2.2723254939472577e-05,
"loss": 1.5834,
"step": 3450
},
{
"epoch": 0.3059053023585742,
"eval_loss": 1.5162127017974854,
"eval_runtime": 59.4395,
"eval_samples_per_second": 8.412,
"eval_steps_per_second": 8.412,
"step": 3450
},
{
"epoch": 0.30679198439439614,
"grad_norm": 8.676182746887207,
"learning_rate": 2.2455150927394888e-05,
"loss": 1.4935,
"step": 3460
},
{
"epoch": 0.30767866643021813,
"grad_norm": 7.3444719314575195,
"learning_rate": 2.2188179176176773e-05,
"loss": 1.4348,
"step": 3470
},
{
"epoch": 0.30856534846604006,
"grad_norm": 8.14013957977295,
"learning_rate": 2.1922350659981268e-05,
"loss": 1.486,
"step": 3480
},
{
"epoch": 0.30945203050186204,
"grad_norm": 8.495216369628906,
"learning_rate": 2.1657676305977525e-05,
"loss": 1.4254,
"step": 3490
},
{
"epoch": 0.31033871253768397,
"grad_norm": 6.765787601470947,
"learning_rate": 2.1394166993891536e-05,
"loss": 1.4532,
"step": 3500
},
{
"epoch": 0.31122539457350595,
"grad_norm": 7.284378528594971,
"learning_rate": 2.1131833555559044e-05,
"loss": 1.4648,
"step": 3510
},
{
"epoch": 0.3121120766093279,
"grad_norm": 5.678081512451172,
"learning_rate": 2.0870686774480203e-05,
"loss": 1.4927,
"step": 3520
},
{
"epoch": 0.31299875864514987,
"grad_norm": 7.111896514892578,
"learning_rate": 2.0610737385376356e-05,
"loss": 1.3698,
"step": 3530
},
{
"epoch": 0.3138854406809718,
"grad_norm": 8.32049560546875,
"learning_rate": 2.035199607374872e-05,
"loss": 1.3701,
"step": 3540
},
{
"epoch": 0.3147721227167938,
"grad_norm": 5.531152248382568,
"learning_rate": 2.009447347543921e-05,
"loss": 1.4623,
"step": 3550
},
{
"epoch": 0.3156588047526157,
"grad_norm": 11.240205764770508,
"learning_rate": 1.983818017619318e-05,
"loss": 1.4952,
"step": 3560
},
{
"epoch": 0.3165454867884377,
"grad_norm": 9.017026901245117,
"learning_rate": 1.9583126711224347e-05,
"loss": 1.3575,
"step": 3570
},
{
"epoch": 0.3174321688242596,
"grad_norm": 7.459463596343994,
"learning_rate": 1.9329323564781685e-05,
"loss": 1.5883,
"step": 3580
},
{
"epoch": 0.31831885086008155,
"grad_norm": 8.614239692687988,
"learning_rate": 1.907678116971843e-05,
"loss": 1.3795,
"step": 3590
},
{
"epoch": 0.31920553289590353,
"grad_norm": 7.115685939788818,
"learning_rate": 1.882550990706333e-05,
"loss": 1.4851,
"step": 3600
},
{
"epoch": 0.31920553289590353,
"eval_loss": 1.507421612739563,
"eval_runtime": 59.4282,
"eval_samples_per_second": 8.414,
"eval_steps_per_second": 8.414,
"step": 3600
},
{
"epoch": 0.32009221493172546,
"grad_norm": 7.0822882652282715,
"learning_rate": 1.8575520105593824e-05,
"loss": 1.4626,
"step": 3610
},
{
"epoch": 0.32097889696754744,
"grad_norm": 8.020169258117676,
"learning_rate": 1.8326822041411528e-05,
"loss": 1.4378,
"step": 3620
},
{
"epoch": 0.3218655790033694,
"grad_norm": 7.411886692047119,
"learning_rate": 1.8079425937519732e-05,
"loss": 1.4582,
"step": 3630
},
{
"epoch": 0.32275226103919136,
"grad_norm": 8.050488471984863,
"learning_rate": 1.7833341963403314e-05,
"loss": 1.5404,
"step": 3640
},
{
"epoch": 0.3236389430750133,
"grad_norm": 7.69305944442749,
"learning_rate": 1.7588580234610594e-05,
"loss": 1.3396,
"step": 3650
},
{
"epoch": 0.32452562511083527,
"grad_norm": 10.572087287902832,
"learning_rate": 1.7345150812337567e-05,
"loss": 1.5023,
"step": 3660
},
{
"epoch": 0.3254123071466572,
"grad_norm": 9.625089645385742,
"learning_rate": 1.7103063703014376e-05,
"loss": 1.3099,
"step": 3670
},
{
"epoch": 0.3262989891824792,
"grad_norm": 7.519534587860107,
"learning_rate": 1.686232885789386e-05,
"loss": 1.4512,
"step": 3680
},
{
"epoch": 0.3271856712183011,
"grad_norm": 8.976761817932129,
"learning_rate": 1.6622956172642604e-05,
"loss": 1.5594,
"step": 3690
},
{
"epoch": 0.3280723532541231,
"grad_norm": 6.362197399139404,
"learning_rate": 1.638495548693416e-05,
"loss": 1.3491,
"step": 3700
},
{
"epoch": 0.328959035289945,
"grad_norm": 7.0223469734191895,
"learning_rate": 1.6148336584044543e-05,
"loss": 1.5594,
"step": 3710
},
{
"epoch": 0.329845717325767,
"grad_norm": 7.8357625007629395,
"learning_rate": 1.5913109190450035e-05,
"loss": 1.6264,
"step": 3720
},
{
"epoch": 0.33073239936158894,
"grad_norm": 7.040388107299805,
"learning_rate": 1.5679282975427494e-05,
"loss": 1.4702,
"step": 3730
},
{
"epoch": 0.33161908139741086,
"grad_norm": 8.075628280639648,
"learning_rate": 1.5446867550656772e-05,
"loss": 1.5274,
"step": 3740
},
{
"epoch": 0.33250576343323285,
"grad_norm": 10.086247444152832,
"learning_rate": 1.5215872469825685e-05,
"loss": 1.5678,
"step": 3750
},
{
"epoch": 0.33250576343323285,
"eval_loss": 1.499360203742981,
"eval_runtime": 59.3955,
"eval_samples_per_second": 8.418,
"eval_steps_per_second": 8.418,
"step": 3750
},
{
"epoch": 0.3333924454690548,
"grad_norm": 6.615363597869873,
"learning_rate": 1.4986307228237271e-05,
"loss": 1.4275,
"step": 3760
},
{
"epoch": 0.33427912750487676,
"grad_norm": 10.221020698547363,
"learning_rate": 1.4758181262419428e-05,
"loss": 1.5383,
"step": 3770
},
{
"epoch": 0.3351658095406987,
"grad_norm": 7.772680759429932,
"learning_rate": 1.4531503949737111e-05,
"loss": 1.4759,
"step": 3780
},
{
"epoch": 0.3360524915765207,
"grad_norm": 12.315176963806152,
"learning_rate": 1.4306284608006841e-05,
"loss": 1.6371,
"step": 3790
},
{
"epoch": 0.3369391736123426,
"grad_norm": 5.77493953704834,
"learning_rate": 1.408253249511363e-05,
"loss": 1.5273,
"step": 3800
},
{
"epoch": 0.3378258556481646,
"grad_norm": 7.274715423583984,
"learning_rate": 1.3860256808630431e-05,
"loss": 1.5488,
"step": 3810
},
{
"epoch": 0.3387125376839865,
"grad_norm": 5.848362922668457,
"learning_rate": 1.3639466685440138e-05,
"loss": 1.3308,
"step": 3820
},
{
"epoch": 0.3395992197198085,
"grad_norm": 7.360718727111816,
"learning_rate": 1.3420171201359936e-05,
"loss": 1.3528,
"step": 3830
},
{
"epoch": 0.3404859017556304,
"grad_norm": 7.868961334228516,
"learning_rate": 1.3202379370768256e-05,
"loss": 1.4614,
"step": 3840
},
{
"epoch": 0.3413725837914524,
"grad_norm": 9.383559226989746,
"learning_rate": 1.2986100146234235e-05,
"loss": 1.4874,
"step": 3850
},
{
"epoch": 0.34225926582727434,
"grad_norm": 11.13818645477295,
"learning_rate": 1.277134241814966e-05,
"loss": 1.3849,
"step": 3860
},
{
"epoch": 0.34314594786309627,
"grad_norm": 10.342170715332031,
"learning_rate": 1.2558115014363595e-05,
"loss": 1.3115,
"step": 3870
},
{
"epoch": 0.34403262989891825,
"grad_norm": 6.494439125061035,
"learning_rate": 1.2346426699819462e-05,
"loss": 1.4533,
"step": 3880
},
{
"epoch": 0.3449193119347402,
"grad_norm": 9.364208221435547,
"learning_rate": 1.2136286176194748e-05,
"loss": 1.4229,
"step": 3890
},
{
"epoch": 0.34580599397056216,
"grad_norm": 6.627419471740723,
"learning_rate": 1.1927702081543282e-05,
"loss": 1.2769,
"step": 3900
},
{
"epoch": 0.34580599397056216,
"eval_loss": 1.4924039840698242,
"eval_runtime": 59.4072,
"eval_samples_per_second": 8.416,
"eval_steps_per_second": 8.416,
"step": 3900
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3131363193066783e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}