v-r-5000 / trainer_state.json
chieunq's picture
Upload folder using huggingface_hub
7a946d1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21892377074302727,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00043784754148605456,
"grad_norm": 11.96724490271355,
"learning_rate": 7.519033870117711e-06,
"loss": 0.682,
"step": 10
},
{
"epoch": 0.0008756950829721091,
"grad_norm": 16.377854891876865,
"learning_rate": 9.782488603436574e-06,
"loss": 0.5773,
"step": 20
},
{
"epoch": 0.0013135426244581636,
"grad_norm": 19.097945880780852,
"learning_rate": 1.1106524744507912e-05,
"loss": 0.4648,
"step": 30
},
{
"epoch": 0.0017513901659442182,
"grad_norm": 17.780105205349074,
"learning_rate": 1.2045943336755435e-05,
"loss": 0.5231,
"step": 40
},
{
"epoch": 0.002189237707430273,
"grad_norm": 11.341588174257303,
"learning_rate": 1.2774613006916558e-05,
"loss": 0.373,
"step": 50
},
{
"epoch": 0.002627085248916327,
"grad_norm": 12.468061614573642,
"learning_rate": 1.3369979477826773e-05,
"loss": 0.4499,
"step": 60
},
{
"epoch": 0.003064932790402382,
"grad_norm": 14.844241870962383,
"learning_rate": 1.3873354656555003e-05,
"loss": 0.4481,
"step": 70
},
{
"epoch": 0.0035027803318884365,
"grad_norm": 13.17618773461328,
"learning_rate": 1.4268322372265782e-05,
"loss": 0.4385,
"step": 80
},
{
"epoch": 0.003940627873374491,
"grad_norm": 13.959456456486652,
"learning_rate": 1.4657529486032087e-05,
"loss": 0.5157,
"step": 90
},
{
"epoch": 0.004378475414860546,
"grad_norm": 9.088640068563528,
"learning_rate": 1.5005248620577926e-05,
"loss": 0.5372,
"step": 100
},
{
"epoch": 0.0048163229563466005,
"grad_norm": 8.985973448159925,
"learning_rate": 1.531947884589086e-05,
"loss": 0.3981,
"step": 110
},
{
"epoch": 0.005254170497832654,
"grad_norm": 16.03083595499672,
"learning_rate": 1.5606107901730336e-05,
"loss": 0.4008,
"step": 120
},
{
"epoch": 0.005692018039318709,
"grad_norm": 23.2563822200454,
"learning_rate": 1.586959551766198e-05,
"loss": 0.4434,
"step": 130
},
{
"epoch": 0.006129865580804764,
"grad_norm": 13.106170808319108,
"learning_rate": 1.611340086727408e-05,
"loss": 0.4264,
"step": 140
},
{
"epoch": 0.006567713122290818,
"grad_norm": 10.892983486327262,
"learning_rate": 1.634026115826661e-05,
"loss": 0.538,
"step": 150
},
{
"epoch": 0.007005560663776873,
"grad_norm": 15.273372562202105,
"learning_rate": 1.6531777105584646e-05,
"loss": 0.4921,
"step": 160
},
{
"epoch": 0.007443408205262928,
"grad_norm": 12.078764233569435,
"learning_rate": 1.6732175860784077e-05,
"loss": 0.4219,
"step": 170
},
{
"epoch": 0.007881255746748982,
"grad_norm": 18.9218120999161,
"learning_rate": 1.692098421935095e-05,
"loss": 0.51,
"step": 180
},
{
"epoch": 0.008319103288235036,
"grad_norm": 11.062242407951706,
"learning_rate": 1.7099469894607657e-05,
"loss": 0.4021,
"step": 190
},
{
"epoch": 0.008756950829721092,
"grad_norm": 10.4039117059008,
"learning_rate": 1.726870335389679e-05,
"loss": 0.414,
"step": 200
},
{
"epoch": 0.009194798371207145,
"grad_norm": 13.360369800315105,
"learning_rate": 1.742959672866302e-05,
"loss": 0.3934,
"step": 210
},
{
"epoch": 0.009632645912693201,
"grad_norm": 17.885935179522868,
"learning_rate": 1.7582933579209726e-05,
"loss": 0.4299,
"step": 220
},
{
"epoch": 0.010070493454179255,
"grad_norm": 15.613293466188615,
"learning_rate": 1.7729391978127236e-05,
"loss": 0.4348,
"step": 230
},
{
"epoch": 0.010508340995665309,
"grad_norm": 8.671536275760825,
"learning_rate": 1.7869562635049198e-05,
"loss": 0.4612,
"step": 240
},
{
"epoch": 0.010946188537151364,
"grad_norm": 16.1819762006921,
"learning_rate": 1.8003963288492603e-05,
"loss": 0.3756,
"step": 250
},
{
"epoch": 0.011384036078637418,
"grad_norm": 13.36774911152187,
"learning_rate": 1.813305025098084e-05,
"loss": 0.4427,
"step": 260
},
{
"epoch": 0.011821883620123473,
"grad_norm": 10.629955258208,
"learning_rate": 1.8257227757513754e-05,
"loss": 0.4213,
"step": 270
},
{
"epoch": 0.012259731161609527,
"grad_norm": 19.332750684355226,
"learning_rate": 1.8376855600592943e-05,
"loss": 0.346,
"step": 280
},
{
"epoch": 0.012697578703095583,
"grad_norm": 16.066310801594668,
"learning_rate": 1.8492255415374714e-05,
"loss": 0.4522,
"step": 290
},
{
"epoch": 0.013135426244581637,
"grad_norm": 14.844884825041587,
"learning_rate": 1.860371589158547e-05,
"loss": 0.4505,
"step": 300
},
{
"epoch": 0.01357327378606769,
"grad_norm": 16.539413080837956,
"learning_rate": 1.8711497124872535e-05,
"loss": 0.4296,
"step": 310
},
{
"epoch": 0.014011121327553746,
"grad_norm": 11.178304803994907,
"learning_rate": 1.8815834272664066e-05,
"loss": 0.4404,
"step": 320
},
{
"epoch": 0.0144489688690398,
"grad_norm": 9.171001542204067,
"learning_rate": 1.8916940643811347e-05,
"loss": 0.3785,
"step": 330
},
{
"epoch": 0.014886816410525855,
"grad_norm": 18.255820822315396,
"learning_rate": 1.9015010324094007e-05,
"loss": 0.4524,
"step": 340
},
{
"epoch": 0.015324663952011909,
"grad_norm": 13.044007567571079,
"learning_rate": 1.911022041882251e-05,
"loss": 0.4876,
"step": 350
},
{
"epoch": 0.015762511493497965,
"grad_norm": 13.758756866189971,
"learning_rate": 1.9202732977654023e-05,
"loss": 0.5539,
"step": 360
},
{
"epoch": 0.01620035903498402,
"grad_norm": 15.903726373816992,
"learning_rate": 1.929269665417383e-05,
"loss": 0.4976,
"step": 370
},
{
"epoch": 0.016638206576470072,
"grad_norm": 15.248268904693534,
"learning_rate": 1.938024814292676e-05,
"loss": 0.5259,
"step": 380
},
{
"epoch": 0.017076054117956128,
"grad_norm": 10.722522838992456,
"learning_rate": 1.9465513428778125e-05,
"loss": 0.489,
"step": 390
},
{
"epoch": 0.017513901659442183,
"grad_norm": 16.61965558187522,
"learning_rate": 1.9548608877267744e-05,
"loss": 0.439,
"step": 400
},
{
"epoch": 0.017951749200928235,
"grad_norm": 19.548934962148113,
"learning_rate": 1.9629642189639832e-05,
"loss": 0.4282,
"step": 410
},
{
"epoch": 0.01838959674241429,
"grad_norm": 15.936747454659427,
"learning_rate": 1.9708713242215694e-05,
"loss": 0.3974,
"step": 420
},
{
"epoch": 0.018827444283900346,
"grad_norm": 17.53178788446236,
"learning_rate": 1.9785914826520243e-05,
"loss": 0.4517,
"step": 430
},
{
"epoch": 0.019265291825386402,
"grad_norm": 15.309413053796602,
"learning_rate": 1.9861333303919378e-05,
"loss": 0.4155,
"step": 440
},
{
"epoch": 0.019703139366872454,
"grad_norm": 13.248113533897241,
"learning_rate": 1.9935049186350462e-05,
"loss": 0.4589,
"step": 450
},
{
"epoch": 0.02014098690835851,
"grad_norm": 18.437663863148952,
"learning_rate": 2e-05,
"loss": 0.388,
"step": 460
},
{
"epoch": 0.020578834449844565,
"grad_norm": 14.089094860711976,
"learning_rate": 1.9995577276044317e-05,
"loss": 0.4637,
"step": 470
},
{
"epoch": 0.021016681991330617,
"grad_norm": 50.22724968906943,
"learning_rate": 1.9991154552088635e-05,
"loss": 0.3967,
"step": 480
},
{
"epoch": 0.021454529532816673,
"grad_norm": 9.448811402831021,
"learning_rate": 1.998673182813295e-05,
"loss": 0.405,
"step": 490
},
{
"epoch": 0.021892377074302728,
"grad_norm": 12.511886438347087,
"learning_rate": 1.9982309104177265e-05,
"loss": 0.4245,
"step": 500
},
{
"epoch": 0.022330224615788784,
"grad_norm": 26.23559407530874,
"learning_rate": 1.997788638022158e-05,
"loss": 0.4486,
"step": 510
},
{
"epoch": 0.022768072157274836,
"grad_norm": 10.162620448000466,
"learning_rate": 1.9973463656265898e-05,
"loss": 0.4898,
"step": 520
},
{
"epoch": 0.02320591969876089,
"grad_norm": 10.505426367087482,
"learning_rate": 1.996904093231021e-05,
"loss": 0.4011,
"step": 530
},
{
"epoch": 0.023643767240246947,
"grad_norm": 7.19153886182727,
"learning_rate": 1.9964618208354528e-05,
"loss": 0.4566,
"step": 540
},
{
"epoch": 0.024081614781733,
"grad_norm": 16.625591066795565,
"learning_rate": 1.9960195484398842e-05,
"loss": 0.3865,
"step": 550
},
{
"epoch": 0.024519462323219055,
"grad_norm": 21.196277881857508,
"learning_rate": 1.995577276044316e-05,
"loss": 0.3797,
"step": 560
},
{
"epoch": 0.02495730986470511,
"grad_norm": 19.530616395972775,
"learning_rate": 1.9951350036487472e-05,
"loss": 0.4596,
"step": 570
},
{
"epoch": 0.025395157406191166,
"grad_norm": 10.52123656050975,
"learning_rate": 1.994692731253179e-05,
"loss": 0.3836,
"step": 580
},
{
"epoch": 0.025833004947677218,
"grad_norm": 14.373796230168502,
"learning_rate": 1.9942504588576105e-05,
"loss": 0.4916,
"step": 590
},
{
"epoch": 0.026270852489163273,
"grad_norm": 11.25970447321859,
"learning_rate": 1.9938081864620424e-05,
"loss": 0.3832,
"step": 600
},
{
"epoch": 0.02670870003064933,
"grad_norm": 16.21224392326194,
"learning_rate": 1.9933659140664735e-05,
"loss": 0.4749,
"step": 610
},
{
"epoch": 0.02714654757213538,
"grad_norm": 14.71066366579976,
"learning_rate": 1.9929236416709053e-05,
"loss": 0.3882,
"step": 620
},
{
"epoch": 0.027584395113621436,
"grad_norm": 11.37877349304637,
"learning_rate": 1.992481369275337e-05,
"loss": 0.4396,
"step": 630
},
{
"epoch": 0.028022242655107492,
"grad_norm": 16.754438580812046,
"learning_rate": 1.9920390968797683e-05,
"loss": 0.4539,
"step": 640
},
{
"epoch": 0.028460090196593547,
"grad_norm": 11.479142887670093,
"learning_rate": 1.9915968244841998e-05,
"loss": 0.451,
"step": 650
},
{
"epoch": 0.0288979377380796,
"grad_norm": 8.382864831937072,
"learning_rate": 1.9911545520886316e-05,
"loss": 0.4716,
"step": 660
},
{
"epoch": 0.029335785279565655,
"grad_norm": 19.710550432188437,
"learning_rate": 1.990712279693063e-05,
"loss": 0.3545,
"step": 670
},
{
"epoch": 0.02977363282105171,
"grad_norm": 17.903211831860908,
"learning_rate": 1.9902700072974946e-05,
"loss": 0.4335,
"step": 680
},
{
"epoch": 0.030211480362537766,
"grad_norm": 16.390743229733264,
"learning_rate": 1.989827734901926e-05,
"loss": 0.3806,
"step": 690
},
{
"epoch": 0.030649327904023818,
"grad_norm": 14.980896037745874,
"learning_rate": 1.989385462506358e-05,
"loss": 0.3767,
"step": 700
},
{
"epoch": 0.031087175445509874,
"grad_norm": 11.397175060743534,
"learning_rate": 1.9889431901107894e-05,
"loss": 0.4746,
"step": 710
},
{
"epoch": 0.03152502298699593,
"grad_norm": 19.04061884539536,
"learning_rate": 1.988500917715221e-05,
"loss": 0.413,
"step": 720
},
{
"epoch": 0.031962870528481985,
"grad_norm": 13.947751231087285,
"learning_rate": 1.9880586453196527e-05,
"loss": 0.3386,
"step": 730
},
{
"epoch": 0.03240071806996804,
"grad_norm": 41.64839133720008,
"learning_rate": 1.987616372924084e-05,
"loss": 0.4444,
"step": 740
},
{
"epoch": 0.03283856561145409,
"grad_norm": 13.185903732811664,
"learning_rate": 1.9871741005285157e-05,
"loss": 0.4383,
"step": 750
},
{
"epoch": 0.033276413152940144,
"grad_norm": 15.456378713797612,
"learning_rate": 1.9867318281329472e-05,
"loss": 0.4087,
"step": 760
},
{
"epoch": 0.0337142606944262,
"grad_norm": 13.751468649371544,
"learning_rate": 1.986289555737379e-05,
"loss": 0.4245,
"step": 770
},
{
"epoch": 0.034152108235912255,
"grad_norm": 15.975026190651212,
"learning_rate": 1.9858472833418102e-05,
"loss": 0.4263,
"step": 780
},
{
"epoch": 0.03458995577739831,
"grad_norm": 11.294733923572386,
"learning_rate": 1.985405010946242e-05,
"loss": 0.5273,
"step": 790
},
{
"epoch": 0.03502780331888437,
"grad_norm": 22.907563592109977,
"learning_rate": 1.9849627385506735e-05,
"loss": 0.3766,
"step": 800
},
{
"epoch": 0.03546565086037042,
"grad_norm": 19.465013466925168,
"learning_rate": 1.9845204661551053e-05,
"loss": 0.3739,
"step": 810
},
{
"epoch": 0.03590349840185647,
"grad_norm": 13.553846560729426,
"learning_rate": 1.9840781937595365e-05,
"loss": 0.481,
"step": 820
},
{
"epoch": 0.036341345943342526,
"grad_norm": 22.910094642393723,
"learning_rate": 1.9836359213639683e-05,
"loss": 0.4659,
"step": 830
},
{
"epoch": 0.03677919348482858,
"grad_norm": 16.26462964503755,
"learning_rate": 1.9831936489683998e-05,
"loss": 0.381,
"step": 840
},
{
"epoch": 0.03721704102631464,
"grad_norm": 9.94980804411553,
"learning_rate": 1.9827513765728313e-05,
"loss": 0.4003,
"step": 850
},
{
"epoch": 0.03765488856780069,
"grad_norm": 13.328592093222,
"learning_rate": 1.9823091041772628e-05,
"loss": 0.3755,
"step": 860
},
{
"epoch": 0.03809273610928675,
"grad_norm": 18.322055654010875,
"learning_rate": 1.9818668317816946e-05,
"loss": 0.494,
"step": 870
},
{
"epoch": 0.038530583650772804,
"grad_norm": 12.090527797691786,
"learning_rate": 1.981424559386126e-05,
"loss": 0.4751,
"step": 880
},
{
"epoch": 0.03896843119225885,
"grad_norm": 19.070570141131398,
"learning_rate": 1.9809822869905576e-05,
"loss": 0.4483,
"step": 890
},
{
"epoch": 0.03940627873374491,
"grad_norm": 18.183543820420173,
"learning_rate": 1.980540014594989e-05,
"loss": 0.3881,
"step": 900
},
{
"epoch": 0.039844126275230964,
"grad_norm": 11.47870821074828,
"learning_rate": 1.980097742199421e-05,
"loss": 0.465,
"step": 910
},
{
"epoch": 0.04028197381671702,
"grad_norm": 4.662342565910864,
"learning_rate": 1.9796554698038524e-05,
"loss": 0.3553,
"step": 920
},
{
"epoch": 0.040719821358203075,
"grad_norm": 11.599443461780693,
"learning_rate": 1.979213197408284e-05,
"loss": 0.4284,
"step": 930
},
{
"epoch": 0.04115766889968913,
"grad_norm": 20.899400532717515,
"learning_rate": 1.9787709250127154e-05,
"loss": 0.3849,
"step": 940
},
{
"epoch": 0.041595516441175186,
"grad_norm": 13.83963592226824,
"learning_rate": 1.978328652617147e-05,
"loss": 0.4228,
"step": 950
},
{
"epoch": 0.042033363982661234,
"grad_norm": 10.129058777896109,
"learning_rate": 1.9778863802215787e-05,
"loss": 0.4608,
"step": 960
},
{
"epoch": 0.04247121152414729,
"grad_norm": 13.525629087473678,
"learning_rate": 1.9774441078260102e-05,
"loss": 0.4168,
"step": 970
},
{
"epoch": 0.042909059065633345,
"grad_norm": 12.286627992129988,
"learning_rate": 1.9770018354304417e-05,
"loss": 0.3782,
"step": 980
},
{
"epoch": 0.0433469066071194,
"grad_norm": 15.201916543698117,
"learning_rate": 1.976559563034873e-05,
"loss": 0.4614,
"step": 990
},
{
"epoch": 0.043784754148605456,
"grad_norm": 12.218728770361992,
"learning_rate": 1.976117290639305e-05,
"loss": 0.3442,
"step": 1000
},
{
"epoch": 0.04422260169009151,
"grad_norm": 15.316130787659379,
"learning_rate": 1.9756750182437365e-05,
"loss": 0.424,
"step": 1010
},
{
"epoch": 0.04466044923157757,
"grad_norm": 18.53714384430074,
"learning_rate": 1.975276973087725e-05,
"loss": 0.44,
"step": 1020
},
{
"epoch": 0.045098296773063616,
"grad_norm": 10.148359314556405,
"learning_rate": 1.9748347006921564e-05,
"loss": 0.4016,
"step": 1030
},
{
"epoch": 0.04553614431454967,
"grad_norm": 10.835539686055602,
"learning_rate": 1.974392428296588e-05,
"loss": 0.3899,
"step": 1040
},
{
"epoch": 0.04597399185603573,
"grad_norm": 13.894807990285942,
"learning_rate": 1.9739501559010197e-05,
"loss": 0.4385,
"step": 1050
},
{
"epoch": 0.04641183939752178,
"grad_norm": 13.530858128888262,
"learning_rate": 1.9735078835054512e-05,
"loss": 0.4324,
"step": 1060
},
{
"epoch": 0.04684968693900784,
"grad_norm": 12.011714939023786,
"learning_rate": 1.9730656111098827e-05,
"loss": 0.4168,
"step": 1070
},
{
"epoch": 0.047287534480493894,
"grad_norm": 11.289218951662432,
"learning_rate": 1.9726233387143142e-05,
"loss": 0.4438,
"step": 1080
},
{
"epoch": 0.04772538202197995,
"grad_norm": 13.657029328270559,
"learning_rate": 1.972181066318746e-05,
"loss": 0.3996,
"step": 1090
},
{
"epoch": 0.048163229563466,
"grad_norm": 8.72748000086665,
"learning_rate": 1.9717387939231775e-05,
"loss": 0.4578,
"step": 1100
},
{
"epoch": 0.04860107710495205,
"grad_norm": 10.975240759500627,
"learning_rate": 1.971296521527609e-05,
"loss": 0.4165,
"step": 1110
},
{
"epoch": 0.04903892464643811,
"grad_norm": 11.29698373153932,
"learning_rate": 1.9708542491320405e-05,
"loss": 0.3545,
"step": 1120
},
{
"epoch": 0.049476772187924165,
"grad_norm": 20.506288633859523,
"learning_rate": 1.9704119767364723e-05,
"loss": 0.3568,
"step": 1130
},
{
"epoch": 0.04991461972941022,
"grad_norm": 11.698600515801893,
"learning_rate": 1.9699697043409035e-05,
"loss": 0.4479,
"step": 1140
},
{
"epoch": 0.050352467270896276,
"grad_norm": 13.862971410140013,
"learning_rate": 1.9695274319453353e-05,
"loss": 0.3974,
"step": 1150
},
{
"epoch": 0.05079031481238233,
"grad_norm": 12.469196053464175,
"learning_rate": 1.9690851595497668e-05,
"loss": 0.4402,
"step": 1160
},
{
"epoch": 0.05122816235386838,
"grad_norm": 11.3264424935876,
"learning_rate": 1.9686428871541986e-05,
"loss": 0.3643,
"step": 1170
},
{
"epoch": 0.051666009895354435,
"grad_norm": 19.974826992464948,
"learning_rate": 1.9682006147586298e-05,
"loss": 0.3762,
"step": 1180
},
{
"epoch": 0.05210385743684049,
"grad_norm": 10.918494229304311,
"learning_rate": 1.9677583423630616e-05,
"loss": 0.3545,
"step": 1190
},
{
"epoch": 0.052541704978326546,
"grad_norm": 11.568181801830196,
"learning_rate": 1.967316069967493e-05,
"loss": 0.4007,
"step": 1200
},
{
"epoch": 0.0529795525198126,
"grad_norm": 17.075179869568725,
"learning_rate": 1.966873797571925e-05,
"loss": 0.3989,
"step": 1210
},
{
"epoch": 0.05341740006129866,
"grad_norm": 15.560201574750726,
"learning_rate": 1.9664315251763564e-05,
"loss": 0.4096,
"step": 1220
},
{
"epoch": 0.05385524760278471,
"grad_norm": 11.91165895325058,
"learning_rate": 1.965989252780788e-05,
"loss": 0.436,
"step": 1230
},
{
"epoch": 0.05429309514427076,
"grad_norm": 11.127002664395162,
"learning_rate": 1.9655469803852194e-05,
"loss": 0.3146,
"step": 1240
},
{
"epoch": 0.05473094268575682,
"grad_norm": 17.968378053215634,
"learning_rate": 1.965104707989651e-05,
"loss": 0.444,
"step": 1250
},
{
"epoch": 0.05516879022724287,
"grad_norm": 15.278957202738837,
"learning_rate": 1.9646624355940827e-05,
"loss": 0.461,
"step": 1260
},
{
"epoch": 0.05560663776872893,
"grad_norm": 14.312647123288594,
"learning_rate": 1.9642201631985142e-05,
"loss": 0.398,
"step": 1270
},
{
"epoch": 0.056044485310214984,
"grad_norm": 12.143629725906578,
"learning_rate": 1.9637778908029457e-05,
"loss": 0.3818,
"step": 1280
},
{
"epoch": 0.05648233285170104,
"grad_norm": 10.960126916192301,
"learning_rate": 1.9633356184073772e-05,
"loss": 0.4273,
"step": 1290
},
{
"epoch": 0.056920180393187095,
"grad_norm": 14.260248079643242,
"learning_rate": 1.962893346011809e-05,
"loss": 0.469,
"step": 1300
},
{
"epoch": 0.05735802793467314,
"grad_norm": 9.509984040352219,
"learning_rate": 1.9624510736162405e-05,
"loss": 0.3732,
"step": 1310
},
{
"epoch": 0.0577958754761592,
"grad_norm": 12.88397674204099,
"learning_rate": 1.962008801220672e-05,
"loss": 0.4473,
"step": 1320
},
{
"epoch": 0.058233723017645254,
"grad_norm": 13.708354791471775,
"learning_rate": 1.9615665288251035e-05,
"loss": 0.4093,
"step": 1330
},
{
"epoch": 0.05867157055913131,
"grad_norm": 11.009628243610226,
"learning_rate": 1.9611242564295353e-05,
"loss": 0.4258,
"step": 1340
},
{
"epoch": 0.059109418100617366,
"grad_norm": 15.94608664981347,
"learning_rate": 1.9606819840339664e-05,
"loss": 0.4117,
"step": 1350
},
{
"epoch": 0.05954726564210342,
"grad_norm": 8.631681646096013,
"learning_rate": 1.9602397116383983e-05,
"loss": 0.4384,
"step": 1360
},
{
"epoch": 0.05998511318358948,
"grad_norm": 12.838858533847487,
"learning_rate": 1.9597974392428298e-05,
"loss": 0.3764,
"step": 1370
},
{
"epoch": 0.06042296072507553,
"grad_norm": 41.39191821545224,
"learning_rate": 1.9593551668472616e-05,
"loss": 0.3166,
"step": 1380
},
{
"epoch": 0.06086080826656158,
"grad_norm": 13.630700275122859,
"learning_rate": 1.9589128944516927e-05,
"loss": 0.4142,
"step": 1390
},
{
"epoch": 0.061298655808047636,
"grad_norm": 10.941493062958882,
"learning_rate": 1.9584706220561246e-05,
"loss": 0.393,
"step": 1400
},
{
"epoch": 0.06173650334953369,
"grad_norm": 12.209335407537546,
"learning_rate": 1.958028349660556e-05,
"loss": 0.3882,
"step": 1410
},
{
"epoch": 0.06217435089101975,
"grad_norm": 11.160854212292483,
"learning_rate": 1.957586077264988e-05,
"loss": 0.3571,
"step": 1420
},
{
"epoch": 0.0626121984325058,
"grad_norm": 15.758848096703037,
"learning_rate": 1.957143804869419e-05,
"loss": 0.4188,
"step": 1430
},
{
"epoch": 0.06305004597399186,
"grad_norm": 14.95914949335199,
"learning_rate": 1.956701532473851e-05,
"loss": 0.3811,
"step": 1440
},
{
"epoch": 0.06348789351547791,
"grad_norm": 12.602990028799342,
"learning_rate": 1.9562592600782824e-05,
"loss": 0.351,
"step": 1450
},
{
"epoch": 0.06392574105696397,
"grad_norm": 11.138632661742742,
"learning_rate": 1.955816987682714e-05,
"loss": 0.3217,
"step": 1460
},
{
"epoch": 0.06436358859845003,
"grad_norm": 10.073650366260493,
"learning_rate": 1.9553747152871457e-05,
"loss": 0.3915,
"step": 1470
},
{
"epoch": 0.06480143613993608,
"grad_norm": 11.607027332443641,
"learning_rate": 1.954932442891577e-05,
"loss": 0.4501,
"step": 1480
},
{
"epoch": 0.06523928368142212,
"grad_norm": 11.6963320729176,
"learning_rate": 1.9544901704960087e-05,
"loss": 0.3876,
"step": 1490
},
{
"epoch": 0.06567713122290818,
"grad_norm": 12.978277397409164,
"learning_rate": 1.95404789810044e-05,
"loss": 0.4058,
"step": 1500
},
{
"epoch": 0.06611497876439423,
"grad_norm": 10.230356140798722,
"learning_rate": 1.953605625704872e-05,
"loss": 0.3503,
"step": 1510
},
{
"epoch": 0.06655282630588029,
"grad_norm": 10.411279694304964,
"learning_rate": 1.953163353309303e-05,
"loss": 0.4545,
"step": 1520
},
{
"epoch": 0.06699067384736634,
"grad_norm": 12.48524397593703,
"learning_rate": 1.952721080913735e-05,
"loss": 0.4056,
"step": 1530
},
{
"epoch": 0.0674285213888524,
"grad_norm": 9.054498757739506,
"learning_rate": 1.9522788085181664e-05,
"loss": 0.422,
"step": 1540
},
{
"epoch": 0.06786636893033846,
"grad_norm": 12.468833215574444,
"learning_rate": 1.9518365361225983e-05,
"loss": 0.3668,
"step": 1550
},
{
"epoch": 0.06830421647182451,
"grad_norm": 7.37855108081838,
"learning_rate": 1.9513942637270294e-05,
"loss": 0.3418,
"step": 1560
},
{
"epoch": 0.06874206401331057,
"grad_norm": 18.087926793945318,
"learning_rate": 1.9509519913314612e-05,
"loss": 0.4362,
"step": 1570
},
{
"epoch": 0.06917991155479662,
"grad_norm": 9.463333040806022,
"learning_rate": 1.9505097189358927e-05,
"loss": 0.351,
"step": 1580
},
{
"epoch": 0.06961775909628268,
"grad_norm": 13.249219763473633,
"learning_rate": 1.9500674465403246e-05,
"loss": 0.3334,
"step": 1590
},
{
"epoch": 0.07005560663776873,
"grad_norm": 9.494154085119547,
"learning_rate": 1.9496251741447557e-05,
"loss": 0.3766,
"step": 1600
},
{
"epoch": 0.07049345417925479,
"grad_norm": 12.84431192695635,
"learning_rate": 1.9491829017491875e-05,
"loss": 0.3904,
"step": 1610
},
{
"epoch": 0.07093130172074084,
"grad_norm": 10.983543223419419,
"learning_rate": 1.948740629353619e-05,
"loss": 0.4181,
"step": 1620
},
{
"epoch": 0.07136914926222689,
"grad_norm": 17.280817340311735,
"learning_rate": 1.948298356958051e-05,
"loss": 0.3994,
"step": 1630
},
{
"epoch": 0.07180699680371294,
"grad_norm": 11.179902123311116,
"learning_rate": 1.947856084562482e-05,
"loss": 0.41,
"step": 1640
},
{
"epoch": 0.072244844345199,
"grad_norm": 7.022304150684218,
"learning_rate": 1.947413812166914e-05,
"loss": 0.3959,
"step": 1650
},
{
"epoch": 0.07268269188668505,
"grad_norm": 11.715877252052287,
"learning_rate": 1.9469715397713453e-05,
"loss": 0.3781,
"step": 1660
},
{
"epoch": 0.07312053942817111,
"grad_norm": 10.5240917837852,
"learning_rate": 1.9465292673757768e-05,
"loss": 0.4285,
"step": 1670
},
{
"epoch": 0.07355838696965716,
"grad_norm": 13.043269992216052,
"learning_rate": 1.9460869949802083e-05,
"loss": 0.3953,
"step": 1680
},
{
"epoch": 0.07399623451114322,
"grad_norm": 10.473522043776144,
"learning_rate": 1.94564472258464e-05,
"loss": 0.3795,
"step": 1690
},
{
"epoch": 0.07443408205262927,
"grad_norm": 12.81124231632124,
"learning_rate": 1.9452024501890716e-05,
"loss": 0.4329,
"step": 1700
},
{
"epoch": 0.07487192959411533,
"grad_norm": 9.570293367327814,
"learning_rate": 1.944760177793503e-05,
"loss": 0.4536,
"step": 1710
},
{
"epoch": 0.07530977713560139,
"grad_norm": 13.781425492257062,
"learning_rate": 1.944317905397935e-05,
"loss": 0.3918,
"step": 1720
},
{
"epoch": 0.07574762467708744,
"grad_norm": 13.427510367449685,
"learning_rate": 1.943875633002366e-05,
"loss": 0.3379,
"step": 1730
},
{
"epoch": 0.0761854722185735,
"grad_norm": 12.599200486748096,
"learning_rate": 1.943433360606798e-05,
"loss": 0.3939,
"step": 1740
},
{
"epoch": 0.07662331976005955,
"grad_norm": 14.384131674124605,
"learning_rate": 1.9429910882112294e-05,
"loss": 0.405,
"step": 1750
},
{
"epoch": 0.07706116730154561,
"grad_norm": 9.414388690536624,
"learning_rate": 1.9425488158156612e-05,
"loss": 0.4003,
"step": 1760
},
{
"epoch": 0.07749901484303165,
"grad_norm": 18.045324270495772,
"learning_rate": 1.9421065434200924e-05,
"loss": 0.4676,
"step": 1770
},
{
"epoch": 0.0779368623845177,
"grad_norm": 11.920491989530506,
"learning_rate": 1.9416642710245242e-05,
"loss": 0.3452,
"step": 1780
},
{
"epoch": 0.07837470992600376,
"grad_norm": 13.25199040272023,
"learning_rate": 1.9412219986289557e-05,
"loss": 0.3281,
"step": 1790
},
{
"epoch": 0.07881255746748982,
"grad_norm": 18.052479006801686,
"learning_rate": 1.9407797262333875e-05,
"loss": 0.393,
"step": 1800
},
{
"epoch": 0.07925040500897587,
"grad_norm": 9.88776702327391,
"learning_rate": 1.9403374538378187e-05,
"loss": 0.3531,
"step": 1810
},
{
"epoch": 0.07968825255046193,
"grad_norm": 8.54763684547313,
"learning_rate": 1.9398951814422505e-05,
"loss": 0.4018,
"step": 1820
},
{
"epoch": 0.08012610009194798,
"grad_norm": 12.109050524886657,
"learning_rate": 1.939452909046682e-05,
"loss": 0.4658,
"step": 1830
},
{
"epoch": 0.08056394763343404,
"grad_norm": 7.7518635631951485,
"learning_rate": 1.9390106366511138e-05,
"loss": 0.4297,
"step": 1840
},
{
"epoch": 0.0810017951749201,
"grad_norm": 14.594279048895539,
"learning_rate": 1.938568364255545e-05,
"loss": 0.355,
"step": 1850
},
{
"epoch": 0.08143964271640615,
"grad_norm": 11.417142166667903,
"learning_rate": 1.9381260918599768e-05,
"loss": 0.392,
"step": 1860
},
{
"epoch": 0.0818774902578922,
"grad_norm": 11.637581528522489,
"learning_rate": 1.9376838194644083e-05,
"loss": 0.3802,
"step": 1870
},
{
"epoch": 0.08231533779937826,
"grad_norm": 11.85655948956895,
"learning_rate": 1.9372415470688398e-05,
"loss": 0.3977,
"step": 1880
},
{
"epoch": 0.08275318534086432,
"grad_norm": 10.54522592721261,
"learning_rate": 1.9367992746732713e-05,
"loss": 0.3971,
"step": 1890
},
{
"epoch": 0.08319103288235037,
"grad_norm": 11.259013994047795,
"learning_rate": 1.936357002277703e-05,
"loss": 0.4662,
"step": 1900
},
{
"epoch": 0.08362888042383643,
"grad_norm": 10.413681904734188,
"learning_rate": 1.9359147298821346e-05,
"loss": 0.2992,
"step": 1910
},
{
"epoch": 0.08406672796532247,
"grad_norm": 16.01143047206335,
"learning_rate": 1.935472457486566e-05,
"loss": 0.4038,
"step": 1920
},
{
"epoch": 0.08450457550680852,
"grad_norm": 8.758483697657311,
"learning_rate": 1.9350301850909976e-05,
"loss": 0.3402,
"step": 1930
},
{
"epoch": 0.08494242304829458,
"grad_norm": 13.47768507552826,
"learning_rate": 1.934587912695429e-05,
"loss": 0.3971,
"step": 1940
},
{
"epoch": 0.08538027058978064,
"grad_norm": 14.13100123568219,
"learning_rate": 1.934145640299861e-05,
"loss": 0.3485,
"step": 1950
},
{
"epoch": 0.08581811813126669,
"grad_norm": 11.367957313924562,
"learning_rate": 1.9337033679042924e-05,
"loss": 0.3561,
"step": 1960
},
{
"epoch": 0.08625596567275275,
"grad_norm": 10.362775839894933,
"learning_rate": 1.933261095508724e-05,
"loss": 0.3037,
"step": 1970
},
{
"epoch": 0.0866938132142388,
"grad_norm": 14.232027197294531,
"learning_rate": 1.9328188231131554e-05,
"loss": 0.3772,
"step": 1980
},
{
"epoch": 0.08713166075572486,
"grad_norm": 7.416354083499904,
"learning_rate": 1.9323765507175872e-05,
"loss": 0.3444,
"step": 1990
},
{
"epoch": 0.08756950829721091,
"grad_norm": 11.816638685730455,
"learning_rate": 1.9319342783220187e-05,
"loss": 0.4283,
"step": 2000
},
{
"epoch": 0.08800735583869697,
"grad_norm": 7.26923187311744,
"learning_rate": 1.9314920059264505e-05,
"loss": 0.3865,
"step": 2010
},
{
"epoch": 0.08844520338018302,
"grad_norm": 15.434817185278508,
"learning_rate": 1.9310497335308817e-05,
"loss": 0.4109,
"step": 2020
},
{
"epoch": 0.08888305092166908,
"grad_norm": 10.681457596084588,
"learning_rate": 1.9306074611353135e-05,
"loss": 0.3436,
"step": 2030
},
{
"epoch": 0.08932089846315514,
"grad_norm": 11.348178640084303,
"learning_rate": 1.930165188739745e-05,
"loss": 0.3667,
"step": 2040
},
{
"epoch": 0.08975874600464119,
"grad_norm": 14.026197173621947,
"learning_rate": 1.9297229163441768e-05,
"loss": 0.4237,
"step": 2050
},
{
"epoch": 0.09019659354612723,
"grad_norm": 13.112088492075678,
"learning_rate": 1.929280643948608e-05,
"loss": 0.4262,
"step": 2060
},
{
"epoch": 0.09063444108761329,
"grad_norm": 9.622691088092534,
"learning_rate": 1.9288383715530398e-05,
"loss": 0.3801,
"step": 2070
},
{
"epoch": 0.09107228862909934,
"grad_norm": 9.697640310926039,
"learning_rate": 1.9283960991574713e-05,
"loss": 0.39,
"step": 2080
},
{
"epoch": 0.0915101361705854,
"grad_norm": 12.500317378783329,
"learning_rate": 1.9279538267619028e-05,
"loss": 0.4054,
"step": 2090
},
{
"epoch": 0.09194798371207145,
"grad_norm": 10.789036689302037,
"learning_rate": 1.9275115543663342e-05,
"loss": 0.4074,
"step": 2100
},
{
"epoch": 0.09238583125355751,
"grad_norm": 12.509375756268714,
"learning_rate": 1.927069281970766e-05,
"loss": 0.3667,
"step": 2110
},
{
"epoch": 0.09282367879504357,
"grad_norm": 9.389386341327063,
"learning_rate": 1.9266270095751976e-05,
"loss": 0.4499,
"step": 2120
},
{
"epoch": 0.09326152633652962,
"grad_norm": 15.390016743215236,
"learning_rate": 1.926184737179629e-05,
"loss": 0.4473,
"step": 2130
},
{
"epoch": 0.09369937387801568,
"grad_norm": 19.17657099807316,
"learning_rate": 1.9257424647840605e-05,
"loss": 0.3094,
"step": 2140
},
{
"epoch": 0.09413722141950173,
"grad_norm": 14.244069789562317,
"learning_rate": 1.925300192388492e-05,
"loss": 0.426,
"step": 2150
},
{
"epoch": 0.09457506896098779,
"grad_norm": 14.851204953612982,
"learning_rate": 1.924857919992924e-05,
"loss": 0.4369,
"step": 2160
},
{
"epoch": 0.09501291650247384,
"grad_norm": 8.66142168035377,
"learning_rate": 1.9244156475973554e-05,
"loss": 0.3666,
"step": 2170
},
{
"epoch": 0.0954507640439599,
"grad_norm": 10.22720519072457,
"learning_rate": 1.923973375201787e-05,
"loss": 0.4175,
"step": 2180
},
{
"epoch": 0.09588861158544595,
"grad_norm": 9.595268402718855,
"learning_rate": 1.9235311028062183e-05,
"loss": 0.4074,
"step": 2190
},
{
"epoch": 0.096326459126932,
"grad_norm": 11.423439263182056,
"learning_rate": 1.92308883041065e-05,
"loss": 0.3714,
"step": 2200
},
{
"epoch": 0.09676430666841805,
"grad_norm": 12.195829235128782,
"learning_rate": 1.9226465580150816e-05,
"loss": 0.3617,
"step": 2210
},
{
"epoch": 0.0972021542099041,
"grad_norm": 13.401702180614619,
"learning_rate": 1.922204285619513e-05,
"loss": 0.3476,
"step": 2220
},
{
"epoch": 0.09764000175139016,
"grad_norm": 14.400413274132806,
"learning_rate": 1.9217620132239446e-05,
"loss": 0.3654,
"step": 2230
},
{
"epoch": 0.09807784929287622,
"grad_norm": 13.728323756706255,
"learning_rate": 1.9213197408283765e-05,
"loss": 0.3755,
"step": 2240
},
{
"epoch": 0.09851569683436227,
"grad_norm": 9.537467954489292,
"learning_rate": 1.920877468432808e-05,
"loss": 0.4267,
"step": 2250
},
{
"epoch": 0.09895354437584833,
"grad_norm": 12.562863169113138,
"learning_rate": 1.9204351960372394e-05,
"loss": 0.3631,
"step": 2260
},
{
"epoch": 0.09939139191733438,
"grad_norm": 8.916130073539824,
"learning_rate": 1.919992923641671e-05,
"loss": 0.3976,
"step": 2270
},
{
"epoch": 0.09982923945882044,
"grad_norm": 10.835660087923376,
"learning_rate": 1.9195506512461028e-05,
"loss": 0.344,
"step": 2280
},
{
"epoch": 0.1002670870003065,
"grad_norm": 14.089519398644,
"learning_rate": 1.9191083788505342e-05,
"loss": 0.3957,
"step": 2290
},
{
"epoch": 0.10070493454179255,
"grad_norm": 13.631563897871652,
"learning_rate": 1.9186661064549657e-05,
"loss": 0.4412,
"step": 2300
},
{
"epoch": 0.1011427820832786,
"grad_norm": 11.687133130522081,
"learning_rate": 1.9182238340593972e-05,
"loss": 0.4246,
"step": 2310
},
{
"epoch": 0.10158062962476466,
"grad_norm": 11.012593429809936,
"learning_rate": 1.917781561663829e-05,
"loss": 0.3929,
"step": 2320
},
{
"epoch": 0.10201847716625072,
"grad_norm": 11.437466647406971,
"learning_rate": 1.9173392892682605e-05,
"loss": 0.4196,
"step": 2330
},
{
"epoch": 0.10245632470773676,
"grad_norm": 13.09895341492618,
"learning_rate": 1.916897016872692e-05,
"loss": 0.3513,
"step": 2340
},
{
"epoch": 0.10289417224922282,
"grad_norm": 12.293757546388127,
"learning_rate": 1.9164547444771235e-05,
"loss": 0.3992,
"step": 2350
},
{
"epoch": 0.10333201979070887,
"grad_norm": 10.261487153006069,
"learning_rate": 1.916012472081555e-05,
"loss": 0.3066,
"step": 2360
},
{
"epoch": 0.10376986733219493,
"grad_norm": 14.709144519485733,
"learning_rate": 1.915570199685987e-05,
"loss": 0.4397,
"step": 2370
},
{
"epoch": 0.10420771487368098,
"grad_norm": 10.834726410330754,
"learning_rate": 1.9151279272904183e-05,
"loss": 0.3525,
"step": 2380
},
{
"epoch": 0.10464556241516704,
"grad_norm": 14.674602673430272,
"learning_rate": 1.9146856548948498e-05,
"loss": 0.3694,
"step": 2390
},
{
"epoch": 0.10508340995665309,
"grad_norm": 16.41260572575191,
"learning_rate": 1.9142433824992813e-05,
"loss": 0.3565,
"step": 2400
},
{
"epoch": 0.10552125749813915,
"grad_norm": 10.31589706418058,
"learning_rate": 1.913801110103713e-05,
"loss": 0.4554,
"step": 2410
},
{
"epoch": 0.1059591050396252,
"grad_norm": 8.228303991240578,
"learning_rate": 1.9133588377081446e-05,
"loss": 0.4028,
"step": 2420
},
{
"epoch": 0.10639695258111126,
"grad_norm": 13.474103036444346,
"learning_rate": 1.912916565312576e-05,
"loss": 0.4172,
"step": 2430
},
{
"epoch": 0.10683480012259731,
"grad_norm": 11.893594097933823,
"learning_rate": 1.9124742929170076e-05,
"loss": 0.3997,
"step": 2440
},
{
"epoch": 0.10727264766408337,
"grad_norm": 15.503153818507785,
"learning_rate": 1.9120320205214394e-05,
"loss": 0.4038,
"step": 2450
},
{
"epoch": 0.10771049520556943,
"grad_norm": 10.232230065068523,
"learning_rate": 1.911589748125871e-05,
"loss": 0.4035,
"step": 2460
},
{
"epoch": 0.10814834274705548,
"grad_norm": 12.04140450127315,
"learning_rate": 1.9111474757303024e-05,
"loss": 0.3076,
"step": 2470
},
{
"epoch": 0.10858619028854152,
"grad_norm": 15.468189206217254,
"learning_rate": 1.910705203334734e-05,
"loss": 0.3361,
"step": 2480
},
{
"epoch": 0.10902403783002758,
"grad_norm": 17.902774600013995,
"learning_rate": 1.9102629309391657e-05,
"loss": 0.4081,
"step": 2490
},
{
"epoch": 0.10946188537151363,
"grad_norm": 24.91450836462123,
"learning_rate": 1.9098206585435972e-05,
"loss": 0.4367,
"step": 2500
},
{
"epoch": 0.10989973291299969,
"grad_norm": 11.744904407487343,
"learning_rate": 1.9094226133875857e-05,
"loss": 0.4292,
"step": 2510
},
{
"epoch": 0.11033758045448575,
"grad_norm": 13.899816366396042,
"learning_rate": 1.908980340992017e-05,
"loss": 0.4321,
"step": 2520
},
{
"epoch": 0.1107754279959718,
"grad_norm": 12.102738226657959,
"learning_rate": 1.9085380685964486e-05,
"loss": 0.3108,
"step": 2530
},
{
"epoch": 0.11121327553745786,
"grad_norm": 14.16084103947617,
"learning_rate": 1.9080957962008805e-05,
"loss": 0.335,
"step": 2540
},
{
"epoch": 0.11165112307894391,
"grad_norm": 11.115420213753518,
"learning_rate": 1.9076535238053116e-05,
"loss": 0.3912,
"step": 2550
},
{
"epoch": 0.11208897062042997,
"grad_norm": 13.023671826026302,
"learning_rate": 1.9072112514097434e-05,
"loss": 0.4168,
"step": 2560
},
{
"epoch": 0.11252681816191602,
"grad_norm": 11.952192442175816,
"learning_rate": 1.906768979014175e-05,
"loss": 0.4028,
"step": 2570
},
{
"epoch": 0.11296466570340208,
"grad_norm": 9.592598016519975,
"learning_rate": 1.9063267066186068e-05,
"loss": 0.4134,
"step": 2580
},
{
"epoch": 0.11340251324488813,
"grad_norm": 9.083352138488156,
"learning_rate": 1.905884434223038e-05,
"loss": 0.3328,
"step": 2590
},
{
"epoch": 0.11384036078637419,
"grad_norm": 18.60156824207177,
"learning_rate": 1.9054421618274697e-05,
"loss": 0.3479,
"step": 2600
},
{
"epoch": 0.11427820832786025,
"grad_norm": 9.845963292845576,
"learning_rate": 1.9049998894319012e-05,
"loss": 0.3555,
"step": 2610
},
{
"epoch": 0.11471605586934629,
"grad_norm": 10.182726906192356,
"learning_rate": 1.904557617036333e-05,
"loss": 0.4251,
"step": 2620
},
{
"epoch": 0.11515390341083234,
"grad_norm": 7.335163697208383,
"learning_rate": 1.9041153446407642e-05,
"loss": 0.3376,
"step": 2630
},
{
"epoch": 0.1155917509523184,
"grad_norm": 11.37657070843338,
"learning_rate": 1.903673072245196e-05,
"loss": 0.3645,
"step": 2640
},
{
"epoch": 0.11602959849380445,
"grad_norm": 10.877548094642906,
"learning_rate": 1.9032307998496275e-05,
"loss": 0.3726,
"step": 2650
},
{
"epoch": 0.11646744603529051,
"grad_norm": 15.333882252915895,
"learning_rate": 1.902788527454059e-05,
"loss": 0.385,
"step": 2660
},
{
"epoch": 0.11690529357677656,
"grad_norm": 12.823331692401915,
"learning_rate": 1.9023462550584905e-05,
"loss": 0.3943,
"step": 2670
},
{
"epoch": 0.11734314111826262,
"grad_norm": 12.692981757202359,
"learning_rate": 1.9019039826629223e-05,
"loss": 0.4218,
"step": 2680
},
{
"epoch": 0.11778098865974868,
"grad_norm": 12.550522897473236,
"learning_rate": 1.9014617102673538e-05,
"loss": 0.4077,
"step": 2690
},
{
"epoch": 0.11821883620123473,
"grad_norm": 11.727829165225376,
"learning_rate": 1.9010194378717853e-05,
"loss": 0.4516,
"step": 2700
},
{
"epoch": 0.11865668374272079,
"grad_norm": 9.239870866145635,
"learning_rate": 1.900577165476217e-05,
"loss": 0.3879,
"step": 2710
},
{
"epoch": 0.11909453128420684,
"grad_norm": 10.991158486727915,
"learning_rate": 1.9001348930806486e-05,
"loss": 0.4201,
"step": 2720
},
{
"epoch": 0.1195323788256929,
"grad_norm": 10.775808182785527,
"learning_rate": 1.89969262068508e-05,
"loss": 0.3612,
"step": 2730
},
{
"epoch": 0.11997022636717895,
"grad_norm": 8.786090643394553,
"learning_rate": 1.8992503482895116e-05,
"loss": 0.4029,
"step": 2740
},
{
"epoch": 0.12040807390866501,
"grad_norm": 10.644438789736052,
"learning_rate": 1.8988080758939434e-05,
"loss": 0.3642,
"step": 2750
},
{
"epoch": 0.12084592145015106,
"grad_norm": 10.429045582422866,
"learning_rate": 1.8983658034983746e-05,
"loss": 0.3972,
"step": 2760
},
{
"epoch": 0.1212837689916371,
"grad_norm": 19.068393684191452,
"learning_rate": 1.8979235311028064e-05,
"loss": 0.4214,
"step": 2770
},
{
"epoch": 0.12172161653312316,
"grad_norm": 6.510885660190795,
"learning_rate": 1.897481258707238e-05,
"loss": 0.3927,
"step": 2780
},
{
"epoch": 0.12215946407460922,
"grad_norm": 13.320869214905798,
"learning_rate": 1.8970389863116697e-05,
"loss": 0.3801,
"step": 2790
},
{
"epoch": 0.12259731161609527,
"grad_norm": 11.91416431821247,
"learning_rate": 1.896596713916101e-05,
"loss": 0.3617,
"step": 2800
},
{
"epoch": 0.12303515915758133,
"grad_norm": 16.39803737899181,
"learning_rate": 1.8961544415205327e-05,
"loss": 0.4252,
"step": 2810
},
{
"epoch": 0.12347300669906738,
"grad_norm": 18.444202304052418,
"learning_rate": 1.8957121691249642e-05,
"loss": 0.3753,
"step": 2820
},
{
"epoch": 0.12391085424055344,
"grad_norm": 11.295343530031559,
"learning_rate": 1.895269896729396e-05,
"loss": 0.3921,
"step": 2830
},
{
"epoch": 0.1243487017820395,
"grad_norm": 9.710371030089025,
"learning_rate": 1.8948276243338272e-05,
"loss": 0.3506,
"step": 2840
},
{
"epoch": 0.12478654932352555,
"grad_norm": 13.077018882435036,
"learning_rate": 1.894385351938259e-05,
"loss": 0.3025,
"step": 2850
},
{
"epoch": 0.1252243968650116,
"grad_norm": 9.815328591239517,
"learning_rate": 1.8939430795426905e-05,
"loss": 0.3919,
"step": 2860
},
{
"epoch": 0.12566224440649765,
"grad_norm": 10.956422665808212,
"learning_rate": 1.893500807147122e-05,
"loss": 0.3592,
"step": 2870
},
{
"epoch": 0.12610009194798372,
"grad_norm": 7.24660201847797,
"learning_rate": 1.8930585347515535e-05,
"loss": 0.4198,
"step": 2880
},
{
"epoch": 0.12653793948946976,
"grad_norm": 7.084606579805614,
"learning_rate": 1.8926162623559853e-05,
"loss": 0.4077,
"step": 2890
},
{
"epoch": 0.12697578703095583,
"grad_norm": 16.831228496029244,
"learning_rate": 1.8921739899604168e-05,
"loss": 0.3456,
"step": 2900
},
{
"epoch": 0.12741363457244187,
"grad_norm": 12.587652277854042,
"learning_rate": 1.8917317175648483e-05,
"loss": 0.4014,
"step": 2910
},
{
"epoch": 0.12785148211392794,
"grad_norm": 9.243117375162731,
"learning_rate": 1.8912894451692798e-05,
"loss": 0.3575,
"step": 2920
},
{
"epoch": 0.12828932965541398,
"grad_norm": 11.159631895058327,
"learning_rate": 1.8908471727737116e-05,
"loss": 0.3241,
"step": 2930
},
{
"epoch": 0.12872717719690005,
"grad_norm": 10.310646954134011,
"learning_rate": 1.890404900378143e-05,
"loss": 0.3682,
"step": 2940
},
{
"epoch": 0.1291650247383861,
"grad_norm": 10.173470601980204,
"learning_rate": 1.8899626279825746e-05,
"loss": 0.4251,
"step": 2950
},
{
"epoch": 0.12960287227987216,
"grad_norm": 12.609142050783468,
"learning_rate": 1.889520355587006e-05,
"loss": 0.394,
"step": 2960
},
{
"epoch": 0.1300407198213582,
"grad_norm": 9.635654645850565,
"learning_rate": 1.8890780831914376e-05,
"loss": 0.4089,
"step": 2970
},
{
"epoch": 0.13047856736284424,
"grad_norm": 11.436287904310273,
"learning_rate": 1.8886358107958694e-05,
"loss": 0.3826,
"step": 2980
},
{
"epoch": 0.1309164149043303,
"grad_norm": 16.392761382159563,
"learning_rate": 1.888193538400301e-05,
"loss": 0.3276,
"step": 2990
},
{
"epoch": 0.13135426244581636,
"grad_norm": 17.239395198967156,
"learning_rate": 1.8877512660047327e-05,
"loss": 0.4094,
"step": 3000
},
{
"epoch": 0.13179210998730242,
"grad_norm": 14.545360777292585,
"learning_rate": 1.887308993609164e-05,
"loss": 0.3779,
"step": 3010
},
{
"epoch": 0.13222995752878847,
"grad_norm": 14.366498738758244,
"learning_rate": 1.8868667212135957e-05,
"loss": 0.4288,
"step": 3020
},
{
"epoch": 0.13266780507027454,
"grad_norm": 17.039699201481028,
"learning_rate": 1.886424448818027e-05,
"loss": 0.36,
"step": 3030
},
{
"epoch": 0.13310565261176058,
"grad_norm": 9.226512886191754,
"learning_rate": 1.885982176422459e-05,
"loss": 0.3765,
"step": 3040
},
{
"epoch": 0.13354350015324665,
"grad_norm": 9.028526449867499,
"learning_rate": 1.88553990402689e-05,
"loss": 0.4395,
"step": 3050
},
{
"epoch": 0.1339813476947327,
"grad_norm": 13.6426618700454,
"learning_rate": 1.885097631631322e-05,
"loss": 0.3723,
"step": 3060
},
{
"epoch": 0.13441919523621876,
"grad_norm": 13.305857832591359,
"learning_rate": 1.8846553592357535e-05,
"loss": 0.399,
"step": 3070
},
{
"epoch": 0.1348570427777048,
"grad_norm": 7.874733814403194,
"learning_rate": 1.884213086840185e-05,
"loss": 0.4011,
"step": 3080
},
{
"epoch": 0.13529489031919087,
"grad_norm": 13.430051646641026,
"learning_rate": 1.8837708144446164e-05,
"loss": 0.3906,
"step": 3090
},
{
"epoch": 0.1357327378606769,
"grad_norm": 8.662733163489246,
"learning_rate": 1.8833285420490483e-05,
"loss": 0.4138,
"step": 3100
},
{
"epoch": 0.13617058540216298,
"grad_norm": 9.801119868851403,
"learning_rate": 1.8828862696534798e-05,
"loss": 0.3641,
"step": 3110
},
{
"epoch": 0.13660843294364902,
"grad_norm": 11.778191693148269,
"learning_rate": 1.8824439972579112e-05,
"loss": 0.3762,
"step": 3120
},
{
"epoch": 0.13704628048513506,
"grad_norm": 8.279726053764739,
"learning_rate": 1.8820017248623427e-05,
"loss": 0.3693,
"step": 3130
},
{
"epoch": 0.13748412802662113,
"grad_norm": 10.69124033655974,
"learning_rate": 1.8815594524667746e-05,
"loss": 0.3369,
"step": 3140
},
{
"epoch": 0.13792197556810717,
"grad_norm": 6.139622574926641,
"learning_rate": 1.881117180071206e-05,
"loss": 0.3526,
"step": 3150
},
{
"epoch": 0.13835982310959324,
"grad_norm": 8.884320160504993,
"learning_rate": 1.8806749076756375e-05,
"loss": 0.321,
"step": 3160
},
{
"epoch": 0.13879767065107929,
"grad_norm": 11.836052882069614,
"learning_rate": 1.880232635280069e-05,
"loss": 0.3988,
"step": 3170
},
{
"epoch": 0.13923551819256536,
"grad_norm": 9.920797262455972,
"learning_rate": 1.8797903628845005e-05,
"loss": 0.5179,
"step": 3180
},
{
"epoch": 0.1396733657340514,
"grad_norm": 14.813362193063755,
"learning_rate": 1.8793480904889324e-05,
"loss": 0.377,
"step": 3190
},
{
"epoch": 0.14011121327553747,
"grad_norm": 6.695253814831782,
"learning_rate": 1.878905818093364e-05,
"loss": 0.3863,
"step": 3200
},
{
"epoch": 0.1405490608170235,
"grad_norm": 11.085686646540799,
"learning_rate": 1.8784635456977953e-05,
"loss": 0.3708,
"step": 3210
},
{
"epoch": 0.14098690835850958,
"grad_norm": 13.830245136728134,
"learning_rate": 1.8780212733022268e-05,
"loss": 0.3151,
"step": 3220
},
{
"epoch": 0.14142475589999562,
"grad_norm": 12.120622926321499,
"learning_rate": 1.8775790009066586e-05,
"loss": 0.3459,
"step": 3230
},
{
"epoch": 0.1418626034414817,
"grad_norm": 16.30580487007426,
"learning_rate": 1.87713672851109e-05,
"loss": 0.3745,
"step": 3240
},
{
"epoch": 0.14230045098296773,
"grad_norm": 9.992654009584912,
"learning_rate": 1.876694456115522e-05,
"loss": 0.3993,
"step": 3250
},
{
"epoch": 0.14273829852445377,
"grad_norm": 8.666190158753487,
"learning_rate": 1.876252183719953e-05,
"loss": 0.3774,
"step": 3260
},
{
"epoch": 0.14317614606593984,
"grad_norm": 10.386340090996436,
"learning_rate": 1.875809911324385e-05,
"loss": 0.3806,
"step": 3270
},
{
"epoch": 0.14361399360742588,
"grad_norm": 22.19356554965066,
"learning_rate": 1.8753676389288164e-05,
"loss": 0.3824,
"step": 3280
},
{
"epoch": 0.14405184114891195,
"grad_norm": 9.21049973448166,
"learning_rate": 1.874925366533248e-05,
"loss": 0.3353,
"step": 3290
},
{
"epoch": 0.144489688690398,
"grad_norm": 12.072168142150097,
"learning_rate": 1.8744830941376794e-05,
"loss": 0.4011,
"step": 3300
},
{
"epoch": 0.14492753623188406,
"grad_norm": 12.59324716931968,
"learning_rate": 1.8740408217421112e-05,
"loss": 0.4021,
"step": 3310
},
{
"epoch": 0.1453653837733701,
"grad_norm": 10.05214029931775,
"learning_rate": 1.8735985493465427e-05,
"loss": 0.3553,
"step": 3320
},
{
"epoch": 0.14580323131485617,
"grad_norm": 11.803155664591488,
"learning_rate": 1.8731562769509742e-05,
"loss": 0.356,
"step": 3330
},
{
"epoch": 0.14624107885634222,
"grad_norm": 14.92368689915674,
"learning_rate": 1.8727140045554057e-05,
"loss": 0.3801,
"step": 3340
},
{
"epoch": 0.14667892639782829,
"grad_norm": 17.079776945779418,
"learning_rate": 1.8722717321598375e-05,
"loss": 0.3477,
"step": 3350
},
{
"epoch": 0.14711677393931433,
"grad_norm": 13.097782069556722,
"learning_rate": 1.871829459764269e-05,
"loss": 0.3671,
"step": 3360
},
{
"epoch": 0.1475546214808004,
"grad_norm": 8.15632623512624,
"learning_rate": 1.8713871873687005e-05,
"loss": 0.3389,
"step": 3370
},
{
"epoch": 0.14799246902228644,
"grad_norm": 13.960404123834712,
"learning_rate": 1.870944914973132e-05,
"loss": 0.3485,
"step": 3380
},
{
"epoch": 0.1484303165637725,
"grad_norm": 11.071464397882252,
"learning_rate": 1.8705026425775635e-05,
"loss": 0.3416,
"step": 3390
},
{
"epoch": 0.14886816410525855,
"grad_norm": 13.277501194270975,
"learning_rate": 1.8700603701819953e-05,
"loss": 0.3789,
"step": 3400
},
{
"epoch": 0.1493060116467446,
"grad_norm": 10.804325254909127,
"learning_rate": 1.8696180977864268e-05,
"loss": 0.4302,
"step": 3410
},
{
"epoch": 0.14974385918823066,
"grad_norm": 13.368363204432509,
"learning_rate": 1.8691758253908583e-05,
"loss": 0.3827,
"step": 3420
},
{
"epoch": 0.1501817067297167,
"grad_norm": 10.0824595511505,
"learning_rate": 1.8687335529952898e-05,
"loss": 0.3658,
"step": 3430
},
{
"epoch": 0.15061955427120277,
"grad_norm": 12.416220706039892,
"learning_rate": 1.8682912805997216e-05,
"loss": 0.3614,
"step": 3440
},
{
"epoch": 0.1510574018126888,
"grad_norm": 16.7125572642998,
"learning_rate": 1.867849008204153e-05,
"loss": 0.4461,
"step": 3450
},
{
"epoch": 0.15149524935417488,
"grad_norm": 12.811501366379437,
"learning_rate": 1.8674067358085846e-05,
"loss": 0.4033,
"step": 3460
},
{
"epoch": 0.15193309689566092,
"grad_norm": 17.173250485174975,
"learning_rate": 1.866964463413016e-05,
"loss": 0.3277,
"step": 3470
},
{
"epoch": 0.152370944437147,
"grad_norm": 8.979558831403725,
"learning_rate": 1.866522191017448e-05,
"loss": 0.3373,
"step": 3480
},
{
"epoch": 0.15280879197863304,
"grad_norm": 7.797438606883994,
"learning_rate": 1.8660799186218794e-05,
"loss": 0.3954,
"step": 3490
},
{
"epoch": 0.1532466395201191,
"grad_norm": 16.155288520109096,
"learning_rate": 1.865637646226311e-05,
"loss": 0.385,
"step": 3500
},
{
"epoch": 0.15368448706160515,
"grad_norm": 16.957081191616158,
"learning_rate": 1.8651953738307424e-05,
"loss": 0.3742,
"step": 3510
},
{
"epoch": 0.15412233460309122,
"grad_norm": 9.531468523365854,
"learning_rate": 1.8647531014351742e-05,
"loss": 0.3326,
"step": 3520
},
{
"epoch": 0.15456018214457726,
"grad_norm": 25.31643246845194,
"learning_rate": 1.8643108290396057e-05,
"loss": 0.39,
"step": 3530
},
{
"epoch": 0.1549980296860633,
"grad_norm": 10.078617054363939,
"learning_rate": 1.8638685566440372e-05,
"loss": 0.4025,
"step": 3540
},
{
"epoch": 0.15543587722754937,
"grad_norm": 11.02217779389865,
"learning_rate": 1.8634262842484687e-05,
"loss": 0.4015,
"step": 3550
},
{
"epoch": 0.1558737247690354,
"grad_norm": 14.098789980216388,
"learning_rate": 1.8629840118529005e-05,
"loss": 0.3344,
"step": 3560
},
{
"epoch": 0.15631157231052148,
"grad_norm": 15.721719692602761,
"learning_rate": 1.862541739457332e-05,
"loss": 0.3715,
"step": 3570
},
{
"epoch": 0.15674941985200752,
"grad_norm": 12.716040770267046,
"learning_rate": 1.8620994670617635e-05,
"loss": 0.3651,
"step": 3580
},
{
"epoch": 0.1571872673934936,
"grad_norm": 12.965209183423223,
"learning_rate": 1.861657194666195e-05,
"loss": 0.4063,
"step": 3590
},
{
"epoch": 0.15762511493497963,
"grad_norm": 8.666927431536255,
"learning_rate": 1.8612149222706265e-05,
"loss": 0.3274,
"step": 3600
},
{
"epoch": 0.1580629624764657,
"grad_norm": 14.516286713758953,
"learning_rate": 1.8607726498750583e-05,
"loss": 0.4267,
"step": 3610
},
{
"epoch": 0.15850081001795174,
"grad_norm": 9.395231262901326,
"learning_rate": 1.8603303774794898e-05,
"loss": 0.3378,
"step": 3620
},
{
"epoch": 0.1589386575594378,
"grad_norm": 15.580920245081602,
"learning_rate": 1.8598881050839213e-05,
"loss": 0.3727,
"step": 3630
},
{
"epoch": 0.15937650510092385,
"grad_norm": 9.421511818637732,
"learning_rate": 1.8594458326883528e-05,
"loss": 0.4091,
"step": 3640
},
{
"epoch": 0.15981435264240992,
"grad_norm": 10.903593860274887,
"learning_rate": 1.8590035602927846e-05,
"loss": 0.3852,
"step": 3650
},
{
"epoch": 0.16025220018389597,
"grad_norm": 15.079459239057043,
"learning_rate": 1.858561287897216e-05,
"loss": 0.3161,
"step": 3660
},
{
"epoch": 0.16069004772538203,
"grad_norm": 11.906906790652602,
"learning_rate": 1.8581632427412045e-05,
"loss": 0.4032,
"step": 3670
},
{
"epoch": 0.16112789526686808,
"grad_norm": 7.299516154937353,
"learning_rate": 1.857720970345636e-05,
"loss": 0.4608,
"step": 3680
},
{
"epoch": 0.16156574280835412,
"grad_norm": 10.405574742308731,
"learning_rate": 1.8572786979500675e-05,
"loss": 0.3277,
"step": 3690
},
{
"epoch": 0.1620035903498402,
"grad_norm": 12.45192353736054,
"learning_rate": 1.8568364255544993e-05,
"loss": 0.4101,
"step": 3700
},
{
"epoch": 0.16244143789132623,
"grad_norm": 17.784864115352764,
"learning_rate": 1.8563941531589308e-05,
"loss": 0.4311,
"step": 3710
},
{
"epoch": 0.1628792854328123,
"grad_norm": 12.704077503112067,
"learning_rate": 1.8559518807633623e-05,
"loss": 0.3409,
"step": 3720
},
{
"epoch": 0.16331713297429834,
"grad_norm": 26.63374521687372,
"learning_rate": 1.8555096083677938e-05,
"loss": 0.3658,
"step": 3730
},
{
"epoch": 0.1637549805157844,
"grad_norm": 10.121501930571688,
"learning_rate": 1.8550673359722256e-05,
"loss": 0.3402,
"step": 3740
},
{
"epoch": 0.16419282805727045,
"grad_norm": 14.094860227716598,
"learning_rate": 1.854625063576657e-05,
"loss": 0.4277,
"step": 3750
},
{
"epoch": 0.16463067559875652,
"grad_norm": 9.994141579233393,
"learning_rate": 1.8541827911810886e-05,
"loss": 0.4692,
"step": 3760
},
{
"epoch": 0.16506852314024256,
"grad_norm": 8.67508870280096,
"learning_rate": 1.85374051878552e-05,
"loss": 0.4016,
"step": 3770
},
{
"epoch": 0.16550637068172863,
"grad_norm": 14.502486443265836,
"learning_rate": 1.853298246389952e-05,
"loss": 0.4075,
"step": 3780
},
{
"epoch": 0.16594421822321467,
"grad_norm": 11.515960895499763,
"learning_rate": 1.852855973994383e-05,
"loss": 0.3435,
"step": 3790
},
{
"epoch": 0.16638206576470074,
"grad_norm": 9.745079168132195,
"learning_rate": 1.852413701598815e-05,
"loss": 0.3109,
"step": 3800
},
{
"epoch": 0.16681991330618678,
"grad_norm": 8.495004869159922,
"learning_rate": 1.8519714292032464e-05,
"loss": 0.3922,
"step": 3810
},
{
"epoch": 0.16725776084767285,
"grad_norm": 8.988712438049474,
"learning_rate": 1.8515291568076782e-05,
"loss": 0.4012,
"step": 3820
},
{
"epoch": 0.1676956083891589,
"grad_norm": 12.853240059584076,
"learning_rate": 1.8510868844121094e-05,
"loss": 0.4125,
"step": 3830
},
{
"epoch": 0.16813345593064494,
"grad_norm": 9.825049278388068,
"learning_rate": 1.8506446120165412e-05,
"loss": 0.3032,
"step": 3840
},
{
"epoch": 0.168571303472131,
"grad_norm": 7.994348437089401,
"learning_rate": 1.8502023396209727e-05,
"loss": 0.3636,
"step": 3850
},
{
"epoch": 0.16900915101361705,
"grad_norm": 12.765237118303892,
"learning_rate": 1.8497600672254045e-05,
"loss": 0.3478,
"step": 3860
},
{
"epoch": 0.16944699855510312,
"grad_norm": 12.546641498449894,
"learning_rate": 1.8493177948298357e-05,
"loss": 0.3,
"step": 3870
},
{
"epoch": 0.16988484609658916,
"grad_norm": 23.911174431535404,
"learning_rate": 1.8488755224342675e-05,
"loss": 0.451,
"step": 3880
},
{
"epoch": 0.17032269363807523,
"grad_norm": 10.520515381695157,
"learning_rate": 1.848433250038699e-05,
"loss": 0.358,
"step": 3890
},
{
"epoch": 0.17076054117956127,
"grad_norm": 11.224917445909368,
"learning_rate": 1.8479909776431305e-05,
"loss": 0.3162,
"step": 3900
},
{
"epoch": 0.17119838872104734,
"grad_norm": 18.902395009875733,
"learning_rate": 1.847548705247562e-05,
"loss": 0.4223,
"step": 3910
},
{
"epoch": 0.17163623626253338,
"grad_norm": 11.55198595552304,
"learning_rate": 1.8471064328519938e-05,
"loss": 0.3808,
"step": 3920
},
{
"epoch": 0.17207408380401945,
"grad_norm": 11.00362301896321,
"learning_rate": 1.8466641604564253e-05,
"loss": 0.3252,
"step": 3930
},
{
"epoch": 0.1725119313455055,
"grad_norm": 10.336995942120799,
"learning_rate": 1.8462218880608568e-05,
"loss": 0.364,
"step": 3940
},
{
"epoch": 0.17294977888699156,
"grad_norm": 10.710989141583203,
"learning_rate": 1.8457796156652883e-05,
"loss": 0.4231,
"step": 3950
},
{
"epoch": 0.1733876264284776,
"grad_norm": 12.229225448822383,
"learning_rate": 1.84533734326972e-05,
"loss": 0.4117,
"step": 3960
},
{
"epoch": 0.17382547396996365,
"grad_norm": 14.915171514555029,
"learning_rate": 1.8448950708741516e-05,
"loss": 0.4032,
"step": 3970
},
{
"epoch": 0.17426332151144971,
"grad_norm": 12.229642807866787,
"learning_rate": 1.844452798478583e-05,
"loss": 0.4361,
"step": 3980
},
{
"epoch": 0.17470116905293576,
"grad_norm": 7.010608264256624,
"learning_rate": 1.844010526083015e-05,
"loss": 0.3638,
"step": 3990
},
{
"epoch": 0.17513901659442183,
"grad_norm": 15.912932485500148,
"learning_rate": 1.843568253687446e-05,
"loss": 0.455,
"step": 4000
},
{
"epoch": 0.17557686413590787,
"grad_norm": 6.924969603544103,
"learning_rate": 1.843125981291878e-05,
"loss": 0.2625,
"step": 4010
},
{
"epoch": 0.17601471167739394,
"grad_norm": 15.139219047531501,
"learning_rate": 1.8426837088963094e-05,
"loss": 0.4141,
"step": 4020
},
{
"epoch": 0.17645255921887998,
"grad_norm": 11.17823518693945,
"learning_rate": 1.8422414365007412e-05,
"loss": 0.3224,
"step": 4030
},
{
"epoch": 0.17689040676036605,
"grad_norm": 15.7161064579388,
"learning_rate": 1.8417991641051723e-05,
"loss": 0.3664,
"step": 4040
},
{
"epoch": 0.1773282543018521,
"grad_norm": 13.792070427062033,
"learning_rate": 1.841356891709604e-05,
"loss": 0.3284,
"step": 4050
},
{
"epoch": 0.17776610184333816,
"grad_norm": 9.000617959741795,
"learning_rate": 1.8409146193140357e-05,
"loss": 0.344,
"step": 4060
},
{
"epoch": 0.1782039493848242,
"grad_norm": 12.402957661971133,
"learning_rate": 1.8404723469184675e-05,
"loss": 0.3871,
"step": 4070
},
{
"epoch": 0.17864179692631027,
"grad_norm": 12.032781330103441,
"learning_rate": 1.8400300745228986e-05,
"loss": 0.3787,
"step": 4080
},
{
"epoch": 0.1790796444677963,
"grad_norm": 11.222860194370364,
"learning_rate": 1.8395878021273305e-05,
"loss": 0.3077,
"step": 4090
},
{
"epoch": 0.17951749200928238,
"grad_norm": 13.7968119601476,
"learning_rate": 1.839145529731762e-05,
"loss": 0.4786,
"step": 4100
},
{
"epoch": 0.17995533955076842,
"grad_norm": 11.921299671774793,
"learning_rate": 1.8387032573361934e-05,
"loss": 0.4037,
"step": 4110
},
{
"epoch": 0.18039318709225446,
"grad_norm": 8.879545954875514,
"learning_rate": 1.838260984940625e-05,
"loss": 0.4584,
"step": 4120
},
{
"epoch": 0.18083103463374053,
"grad_norm": 14.982597428617236,
"learning_rate": 1.8378187125450568e-05,
"loss": 0.3815,
"step": 4130
},
{
"epoch": 0.18126888217522658,
"grad_norm": 9.147231103073068,
"learning_rate": 1.8373764401494882e-05,
"loss": 0.4014,
"step": 4140
},
{
"epoch": 0.18170672971671264,
"grad_norm": 12.294983152398926,
"learning_rate": 1.8369341677539197e-05,
"loss": 0.4019,
"step": 4150
},
{
"epoch": 0.1821445772581987,
"grad_norm": 9.927235418370907,
"learning_rate": 1.8364918953583512e-05,
"loss": 0.3566,
"step": 4160
},
{
"epoch": 0.18258242479968476,
"grad_norm": 11.734461472925389,
"learning_rate": 1.8360496229627827e-05,
"loss": 0.4055,
"step": 4170
},
{
"epoch": 0.1830202723411708,
"grad_norm": 17.077445949446012,
"learning_rate": 1.8356073505672145e-05,
"loss": 0.3583,
"step": 4180
},
{
"epoch": 0.18345811988265687,
"grad_norm": 12.654896604913636,
"learning_rate": 1.835165078171646e-05,
"loss": 0.3492,
"step": 4190
},
{
"epoch": 0.1838959674241429,
"grad_norm": 16.58090166573664,
"learning_rate": 1.8347228057760775e-05,
"loss": 0.4069,
"step": 4200
},
{
"epoch": 0.18433381496562898,
"grad_norm": 12.423854991570149,
"learning_rate": 1.834280533380509e-05,
"loss": 0.3649,
"step": 4210
},
{
"epoch": 0.18477166250711502,
"grad_norm": 8.687455211496507,
"learning_rate": 1.833838260984941e-05,
"loss": 0.3713,
"step": 4220
},
{
"epoch": 0.1852095100486011,
"grad_norm": 10.328403486459152,
"learning_rate": 1.8333959885893723e-05,
"loss": 0.3679,
"step": 4230
},
{
"epoch": 0.18564735759008713,
"grad_norm": 12.579913013442551,
"learning_rate": 1.832953716193804e-05,
"loss": 0.3538,
"step": 4240
},
{
"epoch": 0.18608520513157317,
"grad_norm": 10.7018660870028,
"learning_rate": 1.8325114437982353e-05,
"loss": 0.3401,
"step": 4250
},
{
"epoch": 0.18652305267305924,
"grad_norm": 7.833366421027678,
"learning_rate": 1.832069171402667e-05,
"loss": 0.4208,
"step": 4260
},
{
"epoch": 0.18696090021454528,
"grad_norm": 13.791401625254139,
"learning_rate": 1.8316268990070986e-05,
"loss": 0.3355,
"step": 4270
},
{
"epoch": 0.18739874775603135,
"grad_norm": 15.146634353101764,
"learning_rate": 1.8311846266115305e-05,
"loss": 0.4045,
"step": 4280
},
{
"epoch": 0.1878365952975174,
"grad_norm": 11.438151690151432,
"learning_rate": 1.8307423542159616e-05,
"loss": 0.392,
"step": 4290
},
{
"epoch": 0.18827444283900346,
"grad_norm": 12.907194015301421,
"learning_rate": 1.8303000818203934e-05,
"loss": 0.4075,
"step": 4300
},
{
"epoch": 0.1887122903804895,
"grad_norm": 13.004770430694967,
"learning_rate": 1.829857809424825e-05,
"loss": 0.3703,
"step": 4310
},
{
"epoch": 0.18915013792197558,
"grad_norm": 13.767940400122603,
"learning_rate": 1.8294155370292564e-05,
"loss": 0.3479,
"step": 4320
},
{
"epoch": 0.18958798546346162,
"grad_norm": 13.598058539546074,
"learning_rate": 1.828973264633688e-05,
"loss": 0.3426,
"step": 4330
},
{
"epoch": 0.1900258330049477,
"grad_norm": 15.83770263349553,
"learning_rate": 1.8285309922381197e-05,
"loss": 0.3301,
"step": 4340
},
{
"epoch": 0.19046368054643373,
"grad_norm": 9.739199794350526,
"learning_rate": 1.8280887198425512e-05,
"loss": 0.3444,
"step": 4350
},
{
"epoch": 0.1909015280879198,
"grad_norm": 11.924698121670366,
"learning_rate": 1.8276464474469827e-05,
"loss": 0.3052,
"step": 4360
},
{
"epoch": 0.19133937562940584,
"grad_norm": 12.922426444929348,
"learning_rate": 1.8272041750514142e-05,
"loss": 0.3838,
"step": 4370
},
{
"epoch": 0.1917772231708919,
"grad_norm": 10.139430182884723,
"learning_rate": 1.8267619026558457e-05,
"loss": 0.339,
"step": 4380
},
{
"epoch": 0.19221507071237795,
"grad_norm": 16.77456639919075,
"learning_rate": 1.8263196302602775e-05,
"loss": 0.3662,
"step": 4390
},
{
"epoch": 0.192652918253864,
"grad_norm": 16.923778423361814,
"learning_rate": 1.825877357864709e-05,
"loss": 0.4345,
"step": 4400
},
{
"epoch": 0.19309076579535006,
"grad_norm": 9.79791653342669,
"learning_rate": 1.8254350854691405e-05,
"loss": 0.374,
"step": 4410
},
{
"epoch": 0.1935286133368361,
"grad_norm": 14.223990277487626,
"learning_rate": 1.824992813073572e-05,
"loss": 0.3418,
"step": 4420
},
{
"epoch": 0.19396646087832217,
"grad_norm": 10.280657812067675,
"learning_rate": 1.8245505406780038e-05,
"loss": 0.4258,
"step": 4430
},
{
"epoch": 0.1944043084198082,
"grad_norm": 11.454773579977681,
"learning_rate": 1.8241082682824353e-05,
"loss": 0.3558,
"step": 4440
},
{
"epoch": 0.19484215596129428,
"grad_norm": 10.05797176474998,
"learning_rate": 1.8236659958868668e-05,
"loss": 0.4195,
"step": 4450
},
{
"epoch": 0.19528000350278032,
"grad_norm": 17.413968742737737,
"learning_rate": 1.8232237234912983e-05,
"loss": 0.4842,
"step": 4460
},
{
"epoch": 0.1957178510442664,
"grad_norm": 7.594947751878203,
"learning_rate": 1.82278145109573e-05,
"loss": 0.376,
"step": 4470
},
{
"epoch": 0.19615569858575244,
"grad_norm": 7.752555917921395,
"learning_rate": 1.8223391787001616e-05,
"loss": 0.3874,
"step": 4480
},
{
"epoch": 0.1965935461272385,
"grad_norm": 11.095188960924157,
"learning_rate": 1.821896906304593e-05,
"loss": 0.3199,
"step": 4490
},
{
"epoch": 0.19703139366872455,
"grad_norm": 8.85243989748995,
"learning_rate": 1.8214546339090246e-05,
"loss": 0.3514,
"step": 4500
},
{
"epoch": 0.19746924121021062,
"grad_norm": 10.110803517936658,
"learning_rate": 1.8210123615134564e-05,
"loss": 0.3794,
"step": 4510
},
{
"epoch": 0.19790708875169666,
"grad_norm": 10.373451209228024,
"learning_rate": 1.820570089117888e-05,
"loss": 0.408,
"step": 4520
},
{
"epoch": 0.19834493629318273,
"grad_norm": 13.162706254319437,
"learning_rate": 1.8201278167223194e-05,
"loss": 0.3425,
"step": 4530
},
{
"epoch": 0.19878278383466877,
"grad_norm": 15.282957890404685,
"learning_rate": 1.819685544326751e-05,
"loss": 0.3852,
"step": 4540
},
{
"epoch": 0.1992206313761548,
"grad_norm": 13.370558527120265,
"learning_rate": 1.8192432719311827e-05,
"loss": 0.3652,
"step": 4550
},
{
"epoch": 0.19965847891764088,
"grad_norm": 10.259778080672527,
"learning_rate": 1.8188009995356142e-05,
"loss": 0.356,
"step": 4560
},
{
"epoch": 0.20009632645912692,
"grad_norm": 10.736808667558975,
"learning_rate": 1.8183587271400457e-05,
"loss": 0.3329,
"step": 4570
},
{
"epoch": 0.200534174000613,
"grad_norm": 8.501933158502284,
"learning_rate": 1.817916454744477e-05,
"loss": 0.3397,
"step": 4580
},
{
"epoch": 0.20097202154209903,
"grad_norm": 28.24241657929224,
"learning_rate": 1.8174741823489087e-05,
"loss": 0.393,
"step": 4590
},
{
"epoch": 0.2014098690835851,
"grad_norm": 13.727765781245198,
"learning_rate": 1.8170319099533405e-05,
"loss": 0.306,
"step": 4600
},
{
"epoch": 0.20184771662507114,
"grad_norm": 9.98011595817274,
"learning_rate": 1.816589637557772e-05,
"loss": 0.3357,
"step": 4610
},
{
"epoch": 0.2022855641665572,
"grad_norm": 10.817739639396102,
"learning_rate": 1.8161473651622035e-05,
"loss": 0.3352,
"step": 4620
},
{
"epoch": 0.20272341170804326,
"grad_norm": 17.39150590199315,
"learning_rate": 1.815705092766635e-05,
"loss": 0.453,
"step": 4630
},
{
"epoch": 0.20316125924952932,
"grad_norm": 15.480462835918628,
"learning_rate": 1.8152628203710668e-05,
"loss": 0.3469,
"step": 4640
},
{
"epoch": 0.20359910679101537,
"grad_norm": 14.206320760863697,
"learning_rate": 1.8148205479754983e-05,
"loss": 0.3593,
"step": 4650
},
{
"epoch": 0.20403695433250144,
"grad_norm": 10.767887226596823,
"learning_rate": 1.8143782755799298e-05,
"loss": 0.3401,
"step": 4660
},
{
"epoch": 0.20447480187398748,
"grad_norm": 12.432796643275674,
"learning_rate": 1.8139360031843613e-05,
"loss": 0.3875,
"step": 4670
},
{
"epoch": 0.20491264941547352,
"grad_norm": 9.006109071644179,
"learning_rate": 1.813493730788793e-05,
"loss": 0.3813,
"step": 4680
},
{
"epoch": 0.2053504969569596,
"grad_norm": 15.616401442667728,
"learning_rate": 1.8130514583932246e-05,
"loss": 0.3465,
"step": 4690
},
{
"epoch": 0.20578834449844563,
"grad_norm": 10.59820948899193,
"learning_rate": 1.812609185997656e-05,
"loss": 0.382,
"step": 4700
},
{
"epoch": 0.2062261920399317,
"grad_norm": 14.208386612211477,
"learning_rate": 1.8121669136020875e-05,
"loss": 0.3476,
"step": 4710
},
{
"epoch": 0.20666403958141774,
"grad_norm": 12.589160774020666,
"learning_rate": 1.8117246412065194e-05,
"loss": 0.3421,
"step": 4720
},
{
"epoch": 0.2071018871229038,
"grad_norm": 7.568667332444265,
"learning_rate": 1.811282368810951e-05,
"loss": 0.3727,
"step": 4730
},
{
"epoch": 0.20753973466438985,
"grad_norm": 10.812577263881279,
"learning_rate": 1.8108400964153824e-05,
"loss": 0.3881,
"step": 4740
},
{
"epoch": 0.20797758220587592,
"grad_norm": 13.018455316703584,
"learning_rate": 1.810397824019814e-05,
"loss": 0.3337,
"step": 4750
},
{
"epoch": 0.20841542974736196,
"grad_norm": 13.969299442144436,
"learning_rate": 1.8099555516242457e-05,
"loss": 0.4362,
"step": 4760
},
{
"epoch": 0.20885327728884803,
"grad_norm": 14.510644097153572,
"learning_rate": 1.809513279228677e-05,
"loss": 0.3534,
"step": 4770
},
{
"epoch": 0.20929112483033407,
"grad_norm": 6.43215364830629,
"learning_rate": 1.8090710068331087e-05,
"loss": 0.3813,
"step": 4780
},
{
"epoch": 0.20972897237182014,
"grad_norm": 9.94692028441624,
"learning_rate": 1.80862873443754e-05,
"loss": 0.3787,
"step": 4790
},
{
"epoch": 0.21016681991330619,
"grad_norm": 14.658374754366635,
"learning_rate": 1.8081864620419716e-05,
"loss": 0.332,
"step": 4800
},
{
"epoch": 0.21060466745479225,
"grad_norm": 12.603661359425915,
"learning_rate": 1.8077441896464035e-05,
"loss": 0.5098,
"step": 4810
},
{
"epoch": 0.2110425149962783,
"grad_norm": 8.069510317555627,
"learning_rate": 1.807301917250835e-05,
"loss": 0.3345,
"step": 4820
},
{
"epoch": 0.21148036253776434,
"grad_norm": 15.50460058047792,
"learning_rate": 1.8068596448552664e-05,
"loss": 0.4389,
"step": 4830
},
{
"epoch": 0.2119182100792504,
"grad_norm": 7.479416895287082,
"learning_rate": 1.806417372459698e-05,
"loss": 0.4058,
"step": 4840
},
{
"epoch": 0.21235605762073645,
"grad_norm": 10.414700579430006,
"learning_rate": 1.8059751000641298e-05,
"loss": 0.3713,
"step": 4850
},
{
"epoch": 0.21279390516222252,
"grad_norm": 11.290754293107446,
"learning_rate": 1.8055328276685612e-05,
"loss": 0.3759,
"step": 4860
},
{
"epoch": 0.21323175270370856,
"grad_norm": 11.549866408859454,
"learning_rate": 1.8050905552729927e-05,
"loss": 0.3288,
"step": 4870
},
{
"epoch": 0.21366960024519463,
"grad_norm": 18.046144465733533,
"learning_rate": 1.8046482828774242e-05,
"loss": 0.2962,
"step": 4880
},
{
"epoch": 0.21410744778668067,
"grad_norm": 10.63709064776316,
"learning_rate": 1.804206010481856e-05,
"loss": 0.3126,
"step": 4890
},
{
"epoch": 0.21454529532816674,
"grad_norm": 13.401948678962873,
"learning_rate": 1.8037637380862875e-05,
"loss": 0.335,
"step": 4900
},
{
"epoch": 0.21498314286965278,
"grad_norm": 28.923645525898227,
"learning_rate": 1.803321465690719e-05,
"loss": 0.3323,
"step": 4910
},
{
"epoch": 0.21542099041113885,
"grad_norm": 12.023966063146357,
"learning_rate": 1.8028791932951505e-05,
"loss": 0.3436,
"step": 4920
},
{
"epoch": 0.2158588379526249,
"grad_norm": 16.66651723158919,
"learning_rate": 1.8024369208995823e-05,
"loss": 0.3434,
"step": 4930
},
{
"epoch": 0.21629668549411096,
"grad_norm": 11.447028188385977,
"learning_rate": 1.801994648504014e-05,
"loss": 0.3734,
"step": 4940
},
{
"epoch": 0.216734533035597,
"grad_norm": 9.454223770724575,
"learning_rate": 1.8015523761084453e-05,
"loss": 0.3928,
"step": 4950
},
{
"epoch": 0.21717238057708305,
"grad_norm": 8.902373316783942,
"learning_rate": 1.8011101037128768e-05,
"loss": 0.3856,
"step": 4960
},
{
"epoch": 0.21761022811856912,
"grad_norm": 18.191989619833368,
"learning_rate": 1.8006678313173086e-05,
"loss": 0.4122,
"step": 4970
},
{
"epoch": 0.21804807566005516,
"grad_norm": 10.46306034169101,
"learning_rate": 1.80022555892174e-05,
"loss": 0.3303,
"step": 4980
},
{
"epoch": 0.21848592320154123,
"grad_norm": 14.018809081253679,
"learning_rate": 1.7997832865261716e-05,
"loss": 0.3167,
"step": 4990
},
{
"epoch": 0.21892377074302727,
"grad_norm": 9.557186071382484,
"learning_rate": 1.799341014130603e-05,
"loss": 0.4131,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 45678,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}