{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21892377074302727, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00043784754148605456, "grad_norm": 11.96724490271355, "learning_rate": 7.519033870117711e-06, "loss": 0.682, "step": 10 }, { "epoch": 0.0008756950829721091, "grad_norm": 16.377854891876865, "learning_rate": 9.782488603436574e-06, "loss": 0.5773, "step": 20 }, { "epoch": 0.0013135426244581636, "grad_norm": 19.097945880780852, "learning_rate": 1.1106524744507912e-05, "loss": 0.4648, "step": 30 }, { "epoch": 0.0017513901659442182, "grad_norm": 17.780105205349074, "learning_rate": 1.2045943336755435e-05, "loss": 0.5231, "step": 40 }, { "epoch": 0.002189237707430273, "grad_norm": 11.341588174257303, "learning_rate": 1.2774613006916558e-05, "loss": 0.373, "step": 50 }, { "epoch": 0.002627085248916327, "grad_norm": 12.468061614573642, "learning_rate": 1.3369979477826773e-05, "loss": 0.4499, "step": 60 }, { "epoch": 0.003064932790402382, "grad_norm": 14.844241870962383, "learning_rate": 1.3873354656555003e-05, "loss": 0.4481, "step": 70 }, { "epoch": 0.0035027803318884365, "grad_norm": 13.17618773461328, "learning_rate": 1.4268322372265782e-05, "loss": 0.4385, "step": 80 }, { "epoch": 0.003940627873374491, "grad_norm": 13.959456456486652, "learning_rate": 1.4657529486032087e-05, "loss": 0.5157, "step": 90 }, { "epoch": 0.004378475414860546, "grad_norm": 9.088640068563528, "learning_rate": 1.5005248620577926e-05, "loss": 0.5372, "step": 100 }, { "epoch": 0.0048163229563466005, "grad_norm": 8.985973448159925, "learning_rate": 1.531947884589086e-05, "loss": 0.3981, "step": 110 }, { "epoch": 0.005254170497832654, "grad_norm": 16.03083595499672, "learning_rate": 1.5606107901730336e-05, "loss": 0.4008, "step": 120 }, { "epoch": 0.005692018039318709, "grad_norm": 23.2563822200454, "learning_rate": 1.586959551766198e-05, "loss": 0.4434, "step": 130 }, { "epoch": 0.006129865580804764, "grad_norm": 13.106170808319108, "learning_rate": 1.611340086727408e-05, "loss": 0.4264, "step": 140 }, { "epoch": 0.006567713122290818, "grad_norm": 10.892983486327262, "learning_rate": 1.634026115826661e-05, "loss": 0.538, "step": 150 }, { "epoch": 0.007005560663776873, "grad_norm": 15.273372562202105, "learning_rate": 1.6531777105584646e-05, "loss": 0.4921, "step": 160 }, { "epoch": 0.007443408205262928, "grad_norm": 12.078764233569435, "learning_rate": 1.6732175860784077e-05, "loss": 0.4219, "step": 170 }, { "epoch": 0.007881255746748982, "grad_norm": 18.9218120999161, "learning_rate": 1.692098421935095e-05, "loss": 0.51, "step": 180 }, { "epoch": 0.008319103288235036, "grad_norm": 11.062242407951706, "learning_rate": 1.7099469894607657e-05, "loss": 0.4021, "step": 190 }, { "epoch": 0.008756950829721092, "grad_norm": 10.4039117059008, "learning_rate": 1.726870335389679e-05, "loss": 0.414, "step": 200 }, { "epoch": 0.009194798371207145, "grad_norm": 13.360369800315105, "learning_rate": 1.742959672866302e-05, "loss": 0.3934, "step": 210 }, { "epoch": 0.009632645912693201, "grad_norm": 17.885935179522868, "learning_rate": 1.7582933579209726e-05, "loss": 0.4299, "step": 220 }, { "epoch": 0.010070493454179255, "grad_norm": 15.613293466188615, "learning_rate": 1.7729391978127236e-05, "loss": 0.4348, "step": 230 }, { "epoch": 0.010508340995665309, "grad_norm": 8.671536275760825, "learning_rate": 1.7869562635049198e-05, "loss": 0.4612, "step": 240 }, { "epoch": 0.010946188537151364, "grad_norm": 16.1819762006921, "learning_rate": 1.8003963288492603e-05, "loss": 0.3756, "step": 250 }, { "epoch": 0.011384036078637418, "grad_norm": 13.36774911152187, "learning_rate": 1.813305025098084e-05, "loss": 0.4427, "step": 260 }, { "epoch": 0.011821883620123473, "grad_norm": 10.629955258208, "learning_rate": 1.8257227757513754e-05, "loss": 0.4213, "step": 270 }, { "epoch": 0.012259731161609527, "grad_norm": 19.332750684355226, "learning_rate": 1.8376855600592943e-05, "loss": 0.346, "step": 280 }, { "epoch": 0.012697578703095583, "grad_norm": 16.066310801594668, "learning_rate": 1.8492255415374714e-05, "loss": 0.4522, "step": 290 }, { "epoch": 0.013135426244581637, "grad_norm": 14.844884825041587, "learning_rate": 1.860371589158547e-05, "loss": 0.4505, "step": 300 }, { "epoch": 0.01357327378606769, "grad_norm": 16.539413080837956, "learning_rate": 1.8711497124872535e-05, "loss": 0.4296, "step": 310 }, { "epoch": 0.014011121327553746, "grad_norm": 11.178304803994907, "learning_rate": 1.8815834272664066e-05, "loss": 0.4404, "step": 320 }, { "epoch": 0.0144489688690398, "grad_norm": 9.171001542204067, "learning_rate": 1.8916940643811347e-05, "loss": 0.3785, "step": 330 }, { "epoch": 0.014886816410525855, "grad_norm": 18.255820822315396, "learning_rate": 1.9015010324094007e-05, "loss": 0.4524, "step": 340 }, { "epoch": 0.015324663952011909, "grad_norm": 13.044007567571079, "learning_rate": 1.911022041882251e-05, "loss": 0.4876, "step": 350 }, { "epoch": 0.015762511493497965, "grad_norm": 13.758756866189971, "learning_rate": 1.9202732977654023e-05, "loss": 0.5539, "step": 360 }, { "epoch": 0.01620035903498402, "grad_norm": 15.903726373816992, "learning_rate": 1.929269665417383e-05, "loss": 0.4976, "step": 370 }, { "epoch": 0.016638206576470072, "grad_norm": 15.248268904693534, "learning_rate": 1.938024814292676e-05, "loss": 0.5259, "step": 380 }, { "epoch": 0.017076054117956128, "grad_norm": 10.722522838992456, "learning_rate": 1.9465513428778125e-05, "loss": 0.489, "step": 390 }, { "epoch": 0.017513901659442183, "grad_norm": 16.61965558187522, "learning_rate": 1.9548608877267744e-05, "loss": 0.439, "step": 400 }, { "epoch": 0.017951749200928235, "grad_norm": 19.548934962148113, "learning_rate": 1.9629642189639832e-05, "loss": 0.4282, "step": 410 }, { "epoch": 0.01838959674241429, "grad_norm": 15.936747454659427, "learning_rate": 1.9708713242215694e-05, "loss": 0.3974, "step": 420 }, { "epoch": 0.018827444283900346, "grad_norm": 17.53178788446236, "learning_rate": 1.9785914826520243e-05, "loss": 0.4517, "step": 430 }, { "epoch": 0.019265291825386402, "grad_norm": 15.309413053796602, "learning_rate": 1.9861333303919378e-05, "loss": 0.4155, "step": 440 }, { "epoch": 0.019703139366872454, "grad_norm": 13.248113533897241, "learning_rate": 1.9935049186350462e-05, "loss": 0.4589, "step": 450 }, { "epoch": 0.02014098690835851, "grad_norm": 18.437663863148952, "learning_rate": 2e-05, "loss": 0.388, "step": 460 }, { "epoch": 0.020578834449844565, "grad_norm": 14.089094860711976, "learning_rate": 1.9995577276044317e-05, "loss": 0.4637, "step": 470 }, { "epoch": 0.021016681991330617, "grad_norm": 50.22724968906943, "learning_rate": 1.9991154552088635e-05, "loss": 0.3967, "step": 480 }, { "epoch": 0.021454529532816673, "grad_norm": 9.448811402831021, "learning_rate": 1.998673182813295e-05, "loss": 0.405, "step": 490 }, { "epoch": 0.021892377074302728, "grad_norm": 12.511886438347087, "learning_rate": 1.9982309104177265e-05, "loss": 0.4245, "step": 500 }, { "epoch": 0.022330224615788784, "grad_norm": 26.23559407530874, "learning_rate": 1.997788638022158e-05, "loss": 0.4486, "step": 510 }, { "epoch": 0.022768072157274836, "grad_norm": 10.162620448000466, "learning_rate": 1.9973463656265898e-05, "loss": 0.4898, "step": 520 }, { "epoch": 0.02320591969876089, "grad_norm": 10.505426367087482, "learning_rate": 1.996904093231021e-05, "loss": 0.4011, "step": 530 }, { "epoch": 0.023643767240246947, "grad_norm": 7.19153886182727, "learning_rate": 1.9964618208354528e-05, "loss": 0.4566, "step": 540 }, { "epoch": 0.024081614781733, "grad_norm": 16.625591066795565, "learning_rate": 1.9960195484398842e-05, "loss": 0.3865, "step": 550 }, { "epoch": 0.024519462323219055, "grad_norm": 21.196277881857508, "learning_rate": 1.995577276044316e-05, "loss": 0.3797, "step": 560 }, { "epoch": 0.02495730986470511, "grad_norm": 19.530616395972775, "learning_rate": 1.9951350036487472e-05, "loss": 0.4596, "step": 570 }, { "epoch": 0.025395157406191166, "grad_norm": 10.52123656050975, "learning_rate": 1.994692731253179e-05, "loss": 0.3836, "step": 580 }, { "epoch": 0.025833004947677218, "grad_norm": 14.373796230168502, "learning_rate": 1.9942504588576105e-05, "loss": 0.4916, "step": 590 }, { "epoch": 0.026270852489163273, "grad_norm": 11.25970447321859, "learning_rate": 1.9938081864620424e-05, "loss": 0.3832, "step": 600 }, { "epoch": 0.02670870003064933, "grad_norm": 16.21224392326194, "learning_rate": 1.9933659140664735e-05, "loss": 0.4749, "step": 610 }, { "epoch": 0.02714654757213538, "grad_norm": 14.71066366579976, "learning_rate": 1.9929236416709053e-05, "loss": 0.3882, "step": 620 }, { "epoch": 0.027584395113621436, "grad_norm": 11.37877349304637, "learning_rate": 1.992481369275337e-05, "loss": 0.4396, "step": 630 }, { "epoch": 0.028022242655107492, "grad_norm": 16.754438580812046, "learning_rate": 1.9920390968797683e-05, "loss": 0.4539, "step": 640 }, { "epoch": 0.028460090196593547, "grad_norm": 11.479142887670093, "learning_rate": 1.9915968244841998e-05, "loss": 0.451, "step": 650 }, { "epoch": 0.0288979377380796, "grad_norm": 8.382864831937072, "learning_rate": 1.9911545520886316e-05, "loss": 0.4716, "step": 660 }, { "epoch": 0.029335785279565655, "grad_norm": 19.710550432188437, "learning_rate": 1.990712279693063e-05, "loss": 0.3545, "step": 670 }, { "epoch": 0.02977363282105171, "grad_norm": 17.903211831860908, "learning_rate": 1.9902700072974946e-05, "loss": 0.4335, "step": 680 }, { "epoch": 0.030211480362537766, "grad_norm": 16.390743229733264, "learning_rate": 1.989827734901926e-05, "loss": 0.3806, "step": 690 }, { "epoch": 0.030649327904023818, "grad_norm": 14.980896037745874, "learning_rate": 1.989385462506358e-05, "loss": 0.3767, "step": 700 }, { "epoch": 0.031087175445509874, "grad_norm": 11.397175060743534, "learning_rate": 1.9889431901107894e-05, "loss": 0.4746, "step": 710 }, { "epoch": 0.03152502298699593, "grad_norm": 19.04061884539536, "learning_rate": 1.988500917715221e-05, "loss": 0.413, "step": 720 }, { "epoch": 0.031962870528481985, "grad_norm": 13.947751231087285, "learning_rate": 1.9880586453196527e-05, "loss": 0.3386, "step": 730 }, { "epoch": 0.03240071806996804, "grad_norm": 41.64839133720008, "learning_rate": 1.987616372924084e-05, "loss": 0.4444, "step": 740 }, { "epoch": 0.03283856561145409, "grad_norm": 13.185903732811664, "learning_rate": 1.9871741005285157e-05, "loss": 0.4383, "step": 750 }, { "epoch": 0.033276413152940144, "grad_norm": 15.456378713797612, "learning_rate": 1.9867318281329472e-05, "loss": 0.4087, "step": 760 }, { "epoch": 0.0337142606944262, "grad_norm": 13.751468649371544, "learning_rate": 1.986289555737379e-05, "loss": 0.4245, "step": 770 }, { "epoch": 0.034152108235912255, "grad_norm": 15.975026190651212, "learning_rate": 1.9858472833418102e-05, "loss": 0.4263, "step": 780 }, { "epoch": 0.03458995577739831, "grad_norm": 11.294733923572386, "learning_rate": 1.985405010946242e-05, "loss": 0.5273, "step": 790 }, { "epoch": 0.03502780331888437, "grad_norm": 22.907563592109977, "learning_rate": 1.9849627385506735e-05, "loss": 0.3766, "step": 800 }, { "epoch": 0.03546565086037042, "grad_norm": 19.465013466925168, "learning_rate": 1.9845204661551053e-05, "loss": 0.3739, "step": 810 }, { "epoch": 0.03590349840185647, "grad_norm": 13.553846560729426, "learning_rate": 1.9840781937595365e-05, "loss": 0.481, "step": 820 }, { "epoch": 0.036341345943342526, "grad_norm": 22.910094642393723, "learning_rate": 1.9836359213639683e-05, "loss": 0.4659, "step": 830 }, { "epoch": 0.03677919348482858, "grad_norm": 16.26462964503755, "learning_rate": 1.9831936489683998e-05, "loss": 0.381, "step": 840 }, { "epoch": 0.03721704102631464, "grad_norm": 9.94980804411553, "learning_rate": 1.9827513765728313e-05, "loss": 0.4003, "step": 850 }, { "epoch": 0.03765488856780069, "grad_norm": 13.328592093222, "learning_rate": 1.9823091041772628e-05, "loss": 0.3755, "step": 860 }, { "epoch": 0.03809273610928675, "grad_norm": 18.322055654010875, "learning_rate": 1.9818668317816946e-05, "loss": 0.494, "step": 870 }, { "epoch": 0.038530583650772804, "grad_norm": 12.090527797691786, "learning_rate": 1.981424559386126e-05, "loss": 0.4751, "step": 880 }, { "epoch": 0.03896843119225885, "grad_norm": 19.070570141131398, "learning_rate": 1.9809822869905576e-05, "loss": 0.4483, "step": 890 }, { "epoch": 0.03940627873374491, "grad_norm": 18.183543820420173, "learning_rate": 1.980540014594989e-05, "loss": 0.3881, "step": 900 }, { "epoch": 0.039844126275230964, "grad_norm": 11.47870821074828, "learning_rate": 1.980097742199421e-05, "loss": 0.465, "step": 910 }, { "epoch": 0.04028197381671702, "grad_norm": 4.662342565910864, "learning_rate": 1.9796554698038524e-05, "loss": 0.3553, "step": 920 }, { "epoch": 0.040719821358203075, "grad_norm": 11.599443461780693, "learning_rate": 1.979213197408284e-05, "loss": 0.4284, "step": 930 }, { "epoch": 0.04115766889968913, "grad_norm": 20.899400532717515, "learning_rate": 1.9787709250127154e-05, "loss": 0.3849, "step": 940 }, { "epoch": 0.041595516441175186, "grad_norm": 13.83963592226824, "learning_rate": 1.978328652617147e-05, "loss": 0.4228, "step": 950 }, { "epoch": 0.042033363982661234, "grad_norm": 10.129058777896109, "learning_rate": 1.9778863802215787e-05, "loss": 0.4608, "step": 960 }, { "epoch": 0.04247121152414729, "grad_norm": 13.525629087473678, "learning_rate": 1.9774441078260102e-05, "loss": 0.4168, "step": 970 }, { "epoch": 0.042909059065633345, "grad_norm": 12.286627992129988, "learning_rate": 1.9770018354304417e-05, "loss": 0.3782, "step": 980 }, { "epoch": 0.0433469066071194, "grad_norm": 15.201916543698117, "learning_rate": 1.976559563034873e-05, "loss": 0.4614, "step": 990 }, { "epoch": 0.043784754148605456, "grad_norm": 12.218728770361992, "learning_rate": 1.976117290639305e-05, "loss": 0.3442, "step": 1000 }, { "epoch": 0.04422260169009151, "grad_norm": 15.316130787659379, "learning_rate": 1.9756750182437365e-05, "loss": 0.424, "step": 1010 }, { "epoch": 0.04466044923157757, "grad_norm": 18.53714384430074, "learning_rate": 1.975276973087725e-05, "loss": 0.44, "step": 1020 }, { "epoch": 0.045098296773063616, "grad_norm": 10.148359314556405, "learning_rate": 1.9748347006921564e-05, "loss": 0.4016, "step": 1030 }, { "epoch": 0.04553614431454967, "grad_norm": 10.835539686055602, "learning_rate": 1.974392428296588e-05, "loss": 0.3899, "step": 1040 }, { "epoch": 0.04597399185603573, "grad_norm": 13.894807990285942, "learning_rate": 1.9739501559010197e-05, "loss": 0.4385, "step": 1050 }, { "epoch": 0.04641183939752178, "grad_norm": 13.530858128888262, "learning_rate": 1.9735078835054512e-05, "loss": 0.4324, "step": 1060 }, { "epoch": 0.04684968693900784, "grad_norm": 12.011714939023786, "learning_rate": 1.9730656111098827e-05, "loss": 0.4168, "step": 1070 }, { "epoch": 0.047287534480493894, "grad_norm": 11.289218951662432, "learning_rate": 1.9726233387143142e-05, "loss": 0.4438, "step": 1080 }, { "epoch": 0.04772538202197995, "grad_norm": 13.657029328270559, "learning_rate": 1.972181066318746e-05, "loss": 0.3996, "step": 1090 }, { "epoch": 0.048163229563466, "grad_norm": 8.72748000086665, "learning_rate": 1.9717387939231775e-05, "loss": 0.4578, "step": 1100 }, { "epoch": 0.04860107710495205, "grad_norm": 10.975240759500627, "learning_rate": 1.971296521527609e-05, "loss": 0.4165, "step": 1110 }, { "epoch": 0.04903892464643811, "grad_norm": 11.29698373153932, "learning_rate": 1.9708542491320405e-05, "loss": 0.3545, "step": 1120 }, { "epoch": 0.049476772187924165, "grad_norm": 20.506288633859523, "learning_rate": 1.9704119767364723e-05, "loss": 0.3568, "step": 1130 }, { "epoch": 0.04991461972941022, "grad_norm": 11.698600515801893, "learning_rate": 1.9699697043409035e-05, "loss": 0.4479, "step": 1140 }, { "epoch": 0.050352467270896276, "grad_norm": 13.862971410140013, "learning_rate": 1.9695274319453353e-05, "loss": 0.3974, "step": 1150 }, { "epoch": 0.05079031481238233, "grad_norm": 12.469196053464175, "learning_rate": 1.9690851595497668e-05, "loss": 0.4402, "step": 1160 }, { "epoch": 0.05122816235386838, "grad_norm": 11.3264424935876, "learning_rate": 1.9686428871541986e-05, "loss": 0.3643, "step": 1170 }, { "epoch": 0.051666009895354435, "grad_norm": 19.974826992464948, "learning_rate": 1.9682006147586298e-05, "loss": 0.3762, "step": 1180 }, { "epoch": 0.05210385743684049, "grad_norm": 10.918494229304311, "learning_rate": 1.9677583423630616e-05, "loss": 0.3545, "step": 1190 }, { "epoch": 0.052541704978326546, "grad_norm": 11.568181801830196, "learning_rate": 1.967316069967493e-05, "loss": 0.4007, "step": 1200 }, { "epoch": 0.0529795525198126, "grad_norm": 17.075179869568725, "learning_rate": 1.966873797571925e-05, "loss": 0.3989, "step": 1210 }, { "epoch": 0.05341740006129866, "grad_norm": 15.560201574750726, "learning_rate": 1.9664315251763564e-05, "loss": 0.4096, "step": 1220 }, { "epoch": 0.05385524760278471, "grad_norm": 11.91165895325058, "learning_rate": 1.965989252780788e-05, "loss": 0.436, "step": 1230 }, { "epoch": 0.05429309514427076, "grad_norm": 11.127002664395162, "learning_rate": 1.9655469803852194e-05, "loss": 0.3146, "step": 1240 }, { "epoch": 0.05473094268575682, "grad_norm": 17.968378053215634, "learning_rate": 1.965104707989651e-05, "loss": 0.444, "step": 1250 }, { "epoch": 0.05516879022724287, "grad_norm": 15.278957202738837, "learning_rate": 1.9646624355940827e-05, "loss": 0.461, "step": 1260 }, { "epoch": 0.05560663776872893, "grad_norm": 14.312647123288594, "learning_rate": 1.9642201631985142e-05, "loss": 0.398, "step": 1270 }, { "epoch": 0.056044485310214984, "grad_norm": 12.143629725906578, "learning_rate": 1.9637778908029457e-05, "loss": 0.3818, "step": 1280 }, { "epoch": 0.05648233285170104, "grad_norm": 10.960126916192301, "learning_rate": 1.9633356184073772e-05, "loss": 0.4273, "step": 1290 }, { "epoch": 0.056920180393187095, "grad_norm": 14.260248079643242, "learning_rate": 1.962893346011809e-05, "loss": 0.469, "step": 1300 }, { "epoch": 0.05735802793467314, "grad_norm": 9.509984040352219, "learning_rate": 1.9624510736162405e-05, "loss": 0.3732, "step": 1310 }, { "epoch": 0.0577958754761592, "grad_norm": 12.88397674204099, "learning_rate": 1.962008801220672e-05, "loss": 0.4473, "step": 1320 }, { "epoch": 0.058233723017645254, "grad_norm": 13.708354791471775, "learning_rate": 1.9615665288251035e-05, "loss": 0.4093, "step": 1330 }, { "epoch": 0.05867157055913131, "grad_norm": 11.009628243610226, "learning_rate": 1.9611242564295353e-05, "loss": 0.4258, "step": 1340 }, { "epoch": 0.059109418100617366, "grad_norm": 15.94608664981347, "learning_rate": 1.9606819840339664e-05, "loss": 0.4117, "step": 1350 }, { "epoch": 0.05954726564210342, "grad_norm": 8.631681646096013, "learning_rate": 1.9602397116383983e-05, "loss": 0.4384, "step": 1360 }, { "epoch": 0.05998511318358948, "grad_norm": 12.838858533847487, "learning_rate": 1.9597974392428298e-05, "loss": 0.3764, "step": 1370 }, { "epoch": 0.06042296072507553, "grad_norm": 41.39191821545224, "learning_rate": 1.9593551668472616e-05, "loss": 0.3166, "step": 1380 }, { "epoch": 0.06086080826656158, "grad_norm": 13.630700275122859, "learning_rate": 1.9589128944516927e-05, "loss": 0.4142, "step": 1390 }, { "epoch": 0.061298655808047636, "grad_norm": 10.941493062958882, "learning_rate": 1.9584706220561246e-05, "loss": 0.393, "step": 1400 }, { "epoch": 0.06173650334953369, "grad_norm": 12.209335407537546, "learning_rate": 1.958028349660556e-05, "loss": 0.3882, "step": 1410 }, { "epoch": 0.06217435089101975, "grad_norm": 11.160854212292483, "learning_rate": 1.957586077264988e-05, "loss": 0.3571, "step": 1420 }, { "epoch": 0.0626121984325058, "grad_norm": 15.758848096703037, "learning_rate": 1.957143804869419e-05, "loss": 0.4188, "step": 1430 }, { "epoch": 0.06305004597399186, "grad_norm": 14.95914949335199, "learning_rate": 1.956701532473851e-05, "loss": 0.3811, "step": 1440 }, { "epoch": 0.06348789351547791, "grad_norm": 12.602990028799342, "learning_rate": 1.9562592600782824e-05, "loss": 0.351, "step": 1450 }, { "epoch": 0.06392574105696397, "grad_norm": 11.138632661742742, "learning_rate": 1.955816987682714e-05, "loss": 0.3217, "step": 1460 }, { "epoch": 0.06436358859845003, "grad_norm": 10.073650366260493, "learning_rate": 1.9553747152871457e-05, "loss": 0.3915, "step": 1470 }, { "epoch": 0.06480143613993608, "grad_norm": 11.607027332443641, "learning_rate": 1.954932442891577e-05, "loss": 0.4501, "step": 1480 }, { "epoch": 0.06523928368142212, "grad_norm": 11.6963320729176, "learning_rate": 1.9544901704960087e-05, "loss": 0.3876, "step": 1490 }, { "epoch": 0.06567713122290818, "grad_norm": 12.978277397409164, "learning_rate": 1.95404789810044e-05, "loss": 0.4058, "step": 1500 }, { "epoch": 0.06611497876439423, "grad_norm": 10.230356140798722, "learning_rate": 1.953605625704872e-05, "loss": 0.3503, "step": 1510 }, { "epoch": 0.06655282630588029, "grad_norm": 10.411279694304964, "learning_rate": 1.953163353309303e-05, "loss": 0.4545, "step": 1520 }, { "epoch": 0.06699067384736634, "grad_norm": 12.48524397593703, "learning_rate": 1.952721080913735e-05, "loss": 0.4056, "step": 1530 }, { "epoch": 0.0674285213888524, "grad_norm": 9.054498757739506, "learning_rate": 1.9522788085181664e-05, "loss": 0.422, "step": 1540 }, { "epoch": 0.06786636893033846, "grad_norm": 12.468833215574444, "learning_rate": 1.9518365361225983e-05, "loss": 0.3668, "step": 1550 }, { "epoch": 0.06830421647182451, "grad_norm": 7.37855108081838, "learning_rate": 1.9513942637270294e-05, "loss": 0.3418, "step": 1560 }, { "epoch": 0.06874206401331057, "grad_norm": 18.087926793945318, "learning_rate": 1.9509519913314612e-05, "loss": 0.4362, "step": 1570 }, { "epoch": 0.06917991155479662, "grad_norm": 9.463333040806022, "learning_rate": 1.9505097189358927e-05, "loss": 0.351, "step": 1580 }, { "epoch": 0.06961775909628268, "grad_norm": 13.249219763473633, "learning_rate": 1.9500674465403246e-05, "loss": 0.3334, "step": 1590 }, { "epoch": 0.07005560663776873, "grad_norm": 9.494154085119547, "learning_rate": 1.9496251741447557e-05, "loss": 0.3766, "step": 1600 }, { "epoch": 0.07049345417925479, "grad_norm": 12.84431192695635, "learning_rate": 1.9491829017491875e-05, "loss": 0.3904, "step": 1610 }, { "epoch": 0.07093130172074084, "grad_norm": 10.983543223419419, "learning_rate": 1.948740629353619e-05, "loss": 0.4181, "step": 1620 }, { "epoch": 0.07136914926222689, "grad_norm": 17.280817340311735, "learning_rate": 1.948298356958051e-05, "loss": 0.3994, "step": 1630 }, { "epoch": 0.07180699680371294, "grad_norm": 11.179902123311116, "learning_rate": 1.947856084562482e-05, "loss": 0.41, "step": 1640 }, { "epoch": 0.072244844345199, "grad_norm": 7.022304150684218, "learning_rate": 1.947413812166914e-05, "loss": 0.3959, "step": 1650 }, { "epoch": 0.07268269188668505, "grad_norm": 11.715877252052287, "learning_rate": 1.9469715397713453e-05, "loss": 0.3781, "step": 1660 }, { "epoch": 0.07312053942817111, "grad_norm": 10.5240917837852, "learning_rate": 1.9465292673757768e-05, "loss": 0.4285, "step": 1670 }, { "epoch": 0.07355838696965716, "grad_norm": 13.043269992216052, "learning_rate": 1.9460869949802083e-05, "loss": 0.3953, "step": 1680 }, { "epoch": 0.07399623451114322, "grad_norm": 10.473522043776144, "learning_rate": 1.94564472258464e-05, "loss": 0.3795, "step": 1690 }, { "epoch": 0.07443408205262927, "grad_norm": 12.81124231632124, "learning_rate": 1.9452024501890716e-05, "loss": 0.4329, "step": 1700 }, { "epoch": 0.07487192959411533, "grad_norm": 9.570293367327814, "learning_rate": 1.944760177793503e-05, "loss": 0.4536, "step": 1710 }, { "epoch": 0.07530977713560139, "grad_norm": 13.781425492257062, "learning_rate": 1.944317905397935e-05, "loss": 0.3918, "step": 1720 }, { "epoch": 0.07574762467708744, "grad_norm": 13.427510367449685, "learning_rate": 1.943875633002366e-05, "loss": 0.3379, "step": 1730 }, { "epoch": 0.0761854722185735, "grad_norm": 12.599200486748096, "learning_rate": 1.943433360606798e-05, "loss": 0.3939, "step": 1740 }, { "epoch": 0.07662331976005955, "grad_norm": 14.384131674124605, "learning_rate": 1.9429910882112294e-05, "loss": 0.405, "step": 1750 }, { "epoch": 0.07706116730154561, "grad_norm": 9.414388690536624, "learning_rate": 1.9425488158156612e-05, "loss": 0.4003, "step": 1760 }, { "epoch": 0.07749901484303165, "grad_norm": 18.045324270495772, "learning_rate": 1.9421065434200924e-05, "loss": 0.4676, "step": 1770 }, { "epoch": 0.0779368623845177, "grad_norm": 11.920491989530506, "learning_rate": 1.9416642710245242e-05, "loss": 0.3452, "step": 1780 }, { "epoch": 0.07837470992600376, "grad_norm": 13.25199040272023, "learning_rate": 1.9412219986289557e-05, "loss": 0.3281, "step": 1790 }, { "epoch": 0.07881255746748982, "grad_norm": 18.052479006801686, "learning_rate": 1.9407797262333875e-05, "loss": 0.393, "step": 1800 }, { "epoch": 0.07925040500897587, "grad_norm": 9.88776702327391, "learning_rate": 1.9403374538378187e-05, "loss": 0.3531, "step": 1810 }, { "epoch": 0.07968825255046193, "grad_norm": 8.54763684547313, "learning_rate": 1.9398951814422505e-05, "loss": 0.4018, "step": 1820 }, { "epoch": 0.08012610009194798, "grad_norm": 12.109050524886657, "learning_rate": 1.939452909046682e-05, "loss": 0.4658, "step": 1830 }, { "epoch": 0.08056394763343404, "grad_norm": 7.7518635631951485, "learning_rate": 1.9390106366511138e-05, "loss": 0.4297, "step": 1840 }, { "epoch": 0.0810017951749201, "grad_norm": 14.594279048895539, "learning_rate": 1.938568364255545e-05, "loss": 0.355, "step": 1850 }, { "epoch": 0.08143964271640615, "grad_norm": 11.417142166667903, "learning_rate": 1.9381260918599768e-05, "loss": 0.392, "step": 1860 }, { "epoch": 0.0818774902578922, "grad_norm": 11.637581528522489, "learning_rate": 1.9376838194644083e-05, "loss": 0.3802, "step": 1870 }, { "epoch": 0.08231533779937826, "grad_norm": 11.85655948956895, "learning_rate": 1.9372415470688398e-05, "loss": 0.3977, "step": 1880 }, { "epoch": 0.08275318534086432, "grad_norm": 10.54522592721261, "learning_rate": 1.9367992746732713e-05, "loss": 0.3971, "step": 1890 }, { "epoch": 0.08319103288235037, "grad_norm": 11.259013994047795, "learning_rate": 1.936357002277703e-05, "loss": 0.4662, "step": 1900 }, { "epoch": 0.08362888042383643, "grad_norm": 10.413681904734188, "learning_rate": 1.9359147298821346e-05, "loss": 0.2992, "step": 1910 }, { "epoch": 0.08406672796532247, "grad_norm": 16.01143047206335, "learning_rate": 1.935472457486566e-05, "loss": 0.4038, "step": 1920 }, { "epoch": 0.08450457550680852, "grad_norm": 8.758483697657311, "learning_rate": 1.9350301850909976e-05, "loss": 0.3402, "step": 1930 }, { "epoch": 0.08494242304829458, "grad_norm": 13.47768507552826, "learning_rate": 1.934587912695429e-05, "loss": 0.3971, "step": 1940 }, { "epoch": 0.08538027058978064, "grad_norm": 14.13100123568219, "learning_rate": 1.934145640299861e-05, "loss": 0.3485, "step": 1950 }, { "epoch": 0.08581811813126669, "grad_norm": 11.367957313924562, "learning_rate": 1.9337033679042924e-05, "loss": 0.3561, "step": 1960 }, { "epoch": 0.08625596567275275, "grad_norm": 10.362775839894933, "learning_rate": 1.933261095508724e-05, "loss": 0.3037, "step": 1970 }, { "epoch": 0.0866938132142388, "grad_norm": 14.232027197294531, "learning_rate": 1.9328188231131554e-05, "loss": 0.3772, "step": 1980 }, { "epoch": 0.08713166075572486, "grad_norm": 7.416354083499904, "learning_rate": 1.9323765507175872e-05, "loss": 0.3444, "step": 1990 }, { "epoch": 0.08756950829721091, "grad_norm": 11.816638685730455, "learning_rate": 1.9319342783220187e-05, "loss": 0.4283, "step": 2000 }, { "epoch": 0.08800735583869697, "grad_norm": 7.26923187311744, "learning_rate": 1.9314920059264505e-05, "loss": 0.3865, "step": 2010 }, { "epoch": 0.08844520338018302, "grad_norm": 15.434817185278508, "learning_rate": 1.9310497335308817e-05, "loss": 0.4109, "step": 2020 }, { "epoch": 0.08888305092166908, "grad_norm": 10.681457596084588, "learning_rate": 1.9306074611353135e-05, "loss": 0.3436, "step": 2030 }, { "epoch": 0.08932089846315514, "grad_norm": 11.348178640084303, "learning_rate": 1.930165188739745e-05, "loss": 0.3667, "step": 2040 }, { "epoch": 0.08975874600464119, "grad_norm": 14.026197173621947, "learning_rate": 1.9297229163441768e-05, "loss": 0.4237, "step": 2050 }, { "epoch": 0.09019659354612723, "grad_norm": 13.112088492075678, "learning_rate": 1.929280643948608e-05, "loss": 0.4262, "step": 2060 }, { "epoch": 0.09063444108761329, "grad_norm": 9.622691088092534, "learning_rate": 1.9288383715530398e-05, "loss": 0.3801, "step": 2070 }, { "epoch": 0.09107228862909934, "grad_norm": 9.697640310926039, "learning_rate": 1.9283960991574713e-05, "loss": 0.39, "step": 2080 }, { "epoch": 0.0915101361705854, "grad_norm": 12.500317378783329, "learning_rate": 1.9279538267619028e-05, "loss": 0.4054, "step": 2090 }, { "epoch": 0.09194798371207145, "grad_norm": 10.789036689302037, "learning_rate": 1.9275115543663342e-05, "loss": 0.4074, "step": 2100 }, { "epoch": 0.09238583125355751, "grad_norm": 12.509375756268714, "learning_rate": 1.927069281970766e-05, "loss": 0.3667, "step": 2110 }, { "epoch": 0.09282367879504357, "grad_norm": 9.389386341327063, "learning_rate": 1.9266270095751976e-05, "loss": 0.4499, "step": 2120 }, { "epoch": 0.09326152633652962, "grad_norm": 15.390016743215236, "learning_rate": 1.926184737179629e-05, "loss": 0.4473, "step": 2130 }, { "epoch": 0.09369937387801568, "grad_norm": 19.17657099807316, "learning_rate": 1.9257424647840605e-05, "loss": 0.3094, "step": 2140 }, { "epoch": 0.09413722141950173, "grad_norm": 14.244069789562317, "learning_rate": 1.925300192388492e-05, "loss": 0.426, "step": 2150 }, { "epoch": 0.09457506896098779, "grad_norm": 14.851204953612982, "learning_rate": 1.924857919992924e-05, "loss": 0.4369, "step": 2160 }, { "epoch": 0.09501291650247384, "grad_norm": 8.66142168035377, "learning_rate": 1.9244156475973554e-05, "loss": 0.3666, "step": 2170 }, { "epoch": 0.0954507640439599, "grad_norm": 10.22720519072457, "learning_rate": 1.923973375201787e-05, "loss": 0.4175, "step": 2180 }, { "epoch": 0.09588861158544595, "grad_norm": 9.595268402718855, "learning_rate": 1.9235311028062183e-05, "loss": 0.4074, "step": 2190 }, { "epoch": 0.096326459126932, "grad_norm": 11.423439263182056, "learning_rate": 1.92308883041065e-05, "loss": 0.3714, "step": 2200 }, { "epoch": 0.09676430666841805, "grad_norm": 12.195829235128782, "learning_rate": 1.9226465580150816e-05, "loss": 0.3617, "step": 2210 }, { "epoch": 0.0972021542099041, "grad_norm": 13.401702180614619, "learning_rate": 1.922204285619513e-05, "loss": 0.3476, "step": 2220 }, { "epoch": 0.09764000175139016, "grad_norm": 14.400413274132806, "learning_rate": 1.9217620132239446e-05, "loss": 0.3654, "step": 2230 }, { "epoch": 0.09807784929287622, "grad_norm": 13.728323756706255, "learning_rate": 1.9213197408283765e-05, "loss": 0.3755, "step": 2240 }, { "epoch": 0.09851569683436227, "grad_norm": 9.537467954489292, "learning_rate": 1.920877468432808e-05, "loss": 0.4267, "step": 2250 }, { "epoch": 0.09895354437584833, "grad_norm": 12.562863169113138, "learning_rate": 1.9204351960372394e-05, "loss": 0.3631, "step": 2260 }, { "epoch": 0.09939139191733438, "grad_norm": 8.916130073539824, "learning_rate": 1.919992923641671e-05, "loss": 0.3976, "step": 2270 }, { "epoch": 0.09982923945882044, "grad_norm": 10.835660087923376, "learning_rate": 1.9195506512461028e-05, "loss": 0.344, "step": 2280 }, { "epoch": 0.1002670870003065, "grad_norm": 14.089519398644, "learning_rate": 1.9191083788505342e-05, "loss": 0.3957, "step": 2290 }, { "epoch": 0.10070493454179255, "grad_norm": 13.631563897871652, "learning_rate": 1.9186661064549657e-05, "loss": 0.4412, "step": 2300 }, { "epoch": 0.1011427820832786, "grad_norm": 11.687133130522081, "learning_rate": 1.9182238340593972e-05, "loss": 0.4246, "step": 2310 }, { "epoch": 0.10158062962476466, "grad_norm": 11.012593429809936, "learning_rate": 1.917781561663829e-05, "loss": 0.3929, "step": 2320 }, { "epoch": 0.10201847716625072, "grad_norm": 11.437466647406971, "learning_rate": 1.9173392892682605e-05, "loss": 0.4196, "step": 2330 }, { "epoch": 0.10245632470773676, "grad_norm": 13.09895341492618, "learning_rate": 1.916897016872692e-05, "loss": 0.3513, "step": 2340 }, { "epoch": 0.10289417224922282, "grad_norm": 12.293757546388127, "learning_rate": 1.9164547444771235e-05, "loss": 0.3992, "step": 2350 }, { "epoch": 0.10333201979070887, "grad_norm": 10.261487153006069, "learning_rate": 1.916012472081555e-05, "loss": 0.3066, "step": 2360 }, { "epoch": 0.10376986733219493, "grad_norm": 14.709144519485733, "learning_rate": 1.915570199685987e-05, "loss": 0.4397, "step": 2370 }, { "epoch": 0.10420771487368098, "grad_norm": 10.834726410330754, "learning_rate": 1.9151279272904183e-05, "loss": 0.3525, "step": 2380 }, { "epoch": 0.10464556241516704, "grad_norm": 14.674602673430272, "learning_rate": 1.9146856548948498e-05, "loss": 0.3694, "step": 2390 }, { "epoch": 0.10508340995665309, "grad_norm": 16.41260572575191, "learning_rate": 1.9142433824992813e-05, "loss": 0.3565, "step": 2400 }, { "epoch": 0.10552125749813915, "grad_norm": 10.31589706418058, "learning_rate": 1.913801110103713e-05, "loss": 0.4554, "step": 2410 }, { "epoch": 0.1059591050396252, "grad_norm": 8.228303991240578, "learning_rate": 1.9133588377081446e-05, "loss": 0.4028, "step": 2420 }, { "epoch": 0.10639695258111126, "grad_norm": 13.474103036444346, "learning_rate": 1.912916565312576e-05, "loss": 0.4172, "step": 2430 }, { "epoch": 0.10683480012259731, "grad_norm": 11.893594097933823, "learning_rate": 1.9124742929170076e-05, "loss": 0.3997, "step": 2440 }, { "epoch": 0.10727264766408337, "grad_norm": 15.503153818507785, "learning_rate": 1.9120320205214394e-05, "loss": 0.4038, "step": 2450 }, { "epoch": 0.10771049520556943, "grad_norm": 10.232230065068523, "learning_rate": 1.911589748125871e-05, "loss": 0.4035, "step": 2460 }, { "epoch": 0.10814834274705548, "grad_norm": 12.04140450127315, "learning_rate": 1.9111474757303024e-05, "loss": 0.3076, "step": 2470 }, { "epoch": 0.10858619028854152, "grad_norm": 15.468189206217254, "learning_rate": 1.910705203334734e-05, "loss": 0.3361, "step": 2480 }, { "epoch": 0.10902403783002758, "grad_norm": 17.902774600013995, "learning_rate": 1.9102629309391657e-05, "loss": 0.4081, "step": 2490 }, { "epoch": 0.10946188537151363, "grad_norm": 24.91450836462123, "learning_rate": 1.9098206585435972e-05, "loss": 0.4367, "step": 2500 }, { "epoch": 0.10989973291299969, "grad_norm": 11.744904407487343, "learning_rate": 1.9094226133875857e-05, "loss": 0.4292, "step": 2510 }, { "epoch": 0.11033758045448575, "grad_norm": 13.899816366396042, "learning_rate": 1.908980340992017e-05, "loss": 0.4321, "step": 2520 }, { "epoch": 0.1107754279959718, "grad_norm": 12.102738226657959, "learning_rate": 1.9085380685964486e-05, "loss": 0.3108, "step": 2530 }, { "epoch": 0.11121327553745786, "grad_norm": 14.16084103947617, "learning_rate": 1.9080957962008805e-05, "loss": 0.335, "step": 2540 }, { "epoch": 0.11165112307894391, "grad_norm": 11.115420213753518, "learning_rate": 1.9076535238053116e-05, "loss": 0.3912, "step": 2550 }, { "epoch": 0.11208897062042997, "grad_norm": 13.023671826026302, "learning_rate": 1.9072112514097434e-05, "loss": 0.4168, "step": 2560 }, { "epoch": 0.11252681816191602, "grad_norm": 11.952192442175816, "learning_rate": 1.906768979014175e-05, "loss": 0.4028, "step": 2570 }, { "epoch": 0.11296466570340208, "grad_norm": 9.592598016519975, "learning_rate": 1.9063267066186068e-05, "loss": 0.4134, "step": 2580 }, { "epoch": 0.11340251324488813, "grad_norm": 9.083352138488156, "learning_rate": 1.905884434223038e-05, "loss": 0.3328, "step": 2590 }, { "epoch": 0.11384036078637419, "grad_norm": 18.60156824207177, "learning_rate": 1.9054421618274697e-05, "loss": 0.3479, "step": 2600 }, { "epoch": 0.11427820832786025, "grad_norm": 9.845963292845576, "learning_rate": 1.9049998894319012e-05, "loss": 0.3555, "step": 2610 }, { "epoch": 0.11471605586934629, "grad_norm": 10.182726906192356, "learning_rate": 1.904557617036333e-05, "loss": 0.4251, "step": 2620 }, { "epoch": 0.11515390341083234, "grad_norm": 7.335163697208383, "learning_rate": 1.9041153446407642e-05, "loss": 0.3376, "step": 2630 }, { "epoch": 0.1155917509523184, "grad_norm": 11.37657070843338, "learning_rate": 1.903673072245196e-05, "loss": 0.3645, "step": 2640 }, { "epoch": 0.11602959849380445, "grad_norm": 10.877548094642906, "learning_rate": 1.9032307998496275e-05, "loss": 0.3726, "step": 2650 }, { "epoch": 0.11646744603529051, "grad_norm": 15.333882252915895, "learning_rate": 1.902788527454059e-05, "loss": 0.385, "step": 2660 }, { "epoch": 0.11690529357677656, "grad_norm": 12.823331692401915, "learning_rate": 1.9023462550584905e-05, "loss": 0.3943, "step": 2670 }, { "epoch": 0.11734314111826262, "grad_norm": 12.692981757202359, "learning_rate": 1.9019039826629223e-05, "loss": 0.4218, "step": 2680 }, { "epoch": 0.11778098865974868, "grad_norm": 12.550522897473236, "learning_rate": 1.9014617102673538e-05, "loss": 0.4077, "step": 2690 }, { "epoch": 0.11821883620123473, "grad_norm": 11.727829165225376, "learning_rate": 1.9010194378717853e-05, "loss": 0.4516, "step": 2700 }, { "epoch": 0.11865668374272079, "grad_norm": 9.239870866145635, "learning_rate": 1.900577165476217e-05, "loss": 0.3879, "step": 2710 }, { "epoch": 0.11909453128420684, "grad_norm": 10.991158486727915, "learning_rate": 1.9001348930806486e-05, "loss": 0.4201, "step": 2720 }, { "epoch": 0.1195323788256929, "grad_norm": 10.775808182785527, "learning_rate": 1.89969262068508e-05, "loss": 0.3612, "step": 2730 }, { "epoch": 0.11997022636717895, "grad_norm": 8.786090643394553, "learning_rate": 1.8992503482895116e-05, "loss": 0.4029, "step": 2740 }, { "epoch": 0.12040807390866501, "grad_norm": 10.644438789736052, "learning_rate": 1.8988080758939434e-05, "loss": 0.3642, "step": 2750 }, { "epoch": 0.12084592145015106, "grad_norm": 10.429045582422866, "learning_rate": 1.8983658034983746e-05, "loss": 0.3972, "step": 2760 }, { "epoch": 0.1212837689916371, "grad_norm": 19.068393684191452, "learning_rate": 1.8979235311028064e-05, "loss": 0.4214, "step": 2770 }, { "epoch": 0.12172161653312316, "grad_norm": 6.510885660190795, "learning_rate": 1.897481258707238e-05, "loss": 0.3927, "step": 2780 }, { "epoch": 0.12215946407460922, "grad_norm": 13.320869214905798, "learning_rate": 1.8970389863116697e-05, "loss": 0.3801, "step": 2790 }, { "epoch": 0.12259731161609527, "grad_norm": 11.91416431821247, "learning_rate": 1.896596713916101e-05, "loss": 0.3617, "step": 2800 }, { "epoch": 0.12303515915758133, "grad_norm": 16.39803737899181, "learning_rate": 1.8961544415205327e-05, "loss": 0.4252, "step": 2810 }, { "epoch": 0.12347300669906738, "grad_norm": 18.444202304052418, "learning_rate": 1.8957121691249642e-05, "loss": 0.3753, "step": 2820 }, { "epoch": 0.12391085424055344, "grad_norm": 11.295343530031559, "learning_rate": 1.895269896729396e-05, "loss": 0.3921, "step": 2830 }, { "epoch": 0.1243487017820395, "grad_norm": 9.710371030089025, "learning_rate": 1.8948276243338272e-05, "loss": 0.3506, "step": 2840 }, { "epoch": 0.12478654932352555, "grad_norm": 13.077018882435036, "learning_rate": 1.894385351938259e-05, "loss": 0.3025, "step": 2850 }, { "epoch": 0.1252243968650116, "grad_norm": 9.815328591239517, "learning_rate": 1.8939430795426905e-05, "loss": 0.3919, "step": 2860 }, { "epoch": 0.12566224440649765, "grad_norm": 10.956422665808212, "learning_rate": 1.893500807147122e-05, "loss": 0.3592, "step": 2870 }, { "epoch": 0.12610009194798372, "grad_norm": 7.24660201847797, "learning_rate": 1.8930585347515535e-05, "loss": 0.4198, "step": 2880 }, { "epoch": 0.12653793948946976, "grad_norm": 7.084606579805614, "learning_rate": 1.8926162623559853e-05, "loss": 0.4077, "step": 2890 }, { "epoch": 0.12697578703095583, "grad_norm": 16.831228496029244, "learning_rate": 1.8921739899604168e-05, "loss": 0.3456, "step": 2900 }, { "epoch": 0.12741363457244187, "grad_norm": 12.587652277854042, "learning_rate": 1.8917317175648483e-05, "loss": 0.4014, "step": 2910 }, { "epoch": 0.12785148211392794, "grad_norm": 9.243117375162731, "learning_rate": 1.8912894451692798e-05, "loss": 0.3575, "step": 2920 }, { "epoch": 0.12828932965541398, "grad_norm": 11.159631895058327, "learning_rate": 1.8908471727737116e-05, "loss": 0.3241, "step": 2930 }, { "epoch": 0.12872717719690005, "grad_norm": 10.310646954134011, "learning_rate": 1.890404900378143e-05, "loss": 0.3682, "step": 2940 }, { "epoch": 0.1291650247383861, "grad_norm": 10.173470601980204, "learning_rate": 1.8899626279825746e-05, "loss": 0.4251, "step": 2950 }, { "epoch": 0.12960287227987216, "grad_norm": 12.609142050783468, "learning_rate": 1.889520355587006e-05, "loss": 0.394, "step": 2960 }, { "epoch": 0.1300407198213582, "grad_norm": 9.635654645850565, "learning_rate": 1.8890780831914376e-05, "loss": 0.4089, "step": 2970 }, { "epoch": 0.13047856736284424, "grad_norm": 11.436287904310273, "learning_rate": 1.8886358107958694e-05, "loss": 0.3826, "step": 2980 }, { "epoch": 0.1309164149043303, "grad_norm": 16.392761382159563, "learning_rate": 1.888193538400301e-05, "loss": 0.3276, "step": 2990 }, { "epoch": 0.13135426244581636, "grad_norm": 17.239395198967156, "learning_rate": 1.8877512660047327e-05, "loss": 0.4094, "step": 3000 }, { "epoch": 0.13179210998730242, "grad_norm": 14.545360777292585, "learning_rate": 1.887308993609164e-05, "loss": 0.3779, "step": 3010 }, { "epoch": 0.13222995752878847, "grad_norm": 14.366498738758244, "learning_rate": 1.8868667212135957e-05, "loss": 0.4288, "step": 3020 }, { "epoch": 0.13266780507027454, "grad_norm": 17.039699201481028, "learning_rate": 1.886424448818027e-05, "loss": 0.36, "step": 3030 }, { "epoch": 0.13310565261176058, "grad_norm": 9.226512886191754, "learning_rate": 1.885982176422459e-05, "loss": 0.3765, "step": 3040 }, { "epoch": 0.13354350015324665, "grad_norm": 9.028526449867499, "learning_rate": 1.88553990402689e-05, "loss": 0.4395, "step": 3050 }, { "epoch": 0.1339813476947327, "grad_norm": 13.6426618700454, "learning_rate": 1.885097631631322e-05, "loss": 0.3723, "step": 3060 }, { "epoch": 0.13441919523621876, "grad_norm": 13.305857832591359, "learning_rate": 1.8846553592357535e-05, "loss": 0.399, "step": 3070 }, { "epoch": 0.1348570427777048, "grad_norm": 7.874733814403194, "learning_rate": 1.884213086840185e-05, "loss": 0.4011, "step": 3080 }, { "epoch": 0.13529489031919087, "grad_norm": 13.430051646641026, "learning_rate": 1.8837708144446164e-05, "loss": 0.3906, "step": 3090 }, { "epoch": 0.1357327378606769, "grad_norm": 8.662733163489246, "learning_rate": 1.8833285420490483e-05, "loss": 0.4138, "step": 3100 }, { "epoch": 0.13617058540216298, "grad_norm": 9.801119868851403, "learning_rate": 1.8828862696534798e-05, "loss": 0.3641, "step": 3110 }, { "epoch": 0.13660843294364902, "grad_norm": 11.778191693148269, "learning_rate": 1.8824439972579112e-05, "loss": 0.3762, "step": 3120 }, { "epoch": 0.13704628048513506, "grad_norm": 8.279726053764739, "learning_rate": 1.8820017248623427e-05, "loss": 0.3693, "step": 3130 }, { "epoch": 0.13748412802662113, "grad_norm": 10.69124033655974, "learning_rate": 1.8815594524667746e-05, "loss": 0.3369, "step": 3140 }, { "epoch": 0.13792197556810717, "grad_norm": 6.139622574926641, "learning_rate": 1.881117180071206e-05, "loss": 0.3526, "step": 3150 }, { "epoch": 0.13835982310959324, "grad_norm": 8.884320160504993, "learning_rate": 1.8806749076756375e-05, "loss": 0.321, "step": 3160 }, { "epoch": 0.13879767065107929, "grad_norm": 11.836052882069614, "learning_rate": 1.880232635280069e-05, "loss": 0.3988, "step": 3170 }, { "epoch": 0.13923551819256536, "grad_norm": 9.920797262455972, "learning_rate": 1.8797903628845005e-05, "loss": 0.5179, "step": 3180 }, { "epoch": 0.1396733657340514, "grad_norm": 14.813362193063755, "learning_rate": 1.8793480904889324e-05, "loss": 0.377, "step": 3190 }, { "epoch": 0.14011121327553747, "grad_norm": 6.695253814831782, "learning_rate": 1.878905818093364e-05, "loss": 0.3863, "step": 3200 }, { "epoch": 0.1405490608170235, "grad_norm": 11.085686646540799, "learning_rate": 1.8784635456977953e-05, "loss": 0.3708, "step": 3210 }, { "epoch": 0.14098690835850958, "grad_norm": 13.830245136728134, "learning_rate": 1.8780212733022268e-05, "loss": 0.3151, "step": 3220 }, { "epoch": 0.14142475589999562, "grad_norm": 12.120622926321499, "learning_rate": 1.8775790009066586e-05, "loss": 0.3459, "step": 3230 }, { "epoch": 0.1418626034414817, "grad_norm": 16.30580487007426, "learning_rate": 1.87713672851109e-05, "loss": 0.3745, "step": 3240 }, { "epoch": 0.14230045098296773, "grad_norm": 9.992654009584912, "learning_rate": 1.876694456115522e-05, "loss": 0.3993, "step": 3250 }, { "epoch": 0.14273829852445377, "grad_norm": 8.666190158753487, "learning_rate": 1.876252183719953e-05, "loss": 0.3774, "step": 3260 }, { "epoch": 0.14317614606593984, "grad_norm": 10.386340090996436, "learning_rate": 1.875809911324385e-05, "loss": 0.3806, "step": 3270 }, { "epoch": 0.14361399360742588, "grad_norm": 22.19356554965066, "learning_rate": 1.8753676389288164e-05, "loss": 0.3824, "step": 3280 }, { "epoch": 0.14405184114891195, "grad_norm": 9.21049973448166, "learning_rate": 1.874925366533248e-05, "loss": 0.3353, "step": 3290 }, { "epoch": 0.144489688690398, "grad_norm": 12.072168142150097, "learning_rate": 1.8744830941376794e-05, "loss": 0.4011, "step": 3300 }, { "epoch": 0.14492753623188406, "grad_norm": 12.59324716931968, "learning_rate": 1.8740408217421112e-05, "loss": 0.4021, "step": 3310 }, { "epoch": 0.1453653837733701, "grad_norm": 10.05214029931775, "learning_rate": 1.8735985493465427e-05, "loss": 0.3553, "step": 3320 }, { "epoch": 0.14580323131485617, "grad_norm": 11.803155664591488, "learning_rate": 1.8731562769509742e-05, "loss": 0.356, "step": 3330 }, { "epoch": 0.14624107885634222, "grad_norm": 14.92368689915674, "learning_rate": 1.8727140045554057e-05, "loss": 0.3801, "step": 3340 }, { "epoch": 0.14667892639782829, "grad_norm": 17.079776945779418, "learning_rate": 1.8722717321598375e-05, "loss": 0.3477, "step": 3350 }, { "epoch": 0.14711677393931433, "grad_norm": 13.097782069556722, "learning_rate": 1.871829459764269e-05, "loss": 0.3671, "step": 3360 }, { "epoch": 0.1475546214808004, "grad_norm": 8.15632623512624, "learning_rate": 1.8713871873687005e-05, "loss": 0.3389, "step": 3370 }, { "epoch": 0.14799246902228644, "grad_norm": 13.960404123834712, "learning_rate": 1.870944914973132e-05, "loss": 0.3485, "step": 3380 }, { "epoch": 0.1484303165637725, "grad_norm": 11.071464397882252, "learning_rate": 1.8705026425775635e-05, "loss": 0.3416, "step": 3390 }, { "epoch": 0.14886816410525855, "grad_norm": 13.277501194270975, "learning_rate": 1.8700603701819953e-05, "loss": 0.3789, "step": 3400 }, { "epoch": 0.1493060116467446, "grad_norm": 10.804325254909127, "learning_rate": 1.8696180977864268e-05, "loss": 0.4302, "step": 3410 }, { "epoch": 0.14974385918823066, "grad_norm": 13.368363204432509, "learning_rate": 1.8691758253908583e-05, "loss": 0.3827, "step": 3420 }, { "epoch": 0.1501817067297167, "grad_norm": 10.0824595511505, "learning_rate": 1.8687335529952898e-05, "loss": 0.3658, "step": 3430 }, { "epoch": 0.15061955427120277, "grad_norm": 12.416220706039892, "learning_rate": 1.8682912805997216e-05, "loss": 0.3614, "step": 3440 }, { "epoch": 0.1510574018126888, "grad_norm": 16.7125572642998, "learning_rate": 1.867849008204153e-05, "loss": 0.4461, "step": 3450 }, { "epoch": 0.15149524935417488, "grad_norm": 12.811501366379437, "learning_rate": 1.8674067358085846e-05, "loss": 0.4033, "step": 3460 }, { "epoch": 0.15193309689566092, "grad_norm": 17.173250485174975, "learning_rate": 1.866964463413016e-05, "loss": 0.3277, "step": 3470 }, { "epoch": 0.152370944437147, "grad_norm": 8.979558831403725, "learning_rate": 1.866522191017448e-05, "loss": 0.3373, "step": 3480 }, { "epoch": 0.15280879197863304, "grad_norm": 7.797438606883994, "learning_rate": 1.8660799186218794e-05, "loss": 0.3954, "step": 3490 }, { "epoch": 0.1532466395201191, "grad_norm": 16.155288520109096, "learning_rate": 1.865637646226311e-05, "loss": 0.385, "step": 3500 }, { "epoch": 0.15368448706160515, "grad_norm": 16.957081191616158, "learning_rate": 1.8651953738307424e-05, "loss": 0.3742, "step": 3510 }, { "epoch": 0.15412233460309122, "grad_norm": 9.531468523365854, "learning_rate": 1.8647531014351742e-05, "loss": 0.3326, "step": 3520 }, { "epoch": 0.15456018214457726, "grad_norm": 25.31643246845194, "learning_rate": 1.8643108290396057e-05, "loss": 0.39, "step": 3530 }, { "epoch": 0.1549980296860633, "grad_norm": 10.078617054363939, "learning_rate": 1.8638685566440372e-05, "loss": 0.4025, "step": 3540 }, { "epoch": 0.15543587722754937, "grad_norm": 11.02217779389865, "learning_rate": 1.8634262842484687e-05, "loss": 0.4015, "step": 3550 }, { "epoch": 0.1558737247690354, "grad_norm": 14.098789980216388, "learning_rate": 1.8629840118529005e-05, "loss": 0.3344, "step": 3560 }, { "epoch": 0.15631157231052148, "grad_norm": 15.721719692602761, "learning_rate": 1.862541739457332e-05, "loss": 0.3715, "step": 3570 }, { "epoch": 0.15674941985200752, "grad_norm": 12.716040770267046, "learning_rate": 1.8620994670617635e-05, "loss": 0.3651, "step": 3580 }, { "epoch": 0.1571872673934936, "grad_norm": 12.965209183423223, "learning_rate": 1.861657194666195e-05, "loss": 0.4063, "step": 3590 }, { "epoch": 0.15762511493497963, "grad_norm": 8.666927431536255, "learning_rate": 1.8612149222706265e-05, "loss": 0.3274, "step": 3600 }, { "epoch": 0.1580629624764657, "grad_norm": 14.516286713758953, "learning_rate": 1.8607726498750583e-05, "loss": 0.4267, "step": 3610 }, { "epoch": 0.15850081001795174, "grad_norm": 9.395231262901326, "learning_rate": 1.8603303774794898e-05, "loss": 0.3378, "step": 3620 }, { "epoch": 0.1589386575594378, "grad_norm": 15.580920245081602, "learning_rate": 1.8598881050839213e-05, "loss": 0.3727, "step": 3630 }, { "epoch": 0.15937650510092385, "grad_norm": 9.421511818637732, "learning_rate": 1.8594458326883528e-05, "loss": 0.4091, "step": 3640 }, { "epoch": 0.15981435264240992, "grad_norm": 10.903593860274887, "learning_rate": 1.8590035602927846e-05, "loss": 0.3852, "step": 3650 }, { "epoch": 0.16025220018389597, "grad_norm": 15.079459239057043, "learning_rate": 1.858561287897216e-05, "loss": 0.3161, "step": 3660 }, { "epoch": 0.16069004772538203, "grad_norm": 11.906906790652602, "learning_rate": 1.8581632427412045e-05, "loss": 0.4032, "step": 3670 }, { "epoch": 0.16112789526686808, "grad_norm": 7.299516154937353, "learning_rate": 1.857720970345636e-05, "loss": 0.4608, "step": 3680 }, { "epoch": 0.16156574280835412, "grad_norm": 10.405574742308731, "learning_rate": 1.8572786979500675e-05, "loss": 0.3277, "step": 3690 }, { "epoch": 0.1620035903498402, "grad_norm": 12.45192353736054, "learning_rate": 1.8568364255544993e-05, "loss": 0.4101, "step": 3700 }, { "epoch": 0.16244143789132623, "grad_norm": 17.784864115352764, "learning_rate": 1.8563941531589308e-05, "loss": 0.4311, "step": 3710 }, { "epoch": 0.1628792854328123, "grad_norm": 12.704077503112067, "learning_rate": 1.8559518807633623e-05, "loss": 0.3409, "step": 3720 }, { "epoch": 0.16331713297429834, "grad_norm": 26.63374521687372, "learning_rate": 1.8555096083677938e-05, "loss": 0.3658, "step": 3730 }, { "epoch": 0.1637549805157844, "grad_norm": 10.121501930571688, "learning_rate": 1.8550673359722256e-05, "loss": 0.3402, "step": 3740 }, { "epoch": 0.16419282805727045, "grad_norm": 14.094860227716598, "learning_rate": 1.854625063576657e-05, "loss": 0.4277, "step": 3750 }, { "epoch": 0.16463067559875652, "grad_norm": 9.994141579233393, "learning_rate": 1.8541827911810886e-05, "loss": 0.4692, "step": 3760 }, { "epoch": 0.16506852314024256, "grad_norm": 8.67508870280096, "learning_rate": 1.85374051878552e-05, "loss": 0.4016, "step": 3770 }, { "epoch": 0.16550637068172863, "grad_norm": 14.502486443265836, "learning_rate": 1.853298246389952e-05, "loss": 0.4075, "step": 3780 }, { "epoch": 0.16594421822321467, "grad_norm": 11.515960895499763, "learning_rate": 1.852855973994383e-05, "loss": 0.3435, "step": 3790 }, { "epoch": 0.16638206576470074, "grad_norm": 9.745079168132195, "learning_rate": 1.852413701598815e-05, "loss": 0.3109, "step": 3800 }, { "epoch": 0.16681991330618678, "grad_norm": 8.495004869159922, "learning_rate": 1.8519714292032464e-05, "loss": 0.3922, "step": 3810 }, { "epoch": 0.16725776084767285, "grad_norm": 8.988712438049474, "learning_rate": 1.8515291568076782e-05, "loss": 0.4012, "step": 3820 }, { "epoch": 0.1676956083891589, "grad_norm": 12.853240059584076, "learning_rate": 1.8510868844121094e-05, "loss": 0.4125, "step": 3830 }, { "epoch": 0.16813345593064494, "grad_norm": 9.825049278388068, "learning_rate": 1.8506446120165412e-05, "loss": 0.3032, "step": 3840 }, { "epoch": 0.168571303472131, "grad_norm": 7.994348437089401, "learning_rate": 1.8502023396209727e-05, "loss": 0.3636, "step": 3850 }, { "epoch": 0.16900915101361705, "grad_norm": 12.765237118303892, "learning_rate": 1.8497600672254045e-05, "loss": 0.3478, "step": 3860 }, { "epoch": 0.16944699855510312, "grad_norm": 12.546641498449894, "learning_rate": 1.8493177948298357e-05, "loss": 0.3, "step": 3870 }, { "epoch": 0.16988484609658916, "grad_norm": 23.911174431535404, "learning_rate": 1.8488755224342675e-05, "loss": 0.451, "step": 3880 }, { "epoch": 0.17032269363807523, "grad_norm": 10.520515381695157, "learning_rate": 1.848433250038699e-05, "loss": 0.358, "step": 3890 }, { "epoch": 0.17076054117956127, "grad_norm": 11.224917445909368, "learning_rate": 1.8479909776431305e-05, "loss": 0.3162, "step": 3900 }, { "epoch": 0.17119838872104734, "grad_norm": 18.902395009875733, "learning_rate": 1.847548705247562e-05, "loss": 0.4223, "step": 3910 }, { "epoch": 0.17163623626253338, "grad_norm": 11.55198595552304, "learning_rate": 1.8471064328519938e-05, "loss": 0.3808, "step": 3920 }, { "epoch": 0.17207408380401945, "grad_norm": 11.00362301896321, "learning_rate": 1.8466641604564253e-05, "loss": 0.3252, "step": 3930 }, { "epoch": 0.1725119313455055, "grad_norm": 10.336995942120799, "learning_rate": 1.8462218880608568e-05, "loss": 0.364, "step": 3940 }, { "epoch": 0.17294977888699156, "grad_norm": 10.710989141583203, "learning_rate": 1.8457796156652883e-05, "loss": 0.4231, "step": 3950 }, { "epoch": 0.1733876264284776, "grad_norm": 12.229225448822383, "learning_rate": 1.84533734326972e-05, "loss": 0.4117, "step": 3960 }, { "epoch": 0.17382547396996365, "grad_norm": 14.915171514555029, "learning_rate": 1.8448950708741516e-05, "loss": 0.4032, "step": 3970 }, { "epoch": 0.17426332151144971, "grad_norm": 12.229642807866787, "learning_rate": 1.844452798478583e-05, "loss": 0.4361, "step": 3980 }, { "epoch": 0.17470116905293576, "grad_norm": 7.010608264256624, "learning_rate": 1.844010526083015e-05, "loss": 0.3638, "step": 3990 }, { "epoch": 0.17513901659442183, "grad_norm": 15.912932485500148, "learning_rate": 1.843568253687446e-05, "loss": 0.455, "step": 4000 }, { "epoch": 0.17557686413590787, "grad_norm": 6.924969603544103, "learning_rate": 1.843125981291878e-05, "loss": 0.2625, "step": 4010 }, { "epoch": 0.17601471167739394, "grad_norm": 15.139219047531501, "learning_rate": 1.8426837088963094e-05, "loss": 0.4141, "step": 4020 }, { "epoch": 0.17645255921887998, "grad_norm": 11.17823518693945, "learning_rate": 1.8422414365007412e-05, "loss": 0.3224, "step": 4030 }, { "epoch": 0.17689040676036605, "grad_norm": 15.7161064579388, "learning_rate": 1.8417991641051723e-05, "loss": 0.3664, "step": 4040 }, { "epoch": 0.1773282543018521, "grad_norm": 13.792070427062033, "learning_rate": 1.841356891709604e-05, "loss": 0.3284, "step": 4050 }, { "epoch": 0.17776610184333816, "grad_norm": 9.000617959741795, "learning_rate": 1.8409146193140357e-05, "loss": 0.344, "step": 4060 }, { "epoch": 0.1782039493848242, "grad_norm": 12.402957661971133, "learning_rate": 1.8404723469184675e-05, "loss": 0.3871, "step": 4070 }, { "epoch": 0.17864179692631027, "grad_norm": 12.032781330103441, "learning_rate": 1.8400300745228986e-05, "loss": 0.3787, "step": 4080 }, { "epoch": 0.1790796444677963, "grad_norm": 11.222860194370364, "learning_rate": 1.8395878021273305e-05, "loss": 0.3077, "step": 4090 }, { "epoch": 0.17951749200928238, "grad_norm": 13.7968119601476, "learning_rate": 1.839145529731762e-05, "loss": 0.4786, "step": 4100 }, { "epoch": 0.17995533955076842, "grad_norm": 11.921299671774793, "learning_rate": 1.8387032573361934e-05, "loss": 0.4037, "step": 4110 }, { "epoch": 0.18039318709225446, "grad_norm": 8.879545954875514, "learning_rate": 1.838260984940625e-05, "loss": 0.4584, "step": 4120 }, { "epoch": 0.18083103463374053, "grad_norm": 14.982597428617236, "learning_rate": 1.8378187125450568e-05, "loss": 0.3815, "step": 4130 }, { "epoch": 0.18126888217522658, "grad_norm": 9.147231103073068, "learning_rate": 1.8373764401494882e-05, "loss": 0.4014, "step": 4140 }, { "epoch": 0.18170672971671264, "grad_norm": 12.294983152398926, "learning_rate": 1.8369341677539197e-05, "loss": 0.4019, "step": 4150 }, { "epoch": 0.1821445772581987, "grad_norm": 9.927235418370907, "learning_rate": 1.8364918953583512e-05, "loss": 0.3566, "step": 4160 }, { "epoch": 0.18258242479968476, "grad_norm": 11.734461472925389, "learning_rate": 1.8360496229627827e-05, "loss": 0.4055, "step": 4170 }, { "epoch": 0.1830202723411708, "grad_norm": 17.077445949446012, "learning_rate": 1.8356073505672145e-05, "loss": 0.3583, "step": 4180 }, { "epoch": 0.18345811988265687, "grad_norm": 12.654896604913636, "learning_rate": 1.835165078171646e-05, "loss": 0.3492, "step": 4190 }, { "epoch": 0.1838959674241429, "grad_norm": 16.58090166573664, "learning_rate": 1.8347228057760775e-05, "loss": 0.4069, "step": 4200 }, { "epoch": 0.18433381496562898, "grad_norm": 12.423854991570149, "learning_rate": 1.834280533380509e-05, "loss": 0.3649, "step": 4210 }, { "epoch": 0.18477166250711502, "grad_norm": 8.687455211496507, "learning_rate": 1.833838260984941e-05, "loss": 0.3713, "step": 4220 }, { "epoch": 0.1852095100486011, "grad_norm": 10.328403486459152, "learning_rate": 1.8333959885893723e-05, "loss": 0.3679, "step": 4230 }, { "epoch": 0.18564735759008713, "grad_norm": 12.579913013442551, "learning_rate": 1.832953716193804e-05, "loss": 0.3538, "step": 4240 }, { "epoch": 0.18608520513157317, "grad_norm": 10.7018660870028, "learning_rate": 1.8325114437982353e-05, "loss": 0.3401, "step": 4250 }, { "epoch": 0.18652305267305924, "grad_norm": 7.833366421027678, "learning_rate": 1.832069171402667e-05, "loss": 0.4208, "step": 4260 }, { "epoch": 0.18696090021454528, "grad_norm": 13.791401625254139, "learning_rate": 1.8316268990070986e-05, "loss": 0.3355, "step": 4270 }, { "epoch": 0.18739874775603135, "grad_norm": 15.146634353101764, "learning_rate": 1.8311846266115305e-05, "loss": 0.4045, "step": 4280 }, { "epoch": 0.1878365952975174, "grad_norm": 11.438151690151432, "learning_rate": 1.8307423542159616e-05, "loss": 0.392, "step": 4290 }, { "epoch": 0.18827444283900346, "grad_norm": 12.907194015301421, "learning_rate": 1.8303000818203934e-05, "loss": 0.4075, "step": 4300 }, { "epoch": 0.1887122903804895, "grad_norm": 13.004770430694967, "learning_rate": 1.829857809424825e-05, "loss": 0.3703, "step": 4310 }, { "epoch": 0.18915013792197558, "grad_norm": 13.767940400122603, "learning_rate": 1.8294155370292564e-05, "loss": 0.3479, "step": 4320 }, { "epoch": 0.18958798546346162, "grad_norm": 13.598058539546074, "learning_rate": 1.828973264633688e-05, "loss": 0.3426, "step": 4330 }, { "epoch": 0.1900258330049477, "grad_norm": 15.83770263349553, "learning_rate": 1.8285309922381197e-05, "loss": 0.3301, "step": 4340 }, { "epoch": 0.19046368054643373, "grad_norm": 9.739199794350526, "learning_rate": 1.8280887198425512e-05, "loss": 0.3444, "step": 4350 }, { "epoch": 0.1909015280879198, "grad_norm": 11.924698121670366, "learning_rate": 1.8276464474469827e-05, "loss": 0.3052, "step": 4360 }, { "epoch": 0.19133937562940584, "grad_norm": 12.922426444929348, "learning_rate": 1.8272041750514142e-05, "loss": 0.3838, "step": 4370 }, { "epoch": 0.1917772231708919, "grad_norm": 10.139430182884723, "learning_rate": 1.8267619026558457e-05, "loss": 0.339, "step": 4380 }, { "epoch": 0.19221507071237795, "grad_norm": 16.77456639919075, "learning_rate": 1.8263196302602775e-05, "loss": 0.3662, "step": 4390 }, { "epoch": 0.192652918253864, "grad_norm": 16.923778423361814, "learning_rate": 1.825877357864709e-05, "loss": 0.4345, "step": 4400 }, { "epoch": 0.19309076579535006, "grad_norm": 9.79791653342669, "learning_rate": 1.8254350854691405e-05, "loss": 0.374, "step": 4410 }, { "epoch": 0.1935286133368361, "grad_norm": 14.223990277487626, "learning_rate": 1.824992813073572e-05, "loss": 0.3418, "step": 4420 }, { "epoch": 0.19396646087832217, "grad_norm": 10.280657812067675, "learning_rate": 1.8245505406780038e-05, "loss": 0.4258, "step": 4430 }, { "epoch": 0.1944043084198082, "grad_norm": 11.454773579977681, "learning_rate": 1.8241082682824353e-05, "loss": 0.3558, "step": 4440 }, { "epoch": 0.19484215596129428, "grad_norm": 10.05797176474998, "learning_rate": 1.8236659958868668e-05, "loss": 0.4195, "step": 4450 }, { "epoch": 0.19528000350278032, "grad_norm": 17.413968742737737, "learning_rate": 1.8232237234912983e-05, "loss": 0.4842, "step": 4460 }, { "epoch": 0.1957178510442664, "grad_norm": 7.594947751878203, "learning_rate": 1.82278145109573e-05, "loss": 0.376, "step": 4470 }, { "epoch": 0.19615569858575244, "grad_norm": 7.752555917921395, "learning_rate": 1.8223391787001616e-05, "loss": 0.3874, "step": 4480 }, { "epoch": 0.1965935461272385, "grad_norm": 11.095188960924157, "learning_rate": 1.821896906304593e-05, "loss": 0.3199, "step": 4490 }, { "epoch": 0.19703139366872455, "grad_norm": 8.85243989748995, "learning_rate": 1.8214546339090246e-05, "loss": 0.3514, "step": 4500 }, { "epoch": 0.19746924121021062, "grad_norm": 10.110803517936658, "learning_rate": 1.8210123615134564e-05, "loss": 0.3794, "step": 4510 }, { "epoch": 0.19790708875169666, "grad_norm": 10.373451209228024, "learning_rate": 1.820570089117888e-05, "loss": 0.408, "step": 4520 }, { "epoch": 0.19834493629318273, "grad_norm": 13.162706254319437, "learning_rate": 1.8201278167223194e-05, "loss": 0.3425, "step": 4530 }, { "epoch": 0.19878278383466877, "grad_norm": 15.282957890404685, "learning_rate": 1.819685544326751e-05, "loss": 0.3852, "step": 4540 }, { "epoch": 0.1992206313761548, "grad_norm": 13.370558527120265, "learning_rate": 1.8192432719311827e-05, "loss": 0.3652, "step": 4550 }, { "epoch": 0.19965847891764088, "grad_norm": 10.259778080672527, "learning_rate": 1.8188009995356142e-05, "loss": 0.356, "step": 4560 }, { "epoch": 0.20009632645912692, "grad_norm": 10.736808667558975, "learning_rate": 1.8183587271400457e-05, "loss": 0.3329, "step": 4570 }, { "epoch": 0.200534174000613, "grad_norm": 8.501933158502284, "learning_rate": 1.817916454744477e-05, "loss": 0.3397, "step": 4580 }, { "epoch": 0.20097202154209903, "grad_norm": 28.24241657929224, "learning_rate": 1.8174741823489087e-05, "loss": 0.393, "step": 4590 }, { "epoch": 0.2014098690835851, "grad_norm": 13.727765781245198, "learning_rate": 1.8170319099533405e-05, "loss": 0.306, "step": 4600 }, { "epoch": 0.20184771662507114, "grad_norm": 9.98011595817274, "learning_rate": 1.816589637557772e-05, "loss": 0.3357, "step": 4610 }, { "epoch": 0.2022855641665572, "grad_norm": 10.817739639396102, "learning_rate": 1.8161473651622035e-05, "loss": 0.3352, "step": 4620 }, { "epoch": 0.20272341170804326, "grad_norm": 17.39150590199315, "learning_rate": 1.815705092766635e-05, "loss": 0.453, "step": 4630 }, { "epoch": 0.20316125924952932, "grad_norm": 15.480462835918628, "learning_rate": 1.8152628203710668e-05, "loss": 0.3469, "step": 4640 }, { "epoch": 0.20359910679101537, "grad_norm": 14.206320760863697, "learning_rate": 1.8148205479754983e-05, "loss": 0.3593, "step": 4650 }, { "epoch": 0.20403695433250144, "grad_norm": 10.767887226596823, "learning_rate": 1.8143782755799298e-05, "loss": 0.3401, "step": 4660 }, { "epoch": 0.20447480187398748, "grad_norm": 12.432796643275674, "learning_rate": 1.8139360031843613e-05, "loss": 0.3875, "step": 4670 }, { "epoch": 0.20491264941547352, "grad_norm": 9.006109071644179, "learning_rate": 1.813493730788793e-05, "loss": 0.3813, "step": 4680 }, { "epoch": 0.2053504969569596, "grad_norm": 15.616401442667728, "learning_rate": 1.8130514583932246e-05, "loss": 0.3465, "step": 4690 }, { "epoch": 0.20578834449844563, "grad_norm": 10.59820948899193, "learning_rate": 1.812609185997656e-05, "loss": 0.382, "step": 4700 }, { "epoch": 0.2062261920399317, "grad_norm": 14.208386612211477, "learning_rate": 1.8121669136020875e-05, "loss": 0.3476, "step": 4710 }, { "epoch": 0.20666403958141774, "grad_norm": 12.589160774020666, "learning_rate": 1.8117246412065194e-05, "loss": 0.3421, "step": 4720 }, { "epoch": 0.2071018871229038, "grad_norm": 7.568667332444265, "learning_rate": 1.811282368810951e-05, "loss": 0.3727, "step": 4730 }, { "epoch": 0.20753973466438985, "grad_norm": 10.812577263881279, "learning_rate": 1.8108400964153824e-05, "loss": 0.3881, "step": 4740 }, { "epoch": 0.20797758220587592, "grad_norm": 13.018455316703584, "learning_rate": 1.810397824019814e-05, "loss": 0.3337, "step": 4750 }, { "epoch": 0.20841542974736196, "grad_norm": 13.969299442144436, "learning_rate": 1.8099555516242457e-05, "loss": 0.4362, "step": 4760 }, { "epoch": 0.20885327728884803, "grad_norm": 14.510644097153572, "learning_rate": 1.809513279228677e-05, "loss": 0.3534, "step": 4770 }, { "epoch": 0.20929112483033407, "grad_norm": 6.43215364830629, "learning_rate": 1.8090710068331087e-05, "loss": 0.3813, "step": 4780 }, { "epoch": 0.20972897237182014, "grad_norm": 9.94692028441624, "learning_rate": 1.80862873443754e-05, "loss": 0.3787, "step": 4790 }, { "epoch": 0.21016681991330619, "grad_norm": 14.658374754366635, "learning_rate": 1.8081864620419716e-05, "loss": 0.332, "step": 4800 }, { "epoch": 0.21060466745479225, "grad_norm": 12.603661359425915, "learning_rate": 1.8077441896464035e-05, "loss": 0.5098, "step": 4810 }, { "epoch": 0.2110425149962783, "grad_norm": 8.069510317555627, "learning_rate": 1.807301917250835e-05, "loss": 0.3345, "step": 4820 }, { "epoch": 0.21148036253776434, "grad_norm": 15.50460058047792, "learning_rate": 1.8068596448552664e-05, "loss": 0.4389, "step": 4830 }, { "epoch": 0.2119182100792504, "grad_norm": 7.479416895287082, "learning_rate": 1.806417372459698e-05, "loss": 0.4058, "step": 4840 }, { "epoch": 0.21235605762073645, "grad_norm": 10.414700579430006, "learning_rate": 1.8059751000641298e-05, "loss": 0.3713, "step": 4850 }, { "epoch": 0.21279390516222252, "grad_norm": 11.290754293107446, "learning_rate": 1.8055328276685612e-05, "loss": 0.3759, "step": 4860 }, { "epoch": 0.21323175270370856, "grad_norm": 11.549866408859454, "learning_rate": 1.8050905552729927e-05, "loss": 0.3288, "step": 4870 }, { "epoch": 0.21366960024519463, "grad_norm": 18.046144465733533, "learning_rate": 1.8046482828774242e-05, "loss": 0.2962, "step": 4880 }, { "epoch": 0.21410744778668067, "grad_norm": 10.63709064776316, "learning_rate": 1.804206010481856e-05, "loss": 0.3126, "step": 4890 }, { "epoch": 0.21454529532816674, "grad_norm": 13.401948678962873, "learning_rate": 1.8037637380862875e-05, "loss": 0.335, "step": 4900 }, { "epoch": 0.21498314286965278, "grad_norm": 28.923645525898227, "learning_rate": 1.803321465690719e-05, "loss": 0.3323, "step": 4910 }, { "epoch": 0.21542099041113885, "grad_norm": 12.023966063146357, "learning_rate": 1.8028791932951505e-05, "loss": 0.3436, "step": 4920 }, { "epoch": 0.2158588379526249, "grad_norm": 16.66651723158919, "learning_rate": 1.8024369208995823e-05, "loss": 0.3434, "step": 4930 }, { "epoch": 0.21629668549411096, "grad_norm": 11.447028188385977, "learning_rate": 1.801994648504014e-05, "loss": 0.3734, "step": 4940 }, { "epoch": 0.216734533035597, "grad_norm": 9.454223770724575, "learning_rate": 1.8015523761084453e-05, "loss": 0.3928, "step": 4950 }, { "epoch": 0.21717238057708305, "grad_norm": 8.902373316783942, "learning_rate": 1.8011101037128768e-05, "loss": 0.3856, "step": 4960 }, { "epoch": 0.21761022811856912, "grad_norm": 18.191989619833368, "learning_rate": 1.8006678313173086e-05, "loss": 0.4122, "step": 4970 }, { "epoch": 0.21804807566005516, "grad_norm": 10.46306034169101, "learning_rate": 1.80022555892174e-05, "loss": 0.3303, "step": 4980 }, { "epoch": 0.21848592320154123, "grad_norm": 14.018809081253679, "learning_rate": 1.7997832865261716e-05, "loss": 0.3167, "step": 4990 }, { "epoch": 0.21892377074302727, "grad_norm": 9.557186071382484, "learning_rate": 1.799341014130603e-05, "loss": 0.4131, "step": 5000 } ], "logging_steps": 10, "max_steps": 45678, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }