{ "best_metric": 11.018465042114258, "best_model_checkpoint": "miner_id_24/checkpoint-370", "epoch": 0.016722785925741793, "eval_steps": 5, "global_step": 370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.519671871822106e-05, "grad_norm": 0.741265594959259, "learning_rate": 2e-05, "loss": 44.3865, "step": 1 }, { "epoch": 4.519671871822106e-05, "eval_loss": 11.093368530273438, "eval_runtime": 175.5908, "eval_samples_per_second": 53.061, "eval_steps_per_second": 26.533, "step": 1 }, { "epoch": 9.039343743644212e-05, "grad_norm": 0.6581929326057434, "learning_rate": 4e-05, "loss": 44.3813, "step": 2 }, { "epoch": 0.00013559015615466317, "grad_norm": 0.6729432344436646, "learning_rate": 6e-05, "loss": 44.3393, "step": 3 }, { "epoch": 0.00018078687487288423, "grad_norm": 0.6872175335884094, "learning_rate": 8e-05, "loss": 44.3795, "step": 4 }, { "epoch": 0.00022598359359110527, "grad_norm": 0.704067051410675, "learning_rate": 0.0001, "loss": 44.389, "step": 5 }, { "epoch": 0.00022598359359110527, "eval_loss": 11.093063354492188, "eval_runtime": 176.3334, "eval_samples_per_second": 52.837, "eval_steps_per_second": 26.422, "step": 5 }, { "epoch": 0.00027118031230932634, "grad_norm": 0.6682418584823608, "learning_rate": 0.00012, "loss": 44.3529, "step": 6 }, { "epoch": 0.0003163770310275474, "grad_norm": 0.6353705525398254, "learning_rate": 0.00014, "loss": 44.4355, "step": 7 }, { "epoch": 0.00036157374974576847, "grad_norm": 0.6866922974586487, "learning_rate": 0.00016, "loss": 44.4036, "step": 8 }, { "epoch": 0.00040677046846398953, "grad_norm": 0.7315343618392944, "learning_rate": 0.00018, "loss": 44.3756, "step": 9 }, { "epoch": 0.00045196718718221055, "grad_norm": 0.6867555379867554, "learning_rate": 0.0002, "loss": 44.3678, "step": 10 }, { "epoch": 0.00045196718718221055, "eval_loss": 11.091917037963867, "eval_runtime": 176.1396, "eval_samples_per_second": 52.896, "eval_steps_per_second": 26.451, "step": 10 }, { "epoch": 0.0004971639059004316, "grad_norm": 0.7067858576774597, "learning_rate": 0.0001999979446958366, "loss": 44.3933, "step": 11 }, { "epoch": 0.0005423606246186527, "grad_norm": 0.7694055438041687, "learning_rate": 0.00019999177886783194, "loss": 44.3476, "step": 12 }, { "epoch": 0.0005875573433368737, "grad_norm": 0.6980550289154053, "learning_rate": 0.00019998150276943902, "loss": 44.3621, "step": 13 }, { "epoch": 0.0006327540620550948, "grad_norm": 0.7399426698684692, "learning_rate": 0.000199967116823068, "loss": 44.3727, "step": 14 }, { "epoch": 0.0006779507807733159, "grad_norm": 0.6623771786689758, "learning_rate": 0.0001999486216200688, "loss": 44.3563, "step": 15 }, { "epoch": 0.0006779507807733159, "eval_loss": 11.089905738830566, "eval_runtime": 176.0326, "eval_samples_per_second": 52.928, "eval_steps_per_second": 26.467, "step": 15 }, { "epoch": 0.0007231474994915369, "grad_norm": 0.6647756695747375, "learning_rate": 0.00019992601792070679, "loss": 44.3454, "step": 16 }, { "epoch": 0.000768344218209758, "grad_norm": 0.7416101694107056, "learning_rate": 0.00019989930665413147, "loss": 44.3249, "step": 17 }, { "epoch": 0.0008135409369279791, "grad_norm": 0.6351829171180725, "learning_rate": 0.00019986848891833845, "loss": 44.37, "step": 18 }, { "epoch": 0.0008587376556462001, "grad_norm": 0.6839431524276733, "learning_rate": 0.0001998335659801241, "loss": 44.3472, "step": 19 }, { "epoch": 0.0009039343743644211, "grad_norm": 0.6762228608131409, "learning_rate": 0.00019979453927503364, "loss": 44.3507, "step": 20 }, { "epoch": 0.0009039343743644211, "eval_loss": 11.087591171264648, "eval_runtime": 176.1534, "eval_samples_per_second": 52.891, "eval_steps_per_second": 26.449, "step": 20 }, { "epoch": 0.0009491310930826422, "grad_norm": 0.7993413209915161, "learning_rate": 0.00019975141040730207, "loss": 44.288, "step": 21 }, { "epoch": 0.0009943278118008632, "grad_norm": 0.6926490664482117, "learning_rate": 0.0001997041811497882, "loss": 44.3672, "step": 22 }, { "epoch": 0.0010395245305190844, "grad_norm": 0.7373084425926208, "learning_rate": 0.00019965285344390184, "loss": 44.3927, "step": 23 }, { "epoch": 0.0010847212492373054, "grad_norm": 0.6655643582344055, "learning_rate": 0.00019959742939952392, "loss": 44.3481, "step": 24 }, { "epoch": 0.0011299179679555265, "grad_norm": 0.7115928530693054, "learning_rate": 0.00019953791129491983, "loss": 44.3368, "step": 25 }, { "epoch": 0.0011299179679555265, "eval_loss": 11.085227012634277, "eval_runtime": 175.877, "eval_samples_per_second": 52.975, "eval_steps_per_second": 26.49, "step": 25 }, { "epoch": 0.0011751146866737475, "grad_norm": 0.7096830010414124, "learning_rate": 0.00019947430157664576, "loss": 44.3735, "step": 26 }, { "epoch": 0.0012203114053919684, "grad_norm": 0.6747312545776367, "learning_rate": 0.00019940660285944803, "loss": 44.3323, "step": 27 }, { "epoch": 0.0012655081241101896, "grad_norm": 0.7371957302093506, "learning_rate": 0.00019933481792615583, "loss": 44.2951, "step": 28 }, { "epoch": 0.0013107048428284106, "grad_norm": 0.7316697239875793, "learning_rate": 0.0001992589497275665, "loss": 44.3097, "step": 29 }, { "epoch": 0.0013559015615466317, "grad_norm": 0.6886783838272095, "learning_rate": 0.0001991790013823246, "loss": 44.3137, "step": 30 }, { "epoch": 0.0013559015615466317, "eval_loss": 11.082609176635742, "eval_runtime": 176.2695, "eval_samples_per_second": 52.857, "eval_steps_per_second": 26.431, "step": 30 }, { "epoch": 0.0014010982802648527, "grad_norm": 0.7027749419212341, "learning_rate": 0.00019909497617679348, "loss": 44.3391, "step": 31 }, { "epoch": 0.0014462949989830739, "grad_norm": 0.735598087310791, "learning_rate": 0.0001990068775649202, "loss": 44.3645, "step": 32 }, { "epoch": 0.0014914917177012948, "grad_norm": 0.7152600288391113, "learning_rate": 0.00019891470916809362, "loss": 44.3478, "step": 33 }, { "epoch": 0.001536688436419516, "grad_norm": 0.6983291506767273, "learning_rate": 0.00019881847477499557, "loss": 44.3252, "step": 34 }, { "epoch": 0.001581885155137737, "grad_norm": 0.6892045140266418, "learning_rate": 0.00019871817834144504, "loss": 44.2998, "step": 35 }, { "epoch": 0.001581885155137737, "eval_loss": 11.079712867736816, "eval_runtime": 176.1378, "eval_samples_per_second": 52.896, "eval_steps_per_second": 26.451, "step": 35 }, { "epoch": 0.0016270818738559581, "grad_norm": 0.7166262865066528, "learning_rate": 0.0001986138239902355, "loss": 44.3485, "step": 36 }, { "epoch": 0.001672278592574179, "grad_norm": 0.7545002102851868, "learning_rate": 0.0001985054160109657, "loss": 44.2613, "step": 37 }, { "epoch": 0.0017174753112924003, "grad_norm": 0.7944263219833374, "learning_rate": 0.00019839295885986296, "loss": 44.2665, "step": 38 }, { "epoch": 0.0017626720300106212, "grad_norm": 0.7216903567314148, "learning_rate": 0.0001982764571596004, "loss": 44.3546, "step": 39 }, { "epoch": 0.0018078687487288422, "grad_norm": 0.7492774128913879, "learning_rate": 0.00019815591569910654, "loss": 44.3223, "step": 40 }, { "epoch": 0.0018078687487288422, "eval_loss": 11.076553344726562, "eval_runtime": 176.1866, "eval_samples_per_second": 52.881, "eval_steps_per_second": 26.444, "step": 40 }, { "epoch": 0.0018530654674470634, "grad_norm": 0.8118460774421692, "learning_rate": 0.00019803133943336874, "loss": 44.3122, "step": 41 }, { "epoch": 0.0018982621861652843, "grad_norm": 0.7527559399604797, "learning_rate": 0.0001979027334832293, "loss": 44.3061, "step": 42 }, { "epoch": 0.0019434589048835055, "grad_norm": 0.7425262331962585, "learning_rate": 0.00019777010313517518, "loss": 44.2408, "step": 43 }, { "epoch": 0.0019886556236017264, "grad_norm": 0.753101646900177, "learning_rate": 0.00019763345384112043, "loss": 44.3362, "step": 44 }, { "epoch": 0.0020338523423199476, "grad_norm": 0.767737090587616, "learning_rate": 0.00019749279121818235, "loss": 44.2864, "step": 45 }, { "epoch": 0.0020338523423199476, "eval_loss": 11.072389602661133, "eval_runtime": 175.9667, "eval_samples_per_second": 52.948, "eval_steps_per_second": 26.477, "step": 45 }, { "epoch": 0.002079049061038169, "grad_norm": 0.7275786995887756, "learning_rate": 0.00019734812104845047, "loss": 44.3542, "step": 46 }, { "epoch": 0.0021242457797563895, "grad_norm": 0.6908650994300842, "learning_rate": 0.00019719944927874881, "loss": 44.3377, "step": 47 }, { "epoch": 0.0021694424984746107, "grad_norm": 0.7260599136352539, "learning_rate": 0.0001970467820203915, "loss": 44.2621, "step": 48 }, { "epoch": 0.002214639217192832, "grad_norm": 0.7138715982437134, "learning_rate": 0.00019689012554893154, "loss": 44.2338, "step": 49 }, { "epoch": 0.002259835935911053, "grad_norm": 0.7867954969406128, "learning_rate": 0.00019672948630390294, "loss": 44.3044, "step": 50 }, { "epoch": 0.002259835935911053, "eval_loss": 11.067892074584961, "eval_runtime": 176.5244, "eval_samples_per_second": 52.78, "eval_steps_per_second": 26.393, "step": 50 }, { "epoch": 0.002305032654629274, "grad_norm": 0.7787512540817261, "learning_rate": 0.00019656487088855592, "loss": 44.2918, "step": 51 }, { "epoch": 0.002350229373347495, "grad_norm": 0.7184544801712036, "learning_rate": 0.00019639628606958533, "loss": 44.2751, "step": 52 }, { "epoch": 0.002395426092065716, "grad_norm": 0.7348573803901672, "learning_rate": 0.0001962237387768529, "loss": 44.246, "step": 53 }, { "epoch": 0.002440622810783937, "grad_norm": 0.7713965773582458, "learning_rate": 0.00019604723610310194, "loss": 44.3292, "step": 54 }, { "epoch": 0.002485819529502158, "grad_norm": 0.8040369749069214, "learning_rate": 0.00019586678530366606, "loss": 44.2155, "step": 55 }, { "epoch": 0.002485819529502158, "eval_loss": 11.062650680541992, "eval_runtime": 176.278, "eval_samples_per_second": 52.854, "eval_steps_per_second": 26.43, "step": 55 }, { "epoch": 0.0025310162482203792, "grad_norm": 0.7459877133369446, "learning_rate": 0.00019568239379617088, "loss": 44.2188, "step": 56 }, { "epoch": 0.0025762129669386004, "grad_norm": 0.8008533716201782, "learning_rate": 0.00019549406916022905, "loss": 44.226, "step": 57 }, { "epoch": 0.002621409685656821, "grad_norm": 0.7918010354042053, "learning_rate": 0.00019530181913712872, "loss": 44.287, "step": 58 }, { "epoch": 0.0026666064043750423, "grad_norm": 0.7287217974662781, "learning_rate": 0.00019510565162951537, "loss": 44.2581, "step": 59 }, { "epoch": 0.0027118031230932635, "grad_norm": 0.7925474643707275, "learning_rate": 0.00019490557470106686, "loss": 44.2277, "step": 60 }, { "epoch": 0.0027118031230932635, "eval_loss": 11.05736255645752, "eval_runtime": 176.2465, "eval_samples_per_second": 52.863, "eval_steps_per_second": 26.435, "step": 60 }, { "epoch": 0.0027569998418114847, "grad_norm": 0.8553807735443115, "learning_rate": 0.00019470159657616215, "loss": 44.2439, "step": 61 }, { "epoch": 0.0028021965605297054, "grad_norm": 0.7586395740509033, "learning_rate": 0.00019449372563954293, "loss": 44.1943, "step": 62 }, { "epoch": 0.0028473932792479266, "grad_norm": 0.7628232836723328, "learning_rate": 0.0001942819704359693, "loss": 44.2594, "step": 63 }, { "epoch": 0.0028925899979661478, "grad_norm": 0.718551754951477, "learning_rate": 0.00019406633966986828, "loss": 44.2302, "step": 64 }, { "epoch": 0.002937786716684369, "grad_norm": 0.7625423073768616, "learning_rate": 0.00019384684220497605, "loss": 44.1989, "step": 65 }, { "epoch": 0.002937786716684369, "eval_loss": 11.051901817321777, "eval_runtime": 176.1759, "eval_samples_per_second": 52.885, "eval_steps_per_second": 26.445, "step": 65 }, { "epoch": 0.0029829834354025897, "grad_norm": 0.7891851663589478, "learning_rate": 0.00019362348706397373, "loss": 44.2199, "step": 66 }, { "epoch": 0.003028180154120811, "grad_norm": 0.6770808100700378, "learning_rate": 0.00019339628342811632, "loss": 44.1689, "step": 67 }, { "epoch": 0.003073376872839032, "grad_norm": 0.7498692870140076, "learning_rate": 0.0001931652406368554, "loss": 44.1741, "step": 68 }, { "epoch": 0.0031185735915572528, "grad_norm": 0.7661782503128052, "learning_rate": 0.0001929303681874552, "loss": 44.2123, "step": 69 }, { "epoch": 0.003163770310275474, "grad_norm": 0.6438837647438049, "learning_rate": 0.0001926916757346022, "loss": 44.1718, "step": 70 }, { "epoch": 0.003163770310275474, "eval_loss": 11.046669960021973, "eval_runtime": 176.3634, "eval_samples_per_second": 52.828, "eval_steps_per_second": 26.417, "step": 70 }, { "epoch": 0.003208967028993695, "grad_norm": 0.7522275447845459, "learning_rate": 0.00019244917309000817, "loss": 44.2246, "step": 71 }, { "epoch": 0.0032541637477119163, "grad_norm": 0.7135974168777466, "learning_rate": 0.00019220287022200707, "loss": 44.2111, "step": 72 }, { "epoch": 0.003299360466430137, "grad_norm": 0.7275662422180176, "learning_rate": 0.0001919527772551451, "loss": 44.1464, "step": 73 }, { "epoch": 0.003344557185148358, "grad_norm": 0.6742229461669922, "learning_rate": 0.00019169890446976454, "loss": 44.2105, "step": 74 }, { "epoch": 0.0033897539038665794, "grad_norm": 0.6085646152496338, "learning_rate": 0.00019144126230158127, "loss": 44.0926, "step": 75 }, { "epoch": 0.0033897539038665794, "eval_loss": 11.042237281799316, "eval_runtime": 176.114, "eval_samples_per_second": 52.903, "eval_steps_per_second": 26.454, "step": 75 }, { "epoch": 0.0034349506225848005, "grad_norm": 0.7245734333992004, "learning_rate": 0.0001911798613412557, "loss": 44.2154, "step": 76 }, { "epoch": 0.0034801473413030213, "grad_norm": 0.7311281561851501, "learning_rate": 0.0001909147123339575, "loss": 44.1687, "step": 77 }, { "epoch": 0.0035253440600212425, "grad_norm": 0.6399495601654053, "learning_rate": 0.0001906458261789238, "loss": 44.1596, "step": 78 }, { "epoch": 0.0035705407787394636, "grad_norm": 0.5650178790092468, "learning_rate": 0.00019037321392901136, "loss": 44.1466, "step": 79 }, { "epoch": 0.0036157374974576844, "grad_norm": 0.6039579510688782, "learning_rate": 0.0001900968867902419, "loss": 44.1955, "step": 80 }, { "epoch": 0.0036157374974576844, "eval_loss": 11.038910865783691, "eval_runtime": 176.3853, "eval_samples_per_second": 52.822, "eval_steps_per_second": 26.414, "step": 80 }, { "epoch": 0.0036609342161759055, "grad_norm": 0.7481367588043213, "learning_rate": 0.0001898168561213419, "loss": 44.2182, "step": 81 }, { "epoch": 0.0037061309348941267, "grad_norm": 0.628414511680603, "learning_rate": 0.0001895331334332753, "loss": 44.1519, "step": 82 }, { "epoch": 0.003751327653612348, "grad_norm": 0.658549964427948, "learning_rate": 0.0001892457303887706, "loss": 44.1364, "step": 83 }, { "epoch": 0.0037965243723305686, "grad_norm": 0.5245007276535034, "learning_rate": 0.0001889546588018412, "loss": 44.1079, "step": 84 }, { "epoch": 0.00384172109104879, "grad_norm": 0.5555324554443359, "learning_rate": 0.00018865993063730004, "loss": 44.1445, "step": 85 }, { "epoch": 0.00384172109104879, "eval_loss": 11.036417007446289, "eval_runtime": 176.131, "eval_samples_per_second": 52.898, "eval_steps_per_second": 26.452, "step": 85 }, { "epoch": 0.003886917809767011, "grad_norm": 0.43622660636901855, "learning_rate": 0.00018836155801026753, "loss": 44.1515, "step": 86 }, { "epoch": 0.003932114528485232, "grad_norm": 0.578544020652771, "learning_rate": 0.0001880595531856738, "loss": 44.0766, "step": 87 }, { "epoch": 0.003977311247203453, "grad_norm": 0.598685085773468, "learning_rate": 0.00018775392857775432, "loss": 44.1756, "step": 88 }, { "epoch": 0.004022507965921674, "grad_norm": 0.5733134150505066, "learning_rate": 0.00018744469674953956, "loss": 44.1756, "step": 89 }, { "epoch": 0.004067704684639895, "grad_norm": 0.5177151560783386, "learning_rate": 0.00018713187041233896, "loss": 44.173, "step": 90 }, { "epoch": 0.004067704684639895, "eval_loss": 11.034589767456055, "eval_runtime": 176.3402, "eval_samples_per_second": 52.835, "eval_steps_per_second": 26.421, "step": 90 }, { "epoch": 0.004112901403358116, "grad_norm": 0.5208268761634827, "learning_rate": 0.00018681546242521786, "loss": 44.1346, "step": 91 }, { "epoch": 0.004158098122076338, "grad_norm": 0.6029201149940491, "learning_rate": 0.00018649548579446936, "loss": 44.152, "step": 92 }, { "epoch": 0.004203294840794558, "grad_norm": 0.468414843082428, "learning_rate": 0.0001861719536730795, "loss": 44.117, "step": 93 }, { "epoch": 0.004248491559512779, "grad_norm": 0.3942670226097107, "learning_rate": 0.00018584487936018661, "loss": 44.137, "step": 94 }, { "epoch": 0.004293688278231, "grad_norm": 0.49822431802749634, "learning_rate": 0.00018551427630053463, "loss": 44.119, "step": 95 }, { "epoch": 0.004293688278231, "eval_loss": 11.03354549407959, "eval_runtime": 176.0642, "eval_samples_per_second": 52.918, "eval_steps_per_second": 26.462, "step": 95 }, { "epoch": 0.004338884996949221, "grad_norm": 0.5527846813201904, "learning_rate": 0.00018518015808392045, "loss": 44.0893, "step": 96 }, { "epoch": 0.004384081715667443, "grad_norm": 0.5725367665290833, "learning_rate": 0.00018484253844463526, "loss": 44.1162, "step": 97 }, { "epoch": 0.004429278434385664, "grad_norm": 0.49278348684310913, "learning_rate": 0.00018450143126090015, "loss": 44.1031, "step": 98 }, { "epoch": 0.004474475153103885, "grad_norm": 0.4361265301704407, "learning_rate": 0.00018415685055429533, "loss": 44.1386, "step": 99 }, { "epoch": 0.004519671871822106, "grad_norm": 0.397714763879776, "learning_rate": 0.00018380881048918405, "loss": 44.1072, "step": 100 }, { "epoch": 0.004519671871822106, "eval_loss": 11.032732963562012, "eval_runtime": 176.1844, "eval_samples_per_second": 52.882, "eval_steps_per_second": 26.444, "step": 100 }, { "epoch": 0.004564868590540326, "grad_norm": 0.46195968985557556, "learning_rate": 0.00018345732537213027, "loss": 44.1243, "step": 101 }, { "epoch": 0.004610065309258548, "grad_norm": 0.4918234348297119, "learning_rate": 0.00018310240965131041, "loss": 44.0833, "step": 102 }, { "epoch": 0.004655262027976769, "grad_norm": 0.39288461208343506, "learning_rate": 0.00018274407791591966, "loss": 44.0844, "step": 103 }, { "epoch": 0.00470045874669499, "grad_norm": 0.7819874882698059, "learning_rate": 0.00018238234489557215, "loss": 44.0727, "step": 104 }, { "epoch": 0.004745655465413211, "grad_norm": 0.4996788203716278, "learning_rate": 0.0001820172254596956, "loss": 44.0926, "step": 105 }, { "epoch": 0.004745655465413211, "eval_loss": 11.03187370300293, "eval_runtime": 176.1674, "eval_samples_per_second": 52.887, "eval_steps_per_second": 26.446, "step": 105 }, { "epoch": 0.004790852184131432, "grad_norm": 0.4443046748638153, "learning_rate": 0.00018164873461691986, "loss": 44.1211, "step": 106 }, { "epoch": 0.0048360489028496535, "grad_norm": 0.6192988753318787, "learning_rate": 0.00018127688751446027, "loss": 44.2023, "step": 107 }, { "epoch": 0.004881245621567874, "grad_norm": 0.49968671798706055, "learning_rate": 0.00018090169943749476, "loss": 44.1175, "step": 108 }, { "epoch": 0.004926442340286095, "grad_norm": 0.5411902070045471, "learning_rate": 0.0001805231858085356, "loss": 44.1106, "step": 109 }, { "epoch": 0.004971639059004316, "grad_norm": 0.7971486449241638, "learning_rate": 0.00018014136218679567, "loss": 44.1488, "step": 110 }, { "epoch": 0.004971639059004316, "eval_loss": 11.030839920043945, "eval_runtime": 176.4251, "eval_samples_per_second": 52.81, "eval_steps_per_second": 26.408, "step": 110 }, { "epoch": 0.005016835777722537, "grad_norm": 0.39622390270233154, "learning_rate": 0.00017975624426754848, "loss": 44.1091, "step": 111 }, { "epoch": 0.0050620324964407585, "grad_norm": 0.4785301089286804, "learning_rate": 0.00017936784788148328, "loss": 44.1038, "step": 112 }, { "epoch": 0.00510722921515898, "grad_norm": 0.5272740125656128, "learning_rate": 0.00017897618899405423, "loss": 44.1133, "step": 113 }, { "epoch": 0.005152425933877201, "grad_norm": 0.6231501698493958, "learning_rate": 0.00017858128370482426, "loss": 44.1085, "step": 114 }, { "epoch": 0.005197622652595422, "grad_norm": 0.5427981019020081, "learning_rate": 0.000178183148246803, "loss": 44.1395, "step": 115 }, { "epoch": 0.005197622652595422, "eval_loss": 11.029810905456543, "eval_runtime": 176.1516, "eval_samples_per_second": 52.892, "eval_steps_per_second": 26.449, "step": 115 }, { "epoch": 0.005242819371313642, "grad_norm": 0.4265317916870117, "learning_rate": 0.00017778179898577973, "loss": 44.1501, "step": 116 }, { "epoch": 0.0052880160900318635, "grad_norm": 0.9469470381736755, "learning_rate": 0.00017737725241965069, "loss": 44.2129, "step": 117 }, { "epoch": 0.005333212808750085, "grad_norm": 0.4538600742816925, "learning_rate": 0.00017696952517774062, "loss": 44.0941, "step": 118 }, { "epoch": 0.005378409527468306, "grad_norm": 0.7306213974952698, "learning_rate": 0.00017655863402011947, "loss": 44.1601, "step": 119 }, { "epoch": 0.005423606246186527, "grad_norm": 0.5303515195846558, "learning_rate": 0.00017614459583691346, "loss": 44.1485, "step": 120 }, { "epoch": 0.005423606246186527, "eval_loss": 11.029101371765137, "eval_runtime": 176.3314, "eval_samples_per_second": 52.838, "eval_steps_per_second": 26.422, "step": 120 }, { "epoch": 0.005468802964904748, "grad_norm": 0.43057698011398315, "learning_rate": 0.00017572742764761055, "loss": 44.1271, "step": 121 }, { "epoch": 0.005513999683622969, "grad_norm": 0.5054545402526855, "learning_rate": 0.00017530714660036112, "loss": 44.1574, "step": 122 }, { "epoch": 0.00555919640234119, "grad_norm": 0.47395941615104675, "learning_rate": 0.00017488376997127283, "loss": 44.0802, "step": 123 }, { "epoch": 0.005604393121059411, "grad_norm": 0.5438507795333862, "learning_rate": 0.0001744573151637007, "loss": 44.0974, "step": 124 }, { "epoch": 0.005649589839777632, "grad_norm": 0.5694723129272461, "learning_rate": 0.00017402779970753155, "loss": 44.1329, "step": 125 }, { "epoch": 0.005649589839777632, "eval_loss": 11.028435707092285, "eval_runtime": 176.0545, "eval_samples_per_second": 52.921, "eval_steps_per_second": 26.463, "step": 125 }, { "epoch": 0.005694786558495853, "grad_norm": 0.49188655614852905, "learning_rate": 0.0001735952412584635, "loss": 44.0859, "step": 126 }, { "epoch": 0.005739983277214074, "grad_norm": 0.5955361127853394, "learning_rate": 0.00017315965759728014, "loss": 44.0938, "step": 127 }, { "epoch": 0.0057851799959322955, "grad_norm": 0.4358704090118408, "learning_rate": 0.00017272106662911973, "loss": 44.1165, "step": 128 }, { "epoch": 0.005830376714650517, "grad_norm": 0.4302980899810791, "learning_rate": 0.00017227948638273916, "loss": 44.1088, "step": 129 }, { "epoch": 0.005875573433368738, "grad_norm": 0.5749801397323608, "learning_rate": 0.00017183493500977278, "loss": 44.1311, "step": 130 }, { "epoch": 0.005875573433368738, "eval_loss": 11.027961730957031, "eval_runtime": 176.2218, "eval_samples_per_second": 52.871, "eval_steps_per_second": 26.438, "step": 130 }, { "epoch": 0.005920770152086958, "grad_norm": 0.4459182620048523, "learning_rate": 0.0001713874307839863, "loss": 44.0874, "step": 131 }, { "epoch": 0.005965966870805179, "grad_norm": 0.5632774233818054, "learning_rate": 0.0001709369921005258, "loss": 44.1085, "step": 132 }, { "epoch": 0.0060111635895234005, "grad_norm": 0.5518532991409302, "learning_rate": 0.00017048363747516117, "loss": 44.0409, "step": 133 }, { "epoch": 0.006056360308241622, "grad_norm": 0.5138490200042725, "learning_rate": 0.00017002738554352552, "loss": 44.1078, "step": 134 }, { "epoch": 0.006101557026959843, "grad_norm": 0.44584622979164124, "learning_rate": 0.00016956825506034867, "loss": 44.1152, "step": 135 }, { "epoch": 0.006101557026959843, "eval_loss": 11.027368545532227, "eval_runtime": 175.9823, "eval_samples_per_second": 52.943, "eval_steps_per_second": 26.474, "step": 135 }, { "epoch": 0.006146753745678064, "grad_norm": 0.5159522294998169, "learning_rate": 0.00016910626489868649, "loss": 44.0946, "step": 136 }, { "epoch": 0.006191950464396285, "grad_norm": 0.4725247323513031, "learning_rate": 0.00016864143404914504, "loss": 44.1131, "step": 137 }, { "epoch": 0.0062371471831145055, "grad_norm": 0.5374069213867188, "learning_rate": 0.00016817378161909996, "loss": 44.1304, "step": 138 }, { "epoch": 0.006282343901832727, "grad_norm": 0.44262439012527466, "learning_rate": 0.00016770332683191096, "loss": 44.065, "step": 139 }, { "epoch": 0.006327540620550948, "grad_norm": 0.5221428871154785, "learning_rate": 0.0001672300890261317, "loss": 44.1053, "step": 140 }, { "epoch": 0.006327540620550948, "eval_loss": 11.026728630065918, "eval_runtime": 176.1986, "eval_samples_per_second": 52.878, "eval_steps_per_second": 26.442, "step": 140 }, { "epoch": 0.006372737339269169, "grad_norm": 0.47628021240234375, "learning_rate": 0.0001667540876547148, "loss": 44.1197, "step": 141 }, { "epoch": 0.00641793405798739, "grad_norm": 0.4244273006916046, "learning_rate": 0.0001662753422842123, "loss": 44.0529, "step": 142 }, { "epoch": 0.006463130776705611, "grad_norm": 0.4019363820552826, "learning_rate": 0.00016579387259397127, "loss": 44.107, "step": 143 }, { "epoch": 0.0065083274954238325, "grad_norm": 0.41666439175605774, "learning_rate": 0.00016530969837532487, "loss": 44.1185, "step": 144 }, { "epoch": 0.006553524214142053, "grad_norm": 0.52204829454422, "learning_rate": 0.00016482283953077887, "loss": 44.0868, "step": 145 }, { "epoch": 0.006553524214142053, "eval_loss": 11.026100158691406, "eval_runtime": 175.9985, "eval_samples_per_second": 52.938, "eval_steps_per_second": 26.472, "step": 145 }, { "epoch": 0.006598720932860274, "grad_norm": 0.4917082190513611, "learning_rate": 0.00016433331607319343, "loss": 44.0786, "step": 146 }, { "epoch": 0.006643917651578495, "grad_norm": 0.6054917573928833, "learning_rate": 0.00016384114812496056, "loss": 44.0952, "step": 147 }, { "epoch": 0.006689114370296716, "grad_norm": 0.46359196305274963, "learning_rate": 0.00016334635591717703, "loss": 44.1401, "step": 148 }, { "epoch": 0.0067343110890149376, "grad_norm": 0.5335073471069336, "learning_rate": 0.00016284895978881236, "loss": 44.0664, "step": 149 }, { "epoch": 0.006779507807733159, "grad_norm": 0.3754950761795044, "learning_rate": 0.00016234898018587337, "loss": 44.1361, "step": 150 }, { "epoch": 0.006779507807733159, "eval_loss": 11.025545120239258, "eval_runtime": 176.2544, "eval_samples_per_second": 52.861, "eval_steps_per_second": 26.433, "step": 150 }, { "epoch": 0.00682470452645138, "grad_norm": 0.48478755354881287, "learning_rate": 0.00016184643766056317, "loss": 44.14, "step": 151 }, { "epoch": 0.006869901245169601, "grad_norm": 0.4497169852256775, "learning_rate": 0.00016134135287043669, "loss": 44.0882, "step": 152 }, { "epoch": 0.006915097963887821, "grad_norm": 0.5556149482727051, "learning_rate": 0.00016083374657755134, "loss": 44.148, "step": 153 }, { "epoch": 0.0069602946826060426, "grad_norm": 0.4659099280834198, "learning_rate": 0.00016032363964761363, "loss": 44.0788, "step": 154 }, { "epoch": 0.007005491401324264, "grad_norm": 0.5520086288452148, "learning_rate": 0.00015981105304912162, "loss": 44.1322, "step": 155 }, { "epoch": 0.007005491401324264, "eval_loss": 11.025052070617676, "eval_runtime": 176.1047, "eval_samples_per_second": 52.906, "eval_steps_per_second": 26.456, "step": 155 }, { "epoch": 0.007050688120042485, "grad_norm": 0.5233341455459595, "learning_rate": 0.00015929600785250257, "loss": 44.0942, "step": 156 }, { "epoch": 0.007095884838760706, "grad_norm": 0.4378088712692261, "learning_rate": 0.00015877852522924732, "loss": 44.0818, "step": 157 }, { "epoch": 0.007141081557478927, "grad_norm": 0.46756836771965027, "learning_rate": 0.0001582586264510396, "loss": 44.1222, "step": 158 }, { "epoch": 0.007186278276197148, "grad_norm": 0.5881497859954834, "learning_rate": 0.00015773633288888197, "loss": 44.0838, "step": 159 }, { "epoch": 0.007231474994915369, "grad_norm": 0.4284621775150299, "learning_rate": 0.00015721166601221698, "loss": 44.1098, "step": 160 }, { "epoch": 0.007231474994915369, "eval_loss": 11.024553298950195, "eval_runtime": 176.2152, "eval_samples_per_second": 52.873, "eval_steps_per_second": 26.439, "step": 160 }, { "epoch": 0.00727667171363359, "grad_norm": 0.5078541040420532, "learning_rate": 0.000156684647388045, "loss": 44.0764, "step": 161 }, { "epoch": 0.007321868432351811, "grad_norm": 0.46269139647483826, "learning_rate": 0.0001561552986800375, "loss": 44.0991, "step": 162 }, { "epoch": 0.007367065151070032, "grad_norm": 0.5498519539833069, "learning_rate": 0.0001556236416476465, "loss": 44.1389, "step": 163 }, { "epoch": 0.007412261869788253, "grad_norm": 0.8603391647338867, "learning_rate": 0.00015508969814521025, "loss": 44.1567, "step": 164 }, { "epoch": 0.007457458588506475, "grad_norm": 0.6750001907348633, "learning_rate": 0.00015455349012105486, "loss": 44.1007, "step": 165 }, { "epoch": 0.007457458588506475, "eval_loss": 11.024243354797363, "eval_runtime": 175.9806, "eval_samples_per_second": 52.943, "eval_steps_per_second": 26.475, "step": 165 }, { "epoch": 0.007502655307224696, "grad_norm": 0.5474929809570312, "learning_rate": 0.00015401503961659204, "loss": 44.0842, "step": 166 }, { "epoch": 0.007547852025942917, "grad_norm": 0.5558362603187561, "learning_rate": 0.00015347436876541297, "loss": 44.1025, "step": 167 }, { "epoch": 0.007593048744661137, "grad_norm": 0.5435320138931274, "learning_rate": 0.00015293149979237876, "loss": 44.073, "step": 168 }, { "epoch": 0.007638245463379358, "grad_norm": 0.41495761275291443, "learning_rate": 0.00015238645501270654, "loss": 44.0608, "step": 169 }, { "epoch": 0.00768344218209758, "grad_norm": 0.4491158127784729, "learning_rate": 0.00015183925683105254, "loss": 44.0995, "step": 170 }, { "epoch": 0.00768344218209758, "eval_loss": 11.023889541625977, "eval_runtime": 176.2494, "eval_samples_per_second": 52.863, "eval_steps_per_second": 26.434, "step": 170 }, { "epoch": 0.007728638900815801, "grad_norm": 0.6309311389923096, "learning_rate": 0.00015128992774059063, "loss": 44.1244, "step": 171 }, { "epoch": 0.007773835619534022, "grad_norm": 0.4494941830635071, "learning_rate": 0.00015073849032208822, "loss": 44.1336, "step": 172 }, { "epoch": 0.007819032338252242, "grad_norm": 0.5996090173721313, "learning_rate": 0.00015018496724297778, "loss": 44.1116, "step": 173 }, { "epoch": 0.007864229056970463, "grad_norm": 0.73329097032547, "learning_rate": 0.00014962938125642503, "loss": 44.1541, "step": 174 }, { "epoch": 0.007909425775688685, "grad_norm": 0.5808178186416626, "learning_rate": 0.0001490717552003938, "loss": 44.114, "step": 175 }, { "epoch": 0.007909425775688685, "eval_loss": 11.023494720458984, "eval_runtime": 175.9386, "eval_samples_per_second": 52.956, "eval_steps_per_second": 26.481, "step": 175 }, { "epoch": 0.007954622494406906, "grad_norm": 0.46136102080345154, "learning_rate": 0.00014851211199670721, "loss": 44.0922, "step": 176 }, { "epoch": 0.007999819213125127, "grad_norm": 0.4197680354118347, "learning_rate": 0.0001479504746501054, "loss": 44.0494, "step": 177 }, { "epoch": 0.008045015931843348, "grad_norm": 0.4883246421813965, "learning_rate": 0.00014738686624729986, "loss": 44.0914, "step": 178 }, { "epoch": 0.00809021265056157, "grad_norm": 0.4930349588394165, "learning_rate": 0.0001468213099560246, "loss": 44.0695, "step": 179 }, { "epoch": 0.00813540936927979, "grad_norm": 0.5016703009605408, "learning_rate": 0.00014625382902408356, "loss": 44.0501, "step": 180 }, { "epoch": 0.00813540936927979, "eval_loss": 11.023147583007812, "eval_runtime": 176.3497, "eval_samples_per_second": 52.833, "eval_steps_per_second": 26.419, "step": 180 }, { "epoch": 0.008180606087998012, "grad_norm": 0.5716975927352905, "learning_rate": 0.00014568444677839516, "loss": 44.1164, "step": 181 }, { "epoch": 0.008225802806716233, "grad_norm": 0.6961561441421509, "learning_rate": 0.00014511318662403347, "loss": 44.1024, "step": 182 }, { "epoch": 0.008270999525434454, "grad_norm": 0.5740232467651367, "learning_rate": 0.0001445400720432659, "loss": 44.1379, "step": 183 }, { "epoch": 0.008316196244152675, "grad_norm": 0.5687277913093567, "learning_rate": 0.00014396512659458824, "loss": 44.1165, "step": 184 }, { "epoch": 0.008361392962870896, "grad_norm": 0.6230690479278564, "learning_rate": 0.00014338837391175582, "loss": 44.118, "step": 185 }, { "epoch": 0.008361392962870896, "eval_loss": 11.022916793823242, "eval_runtime": 176.0405, "eval_samples_per_second": 52.925, "eval_steps_per_second": 26.465, "step": 185 }, { "epoch": 0.008406589681589116, "grad_norm": 0.48787158727645874, "learning_rate": 0.0001428098377028126, "loss": 44.0875, "step": 186 }, { "epoch": 0.008451786400307337, "grad_norm": 0.44323569536209106, "learning_rate": 0.000142229541749116, "loss": 44.143, "step": 187 }, { "epoch": 0.008496983119025558, "grad_norm": 0.47104522585868835, "learning_rate": 0.0001416475099043599, "loss": 44.0804, "step": 188 }, { "epoch": 0.00854217983774378, "grad_norm": 0.549055814743042, "learning_rate": 0.0001410637660935938, "loss": 44.0923, "step": 189 }, { "epoch": 0.008587376556462, "grad_norm": 0.4136901795864105, "learning_rate": 0.00014047833431223938, "loss": 44.0967, "step": 190 }, { "epoch": 0.008587376556462, "eval_loss": 11.02279281616211, "eval_runtime": 176.1885, "eval_samples_per_second": 52.881, "eval_steps_per_second": 26.443, "step": 190 }, { "epoch": 0.008632573275180222, "grad_norm": 0.5897504091262817, "learning_rate": 0.0001398912386251042, "loss": 44.0428, "step": 191 }, { "epoch": 0.008677769993898443, "grad_norm": 0.4917847514152527, "learning_rate": 0.00013930250316539238, "loss": 44.0819, "step": 192 }, { "epoch": 0.008722966712616664, "grad_norm": 0.4644782245159149, "learning_rate": 0.00013871215213371284, "loss": 44.0209, "step": 193 }, { "epoch": 0.008768163431334885, "grad_norm": 0.6393492817878723, "learning_rate": 0.00013812020979708418, "loss": 44.101, "step": 194 }, { "epoch": 0.008813360150053106, "grad_norm": 0.60307377576828, "learning_rate": 0.00013752670048793744, "loss": 44.1646, "step": 195 }, { "epoch": 0.008813360150053106, "eval_loss": 11.022566795349121, "eval_runtime": 176.0184, "eval_samples_per_second": 52.932, "eval_steps_per_second": 26.469, "step": 195 }, { "epoch": 0.008858556868771328, "grad_norm": 0.4305557608604431, "learning_rate": 0.00013693164860311565, "loss": 44.0883, "step": 196 }, { "epoch": 0.008903753587489549, "grad_norm": 0.4658234119415283, "learning_rate": 0.00013633507860287116, "loss": 44.1006, "step": 197 }, { "epoch": 0.00894895030620777, "grad_norm": 0.5248441100120544, "learning_rate": 0.0001357370150098601, "loss": 44.0716, "step": 198 }, { "epoch": 0.008994147024925991, "grad_norm": 0.5177784562110901, "learning_rate": 0.0001351374824081343, "loss": 44.1013, "step": 199 }, { "epoch": 0.009039343743644212, "grad_norm": 0.5134817361831665, "learning_rate": 0.00013453650544213076, "loss": 44.0501, "step": 200 }, { "epoch": 0.009039343743644212, "eval_loss": 11.022467613220215, "eval_runtime": 176.1703, "eval_samples_per_second": 52.886, "eval_steps_per_second": 26.446, "step": 200 }, { "epoch": 0.009084540462362432, "grad_norm": 0.6612194776535034, "learning_rate": 0.00013393410881565876, "loss": 44.1568, "step": 201 }, { "epoch": 0.009129737181080653, "grad_norm": 0.5365848541259766, "learning_rate": 0.00013333031729088419, "loss": 44.0318, "step": 202 }, { "epoch": 0.009174933899798874, "grad_norm": 0.43999558687210083, "learning_rate": 0.0001327251556873117, "loss": 44.0544, "step": 203 }, { "epoch": 0.009220130618517095, "grad_norm": 0.5535528659820557, "learning_rate": 0.00013211864888076457, "loss": 44.0657, "step": 204 }, { "epoch": 0.009265327337235316, "grad_norm": 0.5289484262466431, "learning_rate": 0.0001315108218023621, "loss": 44.0946, "step": 205 }, { "epoch": 0.009265327337235316, "eval_loss": 11.022246360778809, "eval_runtime": 175.5631, "eval_samples_per_second": 53.069, "eval_steps_per_second": 26.537, "step": 205 }, { "epoch": 0.009310524055953538, "grad_norm": 0.515040934085846, "learning_rate": 0.00013090169943749476, "loss": 44.1026, "step": 206 }, { "epoch": 0.009355720774671759, "grad_norm": 0.43807700276374817, "learning_rate": 0.00013029130682479722, "loss": 44.0529, "step": 207 }, { "epoch": 0.00940091749338998, "grad_norm": 0.40383437275886536, "learning_rate": 0.00012967966905511906, "loss": 44.0854, "step": 208 }, { "epoch": 0.009446114212108201, "grad_norm": 0.42450079321861267, "learning_rate": 0.00012906681127049338, "loss": 44.0488, "step": 209 }, { "epoch": 0.009491310930826422, "grad_norm": 0.5043962597846985, "learning_rate": 0.00012845275866310324, "loss": 44.047, "step": 210 }, { "epoch": 0.009491310930826422, "eval_loss": 11.02186393737793, "eval_runtime": 176.4502, "eval_samples_per_second": 52.802, "eval_steps_per_second": 26.404, "step": 210 }, { "epoch": 0.009536507649544643, "grad_norm": 0.5239633321762085, "learning_rate": 0.00012783753647424635, "loss": 44.1326, "step": 211 }, { "epoch": 0.009581704368262865, "grad_norm": 0.4532044231891632, "learning_rate": 0.00012722116999329712, "loss": 44.1039, "step": 212 }, { "epoch": 0.009626901086981086, "grad_norm": 0.5784953832626343, "learning_rate": 0.00012660368455666752, "loss": 44.0902, "step": 213 }, { "epoch": 0.009672097805699307, "grad_norm": 0.46399155259132385, "learning_rate": 0.0001259851055467653, "loss": 44.0665, "step": 214 }, { "epoch": 0.009717294524417528, "grad_norm": 0.5353842973709106, "learning_rate": 0.00012536545839095074, "loss": 44.0339, "step": 215 }, { "epoch": 0.009717294524417528, "eval_loss": 11.021649360656738, "eval_runtime": 176.1431, "eval_samples_per_second": 52.895, "eval_steps_per_second": 26.45, "step": 215 }, { "epoch": 0.009762491243135748, "grad_norm": 0.4887973666191101, "learning_rate": 0.00012474476856049144, "loss": 44.074, "step": 216 }, { "epoch": 0.009807687961853969, "grad_norm": 0.44021403789520264, "learning_rate": 0.00012412306156951526, "loss": 44.0695, "step": 217 }, { "epoch": 0.00985288468057219, "grad_norm": 0.5092349052429199, "learning_rate": 0.00012350036297396154, "loss": 44.0596, "step": 218 }, { "epoch": 0.009898081399290411, "grad_norm": 0.47505757212638855, "learning_rate": 0.00012287669837053055, "loss": 44.0435, "step": 219 }, { "epoch": 0.009943278118008632, "grad_norm": 0.4098033308982849, "learning_rate": 0.00012225209339563145, "loss": 44.1334, "step": 220 }, { "epoch": 0.009943278118008632, "eval_loss": 11.021401405334473, "eval_runtime": 176.2917, "eval_samples_per_second": 52.85, "eval_steps_per_second": 26.428, "step": 220 }, { "epoch": 0.009988474836726853, "grad_norm": 0.5452781915664673, "learning_rate": 0.00012162657372432836, "loss": 44.0602, "step": 221 }, { "epoch": 0.010033671555445075, "grad_norm": 0.5344114303588867, "learning_rate": 0.00012100016506928493, "loss": 44.045, "step": 222 }, { "epoch": 0.010078868274163296, "grad_norm": 0.4083841145038605, "learning_rate": 0.00012037289317970757, "loss": 44.0642, "step": 223 }, { "epoch": 0.010124064992881517, "grad_norm": 0.4382067918777466, "learning_rate": 0.00011974478384028672, "loss": 44.0648, "step": 224 }, { "epoch": 0.010169261711599738, "grad_norm": 0.42340517044067383, "learning_rate": 0.00011911586287013725, "loss": 44.1315, "step": 225 }, { "epoch": 0.010169261711599738, "eval_loss": 11.021224975585938, "eval_runtime": 176.0622, "eval_samples_per_second": 52.919, "eval_steps_per_second": 26.462, "step": 225 }, { "epoch": 0.01021445843031796, "grad_norm": 0.5047578811645508, "learning_rate": 0.00011848615612173688, "loss": 44.123, "step": 226 }, { "epoch": 0.01025965514903618, "grad_norm": 0.5647579431533813, "learning_rate": 0.00011785568947986367, "loss": 44.0525, "step": 227 }, { "epoch": 0.010304851867754402, "grad_norm": 0.48243632912635803, "learning_rate": 0.0001172244888605319, "loss": 44.1143, "step": 228 }, { "epoch": 0.010350048586472623, "grad_norm": 0.5492759943008423, "learning_rate": 0.0001165925802099268, "loss": 44.0494, "step": 229 }, { "epoch": 0.010395245305190844, "grad_norm": 0.5804261565208435, "learning_rate": 0.00011595998950333793, "loss": 44.0785, "step": 230 }, { "epoch": 0.010395245305190844, "eval_loss": 11.021036148071289, "eval_runtime": 176.2523, "eval_samples_per_second": 52.862, "eval_steps_per_second": 26.434, "step": 230 }, { "epoch": 0.010440442023909063, "grad_norm": 0.4731612503528595, "learning_rate": 0.00011532674274409159, "loss": 44.1151, "step": 231 }, { "epoch": 0.010485638742627285, "grad_norm": 0.47020676732063293, "learning_rate": 0.00011469286596248181, "loss": 44.0772, "step": 232 }, { "epoch": 0.010530835461345506, "grad_norm": 0.4738229215145111, "learning_rate": 0.00011405838521470029, "loss": 44.1274, "step": 233 }, { "epoch": 0.010576032180063727, "grad_norm": 0.5980152487754822, "learning_rate": 0.00011342332658176555, "loss": 44.0543, "step": 234 }, { "epoch": 0.010621228898781948, "grad_norm": 0.45920702815055847, "learning_rate": 0.00011278771616845061, "loss": 44.0846, "step": 235 }, { "epoch": 0.010621228898781948, "eval_loss": 11.02093505859375, "eval_runtime": 176.025, "eval_samples_per_second": 52.93, "eval_steps_per_second": 26.468, "step": 235 }, { "epoch": 0.01066642561750017, "grad_norm": 0.48931440711021423, "learning_rate": 0.00011215158010221005, "loss": 44.0991, "step": 236 }, { "epoch": 0.01071162233621839, "grad_norm": 0.4345873296260834, "learning_rate": 0.00011151494453210596, "loss": 44.0491, "step": 237 }, { "epoch": 0.010756819054936612, "grad_norm": 0.43655380606651306, "learning_rate": 0.00011087783562773311, "loss": 44.0903, "step": 238 }, { "epoch": 0.010802015773654833, "grad_norm": 0.616533637046814, "learning_rate": 0.00011024027957814314, "loss": 44.1318, "step": 239 }, { "epoch": 0.010847212492373054, "grad_norm": 0.45536908507347107, "learning_rate": 0.00010960230259076818, "loss": 44.0812, "step": 240 }, { "epoch": 0.010847212492373054, "eval_loss": 11.020767211914062, "eval_runtime": 176.3636, "eval_samples_per_second": 52.828, "eval_steps_per_second": 26.417, "step": 240 }, { "epoch": 0.010892409211091275, "grad_norm": 0.47256338596343994, "learning_rate": 0.00010896393089034336, "loss": 44.0513, "step": 241 }, { "epoch": 0.010937605929809496, "grad_norm": 0.42103204131126404, "learning_rate": 0.00010832519071782894, "loss": 44.0399, "step": 242 }, { "epoch": 0.010982802648527717, "grad_norm": 0.49555832147598267, "learning_rate": 0.00010768610832933168, "loss": 44.1504, "step": 243 }, { "epoch": 0.011027999367245939, "grad_norm": 0.42800289392471313, "learning_rate": 0.0001070467099950254, "loss": 44.0886, "step": 244 }, { "epoch": 0.01107319608596416, "grad_norm": 0.6031785607337952, "learning_rate": 0.0001064070219980713, "loss": 44.0548, "step": 245 }, { "epoch": 0.01107319608596416, "eval_loss": 11.020543098449707, "eval_runtime": 176.1913, "eval_samples_per_second": 52.88, "eval_steps_per_second": 26.443, "step": 245 }, { "epoch": 0.01111839280468238, "grad_norm": 0.4927026629447937, "learning_rate": 0.00010576707063353746, "loss": 44.0813, "step": 246 }, { "epoch": 0.0111635895234006, "grad_norm": 0.6148269772529602, "learning_rate": 0.00010512688220731792, "loss": 44.0928, "step": 247 }, { "epoch": 0.011208786242118822, "grad_norm": 0.4395325779914856, "learning_rate": 0.00010448648303505151, "loss": 44.047, "step": 248 }, { "epoch": 0.011253982960837043, "grad_norm": 0.4433494806289673, "learning_rate": 0.00010384589944103984, "loss": 44.1, "step": 249 }, { "epoch": 0.011299179679555264, "grad_norm": 0.6447661519050598, "learning_rate": 0.00010320515775716555, "loss": 44.0861, "step": 250 }, { "epoch": 0.011299179679555264, "eval_loss": 11.020323753356934, "eval_runtime": 176.3276, "eval_samples_per_second": 52.839, "eval_steps_per_second": 26.422, "step": 250 }, { "epoch": 0.011344376398273485, "grad_norm": 0.5418515801429749, "learning_rate": 0.00010256428432180956, "loss": 44.0602, "step": 251 }, { "epoch": 0.011389573116991706, "grad_norm": 0.45757991075515747, "learning_rate": 0.00010192330547876871, "loss": 44.0788, "step": 252 }, { "epoch": 0.011434769835709927, "grad_norm": 0.5210107564926147, "learning_rate": 0.00010128224757617274, "loss": 44.0517, "step": 253 }, { "epoch": 0.011479966554428149, "grad_norm": 0.39198753237724304, "learning_rate": 0.00010064113696540111, "loss": 44.0776, "step": 254 }, { "epoch": 0.01152516327314637, "grad_norm": 0.4305363893508911, "learning_rate": 0.0001, "loss": 44.1121, "step": 255 }, { "epoch": 0.01152516327314637, "eval_loss": 11.02021312713623, "eval_runtime": 176.1601, "eval_samples_per_second": 52.889, "eval_steps_per_second": 26.448, "step": 255 }, { "epoch": 0.011570359991864591, "grad_norm": 0.4909750521183014, "learning_rate": 9.93588630345989e-05, "loss": 44.0858, "step": 256 }, { "epoch": 0.011615556710582812, "grad_norm": 0.4016626477241516, "learning_rate": 9.871775242382727e-05, "loss": 44.0732, "step": 257 }, { "epoch": 0.011660753429301033, "grad_norm": 0.5827097296714783, "learning_rate": 9.80766945212313e-05, "loss": 44.0957, "step": 258 }, { "epoch": 0.011705950148019255, "grad_norm": 0.48728469014167786, "learning_rate": 9.743571567819046e-05, "loss": 44.0648, "step": 259 }, { "epoch": 0.011751146866737476, "grad_norm": 0.455342173576355, "learning_rate": 9.679484224283449e-05, "loss": 44.0327, "step": 260 }, { "epoch": 0.011751146866737476, "eval_loss": 11.020062446594238, "eval_runtime": 176.2853, "eval_samples_per_second": 52.852, "eval_steps_per_second": 26.429, "step": 260 }, { "epoch": 0.011796343585455695, "grad_norm": 0.50531005859375, "learning_rate": 9.615410055896015e-05, "loss": 44.0094, "step": 261 }, { "epoch": 0.011841540304173916, "grad_norm": 0.6205224990844727, "learning_rate": 9.551351696494854e-05, "loss": 44.1, "step": 262 }, { "epoch": 0.011886737022892137, "grad_norm": 0.5274375081062317, "learning_rate": 9.48731177926821e-05, "loss": 44.1223, "step": 263 }, { "epoch": 0.011931933741610359, "grad_norm": 0.5149595141410828, "learning_rate": 9.423292936646257e-05, "loss": 44.1192, "step": 264 }, { "epoch": 0.01197713046032858, "grad_norm": 0.5359209179878235, "learning_rate": 9.359297800192872e-05, "loss": 44.1155, "step": 265 }, { "epoch": 0.01197713046032858, "eval_loss": 11.019892692565918, "eval_runtime": 176.1866, "eval_samples_per_second": 52.881, "eval_steps_per_second": 26.444, "step": 265 }, { "epoch": 0.012022327179046801, "grad_norm": 0.5752252340316772, "learning_rate": 9.29532900049746e-05, "loss": 44.0821, "step": 266 }, { "epoch": 0.012067523897765022, "grad_norm": 0.5125178098678589, "learning_rate": 9.231389167066837e-05, "loss": 44.061, "step": 267 }, { "epoch": 0.012112720616483243, "grad_norm": 0.5295204520225525, "learning_rate": 9.167480928217108e-05, "loss": 43.9889, "step": 268 }, { "epoch": 0.012157917335201465, "grad_norm": 0.40016570687294006, "learning_rate": 9.103606910965666e-05, "loss": 44.0684, "step": 269 }, { "epoch": 0.012203114053919686, "grad_norm": 0.42660149931907654, "learning_rate": 9.039769740923183e-05, "loss": 44.0547, "step": 270 }, { "epoch": 0.012203114053919686, "eval_loss": 11.01980209350586, "eval_runtime": 176.1599, "eval_samples_per_second": 52.889, "eval_steps_per_second": 26.448, "step": 270 }, { "epoch": 0.012248310772637907, "grad_norm": 0.636551022529602, "learning_rate": 8.975972042185687e-05, "loss": 44.1385, "step": 271 }, { "epoch": 0.012293507491356128, "grad_norm": 0.5031408071517944, "learning_rate": 8.912216437226693e-05, "loss": 44.1121, "step": 272 }, { "epoch": 0.01233870421007435, "grad_norm": 0.49243634939193726, "learning_rate": 8.848505546789408e-05, "loss": 44.0864, "step": 273 }, { "epoch": 0.01238390092879257, "grad_norm": 0.47308340668678284, "learning_rate": 8.784841989778996e-05, "loss": 44.0391, "step": 274 }, { "epoch": 0.012429097647510792, "grad_norm": 0.43966105580329895, "learning_rate": 8.721228383154939e-05, "loss": 44.0969, "step": 275 }, { "epoch": 0.012429097647510792, "eval_loss": 11.019760131835938, "eval_runtime": 176.1857, "eval_samples_per_second": 52.882, "eval_steps_per_second": 26.444, "step": 275 }, { "epoch": 0.012474294366229011, "grad_norm": 0.4853382706642151, "learning_rate": 8.657667341823448e-05, "loss": 44.079, "step": 276 }, { "epoch": 0.012519491084947232, "grad_norm": 0.453819215297699, "learning_rate": 8.594161478529974e-05, "loss": 44.0371, "step": 277 }, { "epoch": 0.012564687803665453, "grad_norm": 0.4855421483516693, "learning_rate": 8.530713403751821e-05, "loss": 44.0514, "step": 278 }, { "epoch": 0.012609884522383675, "grad_norm": 0.49890294671058655, "learning_rate": 8.46732572559084e-05, "loss": 44.0561, "step": 279 }, { "epoch": 0.012655081241101896, "grad_norm": 0.406686007976532, "learning_rate": 8.404001049666211e-05, "loss": 44.0746, "step": 280 }, { "epoch": 0.012655081241101896, "eval_loss": 11.01966381072998, "eval_runtime": 176.4032, "eval_samples_per_second": 52.817, "eval_steps_per_second": 26.411, "step": 280 }, { "epoch": 0.012700277959820117, "grad_norm": 0.584389865398407, "learning_rate": 8.340741979007325e-05, "loss": 44.0014, "step": 281 }, { "epoch": 0.012745474678538338, "grad_norm": 0.5981946587562561, "learning_rate": 8.277551113946812e-05, "loss": 44.1037, "step": 282 }, { "epoch": 0.01279067139725656, "grad_norm": 0.48125511407852173, "learning_rate": 8.214431052013634e-05, "loss": 44.1114, "step": 283 }, { "epoch": 0.01283586811597478, "grad_norm": 0.4403318762779236, "learning_rate": 8.151384387826313e-05, "loss": 44.0742, "step": 284 }, { "epoch": 0.012881064834693002, "grad_norm": 0.5336763262748718, "learning_rate": 8.08841371298628e-05, "loss": 44.0535, "step": 285 }, { "epoch": 0.012881064834693002, "eval_loss": 11.01951789855957, "eval_runtime": 176.2803, "eval_samples_per_second": 52.853, "eval_steps_per_second": 26.429, "step": 285 }, { "epoch": 0.012926261553411223, "grad_norm": 0.4550967216491699, "learning_rate": 8.02552161597133e-05, "loss": 44.0825, "step": 286 }, { "epoch": 0.012971458272129444, "grad_norm": 0.5073683261871338, "learning_rate": 7.962710682029245e-05, "loss": 44.0045, "step": 287 }, { "epoch": 0.013016654990847665, "grad_norm": 0.424605131149292, "learning_rate": 7.899983493071507e-05, "loss": 44.0451, "step": 288 }, { "epoch": 0.013061851709565886, "grad_norm": 0.48650291562080383, "learning_rate": 7.837342627567165e-05, "loss": 44.0424, "step": 289 }, { "epoch": 0.013107048428284106, "grad_norm": 0.5977911949157715, "learning_rate": 7.774790660436858e-05, "loss": 44.1303, "step": 290 }, { "epoch": 0.013107048428284106, "eval_loss": 11.019427299499512, "eval_runtime": 176.4378, "eval_samples_per_second": 52.806, "eval_steps_per_second": 26.406, "step": 290 }, { "epoch": 0.013152245147002327, "grad_norm": 0.5895593166351318, "learning_rate": 7.712330162946948e-05, "loss": 44.0645, "step": 291 }, { "epoch": 0.013197441865720548, "grad_norm": 0.4745809733867645, "learning_rate": 7.649963702603849e-05, "loss": 44.0755, "step": 292 }, { "epoch": 0.01324263858443877, "grad_norm": 0.5061216950416565, "learning_rate": 7.587693843048475e-05, "loss": 44.0751, "step": 293 }, { "epoch": 0.01328783530315699, "grad_norm": 0.42560261487960815, "learning_rate": 7.525523143950859e-05, "loss": 44.0495, "step": 294 }, { "epoch": 0.013333032021875212, "grad_norm": 0.44290590286254883, "learning_rate": 7.463454160904928e-05, "loss": 44.1142, "step": 295 }, { "epoch": 0.013333032021875212, "eval_loss": 11.019330978393555, "eval_runtime": 175.7063, "eval_samples_per_second": 53.026, "eval_steps_per_second": 26.516, "step": 295 }, { "epoch": 0.013378228740593433, "grad_norm": 0.6524297595024109, "learning_rate": 7.401489445323473e-05, "loss": 44.0737, "step": 296 }, { "epoch": 0.013423425459311654, "grad_norm": 0.49754655361175537, "learning_rate": 7.339631544333249e-05, "loss": 44.0838, "step": 297 }, { "epoch": 0.013468622178029875, "grad_norm": 0.4138273596763611, "learning_rate": 7.27788300067029e-05, "loss": 44.0653, "step": 298 }, { "epoch": 0.013513818896748096, "grad_norm": 0.5399671792984009, "learning_rate": 7.21624635257537e-05, "loss": 44.0646, "step": 299 }, { "epoch": 0.013559015615466317, "grad_norm": 0.41923409700393677, "learning_rate": 7.154724133689677e-05, "loss": 44.0685, "step": 300 }, { "epoch": 0.013559015615466317, "eval_loss": 11.019237518310547, "eval_runtime": 176.4288, "eval_samples_per_second": 52.809, "eval_steps_per_second": 26.407, "step": 300 }, { "epoch": 0.013604212334184539, "grad_norm": 0.49278682470321655, "learning_rate": 7.093318872950665e-05, "loss": 44.0319, "step": 301 }, { "epoch": 0.01364940905290276, "grad_norm": 0.5009450316429138, "learning_rate": 7.032033094488095e-05, "loss": 44.0988, "step": 302 }, { "epoch": 0.013694605771620981, "grad_norm": 0.4270615577697754, "learning_rate": 6.97086931752028e-05, "loss": 44.1025, "step": 303 }, { "epoch": 0.013739802490339202, "grad_norm": 0.49744102358818054, "learning_rate": 6.909830056250527e-05, "loss": 44.0652, "step": 304 }, { "epoch": 0.013784999209057422, "grad_norm": 0.48600587248802185, "learning_rate": 6.848917819763793e-05, "loss": 44.1292, "step": 305 }, { "epoch": 0.013784999209057422, "eval_loss": 11.01909351348877, "eval_runtime": 176.1259, "eval_samples_per_second": 52.9, "eval_steps_per_second": 26.453, "step": 305 }, { "epoch": 0.013830195927775643, "grad_norm": 0.4116569459438324, "learning_rate": 6.788135111923545e-05, "loss": 44.0897, "step": 306 }, { "epoch": 0.013875392646493864, "grad_norm": 0.4364916682243347, "learning_rate": 6.72748443126883e-05, "loss": 44.1195, "step": 307 }, { "epoch": 0.013920589365212085, "grad_norm": 0.5589216351509094, "learning_rate": 6.666968270911584e-05, "loss": 44.0911, "step": 308 }, { "epoch": 0.013965786083930306, "grad_norm": 0.5414496064186096, "learning_rate": 6.606589118434126e-05, "loss": 44.1532, "step": 309 }, { "epoch": 0.014010982802648527, "grad_norm": 0.4488687515258789, "learning_rate": 6.546349455786926e-05, "loss": 44.0637, "step": 310 }, { "epoch": 0.014010982802648527, "eval_loss": 11.018967628479004, "eval_runtime": 176.4018, "eval_samples_per_second": 52.817, "eval_steps_per_second": 26.411, "step": 310 }, { "epoch": 0.014056179521366749, "grad_norm": 0.5137606859207153, "learning_rate": 6.486251759186572e-05, "loss": 44.1158, "step": 311 }, { "epoch": 0.01410137624008497, "grad_norm": 0.5155542492866516, "learning_rate": 6.426298499013994e-05, "loss": 44.1199, "step": 312 }, { "epoch": 0.014146572958803191, "grad_norm": 0.37395790219306946, "learning_rate": 6.366492139712886e-05, "loss": 44.0457, "step": 313 }, { "epoch": 0.014191769677521412, "grad_norm": 0.6116747260093689, "learning_rate": 6.306835139688438e-05, "loss": 44.1012, "step": 314 }, { "epoch": 0.014236966396239633, "grad_norm": 0.5333120822906494, "learning_rate": 6.24732995120626e-05, "loss": 44.1035, "step": 315 }, { "epoch": 0.014236966396239633, "eval_loss": 11.018932342529297, "eval_runtime": 176.1972, "eval_samples_per_second": 52.878, "eval_steps_per_second": 26.442, "step": 315 }, { "epoch": 0.014282163114957854, "grad_norm": 0.43927499651908875, "learning_rate": 6.187979020291583e-05, "loss": 44.0191, "step": 316 }, { "epoch": 0.014327359833676076, "grad_norm": 0.4511764347553253, "learning_rate": 6.12878478662872e-05, "loss": 44.036, "step": 317 }, { "epoch": 0.014372556552394297, "grad_norm": 0.4678284823894501, "learning_rate": 6.069749683460765e-05, "loss": 44.1023, "step": 318 }, { "epoch": 0.014417753271112518, "grad_norm": 0.4449803829193115, "learning_rate": 6.010876137489584e-05, "loss": 44.0835, "step": 319 }, { "epoch": 0.014462949989830737, "grad_norm": 0.42860502004623413, "learning_rate": 5.952166568776062e-05, "loss": 44.0725, "step": 320 }, { "epoch": 0.014462949989830737, "eval_loss": 11.018913269042969, "eval_runtime": 176.3627, "eval_samples_per_second": 52.829, "eval_steps_per_second": 26.417, "step": 320 }, { "epoch": 0.014508146708548959, "grad_norm": 0.47462332248687744, "learning_rate": 5.893623390640621e-05, "loss": 44.0712, "step": 321 }, { "epoch": 0.01455334342726718, "grad_norm": 0.3999902307987213, "learning_rate": 5.835249009564012e-05, "loss": 44.0985, "step": 322 }, { "epoch": 0.014598540145985401, "grad_norm": 0.5390244126319885, "learning_rate": 5.777045825088404e-05, "loss": 44.0947, "step": 323 }, { "epoch": 0.014643736864703622, "grad_norm": 0.5316472053527832, "learning_rate": 5.7190162297187475e-05, "loss": 44.0887, "step": 324 }, { "epoch": 0.014688933583421843, "grad_norm": 0.43537721037864685, "learning_rate": 5.6611626088244194e-05, "loss": 44.1142, "step": 325 }, { "epoch": 0.014688933583421843, "eval_loss": 11.018884658813477, "eval_runtime": 176.0785, "eval_samples_per_second": 52.914, "eval_steps_per_second": 26.46, "step": 325 }, { "epoch": 0.014734130302140065, "grad_norm": 0.42780250310897827, "learning_rate": 5.60348734054118e-05, "loss": 44.0567, "step": 326 }, { "epoch": 0.014779327020858286, "grad_norm": 0.418026864528656, "learning_rate": 5.545992795673408e-05, "loss": 44.0578, "step": 327 }, { "epoch": 0.014824523739576507, "grad_norm": 0.507036030292511, "learning_rate": 5.488681337596653e-05, "loss": 44.0708, "step": 328 }, { "epoch": 0.014869720458294728, "grad_norm": 0.4779205322265625, "learning_rate": 5.431555322160483e-05, "loss": 44.0879, "step": 329 }, { "epoch": 0.01491491717701295, "grad_norm": 0.48253196477890015, "learning_rate": 5.37461709759165e-05, "loss": 44.005, "step": 330 }, { "epoch": 0.01491491717701295, "eval_loss": 11.018866539001465, "eval_runtime": 176.4141, "eval_samples_per_second": 52.813, "eval_steps_per_second": 26.409, "step": 330 }, { "epoch": 0.01496011389573117, "grad_norm": 0.503404438495636, "learning_rate": 5.317869004397544e-05, "loss": 44.0551, "step": 331 }, { "epoch": 0.015005310614449392, "grad_norm": 0.5667140483856201, "learning_rate": 5.261313375270014e-05, "loss": 44.1005, "step": 332 }, { "epoch": 0.015050507333167613, "grad_norm": 0.4343127906322479, "learning_rate": 5.2049525349894625e-05, "loss": 44.0367, "step": 333 }, { "epoch": 0.015095704051885834, "grad_norm": 0.4030550420284271, "learning_rate": 5.148788800329278e-05, "loss": 44.0094, "step": 334 }, { "epoch": 0.015140900770604053, "grad_norm": 0.7541276812553406, "learning_rate": 5.092824479960625e-05, "loss": 44.0686, "step": 335 }, { "epoch": 0.015140900770604053, "eval_loss": 11.018802642822266, "eval_runtime": 176.1322, "eval_samples_per_second": 52.898, "eval_steps_per_second": 26.452, "step": 335 }, { "epoch": 0.015186097489322275, "grad_norm": 0.4742172360420227, "learning_rate": 5.0370618743575026e-05, "loss": 44.0855, "step": 336 }, { "epoch": 0.015231294208040496, "grad_norm": 0.4134741723537445, "learning_rate": 4.981503275702227e-05, "loss": 44.0928, "step": 337 }, { "epoch": 0.015276490926758717, "grad_norm": 0.6316869258880615, "learning_rate": 4.92615096779118e-05, "loss": 44.0649, "step": 338 }, { "epoch": 0.015321687645476938, "grad_norm": 0.4112119674682617, "learning_rate": 4.87100722594094e-05, "loss": 44.0769, "step": 339 }, { "epoch": 0.01536688436419516, "grad_norm": 0.4423971474170685, "learning_rate": 4.8160743168947496e-05, "loss": 44.059, "step": 340 }, { "epoch": 0.01536688436419516, "eval_loss": 11.018733024597168, "eval_runtime": 176.2897, "eval_samples_per_second": 52.85, "eval_steps_per_second": 26.428, "step": 340 }, { "epoch": 0.01541208108291338, "grad_norm": 0.46714112162590027, "learning_rate": 4.7613544987293446e-05, "loss": 44.007, "step": 341 }, { "epoch": 0.015457277801631602, "grad_norm": 0.449789434671402, "learning_rate": 4.706850020762126e-05, "loss": 44.0599, "step": 342 }, { "epoch": 0.015502474520349823, "grad_norm": 0.5278406739234924, "learning_rate": 4.6525631234587034e-05, "loss": 44.0606, "step": 343 }, { "epoch": 0.015547671239068044, "grad_norm": 0.5856757760047913, "learning_rate": 4.5984960383408005e-05, "loss": 44.0862, "step": 344 }, { "epoch": 0.015592867957786265, "grad_norm": 0.48914504051208496, "learning_rate": 4.544650987894514e-05, "loss": 44.0642, "step": 345 }, { "epoch": 0.015592867957786265, "eval_loss": 11.018689155578613, "eval_runtime": 176.1852, "eval_samples_per_second": 52.882, "eval_steps_per_second": 26.444, "step": 345 }, { "epoch": 0.015638064676504485, "grad_norm": 0.5346770882606506, "learning_rate": 4.491030185478976e-05, "loss": 44.122, "step": 346 }, { "epoch": 0.015683261395222706, "grad_norm": 0.4303387701511383, "learning_rate": 4.437635835235353e-05, "loss": 44.0754, "step": 347 }, { "epoch": 0.015728458113940927, "grad_norm": 0.3995809555053711, "learning_rate": 4.384470131996252e-05, "loss": 44.1039, "step": 348 }, { "epoch": 0.015773654832659148, "grad_norm": 0.44882121682167053, "learning_rate": 4.331535261195504e-05, "loss": 44.1023, "step": 349 }, { "epoch": 0.01581885155137737, "grad_norm": 0.4910334646701813, "learning_rate": 4.278833398778306e-05, "loss": 44.0906, "step": 350 }, { "epoch": 0.01581885155137737, "eval_loss": 11.018669128417969, "eval_runtime": 176.2273, "eval_samples_per_second": 52.869, "eval_steps_per_second": 26.437, "step": 350 }, { "epoch": 0.01586404827009559, "grad_norm": 0.4974361062049866, "learning_rate": 4.2263667111118074e-05, "loss": 44.0836, "step": 351 }, { "epoch": 0.01590924498881381, "grad_norm": 0.4839700162410736, "learning_rate": 4.174137354896039e-05, "loss": 44.0984, "step": 352 }, { "epoch": 0.015954441707532033, "grad_norm": 0.4186987578868866, "learning_rate": 4.12214747707527e-05, "loss": 44.0672, "step": 353 }, { "epoch": 0.015999638426250254, "grad_norm": 0.5234962701797485, "learning_rate": 4.0703992147497425e-05, "loss": 44.0376, "step": 354 }, { "epoch": 0.016044835144968475, "grad_norm": 0.47532570362091064, "learning_rate": 4.0188946950878404e-05, "loss": 44.0386, "step": 355 }, { "epoch": 0.016044835144968475, "eval_loss": 11.018640518188477, "eval_runtime": 176.1029, "eval_samples_per_second": 52.907, "eval_steps_per_second": 26.456, "step": 355 }, { "epoch": 0.016090031863686696, "grad_norm": 0.397630900144577, "learning_rate": 3.9676360352386356e-05, "loss": 44.1375, "step": 356 }, { "epoch": 0.016135228582404917, "grad_norm": 0.530908465385437, "learning_rate": 3.9166253422448686e-05, "loss": 44.1015, "step": 357 }, { "epoch": 0.01618042530112314, "grad_norm": 0.41138243675231934, "learning_rate": 3.8658647129563364e-05, "loss": 44.0516, "step": 358 }, { "epoch": 0.01622562201984136, "grad_norm": 0.5258074402809143, "learning_rate": 3.8153562339436855e-05, "loss": 44.1157, "step": 359 }, { "epoch": 0.01627081873855958, "grad_norm": 0.3948734402656555, "learning_rate": 3.7651019814126654e-05, "loss": 44.0478, "step": 360 }, { "epoch": 0.01627081873855958, "eval_loss": 11.018574714660645, "eval_runtime": 176.3307, "eval_samples_per_second": 52.838, "eval_steps_per_second": 26.422, "step": 360 }, { "epoch": 0.016316015457277802, "grad_norm": 0.47562116384506226, "learning_rate": 3.7151040211187635e-05, "loss": 44.0571, "step": 361 }, { "epoch": 0.016361212175996023, "grad_norm": 0.439248651266098, "learning_rate": 3.665364408282305e-05, "loss": 44.0292, "step": 362 }, { "epoch": 0.016406408894714244, "grad_norm": 0.5355764031410217, "learning_rate": 3.615885187503946e-05, "loss": 44.1601, "step": 363 }, { "epoch": 0.016451605613432466, "grad_norm": 0.5143962502479553, "learning_rate": 3.566668392680662e-05, "loss": 44.0829, "step": 364 }, { "epoch": 0.016496802332150687, "grad_norm": 0.5054187178611755, "learning_rate": 3.517716046922118e-05, "loss": 44.144, "step": 365 }, { "epoch": 0.016496802332150687, "eval_loss": 11.018515586853027, "eval_runtime": 176.1346, "eval_samples_per_second": 52.897, "eval_steps_per_second": 26.451, "step": 365 }, { "epoch": 0.016541999050868908, "grad_norm": 0.44439616799354553, "learning_rate": 3.469030162467513e-05, "loss": 44.0321, "step": 366 }, { "epoch": 0.01658719576958713, "grad_norm": 0.5372561812400818, "learning_rate": 3.4206127406028745e-05, "loss": 44.0923, "step": 367 }, { "epoch": 0.01663239248830535, "grad_norm": 0.48407748341560364, "learning_rate": 3.372465771578771e-05, "loss": 44.1126, "step": 368 }, { "epoch": 0.01667758920702357, "grad_norm": 0.4682793915271759, "learning_rate": 3.32459123452852e-05, "loss": 44.0227, "step": 369 }, { "epoch": 0.016722785925741793, "grad_norm": 0.4110027551651001, "learning_rate": 3.276991097386831e-05, "loss": 44.0354, "step": 370 }, { "epoch": 0.016722785925741793, "eval_loss": 11.018465042114258, "eval_runtime": 176.3082, "eval_samples_per_second": 52.845, "eval_steps_per_second": 26.425, "step": 370 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3887490662400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }