{ "best_metric": 0.9677584767341614, "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-7400", "epoch": 1.151936, "eval_steps": 100, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00064, "grad_norm": 7.650606632232666, "learning_rate": 1.0000000000000002e-06, "loss": 6.29, "step": 5 }, { "epoch": 0.00128, "grad_norm": 4.541823387145996, "learning_rate": 2.0000000000000003e-06, "loss": 6.3815, "step": 10 }, { "epoch": 0.00192, "grad_norm": 4.245054721832275, "learning_rate": 3e-06, "loss": 6.2854, "step": 15 }, { "epoch": 0.00256, "grad_norm": 4.5587897300720215, "learning_rate": 4.000000000000001e-06, "loss": 6.0674, "step": 20 }, { "epoch": 0.0032, "grad_norm": 3.7703804969787598, "learning_rate": 4.800000000000001e-06, "loss": 6.2961, "step": 25 }, { "epoch": 0.00384, "grad_norm": 3.8425862789154053, "learning_rate": 5.8e-06, "loss": 6.3326, "step": 30 }, { "epoch": 0.00448, "grad_norm": 4.413463115692139, "learning_rate": 6.800000000000001e-06, "loss": 6.183, "step": 35 }, { "epoch": 0.00512, "grad_norm": 4.1980509757995605, "learning_rate": 7.800000000000002e-06, "loss": 6.2654, "step": 40 }, { "epoch": 0.00576, "grad_norm": 3.9166719913482666, "learning_rate": 8.8e-06, "loss": 6.0916, "step": 45 }, { "epoch": 0.0064, "grad_norm": 3.4706904888153076, "learning_rate": 9.800000000000001e-06, "loss": 6.103, "step": 50 }, { "epoch": 0.00704, "grad_norm": 5.138203144073486, "learning_rate": 9.999998372356185e-06, "loss": 6.2379, "step": 55 }, { "epoch": 0.00768, "grad_norm": 3.7806520462036133, "learning_rate": 9.999991760055e-06, "loss": 6.1776, "step": 60 }, { "epoch": 0.00832, "grad_norm": 3.5731871128082275, "learning_rate": 9.999980061375427e-06, "loss": 6.2082, "step": 65 }, { "epoch": 0.00896, "grad_norm": 3.661797285079956, "learning_rate": 9.999963276329369e-06, "loss": 6.0704, "step": 70 }, { "epoch": 0.0096, "grad_norm": 3.6181113719940186, "learning_rate": 9.999941404933902e-06, "loss": 6.2081, "step": 75 }, { "epoch": 0.01024, "grad_norm": 3.3162803649902344, "learning_rate": 9.99991444721127e-06, "loss": 5.8807, "step": 80 }, { "epoch": 0.01088, "grad_norm": 3.6022472381591797, "learning_rate": 9.999882403188902e-06, "loss": 6.1092, "step": 85 }, { "epoch": 0.01152, "grad_norm": 7.291418552398682, "learning_rate": 9.999845272899393e-06, "loss": 5.7668, "step": 90 }, { "epoch": 0.01216, "grad_norm": 3.522437810897827, "learning_rate": 9.999803056380517e-06, "loss": 6.1621, "step": 95 }, { "epoch": 0.0128, "grad_norm": 3.9014439582824707, "learning_rate": 9.999755753675216e-06, "loss": 6.0573, "step": 100 }, { "epoch": 0.0128, "eval_loss": 1.5072969198226929, "eval_runtime": 11.1161, "eval_samples_per_second": 89.96, "eval_steps_per_second": 11.245, "step": 100 }, { "epoch": 0.01344, "grad_norm": 3.7579081058502197, "learning_rate": 9.999703364831614e-06, "loss": 6.1671, "step": 105 }, { "epoch": 0.01408, "grad_norm": 3.7058262825012207, "learning_rate": 9.999645889903002e-06, "loss": 6.1348, "step": 110 }, { "epoch": 0.01472, "grad_norm": 5.018667697906494, "learning_rate": 9.99958332894785e-06, "loss": 5.9376, "step": 115 }, { "epoch": 0.01536, "grad_norm": 3.5420188903808594, "learning_rate": 9.999515682029798e-06, "loss": 5.9961, "step": 120 }, { "epoch": 0.016, "grad_norm": 3.5725393295288086, "learning_rate": 9.999442949217663e-06, "loss": 5.8439, "step": 125 }, { "epoch": 0.01664, "grad_norm": 3.8440959453582764, "learning_rate": 9.999365130585435e-06, "loss": 5.7857, "step": 130 }, { "epoch": 0.01728, "grad_norm": 3.4371285438537598, "learning_rate": 9.999282226212276e-06, "loss": 5.799, "step": 135 }, { "epoch": 0.01792, "grad_norm": 3.996847152709961, "learning_rate": 9.999194236182523e-06, "loss": 6.0022, "step": 140 }, { "epoch": 0.01856, "grad_norm": 3.720330238342285, "learning_rate": 9.999101160585687e-06, "loss": 5.925, "step": 145 }, { "epoch": 0.0192, "grad_norm": 3.8822953701019287, "learning_rate": 9.99900299951645e-06, "loss": 5.8085, "step": 150 }, { "epoch": 0.01984, "grad_norm": 3.599283456802368, "learning_rate": 9.99889975307467e-06, "loss": 5.6533, "step": 155 }, { "epoch": 0.02048, "grad_norm": 3.4847381114959717, "learning_rate": 9.998791421365376e-06, "loss": 5.9021, "step": 160 }, { "epoch": 0.02112, "grad_norm": 3.4302055835723877, "learning_rate": 9.998678004498774e-06, "loss": 5.962, "step": 165 }, { "epoch": 0.02176, "grad_norm": 4.561929702758789, "learning_rate": 9.99855950259024e-06, "loss": 5.9011, "step": 170 }, { "epoch": 0.0224, "grad_norm": 4.069271087646484, "learning_rate": 9.998435915760323e-06, "loss": 5.6782, "step": 175 }, { "epoch": 0.02304, "grad_norm": 3.5959055423736572, "learning_rate": 9.998307244134741e-06, "loss": 5.8107, "step": 180 }, { "epoch": 0.02368, "grad_norm": 3.5477242469787598, "learning_rate": 9.998173487844396e-06, "loss": 5.8335, "step": 185 }, { "epoch": 0.02432, "grad_norm": 4.488218307495117, "learning_rate": 9.998034647025349e-06, "loss": 5.8285, "step": 190 }, { "epoch": 0.02496, "grad_norm": 3.555074691772461, "learning_rate": 9.997890721818844e-06, "loss": 5.817, "step": 195 }, { "epoch": 0.0256, "grad_norm": 3.6248419284820557, "learning_rate": 9.99774171237129e-06, "loss": 5.8368, "step": 200 }, { "epoch": 0.0256, "eval_loss": 1.440572738647461, "eval_runtime": 6.6468, "eval_samples_per_second": 150.448, "eval_steps_per_second": 18.806, "step": 200 }, { "epoch": 0.02624, "grad_norm": 3.432421922683716, "learning_rate": 9.997587618834272e-06, "loss": 5.7842, "step": 205 }, { "epoch": 0.02688, "grad_norm": 3.333038806915283, "learning_rate": 9.997428441364546e-06, "loss": 5.7173, "step": 210 }, { "epoch": 0.02752, "grad_norm": 3.7716541290283203, "learning_rate": 9.997264180124038e-06, "loss": 5.719, "step": 215 }, { "epoch": 0.02816, "grad_norm": 3.345600128173828, "learning_rate": 9.99709483527985e-06, "loss": 5.8428, "step": 220 }, { "epoch": 0.0288, "grad_norm": 3.7677502632141113, "learning_rate": 9.99692040700425e-06, "loss": 5.7393, "step": 225 }, { "epoch": 0.02944, "grad_norm": 11.996383666992188, "learning_rate": 9.996740895474682e-06, "loss": 5.5566, "step": 230 }, { "epoch": 0.03008, "grad_norm": 3.6089084148406982, "learning_rate": 9.996556300873758e-06, "loss": 5.6939, "step": 235 }, { "epoch": 0.03072, "grad_norm": 3.834825038909912, "learning_rate": 9.996366623389263e-06, "loss": 5.8123, "step": 240 }, { "epoch": 0.03136, "grad_norm": 3.570263147354126, "learning_rate": 9.99617186321415e-06, "loss": 5.6839, "step": 245 }, { "epoch": 0.032, "grad_norm": 3.5728812217712402, "learning_rate": 9.995972020546545e-06, "loss": 5.7764, "step": 250 }, { "epoch": 0.03264, "grad_norm": 3.4725637435913086, "learning_rate": 9.995767095589743e-06, "loss": 5.6879, "step": 255 }, { "epoch": 0.03328, "grad_norm": 3.811537742614746, "learning_rate": 9.99555708855221e-06, "loss": 5.6418, "step": 260 }, { "epoch": 0.03392, "grad_norm": 3.494992971420288, "learning_rate": 9.99534199964758e-06, "loss": 5.6927, "step": 265 }, { "epoch": 0.03456, "grad_norm": 3.8107383251190186, "learning_rate": 9.995121829094662e-06, "loss": 5.5658, "step": 270 }, { "epoch": 0.0352, "grad_norm": 3.570551633834839, "learning_rate": 9.994896577117425e-06, "loss": 5.8131, "step": 275 }, { "epoch": 0.03584, "grad_norm": 3.540811538696289, "learning_rate": 9.994666243945018e-06, "loss": 5.6009, "step": 280 }, { "epoch": 0.03648, "grad_norm": 3.7275819778442383, "learning_rate": 9.99443082981175e-06, "loss": 5.6407, "step": 285 }, { "epoch": 0.03712, "grad_norm": 4.194495677947998, "learning_rate": 9.994190334957103e-06, "loss": 5.8319, "step": 290 }, { "epoch": 0.03776, "grad_norm": 3.5107626914978027, "learning_rate": 9.993944759625728e-06, "loss": 5.5765, "step": 295 }, { "epoch": 0.0384, "grad_norm": 3.4100208282470703, "learning_rate": 9.993694104067444e-06, "loss": 5.7473, "step": 300 }, { "epoch": 0.0384, "eval_loss": 1.407908320426941, "eval_runtime": 6.6542, "eval_samples_per_second": 150.281, "eval_steps_per_second": 18.785, "step": 300 }, { "epoch": 0.03904, "grad_norm": 3.7727818489074707, "learning_rate": 9.993438368537236e-06, "loss": 5.6802, "step": 305 }, { "epoch": 0.03968, "grad_norm": 3.445909023284912, "learning_rate": 9.993177553295258e-06, "loss": 5.7484, "step": 310 }, { "epoch": 0.04032, "grad_norm": 3.4199888706207275, "learning_rate": 9.992911658606832e-06, "loss": 5.7648, "step": 315 }, { "epoch": 0.04096, "grad_norm": 4.9640655517578125, "learning_rate": 9.992640684742445e-06, "loss": 5.7922, "step": 320 }, { "epoch": 0.0416, "grad_norm": 3.3730976581573486, "learning_rate": 9.992364631977754e-06, "loss": 5.677, "step": 325 }, { "epoch": 0.04224, "grad_norm": 3.540597915649414, "learning_rate": 9.99208350059358e-06, "loss": 5.5495, "step": 330 }, { "epoch": 0.04288, "grad_norm": 3.6853768825531006, "learning_rate": 9.991797290875915e-06, "loss": 5.4089, "step": 335 }, { "epoch": 0.04352, "grad_norm": 3.6380045413970947, "learning_rate": 9.991506003115911e-06, "loss": 5.4849, "step": 340 }, { "epoch": 0.04416, "grad_norm": 3.265488862991333, "learning_rate": 9.991209637609887e-06, "loss": 5.523, "step": 345 }, { "epoch": 0.0448, "grad_norm": 3.2634189128875732, "learning_rate": 9.990908194659332e-06, "loss": 5.5664, "step": 350 }, { "epoch": 0.04544, "grad_norm": 3.569810152053833, "learning_rate": 9.990601674570895e-06, "loss": 5.5059, "step": 355 }, { "epoch": 0.04608, "grad_norm": 3.580211877822876, "learning_rate": 9.990290077656393e-06, "loss": 5.4079, "step": 360 }, { "epoch": 0.04672, "grad_norm": 3.4860317707061768, "learning_rate": 9.989973404232805e-06, "loss": 5.6858, "step": 365 }, { "epoch": 0.04736, "grad_norm": 4.026730060577393, "learning_rate": 9.989651654622277e-06, "loss": 5.5662, "step": 370 }, { "epoch": 0.048, "grad_norm": 3.364692449569702, "learning_rate": 9.989324829152119e-06, "loss": 5.5304, "step": 375 }, { "epoch": 0.04864, "grad_norm": 3.611964464187622, "learning_rate": 9.9889929281548e-06, "loss": 5.3911, "step": 380 }, { "epoch": 0.04928, "grad_norm": 3.2946035861968994, "learning_rate": 9.988655951967958e-06, "loss": 5.4102, "step": 385 }, { "epoch": 0.04992, "grad_norm": 3.963909864425659, "learning_rate": 9.98831390093439e-06, "loss": 5.549, "step": 390 }, { "epoch": 0.05056, "grad_norm": 3.2876341342926025, "learning_rate": 9.987966775402056e-06, "loss": 5.5388, "step": 395 }, { "epoch": 0.0512, "grad_norm": 3.8467471599578857, "learning_rate": 9.98761457572408e-06, "loss": 5.454, "step": 400 }, { "epoch": 0.0512, "eval_loss": 1.3826359510421753, "eval_runtime": 7.0199, "eval_samples_per_second": 142.452, "eval_steps_per_second": 17.807, "step": 400 }, { "epoch": 0.05184, "grad_norm": 3.675231695175171, "learning_rate": 9.987257302258748e-06, "loss": 5.674, "step": 405 }, { "epoch": 0.05248, "grad_norm": 3.787940263748169, "learning_rate": 9.986894955369504e-06, "loss": 5.5466, "step": 410 }, { "epoch": 0.05312, "grad_norm": 3.677966833114624, "learning_rate": 9.986527535424956e-06, "loss": 5.4762, "step": 415 }, { "epoch": 0.05376, "grad_norm": 3.5083606243133545, "learning_rate": 9.986155042798874e-06, "loss": 5.3145, "step": 420 }, { "epoch": 0.0544, "grad_norm": 3.536379098892212, "learning_rate": 9.98577747787018e-06, "loss": 5.3769, "step": 425 }, { "epoch": 0.05504, "grad_norm": 3.5448412895202637, "learning_rate": 9.98539484102297e-06, "loss": 5.3996, "step": 430 }, { "epoch": 0.05568, "grad_norm": 3.359647274017334, "learning_rate": 9.985007132646489e-06, "loss": 5.3114, "step": 435 }, { "epoch": 0.05632, "grad_norm": 3.3419110774993896, "learning_rate": 9.984614353135143e-06, "loss": 5.4383, "step": 440 }, { "epoch": 0.05696, "grad_norm": 3.558025360107422, "learning_rate": 9.984216502888496e-06, "loss": 5.5239, "step": 445 }, { "epoch": 0.0576, "grad_norm": 3.6349422931671143, "learning_rate": 9.983813582311277e-06, "loss": 5.5639, "step": 450 }, { "epoch": 0.05824, "grad_norm": 3.2916922569274902, "learning_rate": 9.983405591813362e-06, "loss": 5.3886, "step": 455 }, { "epoch": 0.05888, "grad_norm": 3.32891845703125, "learning_rate": 9.982992531809796e-06, "loss": 5.526, "step": 460 }, { "epoch": 0.05952, "grad_norm": 3.8752880096435547, "learning_rate": 9.982574402720773e-06, "loss": 5.6599, "step": 465 }, { "epoch": 0.06016, "grad_norm": 3.604433536529541, "learning_rate": 9.982151204971646e-06, "loss": 5.4567, "step": 470 }, { "epoch": 0.0608, "grad_norm": 3.3058159351348877, "learning_rate": 9.981722938992926e-06, "loss": 5.4981, "step": 475 }, { "epoch": 0.06144, "grad_norm": 3.7341926097869873, "learning_rate": 9.981289605220276e-06, "loss": 5.3278, "step": 480 }, { "epoch": 0.06208, "grad_norm": 3.51798415184021, "learning_rate": 9.980851204094519e-06, "loss": 5.5029, "step": 485 }, { "epoch": 0.06272, "grad_norm": 3.6541428565979004, "learning_rate": 9.980407736061629e-06, "loss": 5.3987, "step": 490 }, { "epoch": 0.06336, "grad_norm": 3.420767307281494, "learning_rate": 9.979959201572736e-06, "loss": 5.405, "step": 495 }, { "epoch": 0.064, "grad_norm": 3.7169559001922607, "learning_rate": 9.979505601084124e-06, "loss": 5.498, "step": 500 }, { "epoch": 0.064, "eval_loss": 1.3493109941482544, "eval_runtime": 7.1309, "eval_samples_per_second": 140.234, "eval_steps_per_second": 17.529, "step": 500 }, { "epoch": 0.06464, "grad_norm": 4.536627769470215, "learning_rate": 9.97904693505723e-06, "loss": 5.5237, "step": 505 }, { "epoch": 0.06528, "grad_norm": 3.204948902130127, "learning_rate": 9.978583203958649e-06, "loss": 5.3746, "step": 510 }, { "epoch": 0.06592, "grad_norm": 3.4658005237579346, "learning_rate": 9.978114408260118e-06, "loss": 5.4567, "step": 515 }, { "epoch": 0.06656, "grad_norm": 4.932333469390869, "learning_rate": 9.977640548438534e-06, "loss": 5.1959, "step": 520 }, { "epoch": 0.0672, "grad_norm": 3.4697563648223877, "learning_rate": 9.977161624975948e-06, "loss": 5.4013, "step": 525 }, { "epoch": 0.06784, "grad_norm": 3.441819667816162, "learning_rate": 9.976677638359553e-06, "loss": 5.4899, "step": 530 }, { "epoch": 0.06848, "grad_norm": 3.4293930530548096, "learning_rate": 9.9761885890817e-06, "loss": 5.3569, "step": 535 }, { "epoch": 0.06912, "grad_norm": 3.5388574600219727, "learning_rate": 9.975694477639885e-06, "loss": 5.2739, "step": 540 }, { "epoch": 0.06976, "grad_norm": 3.735548973083496, "learning_rate": 9.97519530453676e-06, "loss": 5.4253, "step": 545 }, { "epoch": 0.0704, "grad_norm": 3.33503794670105, "learning_rate": 9.974691070280121e-06, "loss": 5.1569, "step": 550 }, { "epoch": 0.07104, "grad_norm": 3.5171401500701904, "learning_rate": 9.974181775382915e-06, "loss": 5.3242, "step": 555 }, { "epoch": 0.07168, "grad_norm": 3.565356969833374, "learning_rate": 9.973667420363233e-06, "loss": 5.3893, "step": 560 }, { "epoch": 0.07232, "grad_norm": 3.172163248062134, "learning_rate": 9.973148005744319e-06, "loss": 5.3824, "step": 565 }, { "epoch": 0.07296, "grad_norm": 3.517838716506958, "learning_rate": 9.972623532054564e-06, "loss": 5.2673, "step": 570 }, { "epoch": 0.0736, "grad_norm": 3.328416585922241, "learning_rate": 9.9720939998275e-06, "loss": 5.2649, "step": 575 }, { "epoch": 0.07424, "grad_norm": 3.475539445877075, "learning_rate": 9.971559409601807e-06, "loss": 5.3318, "step": 580 }, { "epoch": 0.07488, "grad_norm": 3.492013692855835, "learning_rate": 9.971019761921317e-06, "loss": 5.2735, "step": 585 }, { "epoch": 0.07552, "grad_norm": 3.474803924560547, "learning_rate": 9.970475057334997e-06, "loss": 5.3722, "step": 590 }, { "epoch": 0.07616, "grad_norm": 3.4162726402282715, "learning_rate": 9.96992529639696e-06, "loss": 5.3901, "step": 595 }, { "epoch": 0.0768, "grad_norm": 3.3643155097961426, "learning_rate": 9.969370479666473e-06, "loss": 5.2384, "step": 600 }, { "epoch": 0.0768, "eval_loss": 1.3373793363571167, "eval_runtime": 6.5847, "eval_samples_per_second": 151.867, "eval_steps_per_second": 18.983, "step": 600 }, { "epoch": 0.07744, "grad_norm": 3.44301176071167, "learning_rate": 9.968810607707933e-06, "loss": 5.2322, "step": 605 }, { "epoch": 0.07808, "grad_norm": 3.422262668609619, "learning_rate": 9.968245681090887e-06, "loss": 5.1708, "step": 610 }, { "epoch": 0.07872, "grad_norm": 3.2879252433776855, "learning_rate": 9.96767570039002e-06, "loss": 5.2291, "step": 615 }, { "epoch": 0.07936, "grad_norm": 3.6026480197906494, "learning_rate": 9.967100666185163e-06, "loss": 5.4241, "step": 620 }, { "epoch": 0.08, "grad_norm": 3.3642101287841797, "learning_rate": 9.966520579061286e-06, "loss": 5.4473, "step": 625 }, { "epoch": 0.08064, "grad_norm": 3.5968470573425293, "learning_rate": 9.965935439608493e-06, "loss": 5.3982, "step": 630 }, { "epoch": 0.08128, "grad_norm": 3.352083206176758, "learning_rate": 9.96534524842204e-06, "loss": 5.3953, "step": 635 }, { "epoch": 0.08192, "grad_norm": 3.3571720123291016, "learning_rate": 9.964750006102311e-06, "loss": 5.3159, "step": 640 }, { "epoch": 0.08256, "grad_norm": 3.486246109008789, "learning_rate": 9.964149713254833e-06, "loss": 5.211, "step": 645 }, { "epoch": 0.0832, "grad_norm": 3.674906015396118, "learning_rate": 9.96354437049027e-06, "loss": 5.3374, "step": 650 }, { "epoch": 0.08384, "grad_norm": 3.590810537338257, "learning_rate": 9.962933978424426e-06, "loss": 5.2194, "step": 655 }, { "epoch": 0.08448, "grad_norm": 3.551786184310913, "learning_rate": 9.962318537678238e-06, "loss": 5.1187, "step": 660 }, { "epoch": 0.08512, "grad_norm": 3.5391581058502197, "learning_rate": 9.961698048877776e-06, "loss": 5.2001, "step": 665 }, { "epoch": 0.08576, "grad_norm": 3.6105592250823975, "learning_rate": 9.961072512654255e-06, "loss": 5.2758, "step": 670 }, { "epoch": 0.0864, "grad_norm": 3.7463858127593994, "learning_rate": 9.960441929644017e-06, "loss": 5.2137, "step": 675 }, { "epoch": 0.08704, "grad_norm": 3.9237470626831055, "learning_rate": 9.959806300488538e-06, "loss": 5.2047, "step": 680 }, { "epoch": 0.08768, "grad_norm": 3.392827272415161, "learning_rate": 9.95916562583443e-06, "loss": 5.3071, "step": 685 }, { "epoch": 0.08832, "grad_norm": 3.221484661102295, "learning_rate": 9.958519906333438e-06, "loss": 5.183, "step": 690 }, { "epoch": 0.08896, "grad_norm": 3.5143983364105225, "learning_rate": 9.957869142642437e-06, "loss": 5.3171, "step": 695 }, { "epoch": 0.0896, "grad_norm": 3.497072696685791, "learning_rate": 9.957213335423433e-06, "loss": 5.1784, "step": 700 }, { "epoch": 0.0896, "eval_loss": 1.2988511323928833, "eval_runtime": 6.9763, "eval_samples_per_second": 143.342, "eval_steps_per_second": 17.918, "step": 700 }, { "epoch": 0.09024, "grad_norm": 3.3822438716888428, "learning_rate": 9.956552485343566e-06, "loss": 5.1732, "step": 705 }, { "epoch": 0.09088, "grad_norm": 3.3949694633483887, "learning_rate": 9.955886593075101e-06, "loss": 5.2725, "step": 710 }, { "epoch": 0.09152, "grad_norm": 3.2577288150787354, "learning_rate": 9.955215659295438e-06, "loss": 5.2207, "step": 715 }, { "epoch": 0.09216, "grad_norm": 3.769519567489624, "learning_rate": 9.954539684687103e-06, "loss": 5.2152, "step": 720 }, { "epoch": 0.0928, "grad_norm": 3.3824892044067383, "learning_rate": 9.953858669937746e-06, "loss": 5.2085, "step": 725 }, { "epoch": 0.09344, "grad_norm": 3.771742105484009, "learning_rate": 9.953172615740152e-06, "loss": 5.1575, "step": 730 }, { "epoch": 0.09408, "grad_norm": 3.7706689834594727, "learning_rate": 9.952481522792226e-06, "loss": 4.9608, "step": 735 }, { "epoch": 0.09472, "grad_norm": 3.8110334873199463, "learning_rate": 9.951785391797001e-06, "loss": 5.21, "step": 740 }, { "epoch": 0.09536, "grad_norm": 3.3012993335723877, "learning_rate": 9.951084223462636e-06, "loss": 5.2475, "step": 745 }, { "epoch": 0.096, "grad_norm": 3.6353518962860107, "learning_rate": 9.950378018502415e-06, "loss": 5.0985, "step": 750 }, { "epoch": 0.09664, "grad_norm": 3.369378089904785, "learning_rate": 9.949666777634743e-06, "loss": 5.1986, "step": 755 }, { "epoch": 0.09728, "grad_norm": 3.2247676849365234, "learning_rate": 9.948950501583147e-06, "loss": 5.3192, "step": 760 }, { "epoch": 0.09792, "grad_norm": 3.6966888904571533, "learning_rate": 9.948229191076284e-06, "loss": 5.1654, "step": 765 }, { "epoch": 0.09856, "grad_norm": 3.5823962688446045, "learning_rate": 9.947502846847921e-06, "loss": 5.1351, "step": 770 }, { "epoch": 0.0992, "grad_norm": 3.5258729457855225, "learning_rate": 9.946771469636955e-06, "loss": 5.1745, "step": 775 }, { "epoch": 0.09984, "grad_norm": 3.42067813873291, "learning_rate": 9.946035060187398e-06, "loss": 5.1569, "step": 780 }, { "epoch": 0.10048, "grad_norm": 3.9832825660705566, "learning_rate": 9.945293619248383e-06, "loss": 4.9796, "step": 785 }, { "epoch": 0.10112, "grad_norm": 3.742013692855835, "learning_rate": 9.944547147574162e-06, "loss": 5.1625, "step": 790 }, { "epoch": 0.10176, "grad_norm": 3.3150367736816406, "learning_rate": 9.943795645924104e-06, "loss": 5.099, "step": 795 }, { "epoch": 0.1024, "grad_norm": 3.359069585800171, "learning_rate": 9.943039115062691e-06, "loss": 5.1877, "step": 800 }, { "epoch": 0.1024, "eval_loss": 1.2946017980575562, "eval_runtime": 7.4306, "eval_samples_per_second": 134.579, "eval_steps_per_second": 16.822, "step": 800 }, { "epoch": 0.10304, "grad_norm": 3.703000545501709, "learning_rate": 9.94227755575953e-06, "loss": 5.1581, "step": 805 }, { "epoch": 0.10368, "grad_norm": 3.5370070934295654, "learning_rate": 9.941510968789334e-06, "loss": 5.2402, "step": 810 }, { "epoch": 0.10432, "grad_norm": 3.5010828971862793, "learning_rate": 9.940739354931936e-06, "loss": 5.1828, "step": 815 }, { "epoch": 0.10496, "grad_norm": 3.4637820720672607, "learning_rate": 9.93996271497228e-06, "loss": 5.1792, "step": 820 }, { "epoch": 0.1056, "grad_norm": 3.409712076187134, "learning_rate": 9.939181049700427e-06, "loss": 5.0721, "step": 825 }, { "epoch": 0.10624, "grad_norm": 3.589414596557617, "learning_rate": 9.938394359911545e-06, "loss": 5.234, "step": 830 }, { "epoch": 0.10688, "grad_norm": 3.444977045059204, "learning_rate": 9.937602646405918e-06, "loss": 4.9763, "step": 835 }, { "epoch": 0.10752, "grad_norm": 3.3560900688171387, "learning_rate": 9.936805909988935e-06, "loss": 5.2006, "step": 840 }, { "epoch": 0.10816, "grad_norm": 3.345703601837158, "learning_rate": 9.9360041514711e-06, "loss": 5.0287, "step": 845 }, { "epoch": 0.1088, "grad_norm": 3.492363691329956, "learning_rate": 9.935197371668024e-06, "loss": 5.0908, "step": 850 }, { "epoch": 0.10944, "grad_norm": 7.459951400756836, "learning_rate": 9.934385571400425e-06, "loss": 5.1735, "step": 855 }, { "epoch": 0.11008, "grad_norm": 3.5033841133117676, "learning_rate": 9.933568751494131e-06, "loss": 5.053, "step": 860 }, { "epoch": 0.11072, "grad_norm": 3.5542259216308594, "learning_rate": 9.93274691278007e-06, "loss": 5.1463, "step": 865 }, { "epoch": 0.11136, "grad_norm": 3.3819243907928467, "learning_rate": 9.931920056094285e-06, "loss": 5.0397, "step": 870 }, { "epoch": 0.112, "grad_norm": 3.406768798828125, "learning_rate": 9.931088182277915e-06, "loss": 5.179, "step": 875 }, { "epoch": 0.11264, "grad_norm": 5.960773944854736, "learning_rate": 9.930251292177206e-06, "loss": 5.217, "step": 880 }, { "epoch": 0.11328, "grad_norm": 3.5821049213409424, "learning_rate": 9.929409386643511e-06, "loss": 5.0374, "step": 885 }, { "epoch": 0.11392, "grad_norm": 3.3204903602600098, "learning_rate": 9.928562466533279e-06, "loss": 5.1856, "step": 890 }, { "epoch": 0.11456, "grad_norm": 4.022350788116455, "learning_rate": 9.927710532708064e-06, "loss": 5.1051, "step": 895 }, { "epoch": 0.1152, "grad_norm": 3.3810718059539795, "learning_rate": 9.926853586034515e-06, "loss": 5.1691, "step": 900 }, { "epoch": 0.1152, "eval_loss": 1.2660380601882935, "eval_runtime": 6.8853, "eval_samples_per_second": 145.238, "eval_steps_per_second": 18.155, "step": 900 }, { "epoch": 0.11584, "grad_norm": 3.5757713317871094, "learning_rate": 9.92599162738439e-06, "loss": 5.1505, "step": 905 }, { "epoch": 0.11648, "grad_norm": 3.38582706451416, "learning_rate": 9.925124657634537e-06, "loss": 5.0915, "step": 910 }, { "epoch": 0.11712, "grad_norm": 3.4189300537109375, "learning_rate": 9.924252677666905e-06, "loss": 5.1992, "step": 915 }, { "epoch": 0.11776, "grad_norm": 3.4118812084198, "learning_rate": 9.92337568836854e-06, "loss": 5.1334, "step": 920 }, { "epoch": 0.1184, "grad_norm": 3.5167789459228516, "learning_rate": 9.922493690631583e-06, "loss": 5.1003, "step": 925 }, { "epoch": 0.11904, "grad_norm": 3.546893358230591, "learning_rate": 9.921606685353268e-06, "loss": 5.1346, "step": 930 }, { "epoch": 0.11968, "grad_norm": 3.1576385498046875, "learning_rate": 9.920714673435931e-06, "loss": 4.9601, "step": 935 }, { "epoch": 0.12032, "grad_norm": 3.4227495193481445, "learning_rate": 9.91981765578699e-06, "loss": 5.0087, "step": 940 }, { "epoch": 0.12096, "grad_norm": 3.4890694618225098, "learning_rate": 9.918915633318964e-06, "loss": 5.1319, "step": 945 }, { "epoch": 0.1216, "grad_norm": 3.7377865314483643, "learning_rate": 9.918008606949459e-06, "loss": 5.0618, "step": 950 }, { "epoch": 0.12224, "grad_norm": 3.793402671813965, "learning_rate": 9.917096577601172e-06, "loss": 4.9998, "step": 955 }, { "epoch": 0.12288, "grad_norm": 3.404918909072876, "learning_rate": 9.916179546201889e-06, "loss": 5.0865, "step": 960 }, { "epoch": 0.12352, "grad_norm": 3.6076908111572266, "learning_rate": 9.915257513684488e-06, "loss": 5.0004, "step": 965 }, { "epoch": 0.12416, "grad_norm": 3.631777286529541, "learning_rate": 9.914330480986932e-06, "loss": 5.2806, "step": 970 }, { "epoch": 0.1248, "grad_norm": 3.323333501815796, "learning_rate": 9.913398449052266e-06, "loss": 5.07, "step": 975 }, { "epoch": 0.12544, "grad_norm": 3.6380035877227783, "learning_rate": 9.912461418828628e-06, "loss": 5.0559, "step": 980 }, { "epoch": 0.12608, "grad_norm": 3.7685458660125732, "learning_rate": 9.911519391269238e-06, "loss": 5.0497, "step": 985 }, { "epoch": 0.12672, "grad_norm": 3.4882941246032715, "learning_rate": 9.910572367332397e-06, "loss": 5.0388, "step": 990 }, { "epoch": 0.12736, "grad_norm": 3.27787184715271, "learning_rate": 9.909620347981493e-06, "loss": 5.0285, "step": 995 }, { "epoch": 0.128, "grad_norm": 3.388284921646118, "learning_rate": 9.908663334184994e-06, "loss": 5.1426, "step": 1000 }, { "epoch": 0.128, "eval_loss": 1.2478246688842773, "eval_runtime": 9.3123, "eval_samples_per_second": 107.384, "eval_steps_per_second": 13.423, "step": 1000 }, { "epoch": 0.12864, "grad_norm": 3.4602177143096924, "learning_rate": 9.907701326916448e-06, "loss": 4.8852, "step": 1005 }, { "epoch": 0.12928, "grad_norm": 3.7464816570281982, "learning_rate": 9.906734327154481e-06, "loss": 4.9129, "step": 1010 }, { "epoch": 0.12992, "grad_norm": 6.138649940490723, "learning_rate": 9.905762335882804e-06, "loss": 5.1037, "step": 1015 }, { "epoch": 0.13056, "grad_norm": 3.5933375358581543, "learning_rate": 9.904785354090198e-06, "loss": 4.9644, "step": 1020 }, { "epoch": 0.1312, "grad_norm": 3.6777257919311523, "learning_rate": 9.903803382770528e-06, "loss": 5.0575, "step": 1025 }, { "epoch": 0.13184, "grad_norm": 3.4429285526275635, "learning_rate": 9.902816422922727e-06, "loss": 4.8722, "step": 1030 }, { "epoch": 0.13248, "grad_norm": 3.7400121688842773, "learning_rate": 9.90182447555081e-06, "loss": 4.9521, "step": 1035 }, { "epoch": 0.13312, "grad_norm": 3.2183690071105957, "learning_rate": 9.900827541663862e-06, "loss": 5.0314, "step": 1040 }, { "epoch": 0.13376, "grad_norm": 3.563539505004883, "learning_rate": 9.899825622276041e-06, "loss": 4.9471, "step": 1045 }, { "epoch": 0.1344, "grad_norm": 3.3289413452148438, "learning_rate": 9.898818718406578e-06, "loss": 5.0223, "step": 1050 }, { "epoch": 0.13504, "grad_norm": 3.3363258838653564, "learning_rate": 9.89780683107977e-06, "loss": 4.8883, "step": 1055 }, { "epoch": 0.13568, "grad_norm": 3.5950427055358887, "learning_rate": 9.896789961324991e-06, "loss": 4.9488, "step": 1060 }, { "epoch": 0.13632, "grad_norm": 3.2444112300872803, "learning_rate": 9.895768110176677e-06, "loss": 4.9408, "step": 1065 }, { "epoch": 0.13696, "grad_norm": 3.2985880374908447, "learning_rate": 9.894741278674337e-06, "loss": 4.9875, "step": 1070 }, { "epoch": 0.1376, "grad_norm": 3.474818229675293, "learning_rate": 9.89370946786254e-06, "loss": 5.0526, "step": 1075 }, { "epoch": 0.13824, "grad_norm": 4.721025466918945, "learning_rate": 9.892672678790926e-06, "loss": 5.1362, "step": 1080 }, { "epoch": 0.13888, "grad_norm": 3.84086012840271, "learning_rate": 9.891630912514197e-06, "loss": 4.9631, "step": 1085 }, { "epoch": 0.13952, "grad_norm": 3.487732172012329, "learning_rate": 9.890584170092115e-06, "loss": 4.9211, "step": 1090 }, { "epoch": 0.14016, "grad_norm": 3.398810625076294, "learning_rate": 9.889532452589512e-06, "loss": 4.9814, "step": 1095 }, { "epoch": 0.1408, "grad_norm": 3.3263680934906006, "learning_rate": 9.888475761076273e-06, "loss": 4.9985, "step": 1100 }, { "epoch": 0.1408, "eval_loss": 1.2442607879638672, "eval_runtime": 6.5582, "eval_samples_per_second": 152.481, "eval_steps_per_second": 19.06, "step": 1100 }, { "epoch": 0.14144, "grad_norm": 3.4481613636016846, "learning_rate": 9.887414096627348e-06, "loss": 5.0169, "step": 1105 }, { "epoch": 0.14208, "grad_norm": 3.2736401557922363, "learning_rate": 9.886347460322744e-06, "loss": 5.0703, "step": 1110 }, { "epoch": 0.14272, "grad_norm": 3.2973997592926025, "learning_rate": 9.885275853247526e-06, "loss": 4.9957, "step": 1115 }, { "epoch": 0.14336, "grad_norm": 3.6516940593719482, "learning_rate": 9.884199276491817e-06, "loss": 5.0162, "step": 1120 }, { "epoch": 0.144, "grad_norm": 3.1835155487060547, "learning_rate": 9.883117731150792e-06, "loss": 4.9765, "step": 1125 }, { "epoch": 0.14464, "grad_norm": 3.21928334236145, "learning_rate": 9.882031218324681e-06, "loss": 5.0611, "step": 1130 }, { "epoch": 0.14528, "grad_norm": 4.601723670959473, "learning_rate": 9.880939739118772e-06, "loss": 5.0637, "step": 1135 }, { "epoch": 0.14592, "grad_norm": 3.2973368167877197, "learning_rate": 9.879843294643402e-06, "loss": 4.9621, "step": 1140 }, { "epoch": 0.14656, "grad_norm": 3.4781899452209473, "learning_rate": 9.878741886013959e-06, "loss": 4.9482, "step": 1145 }, { "epoch": 0.1472, "grad_norm": 3.5175704956054688, "learning_rate": 9.877635514350878e-06, "loss": 4.8594, "step": 1150 }, { "epoch": 0.14784, "grad_norm": 3.4302468299865723, "learning_rate": 9.87652418077965e-06, "loss": 4.8865, "step": 1155 }, { "epoch": 0.14848, "grad_norm": 3.464651346206665, "learning_rate": 9.875407886430806e-06, "loss": 4.9922, "step": 1160 }, { "epoch": 0.14912, "grad_norm": 4.064827919006348, "learning_rate": 9.87428663243993e-06, "loss": 4.9592, "step": 1165 }, { "epoch": 0.14976, "grad_norm": 3.654902458190918, "learning_rate": 9.873160419947645e-06, "loss": 4.9286, "step": 1170 }, { "epoch": 0.1504, "grad_norm": 3.395596981048584, "learning_rate": 9.872029250099626e-06, "loss": 5.0057, "step": 1175 }, { "epoch": 0.15104, "grad_norm": 3.745281457901001, "learning_rate": 9.870893124046582e-06, "loss": 4.8671, "step": 1180 }, { "epoch": 0.15168, "grad_norm": 3.449518918991089, "learning_rate": 9.869752042944271e-06, "loss": 4.8306, "step": 1185 }, { "epoch": 0.15232, "grad_norm": 3.1926662921905518, "learning_rate": 9.868606007953487e-06, "loss": 5.0347, "step": 1190 }, { "epoch": 0.15296, "grad_norm": 3.4620425701141357, "learning_rate": 9.86745502024007e-06, "loss": 4.857, "step": 1195 }, { "epoch": 0.1536, "grad_norm": 3.5597681999206543, "learning_rate": 9.866299080974886e-06, "loss": 4.9225, "step": 1200 }, { "epoch": 0.1536, "eval_loss": 1.2185124158859253, "eval_runtime": 7.9383, "eval_samples_per_second": 125.972, "eval_steps_per_second": 15.746, "step": 1200 }, { "epoch": 0.15424, "grad_norm": 3.5934455394744873, "learning_rate": 9.865138191333852e-06, "loss": 4.7654, "step": 1205 }, { "epoch": 0.15488, "grad_norm": 3.8588831424713135, "learning_rate": 9.863972352497912e-06, "loss": 4.9993, "step": 1210 }, { "epoch": 0.15552, "grad_norm": 3.58868408203125, "learning_rate": 9.86280156565305e-06, "loss": 4.8217, "step": 1215 }, { "epoch": 0.15616, "grad_norm": 3.5407521724700928, "learning_rate": 9.861625831990278e-06, "loss": 4.875, "step": 1220 }, { "epoch": 0.1568, "grad_norm": 3.4974656105041504, "learning_rate": 9.860445152705644e-06, "loss": 5.0627, "step": 1225 }, { "epoch": 0.15744, "grad_norm": 3.655677556991577, "learning_rate": 9.859259529000228e-06, "loss": 4.8015, "step": 1230 }, { "epoch": 0.15808, "grad_norm": 3.55148983001709, "learning_rate": 9.858068962080136e-06, "loss": 5.1209, "step": 1235 }, { "epoch": 0.15872, "grad_norm": 3.4331536293029785, "learning_rate": 9.856873453156506e-06, "loss": 4.9739, "step": 1240 }, { "epoch": 0.15936, "grad_norm": 3.374394655227661, "learning_rate": 9.855673003445502e-06, "loss": 4.8138, "step": 1245 }, { "epoch": 0.16, "grad_norm": 3.5296385288238525, "learning_rate": 9.854467614168315e-06, "loss": 5.0274, "step": 1250 }, { "epoch": 0.16064, "grad_norm": 3.6533989906311035, "learning_rate": 9.85325728655116e-06, "loss": 4.9979, "step": 1255 }, { "epoch": 0.16128, "grad_norm": 3.3504199981689453, "learning_rate": 9.852042021825272e-06, "loss": 4.8317, "step": 1260 }, { "epoch": 0.16192, "grad_norm": 3.614529609680176, "learning_rate": 9.850821821226918e-06, "loss": 4.9413, "step": 1265 }, { "epoch": 0.16256, "grad_norm": 3.4821839332580566, "learning_rate": 9.849596685997376e-06, "loss": 4.904, "step": 1270 }, { "epoch": 0.1632, "grad_norm": 3.3400087356567383, "learning_rate": 9.848366617382951e-06, "loss": 4.9039, "step": 1275 }, { "epoch": 0.16384, "grad_norm": 4.062397003173828, "learning_rate": 9.847131616634963e-06, "loss": 4.7378, "step": 1280 }, { "epoch": 0.16448, "grad_norm": 3.689796209335327, "learning_rate": 9.845891685009751e-06, "loss": 4.8799, "step": 1285 }, { "epoch": 0.16512, "grad_norm": 3.509657621383667, "learning_rate": 9.84464682376867e-06, "loss": 4.8513, "step": 1290 }, { "epoch": 0.16576, "grad_norm": 3.4828646183013916, "learning_rate": 9.843397034178088e-06, "loss": 5.0151, "step": 1295 }, { "epoch": 0.1664, "grad_norm": 3.394510507583618, "learning_rate": 9.842142317509387e-06, "loss": 4.7585, "step": 1300 }, { "epoch": 0.1664, "eval_loss": 1.2179418802261353, "eval_runtime": 6.7952, "eval_samples_per_second": 147.163, "eval_steps_per_second": 18.395, "step": 1300 }, { "epoch": 0.16704, "grad_norm": 3.4089293479919434, "learning_rate": 9.840882675038962e-06, "loss": 4.7646, "step": 1305 }, { "epoch": 0.16768, "grad_norm": 3.1607353687286377, "learning_rate": 9.83961810804822e-06, "loss": 4.9528, "step": 1310 }, { "epoch": 0.16832, "grad_norm": 3.30869197845459, "learning_rate": 9.838348617823573e-06, "loss": 5.0086, "step": 1315 }, { "epoch": 0.16896, "grad_norm": 3.6550564765930176, "learning_rate": 9.837074205656452e-06, "loss": 4.8675, "step": 1320 }, { "epoch": 0.1696, "grad_norm": 3.6141419410705566, "learning_rate": 9.835794872843281e-06, "loss": 4.8885, "step": 1325 }, { "epoch": 0.17024, "grad_norm": 3.4006361961364746, "learning_rate": 9.834510620685497e-06, "loss": 4.7784, "step": 1330 }, { "epoch": 0.17088, "grad_norm": 3.4397149085998535, "learning_rate": 9.833221450489543e-06, "loss": 4.929, "step": 1335 }, { "epoch": 0.17152, "grad_norm": 3.613502025604248, "learning_rate": 9.83192736356686e-06, "loss": 4.8763, "step": 1340 }, { "epoch": 0.17216, "grad_norm": 3.613837957382202, "learning_rate": 9.830628361233896e-06, "loss": 4.8765, "step": 1345 }, { "epoch": 0.1728, "grad_norm": 3.775621175765991, "learning_rate": 9.829324444812096e-06, "loss": 4.8103, "step": 1350 }, { "epoch": 0.17344, "grad_norm": 3.6856908798217773, "learning_rate": 9.828015615627904e-06, "loss": 4.8867, "step": 1355 }, { "epoch": 0.17408, "grad_norm": 3.3510427474975586, "learning_rate": 9.826701875012763e-06, "loss": 4.7708, "step": 1360 }, { "epoch": 0.17472, "grad_norm": 3.342366933822632, "learning_rate": 9.82538322430311e-06, "loss": 4.8404, "step": 1365 }, { "epoch": 0.17536, "grad_norm": 3.5898385047912598, "learning_rate": 9.824059664840378e-06, "loss": 4.8205, "step": 1370 }, { "epoch": 0.176, "grad_norm": 3.1588313579559326, "learning_rate": 9.822731197970998e-06, "loss": 4.7214, "step": 1375 }, { "epoch": 0.17664, "grad_norm": 3.431478261947632, "learning_rate": 9.821397825046387e-06, "loss": 4.8892, "step": 1380 }, { "epoch": 0.17728, "grad_norm": 3.7104616165161133, "learning_rate": 9.820059547422952e-06, "loss": 4.8027, "step": 1385 }, { "epoch": 0.17792, "grad_norm": 3.189239263534546, "learning_rate": 9.818716366462098e-06, "loss": 4.8692, "step": 1390 }, { "epoch": 0.17856, "grad_norm": 3.3543105125427246, "learning_rate": 9.81736828353021e-06, "loss": 4.9076, "step": 1395 }, { "epoch": 0.1792, "grad_norm": 3.2962117195129395, "learning_rate": 9.816015299998663e-06, "loss": 4.93, "step": 1400 }, { "epoch": 0.1792, "eval_loss": 1.2212570905685425, "eval_runtime": 7.0462, "eval_samples_per_second": 141.92, "eval_steps_per_second": 17.74, "step": 1400 }, { "epoch": 0.17984, "grad_norm": 3.2857654094696045, "learning_rate": 9.814657417243814e-06, "loss": 4.7544, "step": 1405 }, { "epoch": 0.18048, "grad_norm": 3.31211256980896, "learning_rate": 9.813294636647009e-06, "loss": 4.9007, "step": 1410 }, { "epoch": 0.18112, "grad_norm": 3.3026342391967773, "learning_rate": 9.81192695959457e-06, "loss": 4.8136, "step": 1415 }, { "epoch": 0.18176, "grad_norm": 3.6015031337738037, "learning_rate": 9.810554387477812e-06, "loss": 4.8296, "step": 1420 }, { "epoch": 0.1824, "grad_norm": 3.5558950901031494, "learning_rate": 9.809176921693013e-06, "loss": 4.9049, "step": 1425 }, { "epoch": 0.18304, "grad_norm": 3.272860288619995, "learning_rate": 9.807794563641442e-06, "loss": 4.868, "step": 1430 }, { "epoch": 0.18368, "grad_norm": 3.427809715270996, "learning_rate": 9.806407314729341e-06, "loss": 4.7899, "step": 1435 }, { "epoch": 0.18432, "grad_norm": 3.545553207397461, "learning_rate": 9.805015176367924e-06, "loss": 4.9774, "step": 1440 }, { "epoch": 0.18496, "grad_norm": 3.5434036254882812, "learning_rate": 9.803618149973383e-06, "loss": 4.8174, "step": 1445 }, { "epoch": 0.1856, "grad_norm": 3.5401341915130615, "learning_rate": 9.802216236966882e-06, "loss": 4.8138, "step": 1450 }, { "epoch": 0.18624, "grad_norm": 3.339459180831909, "learning_rate": 9.800809438774557e-06, "loss": 4.9385, "step": 1455 }, { "epoch": 0.18688, "grad_norm": 3.541703224182129, "learning_rate": 9.799397756827508e-06, "loss": 4.8764, "step": 1460 }, { "epoch": 0.18752, "grad_norm": 3.3053269386291504, "learning_rate": 9.79798119256181e-06, "loss": 4.5765, "step": 1465 }, { "epoch": 0.18816, "grad_norm": 3.461660146713257, "learning_rate": 9.7965597474185e-06, "loss": 4.6125, "step": 1470 }, { "epoch": 0.1888, "grad_norm": 3.564030885696411, "learning_rate": 9.795133422843583e-06, "loss": 4.8758, "step": 1475 }, { "epoch": 0.18944, "grad_norm": 3.635293483734131, "learning_rate": 9.793702220288028e-06, "loss": 4.7954, "step": 1480 }, { "epoch": 0.19008, "grad_norm": 3.4663326740264893, "learning_rate": 9.792266141207763e-06, "loss": 4.8442, "step": 1485 }, { "epoch": 0.19072, "grad_norm": 3.556608200073242, "learning_rate": 9.790825187063677e-06, "loss": 4.8431, "step": 1490 }, { "epoch": 0.19136, "grad_norm": 3.726987838745117, "learning_rate": 9.789379359321624e-06, "loss": 4.8309, "step": 1495 }, { "epoch": 0.192, "grad_norm": 3.535627603530884, "learning_rate": 9.78792865945241e-06, "loss": 4.8779, "step": 1500 }, { "epoch": 0.192, "eval_loss": 1.2059489488601685, "eval_runtime": 6.8452, "eval_samples_per_second": 146.087, "eval_steps_per_second": 18.261, "step": 1500 }, { "epoch": 0.19264, "grad_norm": 3.3409645557403564, "learning_rate": 9.7864730889318e-06, "loss": 4.8398, "step": 1505 }, { "epoch": 0.19328, "grad_norm": 3.240247964859009, "learning_rate": 9.78501264924051e-06, "loss": 4.689, "step": 1510 }, { "epoch": 0.19392, "grad_norm": 3.6355326175689697, "learning_rate": 9.783547341864216e-06, "loss": 4.7737, "step": 1515 }, { "epoch": 0.19456, "grad_norm": 3.4650771617889404, "learning_rate": 9.78207716829354e-06, "loss": 4.7844, "step": 1520 }, { "epoch": 0.1952, "grad_norm": 3.281463146209717, "learning_rate": 9.780602130024055e-06, "loss": 4.6872, "step": 1525 }, { "epoch": 0.19584, "grad_norm": 3.264622926712036, "learning_rate": 9.779122228556289e-06, "loss": 4.7438, "step": 1530 }, { "epoch": 0.19648, "grad_norm": 3.598848342895508, "learning_rate": 9.777637465395706e-06, "loss": 4.6983, "step": 1535 }, { "epoch": 0.19712, "grad_norm": 3.3951942920684814, "learning_rate": 9.776147842052725e-06, "loss": 4.8429, "step": 1540 }, { "epoch": 0.19776, "grad_norm": 3.088014841079712, "learning_rate": 9.774653360042706e-06, "loss": 4.8207, "step": 1545 }, { "epoch": 0.1984, "grad_norm": 3.4452457427978516, "learning_rate": 9.773154020885953e-06, "loss": 4.8426, "step": 1550 }, { "epoch": 0.19904, "grad_norm": 3.3782291412353516, "learning_rate": 9.771649826107707e-06, "loss": 4.9081, "step": 1555 }, { "epoch": 0.19968, "grad_norm": 3.420620918273926, "learning_rate": 9.770140777238153e-06, "loss": 4.8296, "step": 1560 }, { "epoch": 0.20032, "grad_norm": 3.3439509868621826, "learning_rate": 9.76862687581241e-06, "loss": 4.626, "step": 1565 }, { "epoch": 0.20096, "grad_norm": 3.2657105922698975, "learning_rate": 9.76710812337054e-06, "loss": 4.8035, "step": 1570 }, { "epoch": 0.2016, "grad_norm": 3.4240477085113525, "learning_rate": 9.765584521457533e-06, "loss": 4.776, "step": 1575 }, { "epoch": 0.20224, "grad_norm": 3.7116453647613525, "learning_rate": 9.764056071623314e-06, "loss": 4.8099, "step": 1580 }, { "epoch": 0.20288, "grad_norm": 3.3470919132232666, "learning_rate": 9.762522775422741e-06, "loss": 4.6686, "step": 1585 }, { "epoch": 0.20352, "grad_norm": 3.552156925201416, "learning_rate": 9.760984634415602e-06, "loss": 4.7256, "step": 1590 }, { "epoch": 0.20416, "grad_norm": 3.144547939300537, "learning_rate": 9.759441650166612e-06, "loss": 4.6914, "step": 1595 }, { "epoch": 0.2048, "grad_norm": 3.3078038692474365, "learning_rate": 9.757893824245414e-06, "loss": 4.7828, "step": 1600 }, { "epoch": 0.2048, "eval_loss": 1.1752163171768188, "eval_runtime": 6.6642, "eval_samples_per_second": 150.056, "eval_steps_per_second": 18.757, "step": 1600 }, { "epoch": 0.20544, "grad_norm": 3.658411741256714, "learning_rate": 9.756341158226578e-06, "loss": 4.8328, "step": 1605 }, { "epoch": 0.20608, "grad_norm": 3.2841243743896484, "learning_rate": 9.754783653689595e-06, "loss": 4.7692, "step": 1610 }, { "epoch": 0.20672, "grad_norm": 3.305380344390869, "learning_rate": 9.75322131221888e-06, "loss": 4.7308, "step": 1615 }, { "epoch": 0.20736, "grad_norm": 3.596205472946167, "learning_rate": 9.751654135403764e-06, "loss": 4.7954, "step": 1620 }, { "epoch": 0.208, "grad_norm": 3.223118782043457, "learning_rate": 9.750082124838505e-06, "loss": 4.7433, "step": 1625 }, { "epoch": 0.20864, "grad_norm": 3.376711845397949, "learning_rate": 9.748505282122269e-06, "loss": 4.8109, "step": 1630 }, { "epoch": 0.20928, "grad_norm": 3.416400194168091, "learning_rate": 9.746923608859147e-06, "loss": 4.8006, "step": 1635 }, { "epoch": 0.20992, "grad_norm": 3.3802921772003174, "learning_rate": 9.745337106658139e-06, "loss": 4.7256, "step": 1640 }, { "epoch": 0.21056, "grad_norm": 3.3929057121276855, "learning_rate": 9.743745777133153e-06, "loss": 4.6883, "step": 1645 }, { "epoch": 0.2112, "grad_norm": 3.730523109436035, "learning_rate": 9.742149621903018e-06, "loss": 4.9453, "step": 1650 }, { "epoch": 0.21184, "grad_norm": 3.5694522857666016, "learning_rate": 9.740548642591463e-06, "loss": 4.9234, "step": 1655 }, { "epoch": 0.21248, "grad_norm": 3.386958122253418, "learning_rate": 9.73894284082713e-06, "loss": 4.6964, "step": 1660 }, { "epoch": 0.21312, "grad_norm": 3.6847736835479736, "learning_rate": 9.737332218243565e-06, "loss": 4.8865, "step": 1665 }, { "epoch": 0.21376, "grad_norm": 3.856374979019165, "learning_rate": 9.735716776479215e-06, "loss": 4.7383, "step": 1670 }, { "epoch": 0.2144, "grad_norm": 3.566790819168091, "learning_rate": 9.734096517177436e-06, "loss": 4.7605, "step": 1675 }, { "epoch": 0.21504, "grad_norm": 4.066038131713867, "learning_rate": 9.732471441986479e-06, "loss": 4.6265, "step": 1680 }, { "epoch": 0.21568, "grad_norm": 3.6826140880584717, "learning_rate": 9.730841552559496e-06, "loss": 4.8244, "step": 1685 }, { "epoch": 0.21632, "grad_norm": 3.5930440425872803, "learning_rate": 9.729206850554537e-06, "loss": 4.8389, "step": 1690 }, { "epoch": 0.21696, "grad_norm": 3.6579551696777344, "learning_rate": 9.727567337634547e-06, "loss": 4.7213, "step": 1695 }, { "epoch": 0.2176, "grad_norm": 3.4516382217407227, "learning_rate": 9.725923015467368e-06, "loss": 4.7244, "step": 1700 }, { "epoch": 0.2176, "eval_loss": 1.1895771026611328, "eval_runtime": 6.8327, "eval_samples_per_second": 146.355, "eval_steps_per_second": 18.294, "step": 1700 }, { "epoch": 0.21824, "grad_norm": 3.3916966915130615, "learning_rate": 9.724273885725728e-06, "loss": 4.7104, "step": 1705 }, { "epoch": 0.21888, "grad_norm": 3.560164451599121, "learning_rate": 9.72261995008725e-06, "loss": 4.7226, "step": 1710 }, { "epoch": 0.21952, "grad_norm": 3.647092342376709, "learning_rate": 9.72096121023445e-06, "loss": 4.8889, "step": 1715 }, { "epoch": 0.22016, "grad_norm": 3.1930177211761475, "learning_rate": 9.719297667854718e-06, "loss": 4.6211, "step": 1720 }, { "epoch": 0.2208, "grad_norm": 3.2158684730529785, "learning_rate": 9.717629324640347e-06, "loss": 4.684, "step": 1725 }, { "epoch": 0.22144, "grad_norm": 3.610088348388672, "learning_rate": 9.715956182288498e-06, "loss": 4.858, "step": 1730 }, { "epoch": 0.22208, "grad_norm": 3.80896258354187, "learning_rate": 9.714278242501222e-06, "loss": 4.6532, "step": 1735 }, { "epoch": 0.22272, "grad_norm": 3.3281309604644775, "learning_rate": 9.712595506985452e-06, "loss": 4.7788, "step": 1740 }, { "epoch": 0.22336, "grad_norm": 3.551541328430176, "learning_rate": 9.710907977452995e-06, "loss": 4.46, "step": 1745 }, { "epoch": 0.224, "grad_norm": 3.282968044281006, "learning_rate": 9.709215655620538e-06, "loss": 4.6924, "step": 1750 }, { "epoch": 0.22464, "grad_norm": 3.20308780670166, "learning_rate": 9.707518543209638e-06, "loss": 4.7473, "step": 1755 }, { "epoch": 0.22528, "grad_norm": 3.8178443908691406, "learning_rate": 9.705816641946733e-06, "loss": 4.6526, "step": 1760 }, { "epoch": 0.22592, "grad_norm": 3.6582953929901123, "learning_rate": 9.704109953563126e-06, "loss": 4.5572, "step": 1765 }, { "epoch": 0.22656, "grad_norm": 3.1742563247680664, "learning_rate": 9.702398479794994e-06, "loss": 4.6242, "step": 1770 }, { "epoch": 0.2272, "grad_norm": 3.8248085975646973, "learning_rate": 9.70068222238338e-06, "loss": 4.6163, "step": 1775 }, { "epoch": 0.22784, "grad_norm": 3.4912667274475098, "learning_rate": 9.698961183074194e-06, "loss": 4.6932, "step": 1780 }, { "epoch": 0.22848, "grad_norm": 3.227597236633301, "learning_rate": 9.69723536361821e-06, "loss": 4.7776, "step": 1785 }, { "epoch": 0.22912, "grad_norm": 3.329366445541382, "learning_rate": 9.695504765771066e-06, "loss": 4.6441, "step": 1790 }, { "epoch": 0.22976, "grad_norm": 3.5038723945617676, "learning_rate": 9.693769391293257e-06, "loss": 4.6334, "step": 1795 }, { "epoch": 0.2304, "grad_norm": 3.5325582027435303, "learning_rate": 9.692029241950144e-06, "loss": 4.5945, "step": 1800 }, { "epoch": 0.2304, "eval_loss": 1.1731183528900146, "eval_runtime": 6.8034, "eval_samples_per_second": 146.984, "eval_steps_per_second": 18.373, "step": 1800 }, { "epoch": 0.23104, "grad_norm": 3.3038785457611084, "learning_rate": 9.69028431951194e-06, "loss": 4.6645, "step": 1805 }, { "epoch": 0.23168, "grad_norm": 5.602443695068359, "learning_rate": 9.688534625753713e-06, "loss": 4.7724, "step": 1810 }, { "epoch": 0.23232, "grad_norm": 3.3495841026306152, "learning_rate": 9.686780162455389e-06, "loss": 4.6676, "step": 1815 }, { "epoch": 0.23296, "grad_norm": 3.6556856632232666, "learning_rate": 9.685020931401745e-06, "loss": 4.6832, "step": 1820 }, { "epoch": 0.2336, "grad_norm": 3.3571536540985107, "learning_rate": 9.683256934382406e-06, "loss": 4.693, "step": 1825 }, { "epoch": 0.23424, "grad_norm": 3.542635917663574, "learning_rate": 9.681488173191843e-06, "loss": 4.6774, "step": 1830 }, { "epoch": 0.23488, "grad_norm": 3.4921188354492188, "learning_rate": 9.679714649629381e-06, "loss": 4.6432, "step": 1835 }, { "epoch": 0.23552, "grad_norm": 3.424345016479492, "learning_rate": 9.677936365499183e-06, "loss": 4.7415, "step": 1840 }, { "epoch": 0.23616, "grad_norm": 3.3347842693328857, "learning_rate": 9.676153322610259e-06, "loss": 4.61, "step": 1845 }, { "epoch": 0.2368, "grad_norm": 3.375143051147461, "learning_rate": 9.674365522776456e-06, "loss": 4.6775, "step": 1850 }, { "epoch": 0.23744, "grad_norm": 3.3928396701812744, "learning_rate": 9.672572967816464e-06, "loss": 4.7787, "step": 1855 }, { "epoch": 0.23808, "grad_norm": 3.4334990978240967, "learning_rate": 9.670775659553808e-06, "loss": 4.7256, "step": 1860 }, { "epoch": 0.23872, "grad_norm": 3.456284761428833, "learning_rate": 9.668973599816847e-06, "loss": 4.5238, "step": 1865 }, { "epoch": 0.23936, "grad_norm": 3.2843079566955566, "learning_rate": 9.66716679043878e-06, "loss": 4.6657, "step": 1870 }, { "epoch": 0.24, "grad_norm": 3.8443307876586914, "learning_rate": 9.66535523325763e-06, "loss": 4.7962, "step": 1875 }, { "epoch": 0.24064, "grad_norm": 3.5036277770996094, "learning_rate": 9.663538930116251e-06, "loss": 4.6989, "step": 1880 }, { "epoch": 0.24128, "grad_norm": 3.1674866676330566, "learning_rate": 9.661717882862333e-06, "loss": 4.6334, "step": 1885 }, { "epoch": 0.24192, "grad_norm": 3.5648207664489746, "learning_rate": 9.659892093348383e-06, "loss": 4.7952, "step": 1890 }, { "epoch": 0.24256, "grad_norm": 3.5110340118408203, "learning_rate": 9.658061563431734e-06, "loss": 4.5461, "step": 1895 }, { "epoch": 0.2432, "grad_norm": 3.374955415725708, "learning_rate": 9.656226294974545e-06, "loss": 4.5967, "step": 1900 }, { "epoch": 0.2432, "eval_loss": 1.1671475172042847, "eval_runtime": 6.824, "eval_samples_per_second": 146.542, "eval_steps_per_second": 18.318, "step": 1900 }, { "epoch": 0.24384, "grad_norm": 3.341566324234009, "learning_rate": 9.65438628984379e-06, "loss": 4.7055, "step": 1905 }, { "epoch": 0.24448, "grad_norm": 3.5377919673919678, "learning_rate": 9.652541549911267e-06, "loss": 4.729, "step": 1910 }, { "epoch": 0.24512, "grad_norm": 3.4675512313842773, "learning_rate": 9.65069207705359e-06, "loss": 4.5535, "step": 1915 }, { "epoch": 0.24576, "grad_norm": 3.2654201984405518, "learning_rate": 9.648837873152182e-06, "loss": 4.7749, "step": 1920 }, { "epoch": 0.2464, "grad_norm": 3.5051968097686768, "learning_rate": 9.646978940093283e-06, "loss": 4.774, "step": 1925 }, { "epoch": 0.24704, "grad_norm": 3.352511405944824, "learning_rate": 9.645115279767947e-06, "loss": 4.6353, "step": 1930 }, { "epoch": 0.24768, "grad_norm": 3.4990932941436768, "learning_rate": 9.64324689407203e-06, "loss": 4.5353, "step": 1935 }, { "epoch": 0.24832, "grad_norm": 3.782811164855957, "learning_rate": 9.641373784906198e-06, "loss": 4.7274, "step": 1940 }, { "epoch": 0.24896, "grad_norm": 4.125209808349609, "learning_rate": 9.639495954175926e-06, "loss": 4.6549, "step": 1945 }, { "epoch": 0.2496, "grad_norm": 3.609104633331299, "learning_rate": 9.637613403791487e-06, "loss": 4.7379, "step": 1950 }, { "epoch": 0.25024, "grad_norm": 3.548325538635254, "learning_rate": 9.635726135667955e-06, "loss": 4.6096, "step": 1955 }, { "epoch": 0.25088, "grad_norm": 3.4659600257873535, "learning_rate": 9.63383415172521e-06, "loss": 4.6373, "step": 1960 }, { "epoch": 0.25152, "grad_norm": 3.2835114002227783, "learning_rate": 9.631937453887917e-06, "loss": 4.7666, "step": 1965 }, { "epoch": 0.25216, "grad_norm": 3.5387489795684814, "learning_rate": 9.63003604408555e-06, "loss": 4.6144, "step": 1970 }, { "epoch": 0.2528, "grad_norm": 3.3846347332000732, "learning_rate": 9.628129924252368e-06, "loss": 4.6594, "step": 1975 }, { "epoch": 0.25344, "grad_norm": 3.6264524459838867, "learning_rate": 9.626219096327424e-06, "loss": 4.5614, "step": 1980 }, { "epoch": 0.25408, "grad_norm": 3.254225492477417, "learning_rate": 9.62430356225456e-06, "loss": 4.6334, "step": 1985 }, { "epoch": 0.25472, "grad_norm": 3.4200801849365234, "learning_rate": 9.622383323982404e-06, "loss": 4.6949, "step": 1990 }, { "epoch": 0.25536, "grad_norm": 3.5615053176879883, "learning_rate": 9.620458383464372e-06, "loss": 4.6219, "step": 1995 }, { "epoch": 0.256, "grad_norm": 3.5107123851776123, "learning_rate": 9.618528742658662e-06, "loss": 4.7314, "step": 2000 }, { "epoch": 0.256, "eval_loss": 1.1562089920043945, "eval_runtime": 6.9874, "eval_samples_per_second": 143.114, "eval_steps_per_second": 17.889, "step": 2000 }, { "epoch": 0.25664, "grad_norm": 3.184966802597046, "learning_rate": 9.616594403528255e-06, "loss": 4.6189, "step": 2005 }, { "epoch": 0.25728, "grad_norm": 3.4554250240325928, "learning_rate": 9.61465536804091e-06, "loss": 4.568, "step": 2010 }, { "epoch": 0.25792, "grad_norm": 3.6082921028137207, "learning_rate": 9.612711638169163e-06, "loss": 4.6917, "step": 2015 }, { "epoch": 0.25856, "grad_norm": 3.2755281925201416, "learning_rate": 9.610763215890326e-06, "loss": 4.5872, "step": 2020 }, { "epoch": 0.2592, "grad_norm": 3.3548803329467773, "learning_rate": 9.608810103186488e-06, "loss": 4.5392, "step": 2025 }, { "epoch": 0.25984, "grad_norm": 4.0107293128967285, "learning_rate": 9.606852302044502e-06, "loss": 4.6321, "step": 2030 }, { "epoch": 0.26048, "grad_norm": 3.364811420440674, "learning_rate": 9.604889814455997e-06, "loss": 4.653, "step": 2035 }, { "epoch": 0.26112, "grad_norm": 3.319972038269043, "learning_rate": 9.602922642417368e-06, "loss": 4.8393, "step": 2040 }, { "epoch": 0.26176, "grad_norm": 3.2800612449645996, "learning_rate": 9.600950787929773e-06, "loss": 4.694, "step": 2045 }, { "epoch": 0.2624, "grad_norm": 3.339520215988159, "learning_rate": 9.598974252999136e-06, "loss": 4.5774, "step": 2050 }, { "epoch": 0.26304, "grad_norm": 3.2331771850585938, "learning_rate": 9.59699303963614e-06, "loss": 4.547, "step": 2055 }, { "epoch": 0.26368, "grad_norm": 3.783783197402954, "learning_rate": 9.595007149856228e-06, "loss": 4.659, "step": 2060 }, { "epoch": 0.26432, "grad_norm": 3.4253666400909424, "learning_rate": 9.593016585679605e-06, "loss": 4.6534, "step": 2065 }, { "epoch": 0.26496, "grad_norm": 4.119887828826904, "learning_rate": 9.591021349131222e-06, "loss": 4.6676, "step": 2070 }, { "epoch": 0.2656, "grad_norm": 3.2150423526763916, "learning_rate": 9.589021442240789e-06, "loss": 4.6495, "step": 2075 }, { "epoch": 0.26624, "grad_norm": 3.347801923751831, "learning_rate": 9.58701686704277e-06, "loss": 4.5401, "step": 2080 }, { "epoch": 0.26688, "grad_norm": 3.378760814666748, "learning_rate": 9.585007625576368e-06, "loss": 4.6443, "step": 2085 }, { "epoch": 0.26752, "grad_norm": 3.516089677810669, "learning_rate": 9.58299371988554e-06, "loss": 4.6488, "step": 2090 }, { "epoch": 0.26816, "grad_norm": 3.7253007888793945, "learning_rate": 9.58097515201899e-06, "loss": 4.5439, "step": 2095 }, { "epoch": 0.2688, "grad_norm": 3.3970401287078857, "learning_rate": 9.57895192403016e-06, "loss": 4.5624, "step": 2100 }, { "epoch": 0.2688, "eval_loss": 1.1436244249343872, "eval_runtime": 6.885, "eval_samples_per_second": 145.242, "eval_steps_per_second": 18.155, "step": 2100 }, { "epoch": 0.26944, "grad_norm": 3.578413486480713, "learning_rate": 9.576924037977233e-06, "loss": 4.6812, "step": 2105 }, { "epoch": 0.27008, "grad_norm": 3.292541027069092, "learning_rate": 9.574891495923133e-06, "loss": 4.6228, "step": 2110 }, { "epoch": 0.27072, "grad_norm": 3.393113851547241, "learning_rate": 9.572854299935517e-06, "loss": 4.6127, "step": 2115 }, { "epoch": 0.27136, "grad_norm": 3.20076322555542, "learning_rate": 9.570812452086779e-06, "loss": 4.6345, "step": 2120 }, { "epoch": 0.272, "grad_norm": 3.2253365516662598, "learning_rate": 9.568765954454047e-06, "loss": 4.6655, "step": 2125 }, { "epoch": 0.27264, "grad_norm": 3.3711695671081543, "learning_rate": 9.566714809119173e-06, "loss": 4.519, "step": 2130 }, { "epoch": 0.27328, "grad_norm": 3.2909939289093018, "learning_rate": 9.564659018168743e-06, "loss": 4.669, "step": 2135 }, { "epoch": 0.27392, "grad_norm": 3.535353899002075, "learning_rate": 9.562598583694067e-06, "loss": 4.7123, "step": 2140 }, { "epoch": 0.27456, "grad_norm": 3.3832180500030518, "learning_rate": 9.560533507791174e-06, "loss": 4.6207, "step": 2145 }, { "epoch": 0.2752, "grad_norm": 3.3504884243011475, "learning_rate": 9.558463792560826e-06, "loss": 4.5949, "step": 2150 }, { "epoch": 0.27584, "grad_norm": 3.9149844646453857, "learning_rate": 9.556389440108493e-06, "loss": 4.6805, "step": 2155 }, { "epoch": 0.27648, "grad_norm": 3.232273817062378, "learning_rate": 9.554310452544366e-06, "loss": 4.6718, "step": 2160 }, { "epoch": 0.27712, "grad_norm": 3.4460339546203613, "learning_rate": 9.552226831983353e-06, "loss": 4.4873, "step": 2165 }, { "epoch": 0.27776, "grad_norm": 3.3323307037353516, "learning_rate": 9.550138580545077e-06, "loss": 4.6764, "step": 2170 }, { "epoch": 0.2784, "grad_norm": 3.325422525405884, "learning_rate": 9.548045700353865e-06, "loss": 4.6288, "step": 2175 }, { "epoch": 0.27904, "grad_norm": 3.620631694793701, "learning_rate": 9.545948193538759e-06, "loss": 4.397, "step": 2180 }, { "epoch": 0.27968, "grad_norm": 3.4073946475982666, "learning_rate": 9.543846062233502e-06, "loss": 4.6072, "step": 2185 }, { "epoch": 0.28032, "grad_norm": 3.269423246383667, "learning_rate": 9.54173930857655e-06, "loss": 4.6166, "step": 2190 }, { "epoch": 0.28096, "grad_norm": 3.4265708923339844, "learning_rate": 9.539627934711049e-06, "loss": 4.7151, "step": 2195 }, { "epoch": 0.2816, "grad_norm": 3.5224685668945312, "learning_rate": 9.537511942784857e-06, "loss": 4.6422, "step": 2200 }, { "epoch": 0.2816, "eval_loss": 1.1447081565856934, "eval_runtime": 6.5029, "eval_samples_per_second": 153.778, "eval_steps_per_second": 19.222, "step": 2200 }, { "epoch": 0.28224, "grad_norm": 3.5605506896972656, "learning_rate": 9.535391334950523e-06, "loss": 4.5522, "step": 2205 }, { "epoch": 0.28288, "grad_norm": 3.5018362998962402, "learning_rate": 9.533266113365293e-06, "loss": 4.5282, "step": 2210 }, { "epoch": 0.28352, "grad_norm": 3.2997825145721436, "learning_rate": 9.531136280191107e-06, "loss": 4.5023, "step": 2215 }, { "epoch": 0.28416, "grad_norm": 3.419358015060425, "learning_rate": 9.529001837594599e-06, "loss": 4.5891, "step": 2220 }, { "epoch": 0.2848, "grad_norm": 3.175732374191284, "learning_rate": 9.526862787747081e-06, "loss": 4.6561, "step": 2225 }, { "epoch": 0.28544, "grad_norm": 4.888601779937744, "learning_rate": 9.524719132824569e-06, "loss": 4.3645, "step": 2230 }, { "epoch": 0.28608, "grad_norm": 3.487359046936035, "learning_rate": 9.52257087500775e-06, "loss": 4.4451, "step": 2235 }, { "epoch": 0.28672, "grad_norm": 3.3482818603515625, "learning_rate": 9.520418016482001e-06, "loss": 4.7056, "step": 2240 }, { "epoch": 0.28736, "grad_norm": 3.410296678543091, "learning_rate": 9.518260559437371e-06, "loss": 4.5788, "step": 2245 }, { "epoch": 0.288, "grad_norm": 3.3393115997314453, "learning_rate": 9.516098506068596e-06, "loss": 4.5909, "step": 2250 }, { "epoch": 0.28864, "grad_norm": 3.641216993331909, "learning_rate": 9.513931858575084e-06, "loss": 4.4144, "step": 2255 }, { "epoch": 0.28928, "grad_norm": 3.332921266555786, "learning_rate": 9.511760619160915e-06, "loss": 4.5648, "step": 2260 }, { "epoch": 0.28992, "grad_norm": 3.7218687534332275, "learning_rate": 9.509584790034842e-06, "loss": 4.5362, "step": 2265 }, { "epoch": 0.29056, "grad_norm": 3.2724854946136475, "learning_rate": 9.50740437341029e-06, "loss": 4.601, "step": 2270 }, { "epoch": 0.2912, "grad_norm": 3.7653846740722656, "learning_rate": 9.50521937150534e-06, "loss": 4.5099, "step": 2275 }, { "epoch": 0.29184, "grad_norm": 3.398200035095215, "learning_rate": 9.503029786542753e-06, "loss": 4.6251, "step": 2280 }, { "epoch": 0.29248, "grad_norm": 3.4145519733428955, "learning_rate": 9.50083562074994e-06, "loss": 4.517, "step": 2285 }, { "epoch": 0.29312, "grad_norm": 3.541825294494629, "learning_rate": 9.498636876358975e-06, "loss": 4.6727, "step": 2290 }, { "epoch": 0.29376, "grad_norm": 3.482581615447998, "learning_rate": 9.496433555606594e-06, "loss": 4.6123, "step": 2295 }, { "epoch": 0.2944, "grad_norm": 3.4173848628997803, "learning_rate": 9.494225660734186e-06, "loss": 4.5458, "step": 2300 }, { "epoch": 0.2944, "eval_loss": 1.1363893747329712, "eval_runtime": 7.4169, "eval_samples_per_second": 134.828, "eval_steps_per_second": 16.853, "step": 2300 }, { "epoch": 0.29504, "grad_norm": 3.2871265411376953, "learning_rate": 9.492013193987788e-06, "loss": 4.4816, "step": 2305 }, { "epoch": 0.29568, "grad_norm": 3.55437970161438, "learning_rate": 9.489796157618094e-06, "loss": 4.5505, "step": 2310 }, { "epoch": 0.29632, "grad_norm": 3.7020423412323, "learning_rate": 9.487574553880447e-06, "loss": 4.4503, "step": 2315 }, { "epoch": 0.29696, "grad_norm": 3.1921563148498535, "learning_rate": 9.485348385034834e-06, "loss": 4.4986, "step": 2320 }, { "epoch": 0.2976, "grad_norm": 3.4034197330474854, "learning_rate": 9.483117653345883e-06, "loss": 4.4827, "step": 2325 }, { "epoch": 0.29824, "grad_norm": 3.3942666053771973, "learning_rate": 9.480882361082871e-06, "loss": 4.6199, "step": 2330 }, { "epoch": 0.29888, "grad_norm": 3.1674904823303223, "learning_rate": 9.478642510519706e-06, "loss": 4.5244, "step": 2335 }, { "epoch": 0.29952, "grad_norm": 3.4777538776397705, "learning_rate": 9.476398103934941e-06, "loss": 4.5746, "step": 2340 }, { "epoch": 0.30016, "grad_norm": 3.476076602935791, "learning_rate": 9.474149143611757e-06, "loss": 4.5492, "step": 2345 }, { "epoch": 0.3008, "grad_norm": 3.8204102516174316, "learning_rate": 9.471895631837972e-06, "loss": 4.6468, "step": 2350 }, { "epoch": 0.30144, "grad_norm": 3.413212299346924, "learning_rate": 9.469637570906032e-06, "loss": 4.4807, "step": 2355 }, { "epoch": 0.30208, "grad_norm": 3.604353904724121, "learning_rate": 9.467374963113011e-06, "loss": 4.5727, "step": 2360 }, { "epoch": 0.30272, "grad_norm": 3.6752219200134277, "learning_rate": 9.46510781076061e-06, "loss": 4.6183, "step": 2365 }, { "epoch": 0.30336, "grad_norm": 3.5153515338897705, "learning_rate": 9.462836116155151e-06, "loss": 4.4947, "step": 2370 }, { "epoch": 0.304, "grad_norm": 3.4051671028137207, "learning_rate": 9.460559881607579e-06, "loss": 4.5685, "step": 2375 }, { "epoch": 0.30464, "grad_norm": 3.418074131011963, "learning_rate": 9.45827910943345e-06, "loss": 4.7324, "step": 2380 }, { "epoch": 0.30528, "grad_norm": 3.331264019012451, "learning_rate": 9.455993801952949e-06, "loss": 4.6685, "step": 2385 }, { "epoch": 0.30592, "grad_norm": 3.6609222888946533, "learning_rate": 9.453703961490863e-06, "loss": 4.6168, "step": 2390 }, { "epoch": 0.30656, "grad_norm": 3.3909974098205566, "learning_rate": 9.451409590376598e-06, "loss": 4.6097, "step": 2395 }, { "epoch": 0.3072, "grad_norm": 3.4507110118865967, "learning_rate": 9.449110690944163e-06, "loss": 4.5849, "step": 2400 }, { "epoch": 0.3072, "eval_loss": 1.1387531757354736, "eval_runtime": 6.7091, "eval_samples_per_second": 149.052, "eval_steps_per_second": 18.631, "step": 2400 }, { "epoch": 0.30784, "grad_norm": 3.8138394355773926, "learning_rate": 9.44680726553218e-06, "loss": 4.5321, "step": 2405 }, { "epoch": 0.30848, "grad_norm": 3.298640727996826, "learning_rate": 9.444499316483865e-06, "loss": 4.5425, "step": 2410 }, { "epoch": 0.30912, "grad_norm": 3.421692371368408, "learning_rate": 9.442186846147048e-06, "loss": 4.517, "step": 2415 }, { "epoch": 0.30976, "grad_norm": 3.3749470710754395, "learning_rate": 9.439869856874153e-06, "loss": 4.527, "step": 2420 }, { "epoch": 0.3104, "grad_norm": 3.163632869720459, "learning_rate": 9.437548351022197e-06, "loss": 4.639, "step": 2425 }, { "epoch": 0.31104, "grad_norm": 3.22723126411438, "learning_rate": 9.435222330952799e-06, "loss": 4.5637, "step": 2430 }, { "epoch": 0.31168, "grad_norm": 3.6903302669525146, "learning_rate": 9.432891799032162e-06, "loss": 4.5799, "step": 2435 }, { "epoch": 0.31232, "grad_norm": 3.5754201412200928, "learning_rate": 9.430556757631087e-06, "loss": 4.5296, "step": 2440 }, { "epoch": 0.31296, "grad_norm": 3.152308702468872, "learning_rate": 9.428217209124958e-06, "loss": 4.5233, "step": 2445 }, { "epoch": 0.3136, "grad_norm": 3.4750640392303467, "learning_rate": 9.425873155893744e-06, "loss": 4.3894, "step": 2450 }, { "epoch": 0.31424, "grad_norm": 3.1524953842163086, "learning_rate": 9.423524600321999e-06, "loss": 4.3978, "step": 2455 }, { "epoch": 0.31488, "grad_norm": 3.344024658203125, "learning_rate": 9.421171544798854e-06, "loss": 4.5563, "step": 2460 }, { "epoch": 0.31552, "grad_norm": 3.460477352142334, "learning_rate": 9.418813991718017e-06, "loss": 4.5687, "step": 2465 }, { "epoch": 0.31616, "grad_norm": 3.2936015129089355, "learning_rate": 9.416451943477778e-06, "loss": 4.6714, "step": 2470 }, { "epoch": 0.3168, "grad_norm": 3.373152732849121, "learning_rate": 9.41408540248099e-06, "loss": 4.5469, "step": 2475 }, { "epoch": 0.31744, "grad_norm": 3.7370991706848145, "learning_rate": 9.411714371135087e-06, "loss": 4.5638, "step": 2480 }, { "epoch": 0.31808, "grad_norm": 3.427574872970581, "learning_rate": 9.40933885185206e-06, "loss": 4.4045, "step": 2485 }, { "epoch": 0.31872, "grad_norm": 3.459003448486328, "learning_rate": 9.406958847048477e-06, "loss": 4.5258, "step": 2490 }, { "epoch": 0.31936, "grad_norm": 3.361318588256836, "learning_rate": 9.40457435914546e-06, "loss": 4.5269, "step": 2495 }, { "epoch": 0.32, "grad_norm": 3.2740869522094727, "learning_rate": 9.402185390568693e-06, "loss": 4.6282, "step": 2500 }, { "epoch": 0.32, "eval_loss": 1.1232563257217407, "eval_runtime": 6.9879, "eval_samples_per_second": 143.105, "eval_steps_per_second": 17.888, "step": 2500 }, { "epoch": 0.32064, "grad_norm": 3.3837263584136963, "learning_rate": 9.399791943748419e-06, "loss": 4.5378, "step": 2505 }, { "epoch": 0.32128, "grad_norm": 3.248398542404175, "learning_rate": 9.397394021119441e-06, "loss": 4.4764, "step": 2510 }, { "epoch": 0.32192, "grad_norm": 3.6042652130126953, "learning_rate": 9.39499162512111e-06, "loss": 4.5116, "step": 2515 }, { "epoch": 0.32256, "grad_norm": 3.6905040740966797, "learning_rate": 9.39258475819733e-06, "loss": 4.3691, "step": 2520 }, { "epoch": 0.3232, "grad_norm": 3.299175500869751, "learning_rate": 9.390173422796548e-06, "loss": 4.4966, "step": 2525 }, { "epoch": 0.32384, "grad_norm": 3.312781810760498, "learning_rate": 9.387757621371765e-06, "loss": 4.6154, "step": 2530 }, { "epoch": 0.32448, "grad_norm": 3.447141408920288, "learning_rate": 9.38533735638052e-06, "loss": 4.4373, "step": 2535 }, { "epoch": 0.32512, "grad_norm": 3.611647605895996, "learning_rate": 9.382912630284893e-06, "loss": 4.5573, "step": 2540 }, { "epoch": 0.32576, "grad_norm": 3.256063461303711, "learning_rate": 9.380483445551503e-06, "loss": 4.3667, "step": 2545 }, { "epoch": 0.3264, "grad_norm": 3.6989307403564453, "learning_rate": 9.378049804651506e-06, "loss": 4.5231, "step": 2550 }, { "epoch": 0.32704, "grad_norm": 3.1487154960632324, "learning_rate": 9.37561171006059e-06, "loss": 4.5691, "step": 2555 }, { "epoch": 0.32768, "grad_norm": 3.951996326446533, "learning_rate": 9.373169164258971e-06, "loss": 4.6015, "step": 2560 }, { "epoch": 0.32832, "grad_norm": 3.4129538536071777, "learning_rate": 9.370722169731396e-06, "loss": 4.499, "step": 2565 }, { "epoch": 0.32896, "grad_norm": 3.4212002754211426, "learning_rate": 9.36827072896714e-06, "loss": 4.531, "step": 2570 }, { "epoch": 0.3296, "grad_norm": 3.6318917274475098, "learning_rate": 9.365814844459994e-06, "loss": 4.5956, "step": 2575 }, { "epoch": 0.33024, "grad_norm": 3.5327017307281494, "learning_rate": 9.363354518708277e-06, "loss": 4.506, "step": 2580 }, { "epoch": 0.33088, "grad_norm": 3.4287726879119873, "learning_rate": 9.360889754214823e-06, "loss": 4.4378, "step": 2585 }, { "epoch": 0.33152, "grad_norm": 3.153348207473755, "learning_rate": 9.358420553486977e-06, "loss": 4.5132, "step": 2590 }, { "epoch": 0.33216, "grad_norm": 3.2982351779937744, "learning_rate": 9.355946919036605e-06, "loss": 4.5825, "step": 2595 }, { "epoch": 0.3328, "grad_norm": 3.3084805011749268, "learning_rate": 9.353468853380079e-06, "loss": 4.4806, "step": 2600 }, { "epoch": 0.3328, "eval_loss": 1.1227222681045532, "eval_runtime": 7.0684, "eval_samples_per_second": 141.475, "eval_steps_per_second": 17.684, "step": 2600 }, { "epoch": 0.33344, "grad_norm": 3.333228588104248, "learning_rate": 9.350986359038277e-06, "loss": 4.5084, "step": 2605 }, { "epoch": 0.33408, "grad_norm": 3.350033760070801, "learning_rate": 9.348499438536585e-06, "loss": 4.4618, "step": 2610 }, { "epoch": 0.33472, "grad_norm": 3.4025425910949707, "learning_rate": 9.34600809440489e-06, "loss": 4.531, "step": 2615 }, { "epoch": 0.33536, "grad_norm": 3.4806454181671143, "learning_rate": 9.343512329177582e-06, "loss": 4.4493, "step": 2620 }, { "epoch": 0.336, "grad_norm": 3.3315722942352295, "learning_rate": 9.341012145393546e-06, "loss": 4.3422, "step": 2625 }, { "epoch": 0.33664, "grad_norm": 3.1910319328308105, "learning_rate": 9.338507545596162e-06, "loss": 4.3205, "step": 2630 }, { "epoch": 0.33728, "grad_norm": 3.3633666038513184, "learning_rate": 9.335998532333303e-06, "loss": 4.3457, "step": 2635 }, { "epoch": 0.33792, "grad_norm": 3.2793116569519043, "learning_rate": 9.333485108157329e-06, "loss": 4.424, "step": 2640 }, { "epoch": 0.33856, "grad_norm": 3.2517988681793213, "learning_rate": 9.330967275625094e-06, "loss": 4.4829, "step": 2645 }, { "epoch": 0.3392, "grad_norm": 3.5355207920074463, "learning_rate": 9.328445037297929e-06, "loss": 4.516, "step": 2650 }, { "epoch": 0.33984, "grad_norm": 3.376213312149048, "learning_rate": 9.32591839574165e-06, "loss": 4.355, "step": 2655 }, { "epoch": 0.34048, "grad_norm": 3.498028516769409, "learning_rate": 9.323387353526552e-06, "loss": 4.4594, "step": 2660 }, { "epoch": 0.34112, "grad_norm": 3.289701461791992, "learning_rate": 9.320851913227407e-06, "loss": 4.3759, "step": 2665 }, { "epoch": 0.34176, "grad_norm": 3.2121317386627197, "learning_rate": 9.318312077423463e-06, "loss": 4.5469, "step": 2670 }, { "epoch": 0.3424, "grad_norm": 3.5415709018707275, "learning_rate": 9.315767848698435e-06, "loss": 4.5208, "step": 2675 }, { "epoch": 0.34304, "grad_norm": 3.387648820877075, "learning_rate": 9.313219229640511e-06, "loss": 4.3834, "step": 2680 }, { "epoch": 0.34368, "grad_norm": 3.4357047080993652, "learning_rate": 9.310666222842343e-06, "loss": 4.4647, "step": 2685 }, { "epoch": 0.34432, "grad_norm": 3.5037624835968018, "learning_rate": 9.308108830901046e-06, "loss": 4.4008, "step": 2690 }, { "epoch": 0.34496, "grad_norm": 3.3989768028259277, "learning_rate": 9.305547056418198e-06, "loss": 4.4072, "step": 2695 }, { "epoch": 0.3456, "grad_norm": 3.28924298286438, "learning_rate": 9.302980901999833e-06, "loss": 4.4879, "step": 2700 }, { "epoch": 0.3456, "eval_loss": 1.1052284240722656, "eval_runtime": 6.8594, "eval_samples_per_second": 145.785, "eval_steps_per_second": 18.223, "step": 2700 }, { "epoch": 0.34624, "grad_norm": 3.323194980621338, "learning_rate": 9.300410370256444e-06, "loss": 4.39, "step": 2705 }, { "epoch": 0.34688, "grad_norm": 3.3750932216644287, "learning_rate": 9.297835463802972e-06, "loss": 4.4385, "step": 2710 }, { "epoch": 0.34752, "grad_norm": 3.1815054416656494, "learning_rate": 9.295256185258811e-06, "loss": 4.6009, "step": 2715 }, { "epoch": 0.34816, "grad_norm": 3.5953733921051025, "learning_rate": 9.292672537247808e-06, "loss": 4.5078, "step": 2720 }, { "epoch": 0.3488, "grad_norm": 3.0168521404266357, "learning_rate": 9.290084522398243e-06, "loss": 4.4222, "step": 2725 }, { "epoch": 0.34944, "grad_norm": 3.386521816253662, "learning_rate": 9.287492143342847e-06, "loss": 4.4376, "step": 2730 }, { "epoch": 0.35008, "grad_norm": 3.3437111377716064, "learning_rate": 9.28489540271879e-06, "loss": 4.4659, "step": 2735 }, { "epoch": 0.35072, "grad_norm": 3.2060422897338867, "learning_rate": 9.282294303167677e-06, "loss": 4.5246, "step": 2740 }, { "epoch": 0.35136, "grad_norm": 3.335606813430786, "learning_rate": 9.279688847335545e-06, "loss": 4.4502, "step": 2745 }, { "epoch": 0.352, "grad_norm": 3.2676444053649902, "learning_rate": 9.27707903787287e-06, "loss": 4.4734, "step": 2750 }, { "epoch": 0.35264, "grad_norm": 3.4844555854797363, "learning_rate": 9.274464877434548e-06, "loss": 4.4142, "step": 2755 }, { "epoch": 0.35328, "grad_norm": 3.7898640632629395, "learning_rate": 9.271846368679907e-06, "loss": 4.3501, "step": 2760 }, { "epoch": 0.35392, "grad_norm": 3.182154893875122, "learning_rate": 9.269223514272697e-06, "loss": 4.3514, "step": 2765 }, { "epoch": 0.35456, "grad_norm": 3.4136388301849365, "learning_rate": 9.266596316881085e-06, "loss": 4.5064, "step": 2770 }, { "epoch": 0.3552, "grad_norm": 3.0961410999298096, "learning_rate": 9.263964779177663e-06, "loss": 4.3653, "step": 2775 }, { "epoch": 0.35584, "grad_norm": 3.2388274669647217, "learning_rate": 9.261328903839434e-06, "loss": 4.4265, "step": 2780 }, { "epoch": 0.35648, "grad_norm": 3.3582630157470703, "learning_rate": 9.258688693547815e-06, "loss": 4.4644, "step": 2785 }, { "epoch": 0.35712, "grad_norm": 3.393575429916382, "learning_rate": 9.25604415098863e-06, "loss": 4.4185, "step": 2790 }, { "epoch": 0.35776, "grad_norm": 3.30346941947937, "learning_rate": 9.253395278852115e-06, "loss": 4.497, "step": 2795 }, { "epoch": 0.3584, "grad_norm": 3.3295273780822754, "learning_rate": 9.250742079832905e-06, "loss": 4.3593, "step": 2800 }, { "epoch": 0.3584, "eval_loss": 1.1154146194458008, "eval_runtime": 6.98, "eval_samples_per_second": 143.267, "eval_steps_per_second": 17.908, "step": 2800 }, { "epoch": 0.35904, "grad_norm": 3.4841437339782715, "learning_rate": 9.248084556630039e-06, "loss": 4.5087, "step": 2805 }, { "epoch": 0.35968, "grad_norm": 3.5100257396698, "learning_rate": 9.245422711946959e-06, "loss": 4.4858, "step": 2810 }, { "epoch": 0.36032, "grad_norm": 3.8446364402770996, "learning_rate": 9.242756548491496e-06, "loss": 4.3412, "step": 2815 }, { "epoch": 0.36096, "grad_norm": 3.121630907058716, "learning_rate": 9.240086068975878e-06, "loss": 4.3509, "step": 2820 }, { "epoch": 0.3616, "grad_norm": 3.465914726257324, "learning_rate": 9.237411276116724e-06, "loss": 4.428, "step": 2825 }, { "epoch": 0.36224, "grad_norm": 3.416682243347168, "learning_rate": 9.234732172635041e-06, "loss": 4.5354, "step": 2830 }, { "epoch": 0.36288, "grad_norm": 3.1812517642974854, "learning_rate": 9.232048761256218e-06, "loss": 4.5089, "step": 2835 }, { "epoch": 0.36352, "grad_norm": 3.454968214035034, "learning_rate": 9.22936104471003e-06, "loss": 4.2911, "step": 2840 }, { "epoch": 0.36416, "grad_norm": 3.487147569656372, "learning_rate": 9.226669025730633e-06, "loss": 4.3171, "step": 2845 }, { "epoch": 0.3648, "grad_norm": 3.4108667373657227, "learning_rate": 9.22397270705655e-06, "loss": 4.3087, "step": 2850 }, { "epoch": 0.36544, "grad_norm": 3.26461124420166, "learning_rate": 9.22127209143069e-06, "loss": 4.4842, "step": 2855 }, { "epoch": 0.36608, "grad_norm": 3.8364768028259277, "learning_rate": 9.21856718160033e-06, "loss": 4.538, "step": 2860 }, { "epoch": 0.36672, "grad_norm": 3.4763057231903076, "learning_rate": 9.215857980317109e-06, "loss": 4.4181, "step": 2865 }, { "epoch": 0.36736, "grad_norm": 3.5775294303894043, "learning_rate": 9.213144490337036e-06, "loss": 4.4466, "step": 2870 }, { "epoch": 0.368, "grad_norm": 3.724712610244751, "learning_rate": 9.210426714420487e-06, "loss": 4.3985, "step": 2875 }, { "epoch": 0.36864, "grad_norm": 3.675708532333374, "learning_rate": 9.20770465533219e-06, "loss": 4.4455, "step": 2880 }, { "epoch": 0.36928, "grad_norm": 3.481708526611328, "learning_rate": 9.204978315841238e-06, "loss": 4.408, "step": 2885 }, { "epoch": 0.36992, "grad_norm": 3.3906009197235107, "learning_rate": 9.20224769872107e-06, "loss": 4.4277, "step": 2890 }, { "epoch": 0.37056, "grad_norm": 3.425987720489502, "learning_rate": 9.199512806749485e-06, "loss": 4.3105, "step": 2895 }, { "epoch": 0.3712, "grad_norm": 3.183659791946411, "learning_rate": 9.196773642708623e-06, "loss": 4.449, "step": 2900 }, { "epoch": 0.3712, "eval_loss": 1.1105434894561768, "eval_runtime": 9.6182, "eval_samples_per_second": 103.97, "eval_steps_per_second": 12.996, "step": 2900 }, { "epoch": 0.37184, "grad_norm": 3.461367130279541, "learning_rate": 9.194030209384975e-06, "loss": 4.5999, "step": 2905 }, { "epoch": 0.37248, "grad_norm": 3.140835762023926, "learning_rate": 9.191282509569375e-06, "loss": 4.4779, "step": 2910 }, { "epoch": 0.37312, "grad_norm": 3.2149133682250977, "learning_rate": 9.188530546056993e-06, "loss": 4.4018, "step": 2915 }, { "epoch": 0.37376, "grad_norm": 3.6455352306365967, "learning_rate": 9.185774321647343e-06, "loss": 4.5308, "step": 2920 }, { "epoch": 0.3744, "grad_norm": 3.063474178314209, "learning_rate": 9.183013839144266e-06, "loss": 4.4842, "step": 2925 }, { "epoch": 0.37504, "grad_norm": 3.5478434562683105, "learning_rate": 9.18024910135594e-06, "loss": 4.3344, "step": 2930 }, { "epoch": 0.37568, "grad_norm": 3.481499433517456, "learning_rate": 9.177480111094871e-06, "loss": 4.4559, "step": 2935 }, { "epoch": 0.37632, "grad_norm": 3.8366904258728027, "learning_rate": 9.174706871177888e-06, "loss": 4.3211, "step": 2940 }, { "epoch": 0.37696, "grad_norm": 3.365673303604126, "learning_rate": 9.171929384426146e-06, "loss": 4.2437, "step": 2945 }, { "epoch": 0.3776, "grad_norm": 3.6502323150634766, "learning_rate": 9.16914765366512e-06, "loss": 4.3133, "step": 2950 }, { "epoch": 0.37824, "grad_norm": 4.653753757476807, "learning_rate": 9.166361681724602e-06, "loss": 4.3793, "step": 2955 }, { "epoch": 0.37888, "grad_norm": 3.373015880584717, "learning_rate": 9.163571471438696e-06, "loss": 4.3469, "step": 2960 }, { "epoch": 0.37952, "grad_norm": 3.6513311862945557, "learning_rate": 9.160777025645822e-06, "loss": 4.2295, "step": 2965 }, { "epoch": 0.38016, "grad_norm": 3.158674955368042, "learning_rate": 9.157978347188706e-06, "loss": 4.4705, "step": 2970 }, { "epoch": 0.3808, "grad_norm": 3.2553975582122803, "learning_rate": 9.15517543891438e-06, "loss": 4.3731, "step": 2975 }, { "epoch": 0.38144, "grad_norm": 3.6190109252929688, "learning_rate": 9.152368303674178e-06, "loss": 4.482, "step": 2980 }, { "epoch": 0.38208, "grad_norm": 3.3591349124908447, "learning_rate": 9.149556944323737e-06, "loss": 4.4614, "step": 2985 }, { "epoch": 0.38272, "grad_norm": 3.4077179431915283, "learning_rate": 9.146741363722987e-06, "loss": 4.4401, "step": 2990 }, { "epoch": 0.38336, "grad_norm": 3.372915267944336, "learning_rate": 9.143921564736156e-06, "loss": 4.4644, "step": 2995 }, { "epoch": 0.384, "grad_norm": 3.4412505626678467, "learning_rate": 9.141097550231762e-06, "loss": 4.1504, "step": 3000 }, { "epoch": 0.384, "eval_loss": 1.101137399673462, "eval_runtime": 7.1886, "eval_samples_per_second": 139.11, "eval_steps_per_second": 17.389, "step": 3000 }, { "epoch": 0.38464, "grad_norm": 3.461183786392212, "learning_rate": 9.13826932308261e-06, "loss": 4.373, "step": 3005 }, { "epoch": 0.38528, "grad_norm": 3.3834474086761475, "learning_rate": 9.136003710192325e-06, "loss": 4.3295, "step": 3010 }, { "epoch": 0.38592, "grad_norm": 3.351191520690918, "learning_rate": 9.133167907535756e-06, "loss": 4.4578, "step": 3015 }, { "epoch": 0.38656, "grad_norm": 3.236766815185547, "learning_rate": 9.130327900301077e-06, "loss": 4.4021, "step": 3020 }, { "epoch": 0.3872, "grad_norm": 3.217759370803833, "learning_rate": 9.12748369137736e-06, "loss": 4.3417, "step": 3025 }, { "epoch": 0.38784, "grad_norm": 3.5346148014068604, "learning_rate": 9.124635283657956e-06, "loss": 4.5156, "step": 3030 }, { "epoch": 0.38848, "grad_norm": 3.3899123668670654, "learning_rate": 9.121782680040487e-06, "loss": 4.3682, "step": 3035 }, { "epoch": 0.38912, "grad_norm": 3.3264894485473633, "learning_rate": 9.11892588342684e-06, "loss": 4.5256, "step": 3040 }, { "epoch": 0.38976, "grad_norm": 3.329000473022461, "learning_rate": 9.116064896723167e-06, "loss": 4.4699, "step": 3045 }, { "epoch": 0.3904, "grad_norm": 3.4215424060821533, "learning_rate": 9.113199722839889e-06, "loss": 4.4322, "step": 3050 }, { "epoch": 0.39104, "grad_norm": 3.363974094390869, "learning_rate": 9.110330364691682e-06, "loss": 4.4147, "step": 3055 }, { "epoch": 0.39168, "grad_norm": 3.2653472423553467, "learning_rate": 9.10745682519748e-06, "loss": 4.4322, "step": 3060 }, { "epoch": 0.39232, "grad_norm": 3.5401949882507324, "learning_rate": 9.104579107280465e-06, "loss": 4.3729, "step": 3065 }, { "epoch": 0.39296, "grad_norm": 3.2343528270721436, "learning_rate": 9.101697213868079e-06, "loss": 4.4719, "step": 3070 }, { "epoch": 0.3936, "grad_norm": 3.2686214447021484, "learning_rate": 9.098811147892004e-06, "loss": 4.4189, "step": 3075 }, { "epoch": 0.39424, "grad_norm": 3.325529098510742, "learning_rate": 9.095920912288173e-06, "loss": 4.2017, "step": 3080 }, { "epoch": 0.39488, "grad_norm": 3.393070936203003, "learning_rate": 9.093026509996752e-06, "loss": 4.3442, "step": 3085 }, { "epoch": 0.39552, "grad_norm": 3.3179240226745605, "learning_rate": 9.090127943962156e-06, "loss": 4.3873, "step": 3090 }, { "epoch": 0.39616, "grad_norm": 3.1389029026031494, "learning_rate": 9.087225217133029e-06, "loss": 4.3073, "step": 3095 }, { "epoch": 0.3968, "grad_norm": 3.4911952018737793, "learning_rate": 9.084318332462247e-06, "loss": 4.4417, "step": 3100 }, { "epoch": 0.3968, "eval_loss": 1.0903624296188354, "eval_runtime": 6.757, "eval_samples_per_second": 147.995, "eval_steps_per_second": 18.499, "step": 3100 }, { "epoch": 0.39744, "grad_norm": 3.211641550064087, "learning_rate": 9.08140729290692e-06, "loss": 4.2307, "step": 3105 }, { "epoch": 0.39808, "grad_norm": 3.5543551445007324, "learning_rate": 9.078492101428381e-06, "loss": 4.4694, "step": 3110 }, { "epoch": 0.39872, "grad_norm": 4.135831832885742, "learning_rate": 9.075572760992193e-06, "loss": 4.4585, "step": 3115 }, { "epoch": 0.39936, "grad_norm": 3.209195613861084, "learning_rate": 9.07264927456813e-06, "loss": 4.4306, "step": 3120 }, { "epoch": 0.4, "grad_norm": 3.220716714859009, "learning_rate": 9.06972164513019e-06, "loss": 4.5434, "step": 3125 }, { "epoch": 0.40064, "grad_norm": 3.3828017711639404, "learning_rate": 9.066789875656583e-06, "loss": 4.2933, "step": 3130 }, { "epoch": 0.40128, "grad_norm": 3.2086288928985596, "learning_rate": 9.063853969129734e-06, "loss": 4.2783, "step": 3135 }, { "epoch": 0.40192, "grad_norm": 3.4894447326660156, "learning_rate": 9.060913928536272e-06, "loss": 4.4412, "step": 3140 }, { "epoch": 0.40256, "grad_norm": 3.0965099334716797, "learning_rate": 9.057969756867036e-06, "loss": 4.2927, "step": 3145 }, { "epoch": 0.4032, "grad_norm": 3.0679054260253906, "learning_rate": 9.055021457117064e-06, "loss": 4.3709, "step": 3150 }, { "epoch": 0.40384, "grad_norm": 3.4988932609558105, "learning_rate": 9.052069032285594e-06, "loss": 4.2851, "step": 3155 }, { "epoch": 0.40448, "grad_norm": 3.3882834911346436, "learning_rate": 9.04911248537606e-06, "loss": 4.4369, "step": 3160 }, { "epoch": 0.40512, "grad_norm": 3.63289737701416, "learning_rate": 9.046151819396094e-06, "loss": 4.4555, "step": 3165 }, { "epoch": 0.40576, "grad_norm": 3.321216106414795, "learning_rate": 9.04318703735751e-06, "loss": 4.4082, "step": 3170 }, { "epoch": 0.4064, "grad_norm": 3.799292802810669, "learning_rate": 9.040218142276318e-06, "loss": 4.3891, "step": 3175 }, { "epoch": 0.40704, "grad_norm": 3.191375255584717, "learning_rate": 9.037245137172703e-06, "loss": 4.2423, "step": 3180 }, { "epoch": 0.40768, "grad_norm": 3.5997314453125, "learning_rate": 9.03426802507104e-06, "loss": 4.3708, "step": 3185 }, { "epoch": 0.40832, "grad_norm": 3.4215621948242188, "learning_rate": 9.031286808999875e-06, "loss": 4.3509, "step": 3190 }, { "epoch": 0.40896, "grad_norm": 3.418410539627075, "learning_rate": 9.028301491991932e-06, "loss": 4.3573, "step": 3195 }, { "epoch": 0.4096, "grad_norm": 3.293621778488159, "learning_rate": 9.025312077084109e-06, "loss": 4.2765, "step": 3200 }, { "epoch": 0.4096, "eval_loss": 1.0897722244262695, "eval_runtime": 7.2579, "eval_samples_per_second": 137.781, "eval_steps_per_second": 17.223, "step": 3200 }, { "epoch": 0.41024, "grad_norm": 3.231015205383301, "learning_rate": 9.022318567317468e-06, "loss": 4.4425, "step": 3205 }, { "epoch": 0.41088, "grad_norm": 3.2523105144500732, "learning_rate": 9.019320965737237e-06, "loss": 4.3704, "step": 3210 }, { "epoch": 0.41152, "grad_norm": 3.635181188583374, "learning_rate": 9.01631927539281e-06, "loss": 4.2446, "step": 3215 }, { "epoch": 0.41216, "grad_norm": 5.132925033569336, "learning_rate": 9.01331349933774e-06, "loss": 4.2925, "step": 3220 }, { "epoch": 0.4128, "grad_norm": 3.6050596237182617, "learning_rate": 9.010303640629733e-06, "loss": 4.3073, "step": 3225 }, { "epoch": 0.41344, "grad_norm": 3.5942418575286865, "learning_rate": 9.007289702330649e-06, "loss": 4.2469, "step": 3230 }, { "epoch": 0.41408, "grad_norm": 3.1797220706939697, "learning_rate": 9.004271687506503e-06, "loss": 4.2212, "step": 3235 }, { "epoch": 0.41472, "grad_norm": 3.2658779621124268, "learning_rate": 9.001249599227448e-06, "loss": 4.2773, "step": 3240 }, { "epoch": 0.41536, "grad_norm": 3.512411117553711, "learning_rate": 8.998223440567792e-06, "loss": 4.376, "step": 3245 }, { "epoch": 0.416, "grad_norm": 3.5214426517486572, "learning_rate": 8.995193214605972e-06, "loss": 4.3563, "step": 3250 }, { "epoch": 0.41664, "grad_norm": 3.3954243659973145, "learning_rate": 8.992158924424572e-06, "loss": 4.4415, "step": 3255 }, { "epoch": 0.41728, "grad_norm": 3.4646153450012207, "learning_rate": 8.989120573110307e-06, "loss": 4.3228, "step": 3260 }, { "epoch": 0.41792, "grad_norm": 3.5404181480407715, "learning_rate": 8.986078163754017e-06, "loss": 4.3483, "step": 3265 }, { "epoch": 0.41856, "grad_norm": 3.178579092025757, "learning_rate": 8.983031699450683e-06, "loss": 4.3122, "step": 3270 }, { "epoch": 0.4192, "grad_norm": 3.556442975997925, "learning_rate": 8.979981183299402e-06, "loss": 4.3761, "step": 3275 }, { "epoch": 0.41984, "grad_norm": 12.68402099609375, "learning_rate": 8.976926618403395e-06, "loss": 4.2578, "step": 3280 }, { "epoch": 0.42048, "grad_norm": 3.3316643238067627, "learning_rate": 8.973868007870001e-06, "loss": 4.3723, "step": 3285 }, { "epoch": 0.42112, "grad_norm": 3.6438074111938477, "learning_rate": 8.970805354810676e-06, "loss": 4.3699, "step": 3290 }, { "epoch": 0.42176, "grad_norm": 2.941915988922119, "learning_rate": 8.967738662340985e-06, "loss": 4.2281, "step": 3295 }, { "epoch": 0.4224, "grad_norm": 3.2409346103668213, "learning_rate": 8.96466793358061e-06, "loss": 4.3198, "step": 3300 }, { "epoch": 0.4224, "eval_loss": 1.0934110879898071, "eval_runtime": 7.852, "eval_samples_per_second": 127.356, "eval_steps_per_second": 15.92, "step": 3300 }, { "epoch": 0.42304, "grad_norm": 3.6601648330688477, "learning_rate": 8.961593171653329e-06, "loss": 4.4013, "step": 3305 }, { "epoch": 0.42368, "grad_norm": 3.4598097801208496, "learning_rate": 8.95851437968703e-06, "loss": 4.3119, "step": 3310 }, { "epoch": 0.42432, "grad_norm": 6.440554141998291, "learning_rate": 8.955431560813698e-06, "loss": 4.2568, "step": 3315 }, { "epoch": 0.42496, "grad_norm": 3.2681210041046143, "learning_rate": 8.952344718169415e-06, "loss": 4.4502, "step": 3320 }, { "epoch": 0.4256, "grad_norm": 3.3809800148010254, "learning_rate": 8.949253854894356e-06, "loss": 4.3411, "step": 3325 }, { "epoch": 0.42624, "grad_norm": 3.58083438873291, "learning_rate": 8.946158974132783e-06, "loss": 4.2869, "step": 3330 }, { "epoch": 0.42688, "grad_norm": 3.4199907779693604, "learning_rate": 8.943060079033054e-06, "loss": 4.3011, "step": 3335 }, { "epoch": 0.42752, "grad_norm": 3.229564905166626, "learning_rate": 8.939957172747602e-06, "loss": 4.3878, "step": 3340 }, { "epoch": 0.42816, "grad_norm": 3.4797444343566895, "learning_rate": 8.936850258432943e-06, "loss": 4.3084, "step": 3345 }, { "epoch": 0.4288, "grad_norm": 3.4006948471069336, "learning_rate": 8.933739339249669e-06, "loss": 4.3248, "step": 3350 }, { "epoch": 0.42944, "grad_norm": 3.5028109550476074, "learning_rate": 8.930624418362452e-06, "loss": 4.1799, "step": 3355 }, { "epoch": 0.43008, "grad_norm": 3.072166681289673, "learning_rate": 8.927505498940027e-06, "loss": 4.3052, "step": 3360 }, { "epoch": 0.43072, "grad_norm": 3.346359968185425, "learning_rate": 8.9243825841552e-06, "loss": 4.4108, "step": 3365 }, { "epoch": 0.43136, "grad_norm": 3.1402132511138916, "learning_rate": 8.921255677184844e-06, "loss": 4.4025, "step": 3370 }, { "epoch": 0.432, "grad_norm": 3.7172560691833496, "learning_rate": 8.918124781209889e-06, "loss": 4.426, "step": 3375 }, { "epoch": 0.43264, "grad_norm": 3.6983797550201416, "learning_rate": 8.914989899415323e-06, "loss": 4.4391, "step": 3380 }, { "epoch": 0.43328, "grad_norm": 3.75880765914917, "learning_rate": 8.911851034990194e-06, "loss": 4.4118, "step": 3385 }, { "epoch": 0.43392, "grad_norm": 3.346653699874878, "learning_rate": 8.908708191127596e-06, "loss": 4.3974, "step": 3390 }, { "epoch": 0.43456, "grad_norm": 3.2262954711914062, "learning_rate": 8.90556137102467e-06, "loss": 4.3007, "step": 3395 }, { "epoch": 0.4352, "grad_norm": 3.443249225616455, "learning_rate": 8.90241057788261e-06, "loss": 4.3532, "step": 3400 }, { "epoch": 0.4352, "eval_loss": 1.0824164152145386, "eval_runtime": 8.7963, "eval_samples_per_second": 113.684, "eval_steps_per_second": 14.21, "step": 3400 }, { "epoch": 0.43584, "grad_norm": 3.880614757537842, "learning_rate": 8.899255814906643e-06, "loss": 4.3109, "step": 3405 }, { "epoch": 0.43648, "grad_norm": 3.4008679389953613, "learning_rate": 8.896097085306036e-06, "loss": 4.3569, "step": 3410 }, { "epoch": 0.43712, "grad_norm": 3.386129856109619, "learning_rate": 8.8929343922941e-06, "loss": 4.4246, "step": 3415 }, { "epoch": 0.43776, "grad_norm": 3.3957948684692383, "learning_rate": 8.889767739088165e-06, "loss": 4.3674, "step": 3420 }, { "epoch": 0.4384, "grad_norm": 3.2077996730804443, "learning_rate": 8.886597128909598e-06, "loss": 4.1689, "step": 3425 }, { "epoch": 0.43904, "grad_norm": 2.9480648040771484, "learning_rate": 8.883422564983789e-06, "loss": 4.3577, "step": 3430 }, { "epoch": 0.43968, "grad_norm": 3.6000452041625977, "learning_rate": 8.880244050540147e-06, "loss": 4.2518, "step": 3435 }, { "epoch": 0.44032, "grad_norm": 3.707974672317505, "learning_rate": 8.877061588812107e-06, "loss": 4.2916, "step": 3440 }, { "epoch": 0.44096, "grad_norm": 3.2383322715759277, "learning_rate": 8.873875183037115e-06, "loss": 4.2048, "step": 3445 }, { "epoch": 0.4416, "grad_norm": 3.4959189891815186, "learning_rate": 8.870684836456625e-06, "loss": 4.2896, "step": 3450 }, { "epoch": 0.44224, "grad_norm": 3.520314931869507, "learning_rate": 8.867490552316109e-06, "loss": 4.3005, "step": 3455 }, { "epoch": 0.44288, "grad_norm": 3.2967355251312256, "learning_rate": 8.864292333865037e-06, "loss": 4.2496, "step": 3460 }, { "epoch": 0.44352, "grad_norm": 3.222911834716797, "learning_rate": 8.861090184356887e-06, "loss": 4.2884, "step": 3465 }, { "epoch": 0.44416, "grad_norm": 3.0785837173461914, "learning_rate": 8.857884107049128e-06, "loss": 4.2479, "step": 3470 }, { "epoch": 0.4448, "grad_norm": 3.3931188583374023, "learning_rate": 8.854674105203236e-06, "loss": 4.2806, "step": 3475 }, { "epoch": 0.44544, "grad_norm": 3.117079257965088, "learning_rate": 8.85146018208467e-06, "loss": 4.309, "step": 3480 }, { "epoch": 0.44608, "grad_norm": 3.419799566268921, "learning_rate": 8.848242340962882e-06, "loss": 4.2736, "step": 3485 }, { "epoch": 0.44672, "grad_norm": 3.147315740585327, "learning_rate": 8.845020585111307e-06, "loss": 4.4178, "step": 3490 }, { "epoch": 0.44736, "grad_norm": 3.2093615531921387, "learning_rate": 8.841794917807369e-06, "loss": 4.2141, "step": 3495 }, { "epoch": 0.448, "grad_norm": 3.126490354537964, "learning_rate": 8.838565342332462e-06, "loss": 4.3183, "step": 3500 }, { "epoch": 0.448, "eval_loss": 1.0745737552642822, "eval_runtime": 6.765, "eval_samples_per_second": 147.82, "eval_steps_per_second": 18.478, "step": 3500 }, { "epoch": 0.44864, "grad_norm": 3.4378812313079834, "learning_rate": 8.83533186197196e-06, "loss": 4.3091, "step": 3505 }, { "epoch": 0.44928, "grad_norm": 3.225221633911133, "learning_rate": 8.832094480015211e-06, "loss": 4.2099, "step": 3510 }, { "epoch": 0.44992, "grad_norm": 3.491705894470215, "learning_rate": 8.82885319975553e-06, "loss": 4.3552, "step": 3515 }, { "epoch": 0.45056, "grad_norm": 3.602252960205078, "learning_rate": 8.825608024490198e-06, "loss": 4.3082, "step": 3520 }, { "epoch": 0.4512, "grad_norm": 3.412418842315674, "learning_rate": 8.822358957520459e-06, "loss": 4.1762, "step": 3525 }, { "epoch": 0.45184, "grad_norm": 3.254765272140503, "learning_rate": 8.819106002151513e-06, "loss": 4.334, "step": 3530 }, { "epoch": 0.45248, "grad_norm": 3.0931169986724854, "learning_rate": 8.81584916169252e-06, "loss": 4.1329, "step": 3535 }, { "epoch": 0.45312, "grad_norm": 3.564138889312744, "learning_rate": 8.812588439456588e-06, "loss": 4.157, "step": 3540 }, { "epoch": 0.45376, "grad_norm": 3.654914140701294, "learning_rate": 8.809323838760778e-06, "loss": 4.3768, "step": 3545 }, { "epoch": 0.4544, "grad_norm": 3.495839834213257, "learning_rate": 8.806055362926093e-06, "loss": 4.224, "step": 3550 }, { "epoch": 0.45504, "grad_norm": 3.4077813625335693, "learning_rate": 8.802783015277483e-06, "loss": 4.3817, "step": 3555 }, { "epoch": 0.45568, "grad_norm": 3.136080741882324, "learning_rate": 8.799506799143826e-06, "loss": 4.2902, "step": 3560 }, { "epoch": 0.45632, "grad_norm": 3.2735695838928223, "learning_rate": 8.79622671785795e-06, "loss": 4.1645, "step": 3565 }, { "epoch": 0.45696, "grad_norm": 3.584965944290161, "learning_rate": 8.792942774756602e-06, "loss": 4.4454, "step": 3570 }, { "epoch": 0.4576, "grad_norm": 3.2631680965423584, "learning_rate": 8.789654973180465e-06, "loss": 4.349, "step": 3575 }, { "epoch": 0.45824, "grad_norm": 3.2688302993774414, "learning_rate": 8.786363316474147e-06, "loss": 4.3671, "step": 3580 }, { "epoch": 0.45888, "grad_norm": 4.0966572761535645, "learning_rate": 8.783067807986172e-06, "loss": 4.3114, "step": 3585 }, { "epoch": 0.45952, "grad_norm": 3.381613254547119, "learning_rate": 8.779768451068988e-06, "loss": 4.2705, "step": 3590 }, { "epoch": 0.46016, "grad_norm": 3.2401769161224365, "learning_rate": 8.776465249078958e-06, "loss": 4.3132, "step": 3595 }, { "epoch": 0.4608, "grad_norm": 3.099407911300659, "learning_rate": 8.773158205376351e-06, "loss": 4.1963, "step": 3600 }, { "epoch": 0.4608, "eval_loss": 1.0701223611831665, "eval_runtime": 6.7989, "eval_samples_per_second": 147.084, "eval_steps_per_second": 18.385, "step": 3600 }, { "epoch": 0.46144, "grad_norm": 3.2464311122894287, "learning_rate": 8.76984732332535e-06, "loss": 4.309, "step": 3605 }, { "epoch": 0.46208, "grad_norm": 3.6688852310180664, "learning_rate": 8.76653260629404e-06, "loss": 4.2009, "step": 3610 }, { "epoch": 0.46272, "grad_norm": 3.16396164894104, "learning_rate": 8.763214057654405e-06, "loss": 4.3162, "step": 3615 }, { "epoch": 0.46336, "grad_norm": 3.2529261112213135, "learning_rate": 8.759891680782336e-06, "loss": 4.3083, "step": 3620 }, { "epoch": 0.464, "grad_norm": 3.212313175201416, "learning_rate": 8.756565479057604e-06, "loss": 4.3336, "step": 3625 }, { "epoch": 0.46464, "grad_norm": 3.034917116165161, "learning_rate": 8.753235455863883e-06, "loss": 4.3935, "step": 3630 }, { "epoch": 0.46528, "grad_norm": 3.362323045730591, "learning_rate": 8.749901614588728e-06, "loss": 4.1964, "step": 3635 }, { "epoch": 0.46592, "grad_norm": 3.1814639568328857, "learning_rate": 8.746563958623584e-06, "loss": 4.3555, "step": 3640 }, { "epoch": 0.46656, "grad_norm": 3.348071336746216, "learning_rate": 8.743222491363767e-06, "loss": 4.3297, "step": 3645 }, { "epoch": 0.4672, "grad_norm": 3.4134304523468018, "learning_rate": 8.739877216208483e-06, "loss": 4.1711, "step": 3650 }, { "epoch": 0.46784, "grad_norm": 3.3224220275878906, "learning_rate": 8.736528136560798e-06, "loss": 4.4583, "step": 3655 }, { "epoch": 0.46848, "grad_norm": 3.0759694576263428, "learning_rate": 8.73317525582766e-06, "loss": 4.1875, "step": 3660 }, { "epoch": 0.46912, "grad_norm": 3.429316997528076, "learning_rate": 8.729818577419875e-06, "loss": 4.2585, "step": 3665 }, { "epoch": 0.46976, "grad_norm": 3.197808265686035, "learning_rate": 8.72645810475212e-06, "loss": 4.3153, "step": 3670 }, { "epoch": 0.4704, "grad_norm": 3.236347198486328, "learning_rate": 8.723093841242922e-06, "loss": 4.2445, "step": 3675 }, { "epoch": 0.47104, "grad_norm": 3.3201181888580322, "learning_rate": 8.719725790314675e-06, "loss": 4.2265, "step": 3680 }, { "epoch": 0.47168, "grad_norm": 3.4637041091918945, "learning_rate": 8.716353955393618e-06, "loss": 4.1416, "step": 3685 }, { "epoch": 0.47232, "grad_norm": 3.4432976245880127, "learning_rate": 8.712978339909845e-06, "loss": 4.2733, "step": 3690 }, { "epoch": 0.47296, "grad_norm": 3.350703239440918, "learning_rate": 8.709598947297291e-06, "loss": 4.2555, "step": 3695 }, { "epoch": 0.4736, "grad_norm": 3.470686197280884, "learning_rate": 8.706215780993735e-06, "loss": 4.2987, "step": 3700 }, { "epoch": 0.4736, "eval_loss": 1.0674490928649902, "eval_runtime": 6.691, "eval_samples_per_second": 149.454, "eval_steps_per_second": 18.682, "step": 3700 }, { "epoch": 0.47424, "grad_norm": 3.153869390487671, "learning_rate": 8.702828844440798e-06, "loss": 4.1894, "step": 3705 }, { "epoch": 0.47488, "grad_norm": 3.3305816650390625, "learning_rate": 8.699438141083933e-06, "loss": 4.3086, "step": 3710 }, { "epoch": 0.47552, "grad_norm": 3.200063705444336, "learning_rate": 8.696043674372424e-06, "loss": 4.1944, "step": 3715 }, { "epoch": 0.47616, "grad_norm": 3.437494993209839, "learning_rate": 8.692645447759387e-06, "loss": 4.219, "step": 3720 }, { "epoch": 0.4768, "grad_norm": 3.2587502002716064, "learning_rate": 8.68924346470176e-06, "loss": 4.1808, "step": 3725 }, { "epoch": 0.47744, "grad_norm": 3.0455007553100586, "learning_rate": 8.685837728660305e-06, "loss": 4.142, "step": 3730 }, { "epoch": 0.47808, "grad_norm": 3.3263583183288574, "learning_rate": 8.6824282430996e-06, "loss": 4.2156, "step": 3735 }, { "epoch": 0.47872, "grad_norm": 3.3989768028259277, "learning_rate": 8.679015011488032e-06, "loss": 4.2058, "step": 3740 }, { "epoch": 0.47936, "grad_norm": 3.58839750289917, "learning_rate": 8.675598037297812e-06, "loss": 4.2954, "step": 3745 }, { "epoch": 0.48, "grad_norm": 3.5100045204162598, "learning_rate": 8.672177324004946e-06, "loss": 4.3007, "step": 3750 }, { "epoch": 0.48064, "grad_norm": 3.6057488918304443, "learning_rate": 8.668752875089248e-06, "loss": 4.4091, "step": 3755 }, { "epoch": 0.48128, "grad_norm": 3.4816482067108154, "learning_rate": 8.665324694034335e-06, "loss": 4.2736, "step": 3760 }, { "epoch": 0.48192, "grad_norm": 3.406355857849121, "learning_rate": 8.661892784327616e-06, "loss": 4.3212, "step": 3765 }, { "epoch": 0.48256, "grad_norm": 3.6660380363464355, "learning_rate": 8.658457149460296e-06, "loss": 4.1134, "step": 3770 }, { "epoch": 0.4832, "grad_norm": 3.302281618118286, "learning_rate": 8.655017792927367e-06, "loss": 4.2658, "step": 3775 }, { "epoch": 0.48384, "grad_norm": 3.288099765777588, "learning_rate": 8.65157471822761e-06, "loss": 4.298, "step": 3780 }, { "epoch": 0.48448, "grad_norm": 4.572479248046875, "learning_rate": 8.648127928863586e-06, "loss": 4.2769, "step": 3785 }, { "epoch": 0.48512, "grad_norm": 3.6005682945251465, "learning_rate": 8.644677428341637e-06, "loss": 4.2542, "step": 3790 }, { "epoch": 0.48576, "grad_norm": 3.3716723918914795, "learning_rate": 8.641223220171877e-06, "loss": 4.2514, "step": 3795 }, { "epoch": 0.4864, "grad_norm": 3.3423805236816406, "learning_rate": 8.637765307868197e-06, "loss": 4.1449, "step": 3800 }, { "epoch": 0.4864, "eval_loss": 1.071823239326477, "eval_runtime": 7.6566, "eval_samples_per_second": 130.606, "eval_steps_per_second": 16.326, "step": 3800 }, { "epoch": 0.48704, "grad_norm": 3.4172847270965576, "learning_rate": 8.634303694948249e-06, "loss": 4.1549, "step": 3805 }, { "epoch": 0.48768, "grad_norm": 3.3541908264160156, "learning_rate": 8.630838384933456e-06, "loss": 4.5025, "step": 3810 }, { "epoch": 0.48832, "grad_norm": 3.3319921493530273, "learning_rate": 8.627369381349e-06, "loss": 4.2343, "step": 3815 }, { "epoch": 0.48896, "grad_norm": 3.2183337211608887, "learning_rate": 8.623896687723817e-06, "loss": 4.209, "step": 3820 }, { "epoch": 0.4896, "grad_norm": 3.541289806365967, "learning_rate": 8.6204203075906e-06, "loss": 4.2734, "step": 3825 }, { "epoch": 0.49024, "grad_norm": 3.3665432929992676, "learning_rate": 8.616940244485794e-06, "loss": 4.3104, "step": 3830 }, { "epoch": 0.49088, "grad_norm": 3.4722721576690674, "learning_rate": 8.61345650194959e-06, "loss": 4.2407, "step": 3835 }, { "epoch": 0.49152, "grad_norm": 3.581308126449585, "learning_rate": 8.609969083525913e-06, "loss": 4.2074, "step": 3840 }, { "epoch": 0.49216, "grad_norm": 4.030796051025391, "learning_rate": 8.606477992762442e-06, "loss": 4.2008, "step": 3845 }, { "epoch": 0.4928, "grad_norm": 3.2765376567840576, "learning_rate": 8.602983233210582e-06, "loss": 4.2747, "step": 3850 }, { "epoch": 0.49344, "grad_norm": 3.3987598419189453, "learning_rate": 8.599484808425471e-06, "loss": 4.3367, "step": 3855 }, { "epoch": 0.49408, "grad_norm": 3.40262508392334, "learning_rate": 8.59598272196598e-06, "loss": 4.2551, "step": 3860 }, { "epoch": 0.49472, "grad_norm": 3.381237268447876, "learning_rate": 8.592476977394703e-06, "loss": 4.2735, "step": 3865 }, { "epoch": 0.49536, "grad_norm": 3.3735103607177734, "learning_rate": 8.588967578277952e-06, "loss": 4.2059, "step": 3870 }, { "epoch": 0.496, "grad_norm": 3.5227203369140625, "learning_rate": 8.585454528185758e-06, "loss": 4.2452, "step": 3875 }, { "epoch": 0.49664, "grad_norm": 3.357236623764038, "learning_rate": 8.58193783069187e-06, "loss": 4.4393, "step": 3880 }, { "epoch": 0.49728, "grad_norm": 3.4981918334960938, "learning_rate": 8.578417489373747e-06, "loss": 4.3561, "step": 3885 }, { "epoch": 0.49792, "grad_norm": 3.3115687370300293, "learning_rate": 8.574893507812548e-06, "loss": 4.2126, "step": 3890 }, { "epoch": 0.49856, "grad_norm": 3.4311251640319824, "learning_rate": 8.571365889593139e-06, "loss": 4.2719, "step": 3895 }, { "epoch": 0.4992, "grad_norm": 3.6359755992889404, "learning_rate": 8.56783463830409e-06, "loss": 4.2107, "step": 3900 }, { "epoch": 0.4992, "eval_loss": 1.0585970878601074, "eval_runtime": 6.76, "eval_samples_per_second": 147.928, "eval_steps_per_second": 18.491, "step": 3900 }, { "epoch": 0.49984, "grad_norm": 3.3010311126708984, "learning_rate": 8.564299757537663e-06, "loss": 4.1933, "step": 3905 }, { "epoch": 0.50048, "grad_norm": 3.1731021404266357, "learning_rate": 8.560761250889808e-06, "loss": 4.2303, "step": 3910 }, { "epoch": 0.50112, "grad_norm": 3.087322950363159, "learning_rate": 8.557219121960173e-06, "loss": 4.1189, "step": 3915 }, { "epoch": 0.50176, "grad_norm": 3.6227645874023438, "learning_rate": 8.553673374352081e-06, "loss": 4.438, "step": 3920 }, { "epoch": 0.5024, "grad_norm": 3.2071166038513184, "learning_rate": 8.550124011672543e-06, "loss": 4.2284, "step": 3925 }, { "epoch": 0.50304, "grad_norm": 3.6163535118103027, "learning_rate": 8.546571037532244e-06, "loss": 4.2896, "step": 3930 }, { "epoch": 0.50368, "grad_norm": 3.202073335647583, "learning_rate": 8.543014455545545e-06, "loss": 4.1561, "step": 3935 }, { "epoch": 0.50432, "grad_norm": 3.4429636001586914, "learning_rate": 8.539454269330476e-06, "loss": 4.1965, "step": 3940 }, { "epoch": 0.50496, "grad_norm": 3.5354838371276855, "learning_rate": 8.535890482508735e-06, "loss": 4.2258, "step": 3945 }, { "epoch": 0.5056, "grad_norm": 4.535614967346191, "learning_rate": 8.532323098705679e-06, "loss": 4.2261, "step": 3950 }, { "epoch": 0.50624, "grad_norm": 3.529849052429199, "learning_rate": 8.52875212155033e-06, "loss": 4.2718, "step": 3955 }, { "epoch": 0.50688, "grad_norm": 3.2982187271118164, "learning_rate": 8.525177554675361e-06, "loss": 4.0328, "step": 3960 }, { "epoch": 0.50752, "grad_norm": 3.3150134086608887, "learning_rate": 8.521599401717095e-06, "loss": 4.1423, "step": 3965 }, { "epoch": 0.50816, "grad_norm": 3.494948387145996, "learning_rate": 8.51801766631551e-06, "loss": 4.2847, "step": 3970 }, { "epoch": 0.5088, "grad_norm": 3.396127700805664, "learning_rate": 8.514432352114224e-06, "loss": 4.4652, "step": 3975 }, { "epoch": 0.50944, "grad_norm": 3.2317397594451904, "learning_rate": 8.510843462760494e-06, "loss": 4.2053, "step": 3980 }, { "epoch": 0.51008, "grad_norm": 3.7028400897979736, "learning_rate": 8.507251001905216e-06, "loss": 4.2045, "step": 3985 }, { "epoch": 0.51072, "grad_norm": 3.373598337173462, "learning_rate": 8.50365497320292e-06, "loss": 4.0433, "step": 3990 }, { "epoch": 0.51136, "grad_norm": 3.230725049972534, "learning_rate": 8.500055380311763e-06, "loss": 4.2078, "step": 3995 }, { "epoch": 0.512, "grad_norm": 3.5658986568450928, "learning_rate": 8.496452226893533e-06, "loss": 4.281, "step": 4000 }, { "epoch": 0.512, "eval_loss": 1.0626306533813477, "eval_runtime": 7.4047, "eval_samples_per_second": 135.049, "eval_steps_per_second": 16.881, "step": 4000 }, { "epoch": 0.51264, "grad_norm": 3.789924144744873, "learning_rate": 8.492845516613632e-06, "loss": 4.2659, "step": 4005 }, { "epoch": 0.51328, "grad_norm": 3.218669891357422, "learning_rate": 8.489235253141088e-06, "loss": 4.1622, "step": 4010 }, { "epoch": 0.51392, "grad_norm": 3.1417505741119385, "learning_rate": 8.485621440148538e-06, "loss": 4.3878, "step": 4015 }, { "epoch": 0.51456, "grad_norm": 3.402475118637085, "learning_rate": 8.482004081312234e-06, "loss": 4.2486, "step": 4020 }, { "epoch": 0.5152, "grad_norm": 3.214193820953369, "learning_rate": 8.47838318031203e-06, "loss": 4.2596, "step": 4025 }, { "epoch": 0.51584, "grad_norm": 3.7518293857574463, "learning_rate": 8.47475874083139e-06, "loss": 4.1408, "step": 4030 }, { "epoch": 0.51648, "grad_norm": 3.5070385932922363, "learning_rate": 8.471130766557373e-06, "loss": 4.3088, "step": 4035 }, { "epoch": 0.51712, "grad_norm": 3.1660029888153076, "learning_rate": 8.467499261180636e-06, "loss": 4.237, "step": 4040 }, { "epoch": 0.51776, "grad_norm": 3.3773226737976074, "learning_rate": 8.463864228395426e-06, "loss": 4.2198, "step": 4045 }, { "epoch": 0.5184, "grad_norm": 3.256511926651001, "learning_rate": 8.46022567189958e-06, "loss": 4.21, "step": 4050 }, { "epoch": 0.51904, "grad_norm": 3.1459224224090576, "learning_rate": 8.456583595394519e-06, "loss": 4.3253, "step": 4055 }, { "epoch": 0.51968, "grad_norm": 3.153252363204956, "learning_rate": 8.452938002585243e-06, "loss": 4.2464, "step": 4060 }, { "epoch": 0.52032, "grad_norm": 3.502012014389038, "learning_rate": 8.449288897180335e-06, "loss": 4.1066, "step": 4065 }, { "epoch": 0.52096, "grad_norm": 3.99371075630188, "learning_rate": 8.445636282891945e-06, "loss": 4.1363, "step": 4070 }, { "epoch": 0.5216, "grad_norm": 3.0267117023468018, "learning_rate": 8.441980163435793e-06, "loss": 4.3352, "step": 4075 }, { "epoch": 0.52224, "grad_norm": 3.478861093521118, "learning_rate": 8.43832054253117e-06, "loss": 4.2754, "step": 4080 }, { "epoch": 0.52288, "grad_norm": 3.4219508171081543, "learning_rate": 8.434657423900925e-06, "loss": 4.2462, "step": 4085 }, { "epoch": 0.52352, "grad_norm": 3.4356322288513184, "learning_rate": 8.430990811271464e-06, "loss": 4.2702, "step": 4090 }, { "epoch": 0.52416, "grad_norm": 3.3565707206726074, "learning_rate": 8.427320708372749e-06, "loss": 4.3038, "step": 4095 }, { "epoch": 0.5248, "grad_norm": 3.221428871154785, "learning_rate": 8.423647118938293e-06, "loss": 4.1941, "step": 4100 }, { "epoch": 0.5248, "eval_loss": 1.0621204376220703, "eval_runtime": 6.8853, "eval_samples_per_second": 145.237, "eval_steps_per_second": 18.155, "step": 4100 }, { "epoch": 0.52544, "grad_norm": 3.1570796966552734, "learning_rate": 8.419970046705155e-06, "loss": 4.2666, "step": 4105 }, { "epoch": 0.52608, "grad_norm": 3.1513359546661377, "learning_rate": 8.416289495413939e-06, "loss": 4.2907, "step": 4110 }, { "epoch": 0.52672, "grad_norm": 3.519578456878662, "learning_rate": 8.412605468808786e-06, "loss": 4.1469, "step": 4115 }, { "epoch": 0.52736, "grad_norm": 3.4287312030792236, "learning_rate": 8.408917970637372e-06, "loss": 4.3648, "step": 4120 }, { "epoch": 0.528, "grad_norm": 3.1880617141723633, "learning_rate": 8.405227004650903e-06, "loss": 4.265, "step": 4125 }, { "epoch": 0.52864, "grad_norm": 3.3588428497314453, "learning_rate": 8.40153257460412e-06, "loss": 4.1882, "step": 4130 }, { "epoch": 0.52928, "grad_norm": 3.5101263523101807, "learning_rate": 8.397834684255279e-06, "loss": 4.2479, "step": 4135 }, { "epoch": 0.52992, "grad_norm": 3.492445945739746, "learning_rate": 8.394133337366164e-06, "loss": 4.0681, "step": 4140 }, { "epoch": 0.53056, "grad_norm": 3.169119358062744, "learning_rate": 8.390428537702066e-06, "loss": 4.2595, "step": 4145 }, { "epoch": 0.5312, "grad_norm": 3.325134038925171, "learning_rate": 8.3867202890318e-06, "loss": 4.1674, "step": 4150 }, { "epoch": 0.53184, "grad_norm": 3.290656805038452, "learning_rate": 8.38300859512768e-06, "loss": 4.3233, "step": 4155 }, { "epoch": 0.53248, "grad_norm": 3.161649465560913, "learning_rate": 8.379293459765527e-06, "loss": 4.2074, "step": 4160 }, { "epoch": 0.53312, "grad_norm": 3.6237411499023438, "learning_rate": 8.375574886724666e-06, "loss": 4.1539, "step": 4165 }, { "epoch": 0.53376, "grad_norm": 3.308765172958374, "learning_rate": 8.371852879787917e-06, "loss": 4.2598, "step": 4170 }, { "epoch": 0.5344, "grad_norm": 3.173496723175049, "learning_rate": 8.368127442741592e-06, "loss": 4.2533, "step": 4175 }, { "epoch": 0.53504, "grad_norm": 3.7781951427459717, "learning_rate": 8.364398579375496e-06, "loss": 4.253, "step": 4180 }, { "epoch": 0.53568, "grad_norm": 3.1767258644104004, "learning_rate": 8.360666293482915e-06, "loss": 4.2968, "step": 4185 }, { "epoch": 0.53632, "grad_norm": 3.150240659713745, "learning_rate": 8.356930588860622e-06, "loss": 4.1578, "step": 4190 }, { "epoch": 0.53696, "grad_norm": 3.2313241958618164, "learning_rate": 8.35319146930886e-06, "loss": 4.33, "step": 4195 }, { "epoch": 0.5376, "grad_norm": 3.2731451988220215, "learning_rate": 8.349448938631354e-06, "loss": 4.3148, "step": 4200 }, { "epoch": 0.5376, "eval_loss": 1.0485703945159912, "eval_runtime": 6.9069, "eval_samples_per_second": 144.782, "eval_steps_per_second": 18.098, "step": 4200 }, { "epoch": 0.53824, "grad_norm": 3.3490631580352783, "learning_rate": 8.345703000635297e-06, "loss": 4.1731, "step": 4205 }, { "epoch": 0.53888, "grad_norm": 3.5214695930480957, "learning_rate": 8.341953659131343e-06, "loss": 4.2613, "step": 4210 }, { "epoch": 0.53952, "grad_norm": 4.7053632736206055, "learning_rate": 8.338200917933616e-06, "loss": 4.2745, "step": 4215 }, { "epoch": 0.54016, "grad_norm": 3.4698004722595215, "learning_rate": 8.334444780859689e-06, "loss": 4.1812, "step": 4220 }, { "epoch": 0.5408, "grad_norm": 3.492169141769409, "learning_rate": 8.330685251730603e-06, "loss": 4.3112, "step": 4225 }, { "epoch": 0.54144, "grad_norm": 3.470062017440796, "learning_rate": 8.326922334370835e-06, "loss": 4.2367, "step": 4230 }, { "epoch": 0.54208, "grad_norm": 3.188481330871582, "learning_rate": 8.32315603260832e-06, "loss": 4.2524, "step": 4235 }, { "epoch": 0.54272, "grad_norm": 3.286050319671631, "learning_rate": 8.31938635027443e-06, "loss": 4.1992, "step": 4240 }, { "epoch": 0.54336, "grad_norm": 3.456850051879883, "learning_rate": 8.315613291203977e-06, "loss": 4.267, "step": 4245 }, { "epoch": 0.544, "grad_norm": 3.5051589012145996, "learning_rate": 8.311836859235208e-06, "loss": 4.1083, "step": 4250 }, { "epoch": 0.54464, "grad_norm": 3.4664196968078613, "learning_rate": 8.308057058209803e-06, "loss": 4.2551, "step": 4255 }, { "epoch": 0.54528, "grad_norm": 3.6029019355773926, "learning_rate": 8.304273891972869e-06, "loss": 4.2654, "step": 4260 }, { "epoch": 0.54592, "grad_norm": 3.2132017612457275, "learning_rate": 8.300487364372934e-06, "loss": 4.1052, "step": 4265 }, { "epoch": 0.54656, "grad_norm": 3.590670585632324, "learning_rate": 8.296697479261944e-06, "loss": 4.2485, "step": 4270 }, { "epoch": 0.5472, "grad_norm": 3.327327013015747, "learning_rate": 8.292904240495267e-06, "loss": 4.1854, "step": 4275 }, { "epoch": 0.54784, "grad_norm": 3.534122943878174, "learning_rate": 8.28910765193168e-06, "loss": 4.192, "step": 4280 }, { "epoch": 0.54848, "grad_norm": 3.2001917362213135, "learning_rate": 8.285307717433363e-06, "loss": 4.2273, "step": 4285 }, { "epoch": 0.54912, "grad_norm": 3.3666787147521973, "learning_rate": 8.281504440865905e-06, "loss": 4.2344, "step": 4290 }, { "epoch": 0.54976, "grad_norm": 3.4703445434570312, "learning_rate": 8.277697826098291e-06, "loss": 4.114, "step": 4295 }, { "epoch": 0.5504, "grad_norm": 3.424564838409424, "learning_rate": 8.27388787700291e-06, "loss": 4.195, "step": 4300 }, { "epoch": 0.5504, "eval_loss": 1.0426100492477417, "eval_runtime": 6.7447, "eval_samples_per_second": 148.265, "eval_steps_per_second": 18.533, "step": 4300 }, { "epoch": 0.55104, "grad_norm": 3.4007487297058105, "learning_rate": 8.27007459745553e-06, "loss": 4.2053, "step": 4305 }, { "epoch": 0.55168, "grad_norm": 3.2374963760375977, "learning_rate": 8.266257991335316e-06, "loss": 4.1603, "step": 4310 }, { "epoch": 0.55232, "grad_norm": 3.1317055225372314, "learning_rate": 8.262438062524817e-06, "loss": 4.3015, "step": 4315 }, { "epoch": 0.55296, "grad_norm": 3.174532890319824, "learning_rate": 8.25861481490996e-06, "loss": 4.2574, "step": 4320 }, { "epoch": 0.5536, "grad_norm": 3.3424572944641113, "learning_rate": 8.254788252380046e-06, "loss": 4.0812, "step": 4325 }, { "epoch": 0.55424, "grad_norm": 3.1913397312164307, "learning_rate": 8.250958378827752e-06, "loss": 4.2152, "step": 4330 }, { "epoch": 0.55488, "grad_norm": 3.294013500213623, "learning_rate": 8.24712519814912e-06, "loss": 4.1572, "step": 4335 }, { "epoch": 0.55552, "grad_norm": 3.1539976596832275, "learning_rate": 8.24328871424356e-06, "loss": 4.2349, "step": 4340 }, { "epoch": 0.55616, "grad_norm": 3.6340556144714355, "learning_rate": 8.239448931013839e-06, "loss": 4.262, "step": 4345 }, { "epoch": 0.5568, "grad_norm": 3.097135305404663, "learning_rate": 8.235605852366082e-06, "loss": 4.2263, "step": 4350 }, { "epoch": 0.55744, "grad_norm": 3.1234707832336426, "learning_rate": 8.231759482209764e-06, "loss": 4.1102, "step": 4355 }, { "epoch": 0.55808, "grad_norm": 3.3812243938446045, "learning_rate": 8.227909824457714e-06, "loss": 4.1727, "step": 4360 }, { "epoch": 0.55872, "grad_norm": 3.526226043701172, "learning_rate": 8.224056883026097e-06, "loss": 4.0455, "step": 4365 }, { "epoch": 0.55936, "grad_norm": 3.2296152114868164, "learning_rate": 8.220200661834428e-06, "loss": 4.2122, "step": 4370 }, { "epoch": 0.56, "grad_norm": 3.571004629135132, "learning_rate": 8.216341164805547e-06, "loss": 4.2038, "step": 4375 }, { "epoch": 0.56064, "grad_norm": 4.520260334014893, "learning_rate": 8.212478395865642e-06, "loss": 4.2543, "step": 4380 }, { "epoch": 0.56128, "grad_norm": 3.549973249435425, "learning_rate": 8.208612358944212e-06, "loss": 4.3407, "step": 4385 }, { "epoch": 0.56192, "grad_norm": 3.5902371406555176, "learning_rate": 8.204743057974093e-06, "loss": 4.1843, "step": 4390 }, { "epoch": 0.56256, "grad_norm": 3.5541188716888428, "learning_rate": 8.200870496891437e-06, "loss": 4.1567, "step": 4395 }, { "epoch": 0.5632, "grad_norm": 3.159623146057129, "learning_rate": 8.196994679635713e-06, "loss": 4.2037, "step": 4400 }, { "epoch": 0.5632, "eval_loss": 1.0485684871673584, "eval_runtime": 6.7526, "eval_samples_per_second": 148.091, "eval_steps_per_second": 18.511, "step": 4400 }, { "epoch": 0.56384, "grad_norm": 3.462564468383789, "learning_rate": 8.1931156101497e-06, "loss": 4.2367, "step": 4405 }, { "epoch": 0.56448, "grad_norm": 3.467090606689453, "learning_rate": 8.189233292379488e-06, "loss": 4.1135, "step": 4410 }, { "epoch": 0.56512, "grad_norm": 3.3077733516693115, "learning_rate": 8.185347730274471e-06, "loss": 4.1801, "step": 4415 }, { "epoch": 0.56576, "grad_norm": 3.233238697052002, "learning_rate": 8.181458927787347e-06, "loss": 4.1566, "step": 4420 }, { "epoch": 0.5664, "grad_norm": 3.4241669178009033, "learning_rate": 8.177566888874101e-06, "loss": 4.2094, "step": 4425 }, { "epoch": 0.56704, "grad_norm": 3.7361552715301514, "learning_rate": 8.17367161749402e-06, "loss": 4.0974, "step": 4430 }, { "epoch": 0.56768, "grad_norm": 3.274073362350464, "learning_rate": 8.169773117609675e-06, "loss": 4.1142, "step": 4435 }, { "epoch": 0.56832, "grad_norm": 3.3000011444091797, "learning_rate": 8.165871393186919e-06, "loss": 4.2093, "step": 4440 }, { "epoch": 0.56896, "grad_norm": 3.3069565296173096, "learning_rate": 8.16196644819489e-06, "loss": 4.2301, "step": 4445 }, { "epoch": 0.5696, "grad_norm": 3.4007809162139893, "learning_rate": 8.158058286606e-06, "loss": 4.111, "step": 4450 }, { "epoch": 0.57024, "grad_norm": 3.8727948665618896, "learning_rate": 8.154146912395933e-06, "loss": 4.196, "step": 4455 }, { "epoch": 0.57088, "grad_norm": 3.4134340286254883, "learning_rate": 8.150232329543643e-06, "loss": 4.1377, "step": 4460 }, { "epoch": 0.57152, "grad_norm": 3.355658531188965, "learning_rate": 8.146314542031343e-06, "loss": 4.0884, "step": 4465 }, { "epoch": 0.57216, "grad_norm": 3.8688745498657227, "learning_rate": 8.142393553844511e-06, "loss": 4.1839, "step": 4470 }, { "epoch": 0.5728, "grad_norm": 3.614151954650879, "learning_rate": 8.138469368971882e-06, "loss": 4.2426, "step": 4475 }, { "epoch": 0.57344, "grad_norm": 3.294654607772827, "learning_rate": 8.134541991405438e-06, "loss": 4.0466, "step": 4480 }, { "epoch": 0.57408, "grad_norm": 3.144278049468994, "learning_rate": 8.130611425140412e-06, "loss": 4.0591, "step": 4485 }, { "epoch": 0.57472, "grad_norm": 3.2101783752441406, "learning_rate": 8.126677674175278e-06, "loss": 4.132, "step": 4490 }, { "epoch": 0.57536, "grad_norm": 3.0196874141693115, "learning_rate": 8.122740742511754e-06, "loss": 4.1155, "step": 4495 }, { "epoch": 0.576, "grad_norm": 3.2182085514068604, "learning_rate": 8.118800634154792e-06, "loss": 4.2686, "step": 4500 }, { "epoch": 0.576, "eval_loss": 1.0499341487884521, "eval_runtime": 6.7956, "eval_samples_per_second": 147.154, "eval_steps_per_second": 18.394, "step": 4500 }, { "epoch": 0.57664, "grad_norm": 3.595808982849121, "learning_rate": 8.114857353112572e-06, "loss": 4.1841, "step": 4505 }, { "epoch": 0.57728, "grad_norm": 3.4274849891662598, "learning_rate": 8.110910903396508e-06, "loss": 4.1231, "step": 4510 }, { "epoch": 0.57792, "grad_norm": 3.538372278213501, "learning_rate": 8.106961289021232e-06, "loss": 4.1426, "step": 4515 }, { "epoch": 0.57856, "grad_norm": 3.3008785247802734, "learning_rate": 8.103008514004596e-06, "loss": 4.0978, "step": 4520 }, { "epoch": 0.5792, "grad_norm": 3.7046406269073486, "learning_rate": 8.099052582367671e-06, "loss": 4.1844, "step": 4525 }, { "epoch": 0.57984, "grad_norm": 3.538774013519287, "learning_rate": 8.095093498134736e-06, "loss": 4.1799, "step": 4530 }, { "epoch": 0.58048, "grad_norm": 3.4396979808807373, "learning_rate": 8.091131265333277e-06, "loss": 4.2845, "step": 4535 }, { "epoch": 0.58112, "grad_norm": 3.499345302581787, "learning_rate": 8.087165887993984e-06, "loss": 4.3056, "step": 4540 }, { "epoch": 0.58176, "grad_norm": 3.0737617015838623, "learning_rate": 8.083197370150748e-06, "loss": 4.0669, "step": 4545 }, { "epoch": 0.5824, "grad_norm": 3.5053136348724365, "learning_rate": 8.079225715840646e-06, "loss": 4.1382, "step": 4550 }, { "epoch": 0.58304, "grad_norm": 3.3943347930908203, "learning_rate": 8.075250929103959e-06, "loss": 4.2206, "step": 4555 }, { "epoch": 0.58368, "grad_norm": 3.1711487770080566, "learning_rate": 8.071273013984144e-06, "loss": 4.2383, "step": 4560 }, { "epoch": 0.58432, "grad_norm": 3.3679018020629883, "learning_rate": 8.067291974527845e-06, "loss": 4.1345, "step": 4565 }, { "epoch": 0.58496, "grad_norm": 3.348994255065918, "learning_rate": 8.063307814784882e-06, "loss": 4.2213, "step": 4570 }, { "epoch": 0.5856, "grad_norm": 3.837846517562866, "learning_rate": 8.059320538808251e-06, "loss": 4.1051, "step": 4575 }, { "epoch": 0.58624, "grad_norm": 3.153294801712036, "learning_rate": 8.05533015065412e-06, "loss": 4.0953, "step": 4580 }, { "epoch": 0.58688, "grad_norm": 3.3224375247955322, "learning_rate": 8.051336654381816e-06, "loss": 4.1112, "step": 4585 }, { "epoch": 0.58752, "grad_norm": 3.488499641418457, "learning_rate": 8.047340054053836e-06, "loss": 4.0806, "step": 4590 }, { "epoch": 0.58816, "grad_norm": 3.5140116214752197, "learning_rate": 8.043340353735828e-06, "loss": 4.132, "step": 4595 }, { "epoch": 0.5888, "grad_norm": 3.4562926292419434, "learning_rate": 8.0393375574966e-06, "loss": 4.0194, "step": 4600 }, { "epoch": 0.5888, "eval_loss": 1.0482516288757324, "eval_runtime": 7.2824, "eval_samples_per_second": 137.317, "eval_steps_per_second": 17.165, "step": 4600 }, { "epoch": 0.58944, "grad_norm": 3.2118759155273438, "learning_rate": 8.035331669408104e-06, "loss": 4.108, "step": 4605 }, { "epoch": 0.59008, "grad_norm": 3.044614315032959, "learning_rate": 8.031322693545438e-06, "loss": 4.1502, "step": 4610 }, { "epoch": 0.59072, "grad_norm": 3.399355888366699, "learning_rate": 8.027310633986845e-06, "loss": 4.1071, "step": 4615 }, { "epoch": 0.59136, "grad_norm": 3.31364107131958, "learning_rate": 8.023295494813701e-06, "loss": 4.205, "step": 4620 }, { "epoch": 0.592, "grad_norm": 3.1222352981567383, "learning_rate": 8.019277280110516e-06, "loss": 4.1323, "step": 4625 }, { "epoch": 0.59264, "grad_norm": 3.3674120903015137, "learning_rate": 8.01525599396493e-06, "loss": 4.0599, "step": 4630 }, { "epoch": 0.59328, "grad_norm": 3.2672739028930664, "learning_rate": 8.011231640467705e-06, "loss": 4.1417, "step": 4635 }, { "epoch": 0.59392, "grad_norm": 3.2652289867401123, "learning_rate": 8.007204223712726e-06, "loss": 4.223, "step": 4640 }, { "epoch": 0.59456, "grad_norm": 4.0131001472473145, "learning_rate": 8.00317374779699e-06, "loss": 4.1341, "step": 4645 }, { "epoch": 0.5952, "grad_norm": 3.488124132156372, "learning_rate": 7.999140216820613e-06, "loss": 4.0751, "step": 4650 }, { "epoch": 0.59584, "grad_norm": 3.4057059288024902, "learning_rate": 7.99510363488681e-06, "loss": 4.182, "step": 4655 }, { "epoch": 0.59648, "grad_norm": 3.633147954940796, "learning_rate": 7.991064006101909e-06, "loss": 4.1708, "step": 4660 }, { "epoch": 0.59712, "grad_norm": 3.4964585304260254, "learning_rate": 7.98702133457533e-06, "loss": 4.123, "step": 4665 }, { "epoch": 0.59776, "grad_norm": 3.620976448059082, "learning_rate": 7.982975624419591e-06, "loss": 4.3387, "step": 4670 }, { "epoch": 0.5984, "grad_norm": 3.191387414932251, "learning_rate": 7.978926879750303e-06, "loss": 4.1979, "step": 4675 }, { "epoch": 0.59904, "grad_norm": 3.342824697494507, "learning_rate": 7.974875104686164e-06, "loss": 4.2519, "step": 4680 }, { "epoch": 0.59968, "grad_norm": 3.3812179565429688, "learning_rate": 7.97082030334895e-06, "loss": 4.1762, "step": 4685 }, { "epoch": 0.60032, "grad_norm": 3.3878660202026367, "learning_rate": 7.966762479863517e-06, "loss": 4.1721, "step": 4690 }, { "epoch": 0.60096, "grad_norm": 3.517085075378418, "learning_rate": 7.962701638357799e-06, "loss": 4.0974, "step": 4695 }, { "epoch": 0.6016, "grad_norm": 3.536073923110962, "learning_rate": 7.9586377829628e-06, "loss": 4.2323, "step": 4700 }, { "epoch": 0.6016, "eval_loss": 1.0386443138122559, "eval_runtime": 7.8627, "eval_samples_per_second": 127.183, "eval_steps_per_second": 15.898, "step": 4700 }, { "epoch": 0.60224, "grad_norm": 3.297013759613037, "learning_rate": 7.954570917812585e-06, "loss": 4.0663, "step": 4705 }, { "epoch": 0.60288, "grad_norm": 3.464735507965088, "learning_rate": 7.950501047044287e-06, "loss": 4.2503, "step": 4710 }, { "epoch": 0.60352, "grad_norm": 3.2324845790863037, "learning_rate": 7.946428174798089e-06, "loss": 4.0498, "step": 4715 }, { "epoch": 0.60416, "grad_norm": 3.490805149078369, "learning_rate": 7.942352305217236e-06, "loss": 4.2563, "step": 4720 }, { "epoch": 0.6048, "grad_norm": 3.37969708442688, "learning_rate": 7.938273442448015e-06, "loss": 3.9272, "step": 4725 }, { "epoch": 0.60544, "grad_norm": 3.358905076980591, "learning_rate": 7.934191590639762e-06, "loss": 4.1162, "step": 4730 }, { "epoch": 0.60608, "grad_norm": 3.1589395999908447, "learning_rate": 7.930106753944853e-06, "loss": 3.951, "step": 4735 }, { "epoch": 0.60672, "grad_norm": 3.8178842067718506, "learning_rate": 7.926018936518698e-06, "loss": 4.3294, "step": 4740 }, { "epoch": 0.60736, "grad_norm": 3.419395923614502, "learning_rate": 7.921928142519742e-06, "loss": 4.2388, "step": 4745 }, { "epoch": 0.608, "grad_norm": 3.332667827606201, "learning_rate": 7.91783437610946e-06, "loss": 4.2338, "step": 4750 }, { "epoch": 0.60864, "grad_norm": 3.7604660987854004, "learning_rate": 7.913737641452342e-06, "loss": 4.2115, "step": 4755 }, { "epoch": 0.60928, "grad_norm": 3.185386896133423, "learning_rate": 7.909637942715906e-06, "loss": 4.091, "step": 4760 }, { "epoch": 0.60992, "grad_norm": 3.2540390491485596, "learning_rate": 7.905535284070685e-06, "loss": 4.1399, "step": 4765 }, { "epoch": 0.61056, "grad_norm": 3.3786585330963135, "learning_rate": 7.901429669690218e-06, "loss": 4.1883, "step": 4770 }, { "epoch": 0.6112, "grad_norm": 3.554351329803467, "learning_rate": 7.897321103751054e-06, "loss": 4.209, "step": 4775 }, { "epoch": 0.61184, "grad_norm": 3.373978614807129, "learning_rate": 7.893209590432744e-06, "loss": 4.2378, "step": 4780 }, { "epoch": 0.61248, "grad_norm": 3.1719655990600586, "learning_rate": 7.889095133917839e-06, "loss": 4.1013, "step": 4785 }, { "epoch": 0.61312, "grad_norm": 3.1673762798309326, "learning_rate": 7.884977738391882e-06, "loss": 4.0557, "step": 4790 }, { "epoch": 0.61376, "grad_norm": 3.32163143157959, "learning_rate": 7.880857408043404e-06, "loss": 4.2376, "step": 4795 }, { "epoch": 0.6144, "grad_norm": 3.029355764389038, "learning_rate": 7.876734147063927e-06, "loss": 4.1745, "step": 4800 }, { "epoch": 0.6144, "eval_loss": 1.0420140027999878, "eval_runtime": 6.7065, "eval_samples_per_second": 149.109, "eval_steps_per_second": 18.639, "step": 4800 }, { "epoch": 0.61504, "grad_norm": 3.270620822906494, "learning_rate": 7.872607959647947e-06, "loss": 4.0588, "step": 4805 }, { "epoch": 0.61568, "grad_norm": 3.68705153465271, "learning_rate": 7.868478849992944e-06, "loss": 4.1343, "step": 4810 }, { "epoch": 0.61632, "grad_norm": 3.301039934158325, "learning_rate": 7.86434682229937e-06, "loss": 4.1913, "step": 4815 }, { "epoch": 0.61696, "grad_norm": 3.441736936569214, "learning_rate": 7.860211880770637e-06, "loss": 4.1783, "step": 4820 }, { "epoch": 0.6176, "grad_norm": 3.1942312717437744, "learning_rate": 7.85607402961313e-06, "loss": 4.1702, "step": 4825 }, { "epoch": 0.61824, "grad_norm": 3.3528614044189453, "learning_rate": 7.851933273036194e-06, "loss": 4.0512, "step": 4830 }, { "epoch": 0.61888, "grad_norm": 3.235706090927124, "learning_rate": 7.847789615252123e-06, "loss": 3.9943, "step": 4835 }, { "epoch": 0.61952, "grad_norm": 3.4161102771759033, "learning_rate": 7.84364306047617e-06, "loss": 4.1663, "step": 4840 }, { "epoch": 0.62016, "grad_norm": 3.2843005657196045, "learning_rate": 7.839493612926528e-06, "loss": 4.1458, "step": 4845 }, { "epoch": 0.6208, "grad_norm": 3.2336485385894775, "learning_rate": 7.835341276824338e-06, "loss": 4.1267, "step": 4850 }, { "epoch": 0.62144, "grad_norm": 3.2358272075653076, "learning_rate": 7.831186056393679e-06, "loss": 4.1099, "step": 4855 }, { "epoch": 0.62208, "grad_norm": 3.2333428859710693, "learning_rate": 7.827027955861557e-06, "loss": 4.0636, "step": 4860 }, { "epoch": 0.62272, "grad_norm": 3.440809726715088, "learning_rate": 7.822866979457917e-06, "loss": 4.2263, "step": 4865 }, { "epoch": 0.62336, "grad_norm": 3.2690722942352295, "learning_rate": 7.818703131415627e-06, "loss": 4.1923, "step": 4870 }, { "epoch": 0.624, "grad_norm": 3.135249614715576, "learning_rate": 7.814536415970475e-06, "loss": 3.9522, "step": 4875 }, { "epoch": 0.62464, "grad_norm": 3.6807265281677246, "learning_rate": 7.810366837361165e-06, "loss": 4.1948, "step": 4880 }, { "epoch": 0.62528, "grad_norm": 3.511955738067627, "learning_rate": 7.806194399829314e-06, "loss": 4.1177, "step": 4885 }, { "epoch": 0.62592, "grad_norm": 3.115957260131836, "learning_rate": 7.802019107619452e-06, "loss": 4.1148, "step": 4890 }, { "epoch": 0.62656, "grad_norm": 3.540649890899658, "learning_rate": 7.797840964979007e-06, "loss": 4.1329, "step": 4895 }, { "epoch": 0.6272, "grad_norm": 3.370652675628662, "learning_rate": 7.793659976158306e-06, "loss": 4.0991, "step": 4900 }, { "epoch": 0.6272, "eval_loss": 1.0484163761138916, "eval_runtime": 6.6185, "eval_samples_per_second": 151.092, "eval_steps_per_second": 18.886, "step": 4900 }, { "epoch": 0.62784, "grad_norm": 3.38389253616333, "learning_rate": 7.78947614541058e-06, "loss": 4.1487, "step": 4905 }, { "epoch": 0.62848, "grad_norm": 3.2582650184631348, "learning_rate": 7.78528947699194e-06, "loss": 4.0647, "step": 4910 }, { "epoch": 0.62912, "grad_norm": 3.4006094932556152, "learning_rate": 7.781099975161393e-06, "loss": 4.0973, "step": 4915 }, { "epoch": 0.62976, "grad_norm": 3.3823771476745605, "learning_rate": 7.776907644180822e-06, "loss": 4.0354, "step": 4920 }, { "epoch": 0.6304, "grad_norm": 3.21818470954895, "learning_rate": 7.772712488314991e-06, "loss": 4.0933, "step": 4925 }, { "epoch": 0.63104, "grad_norm": 3.893780469894409, "learning_rate": 7.768514511831537e-06, "loss": 4.1754, "step": 4930 }, { "epoch": 0.63168, "grad_norm": 3.3422024250030518, "learning_rate": 7.764313719000966e-06, "loss": 4.1792, "step": 4935 }, { "epoch": 0.63232, "grad_norm": 3.573408365249634, "learning_rate": 7.76011011409665e-06, "loss": 3.9603, "step": 4940 }, { "epoch": 0.63296, "grad_norm": 3.3298516273498535, "learning_rate": 7.755903701394822e-06, "loss": 4.1212, "step": 4945 }, { "epoch": 0.6336, "grad_norm": 3.294663429260254, "learning_rate": 7.75169448517457e-06, "loss": 4.0832, "step": 4950 }, { "epoch": 0.63424, "grad_norm": 3.372905731201172, "learning_rate": 7.747482469717832e-06, "loss": 4.1185, "step": 4955 }, { "epoch": 0.63488, "grad_norm": 3.442746639251709, "learning_rate": 7.743267659309396e-06, "loss": 4.0458, "step": 4960 }, { "epoch": 0.63552, "grad_norm": 3.5581016540527344, "learning_rate": 7.739050058236898e-06, "loss": 4.206, "step": 4965 }, { "epoch": 0.63616, "grad_norm": 3.5394022464752197, "learning_rate": 7.734829670790804e-06, "loss": 4.0824, "step": 4970 }, { "epoch": 0.6368, "grad_norm": 3.5203707218170166, "learning_rate": 7.73060650126442e-06, "loss": 4.1966, "step": 4975 }, { "epoch": 0.63744, "grad_norm": 3.2009241580963135, "learning_rate": 7.726380553953879e-06, "loss": 4.1998, "step": 4980 }, { "epoch": 0.63808, "grad_norm": 3.217941999435425, "learning_rate": 7.722151833158142e-06, "loss": 3.9779, "step": 4985 }, { "epoch": 0.63872, "grad_norm": 3.275895118713379, "learning_rate": 7.717920343178993e-06, "loss": 4.1102, "step": 4990 }, { "epoch": 0.63936, "grad_norm": 3.0897116661071777, "learning_rate": 7.713686088321029e-06, "loss": 4.1574, "step": 4995 }, { "epoch": 0.64, "grad_norm": 3.437019109725952, "learning_rate": 7.709449072891661e-06, "loss": 4.3435, "step": 5000 }, { "epoch": 0.64, "eval_loss": 1.0283212661743164, "eval_runtime": 6.8036, "eval_samples_per_second": 146.98, "eval_steps_per_second": 18.373, "step": 5000 }, { "epoch": 0.64064, "grad_norm": 3.3842532634735107, "learning_rate": 7.70520930120111e-06, "loss": 4.0462, "step": 5005 }, { "epoch": 0.64128, "grad_norm": 4.276431083679199, "learning_rate": 7.700966777562402e-06, "loss": 4.1034, "step": 5010 }, { "epoch": 0.64192, "grad_norm": 3.322986125946045, "learning_rate": 7.696721506291353e-06, "loss": 4.0853, "step": 5015 }, { "epoch": 0.64256, "grad_norm": 3.593050718307495, "learning_rate": 7.69247349170659e-06, "loss": 4.0402, "step": 5020 }, { "epoch": 0.6432, "grad_norm": 3.6134753227233887, "learning_rate": 7.688222738129519e-06, "loss": 4.0731, "step": 5025 }, { "epoch": 0.64384, "grad_norm": 3.3642125129699707, "learning_rate": 7.683969249884331e-06, "loss": 4.2386, "step": 5030 }, { "epoch": 0.64448, "grad_norm": 3.5435409545898438, "learning_rate": 7.679713031298009e-06, "loss": 4.0897, "step": 5035 }, { "epoch": 0.64512, "grad_norm": 3.3521769046783447, "learning_rate": 7.675454086700307e-06, "loss": 4.0395, "step": 5040 }, { "epoch": 0.64576, "grad_norm": 3.2340495586395264, "learning_rate": 7.671192420423748e-06, "loss": 4.1929, "step": 5045 }, { "epoch": 0.6464, "grad_norm": 3.205925226211548, "learning_rate": 7.666928036803635e-06, "loss": 4.122, "step": 5050 }, { "epoch": 0.64704, "grad_norm": 3.4584217071533203, "learning_rate": 7.662660940178024e-06, "loss": 4.132, "step": 5055 }, { "epoch": 0.64768, "grad_norm": 3.254892110824585, "learning_rate": 7.65839113488774e-06, "loss": 4.1643, "step": 5060 }, { "epoch": 0.64832, "grad_norm": 3.1381659507751465, "learning_rate": 7.654118625276355e-06, "loss": 4.0238, "step": 5065 }, { "epoch": 0.64896, "grad_norm": 3.4624714851379395, "learning_rate": 7.649843415690198e-06, "loss": 4.0247, "step": 5070 }, { "epoch": 0.6496, "grad_norm": 3.3247270584106445, "learning_rate": 7.645565510478344e-06, "loss": 4.03, "step": 5075 }, { "epoch": 0.65024, "grad_norm": 3.419830322265625, "learning_rate": 7.641284913992608e-06, "loss": 4.0039, "step": 5080 }, { "epoch": 0.65088, "grad_norm": 3.156008243560791, "learning_rate": 7.637001630587544e-06, "loss": 4.0387, "step": 5085 }, { "epoch": 0.65152, "grad_norm": 3.25286865234375, "learning_rate": 7.63271566462044e-06, "loss": 4.1358, "step": 5090 }, { "epoch": 0.65216, "grad_norm": 3.1121902465820312, "learning_rate": 7.62842702045131e-06, "loss": 4.1283, "step": 5095 }, { "epoch": 0.6528, "grad_norm": 3.177816390991211, "learning_rate": 7.624135702442896e-06, "loss": 4.0129, "step": 5100 }, { "epoch": 0.6528, "eval_loss": 1.0441325902938843, "eval_runtime": 6.9093, "eval_samples_per_second": 144.732, "eval_steps_per_second": 18.091, "step": 5100 }, { "epoch": 0.65344, "grad_norm": 3.3002333641052246, "learning_rate": 7.61984171496066e-06, "loss": 4.0557, "step": 5105 }, { "epoch": 0.65408, "grad_norm": 3.5926401615142822, "learning_rate": 7.615545062372775e-06, "loss": 4.1188, "step": 5110 }, { "epoch": 0.65472, "grad_norm": 3.4342427253723145, "learning_rate": 7.611245749050132e-06, "loss": 4.0734, "step": 5115 }, { "epoch": 0.65536, "grad_norm": 3.499502658843994, "learning_rate": 7.606943779366324e-06, "loss": 4.1231, "step": 5120 }, { "epoch": 0.656, "grad_norm": 3.2994086742401123, "learning_rate": 7.602639157697645e-06, "loss": 4.0108, "step": 5125 }, { "epoch": 0.65664, "grad_norm": 3.3510470390319824, "learning_rate": 7.5983318884230915e-06, "loss": 4.1677, "step": 5130 }, { "epoch": 0.65728, "grad_norm": 3.6185731887817383, "learning_rate": 7.5940219759243495e-06, "loss": 4.0468, "step": 5135 }, { "epoch": 0.65792, "grad_norm": 3.3450589179992676, "learning_rate": 7.589709424585796e-06, "loss": 4.2103, "step": 5140 }, { "epoch": 0.65856, "grad_norm": 3.277587413787842, "learning_rate": 7.585394238794492e-06, "loss": 4.1963, "step": 5145 }, { "epoch": 0.6592, "grad_norm": 3.7275032997131348, "learning_rate": 7.581076422940179e-06, "loss": 4.1603, "step": 5150 }, { "epoch": 0.65984, "grad_norm": 3.197981595993042, "learning_rate": 7.5767559814152735e-06, "loss": 4.0356, "step": 5155 }, { "epoch": 0.66048, "grad_norm": 3.3903021812438965, "learning_rate": 7.57243291861486e-06, "loss": 4.1375, "step": 5160 }, { "epoch": 0.66112, "grad_norm": 3.2676641941070557, "learning_rate": 7.568107238936694e-06, "loss": 4.0345, "step": 5165 }, { "epoch": 0.66176, "grad_norm": 3.254869222640991, "learning_rate": 7.563778946781193e-06, "loss": 4.0845, "step": 5170 }, { "epoch": 0.6624, "grad_norm": 3.259308338165283, "learning_rate": 7.559448046551429e-06, "loss": 4.039, "step": 5175 }, { "epoch": 0.66304, "grad_norm": 3.208815813064575, "learning_rate": 7.555114542653128e-06, "loss": 4.1614, "step": 5180 }, { "epoch": 0.66368, "grad_norm": 3.4502336978912354, "learning_rate": 7.550778439494668e-06, "loss": 4.0919, "step": 5185 }, { "epoch": 0.66432, "grad_norm": 3.1363143920898438, "learning_rate": 7.546439741487066e-06, "loss": 4.0748, "step": 5190 }, { "epoch": 0.66496, "grad_norm": 3.3942441940307617, "learning_rate": 7.5420984530439826e-06, "loss": 4.1168, "step": 5195 }, { "epoch": 0.6656, "grad_norm": 3.31434965133667, "learning_rate": 7.537754578581711e-06, "loss": 4.189, "step": 5200 }, { "epoch": 0.6656, "eval_loss": 1.0360530614852905, "eval_runtime": 7.2424, "eval_samples_per_second": 138.076, "eval_steps_per_second": 17.26, "step": 5200 }, { "epoch": 0.66624, "grad_norm": 3.268174648284912, "learning_rate": 7.533408122519177e-06, "loss": 4.1333, "step": 5205 }, { "epoch": 0.66688, "grad_norm": 3.358441114425659, "learning_rate": 7.5290590892779325e-06, "loss": 4.2067, "step": 5210 }, { "epoch": 0.66752, "grad_norm": 3.2438085079193115, "learning_rate": 7.5247074832821495e-06, "loss": 4.0523, "step": 5215 }, { "epoch": 0.66816, "grad_norm": 3.4631876945495605, "learning_rate": 7.52035330895862e-06, "loss": 4.1713, "step": 5220 }, { "epoch": 0.6688, "grad_norm": 3.3959803581237793, "learning_rate": 7.515996570736746e-06, "loss": 4.1517, "step": 5225 }, { "epoch": 0.66944, "grad_norm": 3.5359513759613037, "learning_rate": 7.511637273048538e-06, "loss": 4.0533, "step": 5230 }, { "epoch": 0.67008, "grad_norm": 3.3612425327301025, "learning_rate": 7.50727542032861e-06, "loss": 4.0731, "step": 5235 }, { "epoch": 0.67072, "grad_norm": 3.462785243988037, "learning_rate": 7.502911017014177e-06, "loss": 4.1512, "step": 5240 }, { "epoch": 0.67136, "grad_norm": 3.3726119995117188, "learning_rate": 7.49854406754505e-06, "loss": 4.0378, "step": 5245 }, { "epoch": 0.672, "grad_norm": 3.513679027557373, "learning_rate": 7.494174576363623e-06, "loss": 4.1591, "step": 5250 }, { "epoch": 0.67264, "grad_norm": 3.5844063758850098, "learning_rate": 7.489802547914885e-06, "loss": 4.1491, "step": 5255 }, { "epoch": 0.67328, "grad_norm": 3.381340742111206, "learning_rate": 7.485427986646399e-06, "loss": 4.0664, "step": 5260 }, { "epoch": 0.67392, "grad_norm": 3.2730493545532227, "learning_rate": 7.481050897008308e-06, "loss": 4.1653, "step": 5265 }, { "epoch": 0.67456, "grad_norm": 3.4449875354766846, "learning_rate": 7.476671283453325e-06, "loss": 4.0288, "step": 5270 }, { "epoch": 0.6752, "grad_norm": 3.37095308303833, "learning_rate": 7.472289150436734e-06, "loss": 4.1401, "step": 5275 }, { "epoch": 0.67584, "grad_norm": 3.202610969543457, "learning_rate": 7.4679045024163765e-06, "loss": 4.0416, "step": 5280 }, { "epoch": 0.67648, "grad_norm": 3.40617036819458, "learning_rate": 7.463517343852659e-06, "loss": 3.9643, "step": 5285 }, { "epoch": 0.67712, "grad_norm": 3.3042664527893066, "learning_rate": 7.459127679208536e-06, "loss": 4.0707, "step": 5290 }, { "epoch": 0.67776, "grad_norm": 3.46657657623291, "learning_rate": 7.454735512949515e-06, "loss": 4.0634, "step": 5295 }, { "epoch": 0.6784, "grad_norm": 3.1254940032958984, "learning_rate": 7.450340849543647e-06, "loss": 4.0323, "step": 5300 }, { "epoch": 0.6784, "eval_loss": 1.018658995628357, "eval_runtime": 7.1603, "eval_samples_per_second": 139.658, "eval_steps_per_second": 17.457, "step": 5300 }, { "epoch": 0.67904, "grad_norm": 3.234954595565796, "learning_rate": 7.445943693461524e-06, "loss": 4.1678, "step": 5305 }, { "epoch": 0.67968, "grad_norm": 3.1998000144958496, "learning_rate": 7.441544049176272e-06, "loss": 4.0177, "step": 5310 }, { "epoch": 0.68032, "grad_norm": 3.3766448497772217, "learning_rate": 7.437141921163551e-06, "loss": 4.0408, "step": 5315 }, { "epoch": 0.68096, "grad_norm": 3.486940860748291, "learning_rate": 7.432737313901546e-06, "loss": 4.0692, "step": 5320 }, { "epoch": 0.6816, "grad_norm": 3.4825565814971924, "learning_rate": 7.428330231870963e-06, "loss": 4.1543, "step": 5325 }, { "epoch": 0.68224, "grad_norm": 3.2892098426818848, "learning_rate": 7.423920679555029e-06, "loss": 4.0388, "step": 5330 }, { "epoch": 0.68288, "grad_norm": 3.7257015705108643, "learning_rate": 7.419508661439479e-06, "loss": 4.1455, "step": 5335 }, { "epoch": 0.68352, "grad_norm": 3.36126708984375, "learning_rate": 7.415094182012561e-06, "loss": 4.0867, "step": 5340 }, { "epoch": 0.68416, "grad_norm": 3.403200387954712, "learning_rate": 7.410677245765024e-06, "loss": 3.9311, "step": 5345 }, { "epoch": 0.6848, "grad_norm": 3.3477866649627686, "learning_rate": 7.406257857190118e-06, "loss": 4.0525, "step": 5350 }, { "epoch": 0.68544, "grad_norm": 3.6489646434783936, "learning_rate": 7.401836020783586e-06, "loss": 4.1045, "step": 5355 }, { "epoch": 0.68608, "grad_norm": 3.322072744369507, "learning_rate": 7.397411741043663e-06, "loss": 4.065, "step": 5360 }, { "epoch": 0.68672, "grad_norm": 3.329051971435547, "learning_rate": 7.3929850224710675e-06, "loss": 4.0974, "step": 5365 }, { "epoch": 0.68736, "grad_norm": 2.996154308319092, "learning_rate": 7.388555869569001e-06, "loss": 4.127, "step": 5370 }, { "epoch": 0.688, "grad_norm": 3.2369041442871094, "learning_rate": 7.3841242868431395e-06, "loss": 4.2309, "step": 5375 }, { "epoch": 0.68864, "grad_norm": 3.5981876850128174, "learning_rate": 7.379690278801633e-06, "loss": 4.0174, "step": 5380 }, { "epoch": 0.68928, "grad_norm": 3.3885374069213867, "learning_rate": 7.375253849955097e-06, "loss": 4.0164, "step": 5385 }, { "epoch": 0.68992, "grad_norm": 3.208528518676758, "learning_rate": 7.37081500481661e-06, "loss": 4.2045, "step": 5390 }, { "epoch": 0.69056, "grad_norm": 3.3751118183135986, "learning_rate": 7.366373747901708e-06, "loss": 3.9949, "step": 5395 }, { "epoch": 0.6912, "grad_norm": 3.512998580932617, "learning_rate": 7.361930083728383e-06, "loss": 3.9627, "step": 5400 }, { "epoch": 0.6912, "eval_loss": 1.028464436531067, "eval_runtime": 7.0644, "eval_samples_per_second": 141.555, "eval_steps_per_second": 17.694, "step": 5400 }, { "epoch": 0.69184, "grad_norm": 3.4754066467285156, "learning_rate": 7.35748401681707e-06, "loss": 4.1225, "step": 5405 }, { "epoch": 0.69248, "grad_norm": 3.389253854751587, "learning_rate": 7.353035551690657e-06, "loss": 4.1547, "step": 5410 }, { "epoch": 0.69312, "grad_norm": 3.2215898036956787, "learning_rate": 7.3485846928744635e-06, "loss": 4.1092, "step": 5415 }, { "epoch": 0.69376, "grad_norm": 3.4230103492736816, "learning_rate": 7.344131444896249e-06, "loss": 4.0162, "step": 5420 }, { "epoch": 0.6944, "grad_norm": 3.33898663520813, "learning_rate": 7.3396758122862e-06, "loss": 4.0614, "step": 5425 }, { "epoch": 0.69504, "grad_norm": 2.969716787338257, "learning_rate": 7.335217799576935e-06, "loss": 3.9111, "step": 5430 }, { "epoch": 0.69568, "grad_norm": 3.3126251697540283, "learning_rate": 7.3307574113034825e-06, "loss": 4.0983, "step": 5435 }, { "epoch": 0.69632, "grad_norm": 3.4935758113861084, "learning_rate": 7.326294652003301e-06, "loss": 4.1387, "step": 5440 }, { "epoch": 0.69696, "grad_norm": 4.090250492095947, "learning_rate": 7.3218295262162506e-06, "loss": 4.2359, "step": 5445 }, { "epoch": 0.6976, "grad_norm": 3.3421008586883545, "learning_rate": 7.317362038484603e-06, "loss": 3.9774, "step": 5450 }, { "epoch": 0.69824, "grad_norm": 3.265035629272461, "learning_rate": 7.312892193353035e-06, "loss": 4.0956, "step": 5455 }, { "epoch": 0.69888, "grad_norm": 3.2407443523406982, "learning_rate": 7.308419995368616e-06, "loss": 4.1528, "step": 5460 }, { "epoch": 0.69952, "grad_norm": 3.419646739959717, "learning_rate": 7.303945449080813e-06, "loss": 3.9912, "step": 5465 }, { "epoch": 0.70016, "grad_norm": 3.481679677963257, "learning_rate": 7.29946855904148e-06, "loss": 4.1215, "step": 5470 }, { "epoch": 0.7008, "grad_norm": 3.273656129837036, "learning_rate": 7.294989329804857e-06, "loss": 3.9999, "step": 5475 }, { "epoch": 0.70144, "grad_norm": 3.0722243785858154, "learning_rate": 7.29050776592756e-06, "loss": 4.1311, "step": 5480 }, { "epoch": 0.70208, "grad_norm": 3.490076780319214, "learning_rate": 7.286023871968585e-06, "loss": 4.1776, "step": 5485 }, { "epoch": 0.70272, "grad_norm": 3.279628276824951, "learning_rate": 7.281537652489295e-06, "loss": 4.0772, "step": 5490 }, { "epoch": 0.70336, "grad_norm": 3.328392505645752, "learning_rate": 7.277049112053418e-06, "loss": 4.0715, "step": 5495 }, { "epoch": 0.704, "grad_norm": 3.36295747756958, "learning_rate": 7.272558255227047e-06, "loss": 4.1156, "step": 5500 }, { "epoch": 0.704, "eval_loss": 1.024857997894287, "eval_runtime": 6.7913, "eval_samples_per_second": 147.247, "eval_steps_per_second": 18.406, "step": 5500 }, { "epoch": 0.70464, "grad_norm": 3.70607328414917, "learning_rate": 7.268065086578627e-06, "loss": 4.0388, "step": 5505 }, { "epoch": 0.70528, "grad_norm": 3.330237865447998, "learning_rate": 7.263569610678958e-06, "loss": 4.2135, "step": 5510 }, { "epoch": 0.70592, "grad_norm": 3.2931463718414307, "learning_rate": 7.259071832101186e-06, "loss": 4.1573, "step": 5515 }, { "epoch": 0.70656, "grad_norm": 3.4031548500061035, "learning_rate": 7.254571755420796e-06, "loss": 4.2077, "step": 5520 }, { "epoch": 0.7072, "grad_norm": 3.3596532344818115, "learning_rate": 7.250069385215619e-06, "loss": 4.0455, "step": 5525 }, { "epoch": 0.70784, "grad_norm": 3.28298282623291, "learning_rate": 7.245564726065811e-06, "loss": 4.0913, "step": 5530 }, { "epoch": 0.70848, "grad_norm": 3.297868251800537, "learning_rate": 7.241057782553862e-06, "loss": 4.098, "step": 5535 }, { "epoch": 0.70912, "grad_norm": 3.110238552093506, "learning_rate": 7.2365485592645815e-06, "loss": 4.0421, "step": 5540 }, { "epoch": 0.70976, "grad_norm": 3.42336106300354, "learning_rate": 7.232037060785102e-06, "loss": 4.1329, "step": 5545 }, { "epoch": 0.7104, "grad_norm": 3.1975576877593994, "learning_rate": 7.227523291704866e-06, "loss": 4.1384, "step": 5550 }, { "epoch": 0.71104, "grad_norm": 3.52624773979187, "learning_rate": 7.2230072566156305e-06, "loss": 4.0816, "step": 5555 }, { "epoch": 0.71168, "grad_norm": 3.3925981521606445, "learning_rate": 7.218488960111455e-06, "loss": 4.023, "step": 5560 }, { "epoch": 0.71232, "grad_norm": 3.7120208740234375, "learning_rate": 7.213968406788703e-06, "loss": 4.2338, "step": 5565 }, { "epoch": 0.71296, "grad_norm": 3.375377893447876, "learning_rate": 7.209445601246027e-06, "loss": 4.1258, "step": 5570 }, { "epoch": 0.7136, "grad_norm": 3.4685449600219727, "learning_rate": 7.204920548084378e-06, "loss": 4.0949, "step": 5575 }, { "epoch": 0.71424, "grad_norm": 3.564234733581543, "learning_rate": 7.200393251906985e-06, "loss": 4.0926, "step": 5580 }, { "epoch": 0.71488, "grad_norm": 3.3394558429718018, "learning_rate": 7.19586371731937e-06, "loss": 4.0769, "step": 5585 }, { "epoch": 0.71552, "grad_norm": 3.314234972000122, "learning_rate": 7.191331948929323e-06, "loss": 4.0644, "step": 5590 }, { "epoch": 0.71616, "grad_norm": 3.2954790592193604, "learning_rate": 7.18679795134691e-06, "loss": 4.0076, "step": 5595 }, { "epoch": 0.7168, "grad_norm": 3.253471851348877, "learning_rate": 7.182261729184463e-06, "loss": 4.013, "step": 5600 }, { "epoch": 0.7168, "eval_loss": 1.016201376914978, "eval_runtime": 6.8775, "eval_samples_per_second": 145.401, "eval_steps_per_second": 18.175, "step": 5600 }, { "epoch": 0.71744, "grad_norm": 3.3830933570861816, "learning_rate": 7.17772328705658e-06, "loss": 4.1317, "step": 5605 }, { "epoch": 0.71808, "grad_norm": 2.9672000408172607, "learning_rate": 7.173182629580113e-06, "loss": 3.9456, "step": 5610 }, { "epoch": 0.71872, "grad_norm": 3.272923469543457, "learning_rate": 7.168639761374173e-06, "loss": 4.0034, "step": 5615 }, { "epoch": 0.71936, "grad_norm": 3.0897445678710938, "learning_rate": 7.1640946870601135e-06, "loss": 3.9826, "step": 5620 }, { "epoch": 0.72, "grad_norm": 3.123854398727417, "learning_rate": 7.159547411261538e-06, "loss": 3.9751, "step": 5625 }, { "epoch": 0.72064, "grad_norm": 3.2446231842041016, "learning_rate": 7.154997938604287e-06, "loss": 3.9737, "step": 5630 }, { "epoch": 0.72128, "grad_norm": 3.1185810565948486, "learning_rate": 7.150446273716435e-06, "loss": 3.9893, "step": 5635 }, { "epoch": 0.72192, "grad_norm": 3.1517674922943115, "learning_rate": 7.145892421228289e-06, "loss": 4.022, "step": 5640 }, { "epoch": 0.72256, "grad_norm": 3.4120028018951416, "learning_rate": 7.141336385772377e-06, "loss": 4.119, "step": 5645 }, { "epoch": 0.7232, "grad_norm": 3.3022778034210205, "learning_rate": 7.136778171983456e-06, "loss": 3.9917, "step": 5650 }, { "epoch": 0.72384, "grad_norm": 3.258190155029297, "learning_rate": 7.1322177844984884e-06, "loss": 4.0934, "step": 5655 }, { "epoch": 0.72448, "grad_norm": 4.144087314605713, "learning_rate": 7.127655227956656e-06, "loss": 4.1012, "step": 5660 }, { "epoch": 0.72512, "grad_norm": 3.4005286693573, "learning_rate": 7.123090506999342e-06, "loss": 4.1081, "step": 5665 }, { "epoch": 0.72576, "grad_norm": 3.2259960174560547, "learning_rate": 7.118523626270137e-06, "loss": 4.1488, "step": 5670 }, { "epoch": 0.7264, "grad_norm": 3.616593599319458, "learning_rate": 7.113954590414822e-06, "loss": 4.0788, "step": 5675 }, { "epoch": 0.72704, "grad_norm": 3.442378520965576, "learning_rate": 7.109383404081378e-06, "loss": 3.9939, "step": 5680 }, { "epoch": 0.72768, "grad_norm": 3.6199593544006348, "learning_rate": 7.104810071919964e-06, "loss": 3.9184, "step": 5685 }, { "epoch": 0.72832, "grad_norm": 3.5864577293395996, "learning_rate": 7.10023459858293e-06, "loss": 4.1875, "step": 5690 }, { "epoch": 0.72896, "grad_norm": 3.2463033199310303, "learning_rate": 7.095656988724802e-06, "loss": 4.0096, "step": 5695 }, { "epoch": 0.7296, "grad_norm": 3.480454206466675, "learning_rate": 7.0910772470022784e-06, "loss": 4.1788, "step": 5700 }, { "epoch": 0.7296, "eval_loss": 1.0143406391143799, "eval_runtime": 7.072, "eval_samples_per_second": 141.402, "eval_steps_per_second": 17.675, "step": 5700 }, { "epoch": 0.73024, "grad_norm": 3.228858709335327, "learning_rate": 7.086495378074225e-06, "loss": 4.0522, "step": 5705 }, { "epoch": 0.73088, "grad_norm": 3.310290813446045, "learning_rate": 7.081911386601677e-06, "loss": 4.0268, "step": 5710 }, { "epoch": 0.73152, "grad_norm": 3.3378937244415283, "learning_rate": 7.07732527724782e-06, "loss": 4.1019, "step": 5715 }, { "epoch": 0.73216, "grad_norm": 3.3621530532836914, "learning_rate": 7.072737054678004e-06, "loss": 4.1155, "step": 5720 }, { "epoch": 0.7328, "grad_norm": 3.041964530944824, "learning_rate": 7.06814672355972e-06, "loss": 4.113, "step": 5725 }, { "epoch": 0.73344, "grad_norm": 3.450547218322754, "learning_rate": 7.063554288562611e-06, "loss": 3.9201, "step": 5730 }, { "epoch": 0.73408, "grad_norm": 3.349160671234131, "learning_rate": 7.058959754358455e-06, "loss": 4.1675, "step": 5735 }, { "epoch": 0.73472, "grad_norm": 3.076353073120117, "learning_rate": 7.0543631256211705e-06, "loss": 4.0993, "step": 5740 }, { "epoch": 0.73536, "grad_norm": 3.2057204246520996, "learning_rate": 7.0497644070268e-06, "loss": 4.0656, "step": 5745 }, { "epoch": 0.736, "grad_norm": 3.22353196144104, "learning_rate": 7.045163603253519e-06, "loss": 3.9592, "step": 5750 }, { "epoch": 0.73664, "grad_norm": 3.1400163173675537, "learning_rate": 7.040560718981618e-06, "loss": 4.1322, "step": 5755 }, { "epoch": 0.73728, "grad_norm": 3.4224133491516113, "learning_rate": 7.035955758893509e-06, "loss": 3.9546, "step": 5760 }, { "epoch": 0.73792, "grad_norm": 3.4225850105285645, "learning_rate": 7.031348727673713e-06, "loss": 4.1259, "step": 5765 }, { "epoch": 0.73856, "grad_norm": 3.331876039505005, "learning_rate": 7.026739630008861e-06, "loss": 4.026, "step": 5770 }, { "epoch": 0.7392, "grad_norm": 3.3707172870635986, "learning_rate": 7.022128470587679e-06, "loss": 4.2298, "step": 5775 }, { "epoch": 0.73984, "grad_norm": 3.1047780513763428, "learning_rate": 7.017515254100998e-06, "loss": 4.0686, "step": 5780 }, { "epoch": 0.74048, "grad_norm": 3.0352683067321777, "learning_rate": 7.012899985241738e-06, "loss": 3.9605, "step": 5785 }, { "epoch": 0.74112, "grad_norm": 3.595513343811035, "learning_rate": 7.008282668704907e-06, "loss": 4.023, "step": 5790 }, { "epoch": 0.74176, "grad_norm": 3.1861355304718018, "learning_rate": 7.0036633091875985e-06, "loss": 4.1268, "step": 5795 }, { "epoch": 0.7424, "grad_norm": 3.247361898422241, "learning_rate": 6.99904191138898e-06, "loss": 4.1142, "step": 5800 }, { "epoch": 0.7424, "eval_loss": 1.005723476409912, "eval_runtime": 7.0291, "eval_samples_per_second": 142.265, "eval_steps_per_second": 17.783, "step": 5800 }, { "epoch": 0.74304, "grad_norm": 3.3942816257476807, "learning_rate": 6.994418480010297e-06, "loss": 4.0313, "step": 5805 }, { "epoch": 0.74368, "grad_norm": 3.075566291809082, "learning_rate": 6.989793019754858e-06, "loss": 3.9531, "step": 5810 }, { "epoch": 0.74432, "grad_norm": 3.2327611446380615, "learning_rate": 6.985165535328042e-06, "loss": 4.0313, "step": 5815 }, { "epoch": 0.74496, "grad_norm": 3.170311689376831, "learning_rate": 6.980536031437284e-06, "loss": 4.1406, "step": 5820 }, { "epoch": 0.7456, "grad_norm": 3.5235507488250732, "learning_rate": 6.975904512792073e-06, "loss": 4.0573, "step": 5825 }, { "epoch": 0.74624, "grad_norm": 3.7707526683807373, "learning_rate": 6.971270984103947e-06, "loss": 3.9786, "step": 5830 }, { "epoch": 0.74688, "grad_norm": 3.1437079906463623, "learning_rate": 6.966635450086492e-06, "loss": 4.1191, "step": 5835 }, { "epoch": 0.74752, "grad_norm": 3.192091703414917, "learning_rate": 6.961997915455328e-06, "loss": 4.0604, "step": 5840 }, { "epoch": 0.74816, "grad_norm": 3.214885950088501, "learning_rate": 6.957358384928119e-06, "loss": 4.2096, "step": 5845 }, { "epoch": 0.7488, "grad_norm": 3.4825382232666016, "learning_rate": 6.952716863224551e-06, "loss": 4.1515, "step": 5850 }, { "epoch": 0.74944, "grad_norm": 3.2055091857910156, "learning_rate": 6.948073355066339e-06, "loss": 3.993, "step": 5855 }, { "epoch": 0.75008, "grad_norm": 3.43379282951355, "learning_rate": 6.9434278651772205e-06, "loss": 3.9822, "step": 5860 }, { "epoch": 0.75072, "grad_norm": 2.9562249183654785, "learning_rate": 6.938780398282945e-06, "loss": 3.9083, "step": 5865 }, { "epoch": 0.75136, "grad_norm": 3.5767860412597656, "learning_rate": 6.934130959111276e-06, "loss": 4.1058, "step": 5870 }, { "epoch": 0.752, "grad_norm": 3.2050375938415527, "learning_rate": 6.929479552391985e-06, "loss": 4.1939, "step": 5875 }, { "epoch": 0.75264, "grad_norm": 3.266775369644165, "learning_rate": 6.924826182856839e-06, "loss": 4.011, "step": 5880 }, { "epoch": 0.75328, "grad_norm": 3.255232810974121, "learning_rate": 6.920170855239607e-06, "loss": 3.9561, "step": 5885 }, { "epoch": 0.75392, "grad_norm": 3.48268985748291, "learning_rate": 6.915513574276049e-06, "loss": 4.2584, "step": 5890 }, { "epoch": 0.75456, "grad_norm": 3.360429525375366, "learning_rate": 6.910854344703912e-06, "loss": 4.0792, "step": 5895 }, { "epoch": 0.7552, "grad_norm": 3.310023307800293, "learning_rate": 6.906193171262922e-06, "loss": 4.1032, "step": 5900 }, { "epoch": 0.7552, "eval_loss": 1.0207964181900024, "eval_runtime": 6.8554, "eval_samples_per_second": 145.871, "eval_steps_per_second": 18.234, "step": 5900 }, { "epoch": 0.75584, "grad_norm": 3.7510457038879395, "learning_rate": 6.9015300586947876e-06, "loss": 4.1674, "step": 5905 }, { "epoch": 0.75648, "grad_norm": 3.67040753364563, "learning_rate": 6.896865011743187e-06, "loss": 4.0561, "step": 5910 }, { "epoch": 0.75712, "grad_norm": 3.0624032020568848, "learning_rate": 6.892198035153767e-06, "loss": 3.8555, "step": 5915 }, { "epoch": 0.75776, "grad_norm": 3.3737337589263916, "learning_rate": 6.887529133674137e-06, "loss": 4.1243, "step": 5920 }, { "epoch": 0.7584, "grad_norm": 3.3432435989379883, "learning_rate": 6.882858312053864e-06, "loss": 4.0493, "step": 5925 }, { "epoch": 0.75904, "grad_norm": 3.475452423095703, "learning_rate": 6.8781855750444704e-06, "loss": 4.141, "step": 5930 }, { "epoch": 0.75968, "grad_norm": 3.452732801437378, "learning_rate": 6.873510927399425e-06, "loss": 4.0296, "step": 5935 }, { "epoch": 0.76032, "grad_norm": 3.238232135772705, "learning_rate": 6.86883437387414e-06, "loss": 4.045, "step": 5940 }, { "epoch": 0.76096, "grad_norm": 3.2131717205047607, "learning_rate": 6.86415591922597e-06, "loss": 4.0805, "step": 5945 }, { "epoch": 0.7616, "grad_norm": 3.2297801971435547, "learning_rate": 6.859475568214199e-06, "loss": 3.993, "step": 5950 }, { "epoch": 0.76224, "grad_norm": 3.0972037315368652, "learning_rate": 6.854793325600042e-06, "loss": 3.9922, "step": 5955 }, { "epoch": 0.76288, "grad_norm": 3.390723943710327, "learning_rate": 6.850109196146641e-06, "loss": 4.0131, "step": 5960 }, { "epoch": 0.76352, "grad_norm": 4.386783599853516, "learning_rate": 6.84542318461905e-06, "loss": 3.9761, "step": 5965 }, { "epoch": 0.76416, "grad_norm": 3.243786096572876, "learning_rate": 6.840735295784245e-06, "loss": 3.9654, "step": 5970 }, { "epoch": 0.7648, "grad_norm": 3.3371636867523193, "learning_rate": 6.83604553441111e-06, "loss": 4.0202, "step": 5975 }, { "epoch": 0.76544, "grad_norm": 3.3030002117156982, "learning_rate": 6.831353905270433e-06, "loss": 4.0206, "step": 5980 }, { "epoch": 0.76608, "grad_norm": 3.3030149936676025, "learning_rate": 6.8266604131349015e-06, "loss": 4.0615, "step": 5985 }, { "epoch": 0.76672, "grad_norm": 3.28432559967041, "learning_rate": 6.821965062779098e-06, "loss": 3.9029, "step": 5990 }, { "epoch": 0.76736, "grad_norm": 3.092695951461792, "learning_rate": 6.817267858979497e-06, "loss": 3.9879, "step": 5995 }, { "epoch": 0.768, "grad_norm": 3.263754367828369, "learning_rate": 6.812568806514457e-06, "loss": 3.9958, "step": 6000 }, { "epoch": 0.768, "eval_loss": 1.0079742670059204, "eval_runtime": 6.9911, "eval_samples_per_second": 143.039, "eval_steps_per_second": 17.88, "step": 6000 }, { "epoch": 0.76864, "grad_norm": 3.4394893646240234, "learning_rate": 6.807867910164216e-06, "loss": 4.0567, "step": 6005 }, { "epoch": 0.76928, "grad_norm": 3.2342753410339355, "learning_rate": 6.803165174710895e-06, "loss": 3.9647, "step": 6010 }, { "epoch": 0.76992, "grad_norm": 3.2559092044830322, "learning_rate": 6.798460604938475e-06, "loss": 4.0027, "step": 6015 }, { "epoch": 0.77056, "grad_norm": 3.206169843673706, "learning_rate": 6.79375420563281e-06, "loss": 4.0209, "step": 6020 }, { "epoch": 0.7712, "grad_norm": 3.1606526374816895, "learning_rate": 6.789045981581612e-06, "loss": 3.9584, "step": 6025 }, { "epoch": 0.77184, "grad_norm": 3.431877613067627, "learning_rate": 6.784335937574456e-06, "loss": 3.9336, "step": 6030 }, { "epoch": 0.77248, "grad_norm": 3.3919317722320557, "learning_rate": 6.779624078402755e-06, "loss": 4.0337, "step": 6035 }, { "epoch": 0.77312, "grad_norm": 3.114204168319702, "learning_rate": 6.774910408859781e-06, "loss": 4.0327, "step": 6040 }, { "epoch": 0.77376, "grad_norm": 3.5469913482666016, "learning_rate": 6.770194933740645e-06, "loss": 4.1115, "step": 6045 }, { "epoch": 0.7744, "grad_norm": 3.3311476707458496, "learning_rate": 6.76547765784229e-06, "loss": 4.0417, "step": 6050 }, { "epoch": 0.77504, "grad_norm": 3.26802921295166, "learning_rate": 6.760758585963495e-06, "loss": 4.0431, "step": 6055 }, { "epoch": 0.77568, "grad_norm": 3.6536500453948975, "learning_rate": 6.756037722904867e-06, "loss": 4.0063, "step": 6060 }, { "epoch": 0.77632, "grad_norm": 3.4150497913360596, "learning_rate": 6.7513150734688285e-06, "loss": 4.076, "step": 6065 }, { "epoch": 0.77696, "grad_norm": 3.2071707248687744, "learning_rate": 6.746590642459628e-06, "loss": 4.0914, "step": 6070 }, { "epoch": 0.7776, "grad_norm": 3.3766379356384277, "learning_rate": 6.741864434683319e-06, "loss": 3.9751, "step": 6075 }, { "epoch": 0.77824, "grad_norm": 3.4125139713287354, "learning_rate": 6.737136454947768e-06, "loss": 4.3161, "step": 6080 }, { "epoch": 0.77888, "grad_norm": 3.257509231567383, "learning_rate": 6.73240670806264e-06, "loss": 3.925, "step": 6085 }, { "epoch": 0.77952, "grad_norm": 3.6318936347961426, "learning_rate": 6.727675198839403e-06, "loss": 4.025, "step": 6090 }, { "epoch": 0.78016, "grad_norm": 3.7387313842773438, "learning_rate": 6.722941932091309e-06, "loss": 4.2462, "step": 6095 }, { "epoch": 0.7808, "grad_norm": 3.1425154209136963, "learning_rate": 6.718206912633407e-06, "loss": 4.0087, "step": 6100 }, { "epoch": 0.7808, "eval_loss": 1.0091650485992432, "eval_runtime": 8.1261, "eval_samples_per_second": 123.06, "eval_steps_per_second": 15.383, "step": 6100 }, { "epoch": 0.78144, "grad_norm": 3.4126670360565186, "learning_rate": 6.7134701452825225e-06, "loss": 4.0624, "step": 6105 }, { "epoch": 0.78208, "grad_norm": 3.345456600189209, "learning_rate": 6.7087316348572626e-06, "loss": 4.0735, "step": 6110 }, { "epoch": 0.78272, "grad_norm": 3.3764288425445557, "learning_rate": 6.703991386178008e-06, "loss": 3.9901, "step": 6115 }, { "epoch": 0.78336, "grad_norm": 3.302940845489502, "learning_rate": 6.699249404066906e-06, "loss": 4.0439, "step": 6120 }, { "epoch": 0.784, "grad_norm": 3.295030355453491, "learning_rate": 6.694505693347866e-06, "loss": 3.9957, "step": 6125 }, { "epoch": 0.78464, "grad_norm": 3.1348817348480225, "learning_rate": 6.689760258846557e-06, "loss": 3.8965, "step": 6130 }, { "epoch": 0.78528, "grad_norm": 3.459271192550659, "learning_rate": 6.685013105390404e-06, "loss": 4.0026, "step": 6135 }, { "epoch": 0.78592, "grad_norm": 3.3416779041290283, "learning_rate": 6.680264237808578e-06, "loss": 3.9384, "step": 6140 }, { "epoch": 0.78656, "grad_norm": 3.24820876121521, "learning_rate": 6.6755136609319945e-06, "loss": 3.992, "step": 6145 }, { "epoch": 0.7872, "grad_norm": 3.351857900619507, "learning_rate": 6.670761379593308e-06, "loss": 4.1481, "step": 6150 }, { "epoch": 0.78784, "grad_norm": 3.287588119506836, "learning_rate": 6.666007398626907e-06, "loss": 3.9051, "step": 6155 }, { "epoch": 0.78848, "grad_norm": 3.1063315868377686, "learning_rate": 6.661251722868907e-06, "loss": 3.9763, "step": 6160 }, { "epoch": 0.78912, "grad_norm": 3.2057151794433594, "learning_rate": 6.65649435715715e-06, "loss": 3.9564, "step": 6165 }, { "epoch": 0.78976, "grad_norm": 3.377319574356079, "learning_rate": 6.6517353063311985e-06, "loss": 4.0139, "step": 6170 }, { "epoch": 0.7904, "grad_norm": 3.3640501499176025, "learning_rate": 6.646974575232326e-06, "loss": 3.9932, "step": 6175 }, { "epoch": 0.79104, "grad_norm": 3.394688367843628, "learning_rate": 6.642212168703512e-06, "loss": 3.9131, "step": 6180 }, { "epoch": 0.79168, "grad_norm": 3.154578447341919, "learning_rate": 6.637448091589451e-06, "loss": 4.0465, "step": 6185 }, { "epoch": 0.79232, "grad_norm": 3.3188998699188232, "learning_rate": 6.632682348736529e-06, "loss": 3.9791, "step": 6190 }, { "epoch": 0.79296, "grad_norm": 3.3017475605010986, "learning_rate": 6.627914944992827e-06, "loss": 3.9438, "step": 6195 }, { "epoch": 0.7936, "grad_norm": 3.4923770427703857, "learning_rate": 6.623145885208117e-06, "loss": 4.0598, "step": 6200 }, { "epoch": 0.7936, "eval_loss": 1.0139714479446411, "eval_runtime": 6.7565, "eval_samples_per_second": 148.006, "eval_steps_per_second": 18.501, "step": 6200 }, { "epoch": 0.79424, "grad_norm": 3.5045602321624756, "learning_rate": 6.618375174233857e-06, "loss": 3.9786, "step": 6205 }, { "epoch": 0.79488, "grad_norm": 3.394674301147461, "learning_rate": 6.613602816923183e-06, "loss": 3.9143, "step": 6210 }, { "epoch": 0.79552, "grad_norm": 3.285093307495117, "learning_rate": 6.608828818130903e-06, "loss": 3.9806, "step": 6215 }, { "epoch": 0.79616, "grad_norm": 3.3816416263580322, "learning_rate": 6.604053182713501e-06, "loss": 3.9389, "step": 6220 }, { "epoch": 0.7968, "grad_norm": 3.101391315460205, "learning_rate": 6.599275915529124e-06, "loss": 4.0285, "step": 6225 }, { "epoch": 0.79744, "grad_norm": 3.581902027130127, "learning_rate": 6.594497021437573e-06, "loss": 4.1661, "step": 6230 }, { "epoch": 0.79808, "grad_norm": 3.477958917617798, "learning_rate": 6.5897165053003145e-06, "loss": 4.0367, "step": 6235 }, { "epoch": 0.79872, "grad_norm": 3.292573928833008, "learning_rate": 6.584934371980452e-06, "loss": 4.0413, "step": 6240 }, { "epoch": 0.79936, "grad_norm": 3.326529026031494, "learning_rate": 6.58015062634275e-06, "loss": 4.0497, "step": 6245 }, { "epoch": 0.8, "grad_norm": 3.278815269470215, "learning_rate": 6.575365273253598e-06, "loss": 4.0507, "step": 6250 }, { "epoch": 0.80064, "grad_norm": 3.2232770919799805, "learning_rate": 6.570578317581029e-06, "loss": 4.0234, "step": 6255 }, { "epoch": 0.80128, "grad_norm": 3.564846992492676, "learning_rate": 6.5657897641947045e-06, "loss": 3.8909, "step": 6260 }, { "epoch": 0.80192, "grad_norm": 3.3442466259002686, "learning_rate": 6.560999617965914e-06, "loss": 4.0105, "step": 6265 }, { "epoch": 0.80256, "grad_norm": 3.1271111965179443, "learning_rate": 6.5562078837675625e-06, "loss": 3.9133, "step": 6270 }, { "epoch": 0.8032, "grad_norm": 3.4506609439849854, "learning_rate": 6.551414566474173e-06, "loss": 3.9542, "step": 6275 }, { "epoch": 0.80384, "grad_norm": 3.406909942626953, "learning_rate": 6.546619670961878e-06, "loss": 4.0518, "step": 6280 }, { "epoch": 0.80448, "grad_norm": 3.421560525894165, "learning_rate": 6.5418232021084175e-06, "loss": 4.0037, "step": 6285 }, { "epoch": 0.80512, "grad_norm": 3.2941651344299316, "learning_rate": 6.537025164793129e-06, "loss": 3.8791, "step": 6290 }, { "epoch": 0.80576, "grad_norm": 3.4293181896209717, "learning_rate": 6.532225563896949e-06, "loss": 3.9889, "step": 6295 }, { "epoch": 0.8064, "grad_norm": 3.2118945121765137, "learning_rate": 6.527424404302403e-06, "loss": 4.1149, "step": 6300 }, { "epoch": 0.8064, "eval_loss": 0.9931904077529907, "eval_runtime": 6.719, "eval_samples_per_second": 148.832, "eval_steps_per_second": 18.604, "step": 6300 }, { "epoch": 0.80704, "grad_norm": 3.647317409515381, "learning_rate": 6.522621690893598e-06, "loss": 3.9717, "step": 6305 }, { "epoch": 0.80768, "grad_norm": 3.0840768814086914, "learning_rate": 6.517817428556231e-06, "loss": 3.9465, "step": 6310 }, { "epoch": 0.80832, "grad_norm": 3.4289841651916504, "learning_rate": 6.513011622177565e-06, "loss": 3.9262, "step": 6315 }, { "epoch": 0.80896, "grad_norm": 3.369891881942749, "learning_rate": 6.508204276646441e-06, "loss": 4.0869, "step": 6320 }, { "epoch": 0.8096, "grad_norm": 3.300008535385132, "learning_rate": 6.5033953968532604e-06, "loss": 3.9217, "step": 6325 }, { "epoch": 0.81024, "grad_norm": 3.2289252281188965, "learning_rate": 6.4985849876899894e-06, "loss": 4.1345, "step": 6330 }, { "epoch": 0.81088, "grad_norm": 3.045872688293457, "learning_rate": 6.493773054050147e-06, "loss": 3.942, "step": 6335 }, { "epoch": 0.81152, "grad_norm": 3.694000482559204, "learning_rate": 6.4889596008288065e-06, "loss": 3.9691, "step": 6340 }, { "epoch": 0.81216, "grad_norm": 3.224107503890991, "learning_rate": 6.484144632922582e-06, "loss": 4.1082, "step": 6345 }, { "epoch": 0.8128, "grad_norm": 3.7049143314361572, "learning_rate": 6.479328155229634e-06, "loss": 4.1134, "step": 6350 }, { "epoch": 0.81344, "grad_norm": 3.076693534851074, "learning_rate": 6.474510172649653e-06, "loss": 3.9307, "step": 6355 }, { "epoch": 0.81408, "grad_norm": 3.1345927715301514, "learning_rate": 6.469690690083867e-06, "loss": 4.0009, "step": 6360 }, { "epoch": 0.81472, "grad_norm": 3.370507001876831, "learning_rate": 6.464869712435024e-06, "loss": 4.0593, "step": 6365 }, { "epoch": 0.81536, "grad_norm": 3.774531126022339, "learning_rate": 6.460047244607397e-06, "loss": 3.9653, "step": 6370 }, { "epoch": 0.816, "grad_norm": 3.2934482097625732, "learning_rate": 6.455223291506772e-06, "loss": 3.9456, "step": 6375 }, { "epoch": 0.81664, "grad_norm": 3.284327745437622, "learning_rate": 6.450397858040449e-06, "loss": 3.9911, "step": 6380 }, { "epoch": 0.81728, "grad_norm": 3.251145124435425, "learning_rate": 6.4455709491172295e-06, "loss": 3.9766, "step": 6385 }, { "epoch": 0.81792, "grad_norm": 3.2382752895355225, "learning_rate": 6.44074256964742e-06, "loss": 4.0249, "step": 6390 }, { "epoch": 0.81856, "grad_norm": 3.185112476348877, "learning_rate": 6.435912724542822e-06, "loss": 4.0807, "step": 6395 }, { "epoch": 0.8192, "grad_norm": 3.3324737548828125, "learning_rate": 6.431081418716729e-06, "loss": 3.9421, "step": 6400 }, { "epoch": 0.8192, "eval_loss": 1.0041779279708862, "eval_runtime": 6.7694, "eval_samples_per_second": 147.723, "eval_steps_per_second": 18.465, "step": 6400 }, { "epoch": 0.81984, "grad_norm": 3.61067271232605, "learning_rate": 6.426248657083916e-06, "loss": 4.0269, "step": 6405 }, { "epoch": 0.82048, "grad_norm": 3.5463812351226807, "learning_rate": 6.421414444560643e-06, "loss": 3.9552, "step": 6410 }, { "epoch": 0.82112, "grad_norm": 3.159905433654785, "learning_rate": 6.416578786064645e-06, "loss": 3.9453, "step": 6415 }, { "epoch": 0.82176, "grad_norm": 3.876019239425659, "learning_rate": 6.41174168651513e-06, "loss": 3.8959, "step": 6420 }, { "epoch": 0.8224, "grad_norm": 3.502474308013916, "learning_rate": 6.406903150832766e-06, "loss": 3.8898, "step": 6425 }, { "epoch": 0.82304, "grad_norm": 3.2995212078094482, "learning_rate": 6.402063183939687e-06, "loss": 4.0422, "step": 6430 }, { "epoch": 0.82368, "grad_norm": 3.3530595302581787, "learning_rate": 6.397221790759484e-06, "loss": 4.1236, "step": 6435 }, { "epoch": 0.82432, "grad_norm": 3.162374973297119, "learning_rate": 6.392378976217195e-06, "loss": 4.0261, "step": 6440 }, { "epoch": 0.82496, "grad_norm": 3.0621769428253174, "learning_rate": 6.387534745239306e-06, "loss": 3.9925, "step": 6445 }, { "epoch": 0.8256, "grad_norm": 3.232025146484375, "learning_rate": 6.382689102753741e-06, "loss": 3.8912, "step": 6450 }, { "epoch": 0.82624, "grad_norm": 3.550158739089966, "learning_rate": 6.377842053689865e-06, "loss": 4.0563, "step": 6455 }, { "epoch": 0.82688, "grad_norm": 3.5361146926879883, "learning_rate": 6.372993602978471e-06, "loss": 4.1191, "step": 6460 }, { "epoch": 0.82752, "grad_norm": 3.995157241821289, "learning_rate": 6.368143755551779e-06, "loss": 4.1001, "step": 6465 }, { "epoch": 0.82816, "grad_norm": 3.403334856033325, "learning_rate": 6.363292516343427e-06, "loss": 4.0052, "step": 6470 }, { "epoch": 0.8288, "grad_norm": 3.6193649768829346, "learning_rate": 6.358439890288471e-06, "loss": 4.0026, "step": 6475 }, { "epoch": 0.82944, "grad_norm": 3.133255958557129, "learning_rate": 6.353585882323378e-06, "loss": 4.0432, "step": 6480 }, { "epoch": 0.83008, "grad_norm": 3.300370693206787, "learning_rate": 6.348730497386022e-06, "loss": 4.0762, "step": 6485 }, { "epoch": 0.83072, "grad_norm": 3.2615299224853516, "learning_rate": 6.3438737404156725e-06, "loss": 4.0634, "step": 6490 }, { "epoch": 0.83136, "grad_norm": 3.20989727973938, "learning_rate": 6.3390156163530015e-06, "loss": 4.0828, "step": 6495 }, { "epoch": 0.832, "grad_norm": 3.197261333465576, "learning_rate": 6.334156130140068e-06, "loss": 3.8688, "step": 6500 }, { "epoch": 0.832, "eval_loss": 1.004305362701416, "eval_runtime": 7.1118, "eval_samples_per_second": 140.611, "eval_steps_per_second": 17.576, "step": 6500 }, { "epoch": 0.83264, "grad_norm": 3.9616518020629883, "learning_rate": 6.329295286720316e-06, "loss": 3.9318, "step": 6505 }, { "epoch": 0.83328, "grad_norm": 3.124983787536621, "learning_rate": 6.324433091038573e-06, "loss": 4.0023, "step": 6510 }, { "epoch": 0.83392, "grad_norm": 3.1082472801208496, "learning_rate": 6.31956954804104e-06, "loss": 4.0784, "step": 6515 }, { "epoch": 0.83456, "grad_norm": 3.4243500232696533, "learning_rate": 6.314704662675289e-06, "loss": 4.1357, "step": 6520 }, { "epoch": 0.8352, "grad_norm": 3.482208490371704, "learning_rate": 6.3098384398902565e-06, "loss": 4.3399, "step": 6525 }, { "epoch": 0.83584, "grad_norm": 3.1997745037078857, "learning_rate": 6.3049708846362425e-06, "loss": 4.0126, "step": 6530 }, { "epoch": 0.83648, "grad_norm": 3.6432223320007324, "learning_rate": 6.300102001864902e-06, "loss": 3.9565, "step": 6535 }, { "epoch": 0.83712, "grad_norm": 3.2619552612304688, "learning_rate": 6.2952317965292355e-06, "loss": 4.0278, "step": 6540 }, { "epoch": 0.83776, "grad_norm": 3.5579400062561035, "learning_rate": 6.290360273583596e-06, "loss": 4.067, "step": 6545 }, { "epoch": 0.8384, "grad_norm": 4.077335834503174, "learning_rate": 6.28548743798367e-06, "loss": 3.9901, "step": 6550 }, { "epoch": 0.83904, "grad_norm": 3.147125482559204, "learning_rate": 6.280613294686486e-06, "loss": 3.941, "step": 6555 }, { "epoch": 0.83968, "grad_norm": 3.3126308917999268, "learning_rate": 6.275737848650398e-06, "loss": 4.1567, "step": 6560 }, { "epoch": 0.84032, "grad_norm": 3.32991886138916, "learning_rate": 6.270861104835086e-06, "loss": 3.8803, "step": 6565 }, { "epoch": 0.84096, "grad_norm": 3.420491933822632, "learning_rate": 6.265983068201553e-06, "loss": 3.9706, "step": 6570 }, { "epoch": 0.8416, "grad_norm": 4.031120300292969, "learning_rate": 6.261103743712116e-06, "loss": 3.9856, "step": 6575 }, { "epoch": 0.84224, "grad_norm": 3.198617696762085, "learning_rate": 6.256223136330398e-06, "loss": 3.9416, "step": 6580 }, { "epoch": 0.84288, "grad_norm": 3.415013313293457, "learning_rate": 6.251341251021334e-06, "loss": 4.1193, "step": 6585 }, { "epoch": 0.84352, "grad_norm": 3.3532445430755615, "learning_rate": 6.246458092751151e-06, "loss": 4.1583, "step": 6590 }, { "epoch": 0.84416, "grad_norm": 3.5485520362854004, "learning_rate": 6.241573666487379e-06, "loss": 3.9059, "step": 6595 }, { "epoch": 0.8448, "grad_norm": 3.8386924266815186, "learning_rate": 6.236687977198832e-06, "loss": 3.9479, "step": 6600 }, { "epoch": 0.8448, "eval_loss": 0.9972337484359741, "eval_runtime": 6.9957, "eval_samples_per_second": 142.944, "eval_steps_per_second": 17.868, "step": 6600 }, { "epoch": 0.84544, "grad_norm": 3.2764065265655518, "learning_rate": 6.231801029855614e-06, "loss": 4.0085, "step": 6605 }, { "epoch": 0.84608, "grad_norm": 3.119434118270874, "learning_rate": 6.226912829429104e-06, "loss": 3.9582, "step": 6610 }, { "epoch": 0.84672, "grad_norm": 3.6029162406921387, "learning_rate": 6.222023380891955e-06, "loss": 3.8984, "step": 6615 }, { "epoch": 0.84736, "grad_norm": 3.146862268447876, "learning_rate": 6.217132689218097e-06, "loss": 4.0213, "step": 6620 }, { "epoch": 0.848, "grad_norm": 3.5199005603790283, "learning_rate": 6.212240759382717e-06, "loss": 3.9775, "step": 6625 }, { "epoch": 0.84864, "grad_norm": 3.4542529582977295, "learning_rate": 6.207347596362265e-06, "loss": 3.9534, "step": 6630 }, { "epoch": 0.84928, "grad_norm": 3.4714694023132324, "learning_rate": 6.202453205134444e-06, "loss": 3.9529, "step": 6635 }, { "epoch": 0.84992, "grad_norm": 3.232454299926758, "learning_rate": 6.19755759067821e-06, "loss": 4.0047, "step": 6640 }, { "epoch": 0.85056, "grad_norm": 3.1209444999694824, "learning_rate": 6.192660757973758e-06, "loss": 3.973, "step": 6645 }, { "epoch": 0.8512, "grad_norm": 3.401189088821411, "learning_rate": 6.187762712002529e-06, "loss": 3.9679, "step": 6650 }, { "epoch": 0.85184, "grad_norm": 3.1131510734558105, "learning_rate": 6.182863457747188e-06, "loss": 3.971, "step": 6655 }, { "epoch": 0.85248, "grad_norm": 3.463775396347046, "learning_rate": 6.177963000191642e-06, "loss": 3.9328, "step": 6660 }, { "epoch": 0.85312, "grad_norm": 3.3299026489257812, "learning_rate": 6.17306134432101e-06, "loss": 3.9863, "step": 6665 }, { "epoch": 0.85376, "grad_norm": 3.549071788787842, "learning_rate": 6.168158495121637e-06, "loss": 3.9282, "step": 6670 }, { "epoch": 0.8544, "grad_norm": 3.52746844291687, "learning_rate": 6.163254457581083e-06, "loss": 3.8922, "step": 6675 }, { "epoch": 0.85504, "grad_norm": 3.465867757797241, "learning_rate": 6.158349236688111e-06, "loss": 4.0779, "step": 6680 }, { "epoch": 0.85568, "grad_norm": 3.3179733753204346, "learning_rate": 6.153442837432694e-06, "loss": 4.0155, "step": 6685 }, { "epoch": 0.85632, "grad_norm": 3.2581310272216797, "learning_rate": 6.148535264806001e-06, "loss": 3.9893, "step": 6690 }, { "epoch": 0.85696, "grad_norm": 3.1983416080474854, "learning_rate": 6.14362652380039e-06, "loss": 3.945, "step": 6695 }, { "epoch": 0.8576, "grad_norm": 3.1799731254577637, "learning_rate": 6.138716619409416e-06, "loss": 4.0342, "step": 6700 }, { "epoch": 0.8576, "eval_loss": 1.0036591291427612, "eval_runtime": 8.0911, "eval_samples_per_second": 123.593, "eval_steps_per_second": 15.449, "step": 6700 }, { "epoch": 0.85824, "grad_norm": 3.089611291885376, "learning_rate": 6.133805556627813e-06, "loss": 3.9963, "step": 6705 }, { "epoch": 0.85888, "grad_norm": 3.209955930709839, "learning_rate": 6.128893340451495e-06, "loss": 4.0122, "step": 6710 }, { "epoch": 0.85952, "grad_norm": 3.2523767948150635, "learning_rate": 6.123979975877546e-06, "loss": 3.9918, "step": 6715 }, { "epoch": 0.86016, "grad_norm": 3.52829909324646, "learning_rate": 6.11906546790422e-06, "loss": 4.0027, "step": 6720 }, { "epoch": 0.8608, "grad_norm": 3.4737231731414795, "learning_rate": 6.114149821530938e-06, "loss": 4.2401, "step": 6725 }, { "epoch": 0.86144, "grad_norm": 3.210808277130127, "learning_rate": 6.109233041758274e-06, "loss": 3.9913, "step": 6730 }, { "epoch": 0.86208, "grad_norm": 3.2163102626800537, "learning_rate": 6.104315133587955e-06, "loss": 3.9215, "step": 6735 }, { "epoch": 0.86272, "grad_norm": 3.3578877449035645, "learning_rate": 6.099396102022859e-06, "loss": 3.9928, "step": 6740 }, { "epoch": 0.86336, "grad_norm": 3.540017604827881, "learning_rate": 6.094475952067006e-06, "loss": 4.0551, "step": 6745 }, { "epoch": 0.864, "grad_norm": 3.1817259788513184, "learning_rate": 6.089554688725554e-06, "loss": 3.9157, "step": 6750 }, { "epoch": 0.86464, "grad_norm": 3.662182331085205, "learning_rate": 6.0846323170047895e-06, "loss": 3.9394, "step": 6755 }, { "epoch": 0.86528, "grad_norm": 4.058024883270264, "learning_rate": 6.079708841912133e-06, "loss": 4.0893, "step": 6760 }, { "epoch": 0.86592, "grad_norm": 3.132685899734497, "learning_rate": 6.074784268456125e-06, "loss": 3.8952, "step": 6765 }, { "epoch": 0.86656, "grad_norm": 3.383697986602783, "learning_rate": 6.069858601646416e-06, "loss": 4.118, "step": 6770 }, { "epoch": 0.8672, "grad_norm": 3.267975091934204, "learning_rate": 6.064931846493782e-06, "loss": 3.9389, "step": 6775 }, { "epoch": 0.86784, "grad_norm": 3.556748151779175, "learning_rate": 6.060004008010096e-06, "loss": 3.8576, "step": 6780 }, { "epoch": 0.86848, "grad_norm": 3.565812587738037, "learning_rate": 6.05507509120834e-06, "loss": 4.0734, "step": 6785 }, { "epoch": 0.86912, "grad_norm": 3.3282663822174072, "learning_rate": 6.050145101102586e-06, "loss": 4.057, "step": 6790 }, { "epoch": 0.86976, "grad_norm": 3.3860867023468018, "learning_rate": 6.045214042708003e-06, "loss": 3.949, "step": 6795 }, { "epoch": 0.8704, "grad_norm": 3.271362543106079, "learning_rate": 6.0402819210408435e-06, "loss": 3.9863, "step": 6800 }, { "epoch": 0.8704, "eval_loss": 0.998278021812439, "eval_runtime": 7.4136, "eval_samples_per_second": 134.888, "eval_steps_per_second": 16.861, "step": 6800 }, { "epoch": 0.87104, "grad_norm": 3.317915678024292, "learning_rate": 6.035348741118444e-06, "loss": 4.0284, "step": 6805 }, { "epoch": 0.87168, "grad_norm": 3.094139814376831, "learning_rate": 6.030414507959217e-06, "loss": 3.8683, "step": 6810 }, { "epoch": 0.87232, "grad_norm": 3.2260992527008057, "learning_rate": 6.025479226582647e-06, "loss": 4.1184, "step": 6815 }, { "epoch": 0.87296, "grad_norm": 3.1242997646331787, "learning_rate": 6.020542902009282e-06, "loss": 3.9198, "step": 6820 }, { "epoch": 0.8736, "grad_norm": 4.395246505737305, "learning_rate": 6.015605539260736e-06, "loss": 3.8812, "step": 6825 }, { "epoch": 0.87424, "grad_norm": 3.3219218254089355, "learning_rate": 6.010667143359672e-06, "loss": 3.9868, "step": 6830 }, { "epoch": 0.87488, "grad_norm": 3.319570541381836, "learning_rate": 6.005727719329813e-06, "loss": 3.9658, "step": 6835 }, { "epoch": 0.87552, "grad_norm": 3.454033136367798, "learning_rate": 6.000787272195919e-06, "loss": 4.0934, "step": 6840 }, { "epoch": 0.87616, "grad_norm": 3.215515613555908, "learning_rate": 5.995845806983798e-06, "loss": 3.9793, "step": 6845 }, { "epoch": 0.8768, "grad_norm": 3.322244882583618, "learning_rate": 5.99090332872029e-06, "loss": 4.0305, "step": 6850 }, { "epoch": 0.87744, "grad_norm": 3.3186748027801514, "learning_rate": 5.9859598424332656e-06, "loss": 4.034, "step": 6855 }, { "epoch": 0.87808, "grad_norm": 3.5926856994628906, "learning_rate": 5.9810153531516215e-06, "loss": 3.9167, "step": 6860 }, { "epoch": 0.87872, "grad_norm": 3.6226749420166016, "learning_rate": 5.976069865905276e-06, "loss": 3.8649, "step": 6865 }, { "epoch": 0.87936, "grad_norm": 3.1754322052001953, "learning_rate": 5.971123385725159e-06, "loss": 4.06, "step": 6870 }, { "epoch": 0.88, "grad_norm": 3.3285293579101562, "learning_rate": 5.966175917643214e-06, "loss": 3.8871, "step": 6875 }, { "epoch": 0.88064, "grad_norm": 3.416433334350586, "learning_rate": 5.961227466692388e-06, "loss": 3.983, "step": 6880 }, { "epoch": 0.88128, "grad_norm": 3.308425188064575, "learning_rate": 5.95627803790663e-06, "loss": 3.9962, "step": 6885 }, { "epoch": 0.88192, "grad_norm": 3.1291959285736084, "learning_rate": 5.951327636320878e-06, "loss": 4.047, "step": 6890 }, { "epoch": 0.88256, "grad_norm": 3.337362766265869, "learning_rate": 5.946376266971068e-06, "loss": 4.0805, "step": 6895 }, { "epoch": 0.8832, "grad_norm": 3.3893113136291504, "learning_rate": 5.94142393489411e-06, "loss": 4.0791, "step": 6900 }, { "epoch": 0.8832, "eval_loss": 0.9899783730506897, "eval_runtime": 6.6267, "eval_samples_per_second": 150.904, "eval_steps_per_second": 18.863, "step": 6900 }, { "epoch": 0.88384, "grad_norm": 3.4587414264678955, "learning_rate": 5.936470645127906e-06, "loss": 4.0523, "step": 6905 }, { "epoch": 0.88448, "grad_norm": 3.2666921615600586, "learning_rate": 5.93151640271132e-06, "loss": 4.0537, "step": 6910 }, { "epoch": 0.88512, "grad_norm": 3.273695707321167, "learning_rate": 5.926561212684194e-06, "loss": 3.9064, "step": 6915 }, { "epoch": 0.88576, "grad_norm": 3.240837574005127, "learning_rate": 5.921605080087328e-06, "loss": 3.8327, "step": 6920 }, { "epoch": 0.8864, "grad_norm": 3.1848044395446777, "learning_rate": 5.916648009962487e-06, "loss": 3.8566, "step": 6925 }, { "epoch": 0.88704, "grad_norm": 3.440258741378784, "learning_rate": 5.911690007352384e-06, "loss": 3.8696, "step": 6930 }, { "epoch": 0.88768, "grad_norm": 16.90064239501953, "learning_rate": 5.906731077300681e-06, "loss": 4.1432, "step": 6935 }, { "epoch": 0.88832, "grad_norm": 13.862409591674805, "learning_rate": 5.901771224851989e-06, "loss": 3.9778, "step": 6940 }, { "epoch": 0.88896, "grad_norm": 3.237497568130493, "learning_rate": 5.896810455051849e-06, "loss": 3.9928, "step": 6945 }, { "epoch": 0.8896, "grad_norm": 3.367391347885132, "learning_rate": 5.891848772946744e-06, "loss": 3.9045, "step": 6950 }, { "epoch": 0.89024, "grad_norm": 3.2093420028686523, "learning_rate": 5.88688618358408e-06, "loss": 3.9564, "step": 6955 }, { "epoch": 0.89088, "grad_norm": 3.18294358253479, "learning_rate": 5.8819226920121855e-06, "loss": 3.8482, "step": 6960 }, { "epoch": 0.89152, "grad_norm": 3.097348928451538, "learning_rate": 5.876958303280308e-06, "loss": 3.7759, "step": 6965 }, { "epoch": 0.89216, "grad_norm": 3.166609764099121, "learning_rate": 5.871993022438609e-06, "loss": 3.8114, "step": 6970 }, { "epoch": 0.8928, "grad_norm": 3.337101459503174, "learning_rate": 5.867026854538156e-06, "loss": 4.0362, "step": 6975 }, { "epoch": 0.89344, "grad_norm": 3.179570436477661, "learning_rate": 5.862059804630917e-06, "loss": 4.0118, "step": 6980 }, { "epoch": 0.89408, "grad_norm": 3.0153615474700928, "learning_rate": 5.857091877769762e-06, "loss": 3.9093, "step": 6985 }, { "epoch": 0.89472, "grad_norm": 3.310558557510376, "learning_rate": 5.852123079008451e-06, "loss": 3.9642, "step": 6990 }, { "epoch": 0.89536, "grad_norm": 3.084897518157959, "learning_rate": 5.8471534134016274e-06, "loss": 3.9152, "step": 6995 }, { "epoch": 0.896, "grad_norm": 3.2825677394866943, "learning_rate": 5.842182886004823e-06, "loss": 3.9253, "step": 7000 }, { "epoch": 0.896, "eval_loss": 0.981210470199585, "eval_runtime": 6.7345, "eval_samples_per_second": 148.489, "eval_steps_per_second": 18.561, "step": 7000 }, { "epoch": 0.89664, "grad_norm": 3.5258679389953613, "learning_rate": 5.837211501874438e-06, "loss": 3.9532, "step": 7005 }, { "epoch": 0.89728, "grad_norm": 3.6216719150543213, "learning_rate": 5.832239266067754e-06, "loss": 4.1073, "step": 7010 }, { "epoch": 0.89792, "grad_norm": 3.2402870655059814, "learning_rate": 5.8272661836429115e-06, "loss": 3.9726, "step": 7015 }, { "epoch": 0.89856, "grad_norm": 3.2960052490234375, "learning_rate": 5.822292259658914e-06, "loss": 3.795, "step": 7020 }, { "epoch": 0.8992, "grad_norm": 3.0338668823242188, "learning_rate": 5.817317499175622e-06, "loss": 3.8567, "step": 7025 }, { "epoch": 0.89984, "grad_norm": 3.07011342048645, "learning_rate": 5.812341907253749e-06, "loss": 3.8671, "step": 7030 }, { "epoch": 0.90048, "grad_norm": 3.0267298221588135, "learning_rate": 5.807365488954849e-06, "loss": 3.9413, "step": 7035 }, { "epoch": 0.90112, "grad_norm": 3.2175261974334717, "learning_rate": 5.802388249341322e-06, "loss": 3.8435, "step": 7040 }, { "epoch": 0.90176, "grad_norm": 3.4809112548828125, "learning_rate": 5.797410193476399e-06, "loss": 4.1278, "step": 7045 }, { "epoch": 0.9024, "grad_norm": 3.0254108905792236, "learning_rate": 5.792431326424144e-06, "loss": 4.05, "step": 7050 }, { "epoch": 0.90304, "grad_norm": 3.203744888305664, "learning_rate": 5.787451653249448e-06, "loss": 3.8707, "step": 7055 }, { "epoch": 0.90368, "grad_norm": 3.6568257808685303, "learning_rate": 5.782471179018016e-06, "loss": 3.8331, "step": 7060 }, { "epoch": 0.90432, "grad_norm": 3.4851605892181396, "learning_rate": 5.777489908796374e-06, "loss": 4.0123, "step": 7065 }, { "epoch": 0.90496, "grad_norm": 5.221633434295654, "learning_rate": 5.772507847651857e-06, "loss": 3.8164, "step": 7070 }, { "epoch": 0.9056, "grad_norm": 3.0933420658111572, "learning_rate": 5.7675250006525985e-06, "loss": 3.8729, "step": 7075 }, { "epoch": 0.90624, "grad_norm": 3.4247007369995117, "learning_rate": 5.7625413728675405e-06, "loss": 3.9543, "step": 7080 }, { "epoch": 0.90688, "grad_norm": 3.3053174018859863, "learning_rate": 5.75755696936641e-06, "loss": 4.0038, "step": 7085 }, { "epoch": 0.90752, "grad_norm": 3.358633279800415, "learning_rate": 5.752571795219732e-06, "loss": 3.9082, "step": 7090 }, { "epoch": 0.90816, "grad_norm": 3.2632157802581787, "learning_rate": 5.74758585549881e-06, "loss": 3.9467, "step": 7095 }, { "epoch": 0.9088, "grad_norm": 3.311492443084717, "learning_rate": 5.742599155275726e-06, "loss": 3.9281, "step": 7100 }, { "epoch": 0.9088, "eval_loss": 1.0036808252334595, "eval_runtime": 6.9664, "eval_samples_per_second": 143.545, "eval_steps_per_second": 17.943, "step": 7100 }, { "epoch": 0.90944, "grad_norm": 3.206937789916992, "learning_rate": 5.73761169962334e-06, "loss": 4.1132, "step": 7105 }, { "epoch": 0.91008, "grad_norm": 3.1101412773132324, "learning_rate": 5.732623493615273e-06, "loss": 3.9453, "step": 7110 }, { "epoch": 0.91072, "grad_norm": 3.2870442867279053, "learning_rate": 5.72763454232592e-06, "loss": 3.9204, "step": 7115 }, { "epoch": 0.91136, "grad_norm": 3.2983288764953613, "learning_rate": 5.722644850830423e-06, "loss": 3.9068, "step": 7120 }, { "epoch": 0.912, "grad_norm": 3.8806018829345703, "learning_rate": 5.717654424204686e-06, "loss": 3.9637, "step": 7125 }, { "epoch": 0.91264, "grad_norm": 3.4543910026550293, "learning_rate": 5.7126632675253555e-06, "loss": 3.9966, "step": 7130 }, { "epoch": 0.91328, "grad_norm": 3.230792760848999, "learning_rate": 5.707671385869822e-06, "loss": 3.9935, "step": 7135 }, { "epoch": 0.91392, "grad_norm": 3.474703073501587, "learning_rate": 5.702678784316213e-06, "loss": 3.8884, "step": 7140 }, { "epoch": 0.91456, "grad_norm": 3.442061185836792, "learning_rate": 5.697685467943391e-06, "loss": 3.9302, "step": 7145 }, { "epoch": 0.9152, "grad_norm": 3.4667983055114746, "learning_rate": 5.6926914418309405e-06, "loss": 4.066, "step": 7150 }, { "epoch": 0.91584, "grad_norm": 3.329434394836426, "learning_rate": 5.687696711059174e-06, "loss": 3.9571, "step": 7155 }, { "epoch": 0.91648, "grad_norm": 3.290762424468994, "learning_rate": 5.682701280709117e-06, "loss": 3.9454, "step": 7160 }, { "epoch": 0.91712, "grad_norm": 3.295773983001709, "learning_rate": 5.677705155862508e-06, "loss": 3.9735, "step": 7165 }, { "epoch": 0.91776, "grad_norm": 3.379249334335327, "learning_rate": 5.672708341601791e-06, "loss": 4.0056, "step": 7170 }, { "epoch": 0.9184, "grad_norm": 3.3451154232025146, "learning_rate": 5.667710843010113e-06, "loss": 4.0466, "step": 7175 }, { "epoch": 0.91904, "grad_norm": 3.2415895462036133, "learning_rate": 5.662712665171315e-06, "loss": 3.8953, "step": 7180 }, { "epoch": 0.91968, "grad_norm": 3.1182918548583984, "learning_rate": 5.657713813169932e-06, "loss": 3.9718, "step": 7185 }, { "epoch": 0.92032, "grad_norm": 3.2211804389953613, "learning_rate": 5.6527142920911796e-06, "loss": 3.9707, "step": 7190 }, { "epoch": 0.92096, "grad_norm": 3.490556478500366, "learning_rate": 5.64771410702096e-06, "loss": 3.9644, "step": 7195 }, { "epoch": 0.9216, "grad_norm": 3.5458273887634277, "learning_rate": 5.642713263045847e-06, "loss": 4.0083, "step": 7200 }, { "epoch": 0.9216, "eval_loss": 0.9912606477737427, "eval_runtime": 7.2742, "eval_samples_per_second": 137.471, "eval_steps_per_second": 17.184, "step": 7200 }, { "epoch": 0.92224, "grad_norm": 3.4939253330230713, "learning_rate": 5.637711765253088e-06, "loss": 3.9563, "step": 7205 }, { "epoch": 0.92288, "grad_norm": 3.1346709728240967, "learning_rate": 5.63270961873059e-06, "loss": 3.9072, "step": 7210 }, { "epoch": 0.92352, "grad_norm": 3.3341445922851562, "learning_rate": 5.627706828566928e-06, "loss": 3.9547, "step": 7215 }, { "epoch": 0.92416, "grad_norm": 3.094125986099243, "learning_rate": 5.622703399851321e-06, "loss": 3.9011, "step": 7220 }, { "epoch": 0.9248, "grad_norm": 3.118218421936035, "learning_rate": 5.61769933767365e-06, "loss": 3.7838, "step": 7225 }, { "epoch": 0.92544, "grad_norm": 3.289456844329834, "learning_rate": 5.61269464712443e-06, "loss": 4.0058, "step": 7230 }, { "epoch": 0.92608, "grad_norm": 3.357569694519043, "learning_rate": 5.6076893332948215e-06, "loss": 4.03, "step": 7235 }, { "epoch": 0.92672, "grad_norm": 3.568295955657959, "learning_rate": 5.6026834012766155e-06, "loss": 3.9034, "step": 7240 }, { "epoch": 0.92736, "grad_norm": 3.2108635902404785, "learning_rate": 5.597676856162235e-06, "loss": 4.0486, "step": 7245 }, { "epoch": 0.928, "grad_norm": 3.312995672225952, "learning_rate": 5.592669703044722e-06, "loss": 3.9875, "step": 7250 }, { "epoch": 0.92864, "grad_norm": 3.5728683471679688, "learning_rate": 5.587661947017744e-06, "loss": 4.0232, "step": 7255 }, { "epoch": 0.92928, "grad_norm": 3.0373477935791016, "learning_rate": 5.582653593175574e-06, "loss": 3.8868, "step": 7260 }, { "epoch": 0.92992, "grad_norm": 3.6742284297943115, "learning_rate": 5.577644646613099e-06, "loss": 3.8612, "step": 7265 }, { "epoch": 0.93056, "grad_norm": 3.126483917236328, "learning_rate": 5.572635112425806e-06, "loss": 3.7641, "step": 7270 }, { "epoch": 0.9312, "grad_norm": 3.2737932205200195, "learning_rate": 5.567624995709781e-06, "loss": 3.9605, "step": 7275 }, { "epoch": 0.93184, "grad_norm": 3.3350517749786377, "learning_rate": 5.562614301561704e-06, "loss": 4.0245, "step": 7280 }, { "epoch": 0.93248, "grad_norm": 3.372002601623535, "learning_rate": 5.557603035078838e-06, "loss": 3.999, "step": 7285 }, { "epoch": 0.93312, "grad_norm": 3.2564427852630615, "learning_rate": 5.552591201359031e-06, "loss": 3.8675, "step": 7290 }, { "epoch": 0.93376, "grad_norm": 3.0988609790802, "learning_rate": 5.547578805500711e-06, "loss": 3.804, "step": 7295 }, { "epoch": 0.9344, "grad_norm": 3.1962246894836426, "learning_rate": 5.542565852602872e-06, "loss": 3.986, "step": 7300 }, { "epoch": 0.9344, "eval_loss": 0.9871136546134949, "eval_runtime": 6.8497, "eval_samples_per_second": 145.991, "eval_steps_per_second": 18.249, "step": 7300 }, { "epoch": 0.93504, "grad_norm": 3.38354229927063, "learning_rate": 5.537552347765078e-06, "loss": 3.8876, "step": 7305 }, { "epoch": 0.93568, "grad_norm": 3.4597370624542236, "learning_rate": 5.5325382960874544e-06, "loss": 3.9455, "step": 7310 }, { "epoch": 0.93632, "grad_norm": 3.528806686401367, "learning_rate": 5.5275237026706805e-06, "loss": 4.0232, "step": 7315 }, { "epoch": 0.93696, "grad_norm": 3.351661205291748, "learning_rate": 5.522508572615993e-06, "loss": 3.9921, "step": 7320 }, { "epoch": 0.9376, "grad_norm": 3.342860221862793, "learning_rate": 5.517492911025165e-06, "loss": 4.051, "step": 7325 }, { "epoch": 0.93824, "grad_norm": 3.2882394790649414, "learning_rate": 5.51247672300052e-06, "loss": 3.819, "step": 7330 }, { "epoch": 0.93888, "grad_norm": 3.3351001739501953, "learning_rate": 5.507460013644907e-06, "loss": 3.873, "step": 7335 }, { "epoch": 0.93952, "grad_norm": 3.4074318408966064, "learning_rate": 5.502442788061718e-06, "loss": 3.7729, "step": 7340 }, { "epoch": 0.94016, "grad_norm": 3.293647289276123, "learning_rate": 5.497425051354856e-06, "loss": 3.859, "step": 7345 }, { "epoch": 0.9408, "grad_norm": 3.289747714996338, "learning_rate": 5.492406808628757e-06, "loss": 4.0432, "step": 7350 }, { "epoch": 0.94144, "grad_norm": 3.2682368755340576, "learning_rate": 5.487388064988361e-06, "loss": 4.0227, "step": 7355 }, { "epoch": 0.94208, "grad_norm": 3.1242454051971436, "learning_rate": 5.482368825539125e-06, "loss": 3.9899, "step": 7360 }, { "epoch": 0.94272, "grad_norm": 5.209671974182129, "learning_rate": 5.478353080428558e-06, "loss": 3.8918, "step": 7365 }, { "epoch": 0.94336, "grad_norm": 3.0842902660369873, "learning_rate": 5.4733329613907585e-06, "loss": 3.8996, "step": 7370 }, { "epoch": 0.944, "grad_norm": 3.6524879932403564, "learning_rate": 5.468312360842056e-06, "loss": 4.1494, "step": 7375 }, { "epoch": 0.94464, "grad_norm": 3.5056962966918945, "learning_rate": 5.463291283889796e-06, "loss": 3.766, "step": 7380 }, { "epoch": 0.94528, "grad_norm": 3.3486692905426025, "learning_rate": 5.4582697356418036e-06, "loss": 3.9352, "step": 7385 }, { "epoch": 0.94592, "grad_norm": 3.435131549835205, "learning_rate": 5.4532477212063876e-06, "loss": 4.045, "step": 7390 }, { "epoch": 0.94656, "grad_norm": 3.6526622772216797, "learning_rate": 5.448225245692329e-06, "loss": 4.046, "step": 7395 }, { "epoch": 0.9472, "grad_norm": 3.1922192573547363, "learning_rate": 5.443202314208879e-06, "loss": 4.0255, "step": 7400 }, { "epoch": 0.9472, "eval_loss": 0.9677584767341614, "eval_runtime": 8.257, "eval_samples_per_second": 121.109, "eval_steps_per_second": 15.139, "step": 7400 }, { "epoch": 0.94784, "grad_norm": 3.202497959136963, "learning_rate": 5.4381789318657505e-06, "loss": 3.9725, "step": 7405 }, { "epoch": 0.94848, "grad_norm": 3.406205177307129, "learning_rate": 5.433155103773118e-06, "loss": 3.8766, "step": 7410 }, { "epoch": 0.94912, "grad_norm": 3.2586803436279297, "learning_rate": 5.428130835041609e-06, "loss": 3.8658, "step": 7415 }, { "epoch": 0.94976, "grad_norm": 3.390014410018921, "learning_rate": 5.4231061307822966e-06, "loss": 4.0069, "step": 7420 }, { "epoch": 0.9504, "grad_norm": 3.1910688877105713, "learning_rate": 5.418080996106698e-06, "loss": 3.9555, "step": 7425 }, { "epoch": 0.95104, "grad_norm": 3.1069347858428955, "learning_rate": 5.413055436126771e-06, "loss": 3.8353, "step": 7430 }, { "epoch": 0.95168, "grad_norm": 3.4949285984039307, "learning_rate": 5.408029455954902e-06, "loss": 4.0338, "step": 7435 }, { "epoch": 0.95232, "grad_norm": 3.304924488067627, "learning_rate": 5.403003060703908e-06, "loss": 4.0458, "step": 7440 }, { "epoch": 0.95296, "grad_norm": 3.4705872535705566, "learning_rate": 5.397976255487028e-06, "loss": 4.0438, "step": 7445 }, { "epoch": 0.9536, "grad_norm": 3.259751796722412, "learning_rate": 5.3929490454179155e-06, "loss": 3.8711, "step": 7450 }, { "epoch": 0.95424, "grad_norm": 3.047318458557129, "learning_rate": 5.387921435610637e-06, "loss": 3.9242, "step": 7455 }, { "epoch": 0.95488, "grad_norm": 3.2884445190429688, "learning_rate": 5.382893431179668e-06, "loss": 3.8527, "step": 7460 }, { "epoch": 0.95552, "grad_norm": 3.164912462234497, "learning_rate": 5.377865037239882e-06, "loss": 3.9622, "step": 7465 }, { "epoch": 0.95616, "grad_norm": 3.415105104446411, "learning_rate": 5.372836258906552e-06, "loss": 3.8921, "step": 7470 }, { "epoch": 0.9568, "grad_norm": 3.607869863510132, "learning_rate": 5.367807101295337e-06, "loss": 3.8691, "step": 7475 }, { "epoch": 0.95744, "grad_norm": 3.4621803760528564, "learning_rate": 5.362777569522288e-06, "loss": 3.9839, "step": 7480 }, { "epoch": 0.95808, "grad_norm": 3.2376821041107178, "learning_rate": 5.357747668703834e-06, "loss": 3.9305, "step": 7485 }, { "epoch": 0.95872, "grad_norm": 3.524980306625366, "learning_rate": 5.352717403956777e-06, "loss": 3.9235, "step": 7490 }, { "epoch": 0.95936, "grad_norm": 3.1107211112976074, "learning_rate": 5.347686780398293e-06, "loss": 3.9071, "step": 7495 }, { "epoch": 0.96, "grad_norm": 3.0860273838043213, "learning_rate": 5.342655803145923e-06, "loss": 3.9086, "step": 7500 }, { "epoch": 0.96, "eval_loss": 0.9887832403182983, "eval_runtime": 7.0587, "eval_samples_per_second": 141.668, "eval_steps_per_second": 17.709, "step": 7500 }, { "epoch": 0.96064, "grad_norm": 3.2908318042755127, "learning_rate": 5.337624477317562e-06, "loss": 3.8602, "step": 7505 }, { "epoch": 0.96128, "grad_norm": 3.1805858612060547, "learning_rate": 5.332592808031467e-06, "loss": 3.8726, "step": 7510 }, { "epoch": 0.96192, "grad_norm": 3.564436912536621, "learning_rate": 5.327560800406241e-06, "loss": 3.9942, "step": 7515 }, { "epoch": 0.96256, "grad_norm": 3.2359700202941895, "learning_rate": 5.322528459560829e-06, "loss": 4.0791, "step": 7520 }, { "epoch": 0.9632, "grad_norm": 3.3315274715423584, "learning_rate": 5.317495790614522e-06, "loss": 3.9426, "step": 7525 }, { "epoch": 0.96384, "grad_norm": 3.3361990451812744, "learning_rate": 5.312462798686935e-06, "loss": 3.9885, "step": 7530 }, { "epoch": 0.96448, "grad_norm": 3.4341814517974854, "learning_rate": 5.30742948889802e-06, "loss": 4.0134, "step": 7535 }, { "epoch": 0.96512, "grad_norm": 3.099053144454956, "learning_rate": 5.302395866368046e-06, "loss": 3.9386, "step": 7540 }, { "epoch": 0.96576, "grad_norm": 3.2882211208343506, "learning_rate": 5.2973619362176064e-06, "loss": 3.9915, "step": 7545 }, { "epoch": 0.9664, "grad_norm": 3.2575504779815674, "learning_rate": 5.292327703567604e-06, "loss": 3.9657, "step": 7550 }, { "epoch": 0.96704, "grad_norm": 3.5143399238586426, "learning_rate": 5.287293173539248e-06, "loss": 3.9202, "step": 7555 }, { "epoch": 0.96768, "grad_norm": 3.3682284355163574, "learning_rate": 5.282258351254054e-06, "loss": 3.9375, "step": 7560 }, { "epoch": 0.96832, "grad_norm": 3.3960514068603516, "learning_rate": 5.277223241833831e-06, "loss": 3.8977, "step": 7565 }, { "epoch": 0.96896, "grad_norm": 3.5102431774139404, "learning_rate": 5.27218785040068e-06, "loss": 4.0427, "step": 7570 }, { "epoch": 0.9696, "grad_norm": 3.05222487449646, "learning_rate": 5.267152182076996e-06, "loss": 3.8364, "step": 7575 }, { "epoch": 0.97024, "grad_norm": 3.2691268920898438, "learning_rate": 5.262116241985446e-06, "loss": 3.8768, "step": 7580 }, { "epoch": 0.97088, "grad_norm": 3.137843132019043, "learning_rate": 5.257080035248977e-06, "loss": 3.9938, "step": 7585 }, { "epoch": 0.97152, "grad_norm": 3.9772887229919434, "learning_rate": 5.2520435669908106e-06, "loss": 3.9055, "step": 7590 }, { "epoch": 0.97216, "grad_norm": 3.306682825088501, "learning_rate": 5.247006842334433e-06, "loss": 4.0086, "step": 7595 }, { "epoch": 0.9728, "grad_norm": 3.2649736404418945, "learning_rate": 5.241969866403588e-06, "loss": 3.9806, "step": 7600 }, { "epoch": 0.9728, "eval_loss": 0.9813916683197021, "eval_runtime": 6.8057, "eval_samples_per_second": 146.935, "eval_steps_per_second": 18.367, "step": 7600 }, { "epoch": 0.97344, "grad_norm": 3.297919988632202, "learning_rate": 5.236932644322278e-06, "loss": 3.9641, "step": 7605 }, { "epoch": 0.97408, "grad_norm": 3.2999958992004395, "learning_rate": 5.231895181214753e-06, "loss": 3.8996, "step": 7610 }, { "epoch": 0.97472, "grad_norm": 3.5226521492004395, "learning_rate": 5.226857482205513e-06, "loss": 3.9295, "step": 7615 }, { "epoch": 0.97536, "grad_norm": 3.5334079265594482, "learning_rate": 5.221819552419293e-06, "loss": 3.8712, "step": 7620 }, { "epoch": 0.976, "grad_norm": 3.410144805908203, "learning_rate": 5.216781396981066e-06, "loss": 3.9702, "step": 7625 }, { "epoch": 0.97664, "grad_norm": 3.240433931350708, "learning_rate": 5.211743021016033e-06, "loss": 4.043, "step": 7630 }, { "epoch": 0.97728, "grad_norm": 3.2035484313964844, "learning_rate": 5.206704429649621e-06, "loss": 3.8622, "step": 7635 }, { "epoch": 0.97792, "grad_norm": 3.1555721759796143, "learning_rate": 5.2016656280074725e-06, "loss": 3.9523, "step": 7640 }, { "epoch": 0.97856, "grad_norm": 3.2172229290008545, "learning_rate": 5.196626621215449e-06, "loss": 3.9912, "step": 7645 }, { "epoch": 0.9792, "grad_norm": 3.324141025543213, "learning_rate": 5.191587414399615e-06, "loss": 3.9707, "step": 7650 }, { "epoch": 0.97984, "grad_norm": 3.333359479904175, "learning_rate": 5.1865480126862436e-06, "loss": 3.8726, "step": 7655 }, { "epoch": 0.98048, "grad_norm": 3.3763959407806396, "learning_rate": 5.181508421201803e-06, "loss": 4.063, "step": 7660 }, { "epoch": 0.98112, "grad_norm": 3.8444693088531494, "learning_rate": 5.1764686450729575e-06, "loss": 3.9117, "step": 7665 }, { "epoch": 0.98176, "grad_norm": 3.223740577697754, "learning_rate": 5.171428689426554e-06, "loss": 3.8024, "step": 7670 }, { "epoch": 0.9824, "grad_norm": 3.107224941253662, "learning_rate": 5.166388559389628e-06, "loss": 3.9839, "step": 7675 }, { "epoch": 0.98304, "grad_norm": 3.4000933170318604, "learning_rate": 5.161348260089388e-06, "loss": 4.0005, "step": 7680 }, { "epoch": 0.98368, "grad_norm": 3.662874937057495, "learning_rate": 5.156307796653217e-06, "loss": 3.896, "step": 7685 }, { "epoch": 0.98432, "grad_norm": 3.305297613143921, "learning_rate": 5.151267174208665e-06, "loss": 3.9105, "step": 7690 }, { "epoch": 0.98496, "grad_norm": 3.186823844909668, "learning_rate": 5.146226397883442e-06, "loss": 3.9882, "step": 7695 }, { "epoch": 0.9856, "grad_norm": 3.5797548294067383, "learning_rate": 5.1411854728054155e-06, "loss": 4.0733, "step": 7700 }, { "epoch": 0.9856, "eval_loss": 0.9768617749214172, "eval_runtime": 7.1654, "eval_samples_per_second": 139.559, "eval_steps_per_second": 17.445, "step": 7700 }, { "epoch": 0.98624, "grad_norm": 3.3153810501098633, "learning_rate": 5.136144404102606e-06, "loss": 4.0024, "step": 7705 }, { "epoch": 0.98688, "grad_norm": 3.4547183513641357, "learning_rate": 5.131103196903175e-06, "loss": 4.0543, "step": 7710 }, { "epoch": 0.98752, "grad_norm": 3.2928266525268555, "learning_rate": 5.126061856335432e-06, "loss": 3.9963, "step": 7715 }, { "epoch": 0.98816, "grad_norm": 3.229193687438965, "learning_rate": 5.121020387527818e-06, "loss": 4.004, "step": 7720 }, { "epoch": 0.9888, "grad_norm": 3.1806933879852295, "learning_rate": 5.115978795608903e-06, "loss": 3.9922, "step": 7725 }, { "epoch": 0.98944, "grad_norm": 3.0850744247436523, "learning_rate": 5.110937085707388e-06, "loss": 3.8514, "step": 7730 }, { "epoch": 0.99008, "grad_norm": 3.4792258739471436, "learning_rate": 5.105895262952087e-06, "loss": 3.9743, "step": 7735 }, { "epoch": 0.99072, "grad_norm": 3.4325263500213623, "learning_rate": 5.100853332471932e-06, "loss": 3.9103, "step": 7740 }, { "epoch": 0.99136, "grad_norm": 3.19690203666687, "learning_rate": 5.095811299395967e-06, "loss": 3.9015, "step": 7745 }, { "epoch": 0.992, "grad_norm": 3.202651023864746, "learning_rate": 5.090769168853337e-06, "loss": 3.8134, "step": 7750 }, { "epoch": 0.99264, "grad_norm": 3.395777940750122, "learning_rate": 5.085726945973285e-06, "loss": 3.9866, "step": 7755 }, { "epoch": 0.99328, "grad_norm": 3.347100019454956, "learning_rate": 5.080684635885155e-06, "loss": 3.9207, "step": 7760 }, { "epoch": 0.99392, "grad_norm": 3.1593613624572754, "learning_rate": 5.0756422437183705e-06, "loss": 3.9379, "step": 7765 }, { "epoch": 0.99456, "grad_norm": 3.2734835147857666, "learning_rate": 5.070599774602445e-06, "loss": 3.7975, "step": 7770 }, { "epoch": 0.9952, "grad_norm": 3.32014536857605, "learning_rate": 5.065557233666968e-06, "loss": 4.0122, "step": 7775 }, { "epoch": 0.99584, "grad_norm": 3.6023435592651367, "learning_rate": 5.060514626041602e-06, "loss": 3.9277, "step": 7780 }, { "epoch": 0.99648, "grad_norm": 3.0981125831604004, "learning_rate": 5.055471956856076e-06, "loss": 3.8617, "step": 7785 }, { "epoch": 0.99712, "grad_norm": 3.361649513244629, "learning_rate": 5.0504292312401845e-06, "loss": 3.8414, "step": 7790 }, { "epoch": 0.99776, "grad_norm": 3.3960330486297607, "learning_rate": 5.0453864543237786e-06, "loss": 3.8511, "step": 7795 }, { "epoch": 0.9984, "grad_norm": 3.236677646636963, "learning_rate": 5.040343631236761e-06, "loss": 3.8611, "step": 7800 }, { "epoch": 0.9984, "eval_loss": 0.991203248500824, "eval_runtime": 7.463, "eval_samples_per_second": 133.995, "eval_steps_per_second": 16.749, "step": 7800 }, { "epoch": 0.99904, "grad_norm": 3.3027491569519043, "learning_rate": 5.035300767109081e-06, "loss": 3.8784, "step": 7805 }, { "epoch": 0.99968, "grad_norm": 3.059692144393921, "learning_rate": 5.03025786707073e-06, "loss": 3.856, "step": 7810 }, { "epoch": 1.000256, "grad_norm": 3.319530725479126, "learning_rate": 5.025214936251735e-06, "loss": 3.515, "step": 7815 }, { "epoch": 1.000896, "grad_norm": 3.3164639472961426, "learning_rate": 5.0201719797821595e-06, "loss": 3.8596, "step": 7820 }, { "epoch": 1.001536, "grad_norm": 3.717644691467285, "learning_rate": 5.015129002792082e-06, "loss": 3.9824, "step": 7825 }, { "epoch": 1.002176, "grad_norm": 3.3352081775665283, "learning_rate": 5.0100860104116135e-06, "loss": 3.9783, "step": 7830 }, { "epoch": 1.002816, "grad_norm": 3.228071689605713, "learning_rate": 5.0050430077708756e-06, "loss": 4.0029, "step": 7835 }, { "epoch": 1.003456, "grad_norm": 3.2079076766967773, "learning_rate": 5e-06, "loss": 3.8237, "step": 7840 }, { "epoch": 1.004096, "grad_norm": 3.2352001667022705, "learning_rate": 4.994956992229126e-06, "loss": 3.9639, "step": 7845 }, { "epoch": 1.004736, "grad_norm": 3.2654805183410645, "learning_rate": 4.989913989588388e-06, "loss": 3.9077, "step": 7850 }, { "epoch": 1.005376, "grad_norm": 3.5538816452026367, "learning_rate": 4.9848709972079195e-06, "loss": 3.96, "step": 7855 }, { "epoch": 1.006016, "grad_norm": 3.4138593673706055, "learning_rate": 4.979828020217843e-06, "loss": 3.9993, "step": 7860 }, { "epoch": 1.006656, "grad_norm": 3.351036787033081, "learning_rate": 4.974785063748266e-06, "loss": 3.9911, "step": 7865 }, { "epoch": 1.007296, "grad_norm": 3.1687943935394287, "learning_rate": 4.969742132929272e-06, "loss": 3.9839, "step": 7870 }, { "epoch": 1.007936, "grad_norm": 3.325587034225464, "learning_rate": 4.964699232890919e-06, "loss": 3.9703, "step": 7875 }, { "epoch": 1.008576, "grad_norm": 3.418788194656372, "learning_rate": 4.95965636876324e-06, "loss": 3.9424, "step": 7880 }, { "epoch": 1.009216, "grad_norm": 3.2540955543518066, "learning_rate": 4.954613545676223e-06, "loss": 3.9614, "step": 7885 }, { "epoch": 1.009856, "grad_norm": 3.170858383178711, "learning_rate": 4.949570768759817e-06, "loss": 4.0629, "step": 7890 }, { "epoch": 1.010496, "grad_norm": 3.227459192276001, "learning_rate": 4.944528043143926e-06, "loss": 3.9351, "step": 7895 }, { "epoch": 1.011136, "grad_norm": 3.218439817428589, "learning_rate": 4.9394853739584e-06, "loss": 3.8864, "step": 7900 }, { "epoch": 1.011136, "eval_loss": 0.9792375564575195, "eval_runtime": 6.9749, "eval_samples_per_second": 143.371, "eval_steps_per_second": 17.921, "step": 7900 }, { "epoch": 1.011776, "grad_norm": 3.564470052719116, "learning_rate": 4.934442766333034e-06, "loss": 3.9028, "step": 7905 }, { "epoch": 1.012416, "grad_norm": 3.1783535480499268, "learning_rate": 4.9294002253975575e-06, "loss": 3.8552, "step": 7910 }, { "epoch": 1.013056, "grad_norm": 3.098893165588379, "learning_rate": 4.92435775628163e-06, "loss": 3.8879, "step": 7915 }, { "epoch": 1.013696, "grad_norm": 3.3577730655670166, "learning_rate": 4.9193153641148465e-06, "loss": 3.8814, "step": 7920 }, { "epoch": 1.014336, "grad_norm": 3.4143130779266357, "learning_rate": 4.914273054026717e-06, "loss": 3.9688, "step": 7925 }, { "epoch": 1.014976, "grad_norm": 3.3033559322357178, "learning_rate": 4.9092308311466655e-06, "loss": 3.9102, "step": 7930 }, { "epoch": 1.015616, "grad_norm": 3.276418924331665, "learning_rate": 4.904188700604033e-06, "loss": 4.0419, "step": 7935 }, { "epoch": 1.016256, "grad_norm": 3.209242105484009, "learning_rate": 4.899146667528069e-06, "loss": 3.8513, "step": 7940 }, { "epoch": 1.016896, "grad_norm": 3.382869005203247, "learning_rate": 4.894104737047916e-06, "loss": 3.9987, "step": 7945 }, { "epoch": 1.017536, "grad_norm": 3.677175760269165, "learning_rate": 4.889062914292615e-06, "loss": 3.8717, "step": 7950 }, { "epoch": 1.018176, "grad_norm": 3.4745471477508545, "learning_rate": 4.884021204391097e-06, "loss": 3.9395, "step": 7955 }, { "epoch": 1.018816, "grad_norm": 3.321906805038452, "learning_rate": 4.878979612472183e-06, "loss": 3.8124, "step": 7960 }, { "epoch": 1.019456, "grad_norm": 3.5144152641296387, "learning_rate": 4.8739381436645685e-06, "loss": 3.9267, "step": 7965 }, { "epoch": 1.020096, "grad_norm": 3.3462071418762207, "learning_rate": 4.8688968030968265e-06, "loss": 3.9816, "step": 7970 }, { "epoch": 1.020736, "grad_norm": 3.219229221343994, "learning_rate": 4.863855595897395e-06, "loss": 3.8884, "step": 7975 }, { "epoch": 1.021376, "grad_norm": 3.2813336849212646, "learning_rate": 4.858814527194586e-06, "loss": 4.0285, "step": 7980 }, { "epoch": 1.022016, "grad_norm": 3.3408925533294678, "learning_rate": 4.85377360211656e-06, "loss": 3.9853, "step": 7985 }, { "epoch": 1.022656, "grad_norm": 3.3936686515808105, "learning_rate": 4.848732825791338e-06, "loss": 3.8653, "step": 7990 }, { "epoch": 1.023296, "grad_norm": 3.290966749191284, "learning_rate": 4.843692203346783e-06, "loss": 4.0207, "step": 7995 }, { "epoch": 1.023936, "grad_norm": 3.2207400798797607, "learning_rate": 4.838651739910613e-06, "loss": 3.8461, "step": 8000 }, { "epoch": 1.023936, "eval_loss": 0.9756978154182434, "eval_runtime": 6.6826, "eval_samples_per_second": 149.642, "eval_steps_per_second": 18.705, "step": 8000 }, { "epoch": 1.024576, "grad_norm": 3.4297149181365967, "learning_rate": 4.8336114406103725e-06, "loss": 3.9118, "step": 8005 }, { "epoch": 1.025216, "grad_norm": 3.3404061794281006, "learning_rate": 4.828571310573447e-06, "loss": 3.8817, "step": 8010 }, { "epoch": 1.025856, "grad_norm": 3.070565700531006, "learning_rate": 4.823531354927046e-06, "loss": 3.8727, "step": 8015 }, { "epoch": 1.026496, "grad_norm": 3.4748332500457764, "learning_rate": 4.8184915787981975e-06, "loss": 4.0154, "step": 8020 }, { "epoch": 1.027136, "grad_norm": 3.987419366836548, "learning_rate": 4.813451987313758e-06, "loss": 3.8994, "step": 8025 }, { "epoch": 1.027776, "grad_norm": 3.248142719268799, "learning_rate": 4.808412585600387e-06, "loss": 3.9946, "step": 8030 }, { "epoch": 1.028416, "grad_norm": 3.4826786518096924, "learning_rate": 4.8033733787845535e-06, "loss": 3.7973, "step": 8035 }, { "epoch": 1.029056, "grad_norm": 3.183786630630493, "learning_rate": 4.7983343719925275e-06, "loss": 4.1816, "step": 8040 }, { "epoch": 1.029696, "grad_norm": 3.292250394821167, "learning_rate": 4.79329557035038e-06, "loss": 3.9218, "step": 8045 }, { "epoch": 1.030336, "grad_norm": 3.4107160568237305, "learning_rate": 4.788256978983968e-06, "loss": 3.9041, "step": 8050 }, { "epoch": 1.030976, "grad_norm": 3.4038548469543457, "learning_rate": 4.783218603018936e-06, "loss": 3.965, "step": 8055 }, { "epoch": 1.031616, "grad_norm": 3.390043258666992, "learning_rate": 4.778180447580707e-06, "loss": 3.7506, "step": 8060 }, { "epoch": 1.032256, "grad_norm": 3.275566577911377, "learning_rate": 4.773142517794488e-06, "loss": 3.9673, "step": 8065 }, { "epoch": 1.032896, "grad_norm": 3.6536107063293457, "learning_rate": 4.768104818785248e-06, "loss": 3.8182, "step": 8070 }, { "epoch": 1.033536, "grad_norm": 3.2812538146972656, "learning_rate": 4.763067355677724e-06, "loss": 3.9372, "step": 8075 }, { "epoch": 1.034176, "grad_norm": 3.0662550926208496, "learning_rate": 4.758030133596413e-06, "loss": 3.8224, "step": 8080 }, { "epoch": 1.034816, "grad_norm": 3.2141222953796387, "learning_rate": 4.752993157665568e-06, "loss": 3.9668, "step": 8085 }, { "epoch": 1.035456, "grad_norm": 3.3035459518432617, "learning_rate": 4.74795643300919e-06, "loss": 3.8405, "step": 8090 }, { "epoch": 1.036096, "grad_norm": 3.179811477661133, "learning_rate": 4.742919964751025e-06, "loss": 3.8759, "step": 8095 }, { "epoch": 1.036736, "grad_norm": 3.3282361030578613, "learning_rate": 4.737883758014557e-06, "loss": 3.8409, "step": 8100 }, { "epoch": 1.036736, "eval_loss": 0.9715144038200378, "eval_runtime": 6.8358, "eval_samples_per_second": 146.288, "eval_steps_per_second": 18.286, "step": 8100 }, { "epoch": 1.037376, "grad_norm": 3.3947505950927734, "learning_rate": 4.732847817923005e-06, "loss": 3.9228, "step": 8105 }, { "epoch": 1.038016, "grad_norm": 3.3737497329711914, "learning_rate": 4.7278121495993205e-06, "loss": 3.9072, "step": 8110 }, { "epoch": 1.038656, "grad_norm": 3.2646024227142334, "learning_rate": 4.7227767581661714e-06, "loss": 3.7719, "step": 8115 }, { "epoch": 1.039296, "grad_norm": 3.168193817138672, "learning_rate": 4.717741648745946e-06, "loss": 3.8807, "step": 8120 }, { "epoch": 1.039936, "grad_norm": 4.204946041107178, "learning_rate": 4.712706826460753e-06, "loss": 3.8374, "step": 8125 }, { "epoch": 1.040576, "grad_norm": 3.0786685943603516, "learning_rate": 4.707672296432397e-06, "loss": 3.8454, "step": 8130 }, { "epoch": 1.041216, "grad_norm": 3.35418701171875, "learning_rate": 4.702638063782394e-06, "loss": 3.9096, "step": 8135 }, { "epoch": 1.041856, "grad_norm": 3.190556526184082, "learning_rate": 4.6976041336319545e-06, "loss": 3.8653, "step": 8140 }, { "epoch": 1.042496, "grad_norm": 3.4106967449188232, "learning_rate": 4.692570511101982e-06, "loss": 3.9108, "step": 8145 }, { "epoch": 1.043136, "grad_norm": 3.1676368713378906, "learning_rate": 4.687537201313067e-06, "loss": 3.8812, "step": 8150 }, { "epoch": 1.043776, "grad_norm": 3.254140615463257, "learning_rate": 4.682504209385481e-06, "loss": 3.9211, "step": 8155 }, { "epoch": 1.044416, "grad_norm": 3.426715850830078, "learning_rate": 4.677471540439171e-06, "loss": 3.763, "step": 8160 }, { "epoch": 1.045056, "grad_norm": 3.3520352840423584, "learning_rate": 4.672439199593761e-06, "loss": 3.8946, "step": 8165 }, { "epoch": 1.045696, "grad_norm": 3.071427583694458, "learning_rate": 4.667407191968535e-06, "loss": 3.9504, "step": 8170 }, { "epoch": 1.046336, "grad_norm": 3.1410775184631348, "learning_rate": 4.662375522682439e-06, "loss": 3.8667, "step": 8175 }, { "epoch": 1.046976, "grad_norm": 3.2197885513305664, "learning_rate": 4.6573441968540795e-06, "loss": 3.9225, "step": 8180 }, { "epoch": 1.047616, "grad_norm": 3.2275218963623047, "learning_rate": 4.652313219601706e-06, "loss": 3.9144, "step": 8185 }, { "epoch": 1.048256, "grad_norm": 3.601731538772583, "learning_rate": 4.647282596043224e-06, "loss": 3.9631, "step": 8190 }, { "epoch": 1.048896, "grad_norm": 3.432614803314209, "learning_rate": 4.642252331296168e-06, "loss": 3.8468, "step": 8195 }, { "epoch": 1.049536, "grad_norm": 3.2367265224456787, "learning_rate": 4.637222430477713e-06, "loss": 3.9908, "step": 8200 }, { "epoch": 1.049536, "eval_loss": 0.9731603860855103, "eval_runtime": 6.8061, "eval_samples_per_second": 146.927, "eval_steps_per_second": 18.366, "step": 8200 }, { "epoch": 1.050176, "grad_norm": 3.158475399017334, "learning_rate": 4.632192898704664e-06, "loss": 3.7899, "step": 8205 }, { "epoch": 1.050816, "grad_norm": 3.394278049468994, "learning_rate": 4.62716374109345e-06, "loss": 4.0507, "step": 8210 }, { "epoch": 1.051456, "grad_norm": 3.066319704055786, "learning_rate": 4.6221349627601195e-06, "loss": 3.7521, "step": 8215 }, { "epoch": 1.052096, "grad_norm": 3.577451229095459, "learning_rate": 4.617106568820334e-06, "loss": 3.8836, "step": 8220 }, { "epoch": 1.052736, "grad_norm": 3.3249411582946777, "learning_rate": 4.612078564389363e-06, "loss": 3.9723, "step": 8225 }, { "epoch": 1.053376, "grad_norm": 3.228684902191162, "learning_rate": 4.607050954582086e-06, "loss": 3.7932, "step": 8230 }, { "epoch": 1.054016, "grad_norm": 3.2836358547210693, "learning_rate": 4.602023744512974e-06, "loss": 3.8523, "step": 8235 }, { "epoch": 1.054656, "grad_norm": 3.2514820098876953, "learning_rate": 4.596996939296093e-06, "loss": 3.8587, "step": 8240 }, { "epoch": 1.055296, "grad_norm": 3.3256592750549316, "learning_rate": 4.591970544045099e-06, "loss": 3.9497, "step": 8245 }, { "epoch": 1.055936, "grad_norm": 3.186540365219116, "learning_rate": 4.58694456387323e-06, "loss": 3.7457, "step": 8250 }, { "epoch": 1.056576, "grad_norm": 3.2372641563415527, "learning_rate": 4.5819190038933035e-06, "loss": 3.7401, "step": 8255 }, { "epoch": 1.057216, "grad_norm": 3.463500738143921, "learning_rate": 4.576893869217707e-06, "loss": 3.8811, "step": 8260 }, { "epoch": 1.057856, "grad_norm": 3.2895185947418213, "learning_rate": 4.571869164958392e-06, "loss": 4.0712, "step": 8265 }, { "epoch": 1.058496, "grad_norm": 3.387270450592041, "learning_rate": 4.566844896226883e-06, "loss": 3.7833, "step": 8270 }, { "epoch": 1.059136, "grad_norm": 3.3072781562805176, "learning_rate": 4.56182106813425e-06, "loss": 3.9508, "step": 8275 }, { "epoch": 1.059776, "grad_norm": 3.262228012084961, "learning_rate": 4.556797685791123e-06, "loss": 4.0602, "step": 8280 }, { "epoch": 1.060416, "grad_norm": 3.3029568195343018, "learning_rate": 4.551774754307672e-06, "loss": 3.8243, "step": 8285 }, { "epoch": 1.061056, "grad_norm": 3.6015522480010986, "learning_rate": 4.546752278793613e-06, "loss": 3.9187, "step": 8290 }, { "epoch": 1.061696, "grad_norm": 3.132108449935913, "learning_rate": 4.541730264358198e-06, "loss": 3.9011, "step": 8295 }, { "epoch": 1.062336, "grad_norm": 3.4112889766693115, "learning_rate": 4.536708716110207e-06, "loss": 3.9141, "step": 8300 }, { "epoch": 1.062336, "eval_loss": 0.9754996299743652, "eval_runtime": 7.1041, "eval_samples_per_second": 140.763, "eval_steps_per_second": 17.595, "step": 8300 }, { "epoch": 1.062976, "grad_norm": 3.3974194526672363, "learning_rate": 4.5316876391579444e-06, "loss": 3.9216, "step": 8305 }, { "epoch": 1.0636160000000001, "grad_norm": 3.4178879261016846, "learning_rate": 4.526667038609244e-06, "loss": 3.8112, "step": 8310 }, { "epoch": 1.064256, "grad_norm": 3.3445000648498535, "learning_rate": 4.521646919571444e-06, "loss": 3.9406, "step": 8315 }, { "epoch": 1.064896, "grad_norm": 2.9815402030944824, "learning_rate": 4.516627287151402e-06, "loss": 3.9258, "step": 8320 }, { "epoch": 1.065536, "grad_norm": 3.2848947048187256, "learning_rate": 4.511608146455471e-06, "loss": 3.8195, "step": 8325 }, { "epoch": 1.066176, "grad_norm": 3.2756311893463135, "learning_rate": 4.506589502589514e-06, "loss": 3.844, "step": 8330 }, { "epoch": 1.066816, "grad_norm": 3.47074294090271, "learning_rate": 4.501571360658884e-06, "loss": 3.986, "step": 8335 }, { "epoch": 1.067456, "grad_norm": 3.780287981033325, "learning_rate": 4.49655372576842e-06, "loss": 4.0033, "step": 8340 }, { "epoch": 1.068096, "grad_norm": 3.6877105236053467, "learning_rate": 4.491536603022449e-06, "loss": 3.8884, "step": 8345 }, { "epoch": 1.068736, "grad_norm": 3.375788927078247, "learning_rate": 4.486519997524776e-06, "loss": 3.9015, "step": 8350 }, { "epoch": 1.069376, "grad_norm": 3.4512062072753906, "learning_rate": 4.481503914378683e-06, "loss": 3.8001, "step": 8355 }, { "epoch": 1.070016, "grad_norm": 3.4298641681671143, "learning_rate": 4.476488358686916e-06, "loss": 3.9691, "step": 8360 }, { "epoch": 1.070656, "grad_norm": 3.2674472332000732, "learning_rate": 4.471473335551687e-06, "loss": 3.758, "step": 8365 }, { "epoch": 1.071296, "grad_norm": 3.0024731159210205, "learning_rate": 4.466458850074661e-06, "loss": 3.9368, "step": 8370 }, { "epoch": 1.071936, "grad_norm": 3.393695116043091, "learning_rate": 4.461444907356967e-06, "loss": 3.8741, "step": 8375 }, { "epoch": 1.072576, "grad_norm": 3.399658441543579, "learning_rate": 4.456431512499171e-06, "loss": 3.8315, "step": 8380 }, { "epoch": 1.073216, "grad_norm": 3.303518056869507, "learning_rate": 4.45141867060129e-06, "loss": 3.7508, "step": 8385 }, { "epoch": 1.073856, "grad_norm": 3.3047292232513428, "learning_rate": 4.446406386762768e-06, "loss": 3.6943, "step": 8390 }, { "epoch": 1.074496, "grad_norm": 3.38957142829895, "learning_rate": 4.441394666082496e-06, "loss": 3.8631, "step": 8395 }, { "epoch": 1.075136, "grad_norm": 3.3375697135925293, "learning_rate": 4.436383513658778e-06, "loss": 3.7307, "step": 8400 }, { "epoch": 1.075136, "eval_loss": 0.9679771661758423, "eval_runtime": 6.7924, "eval_samples_per_second": 147.224, "eval_steps_per_second": 18.403, "step": 8400 }, { "epoch": 1.075776, "grad_norm": 2.9178521633148193, "learning_rate": 4.431372934589349e-06, "loss": 3.7509, "step": 8405 }, { "epoch": 1.076416, "grad_norm": 3.609673023223877, "learning_rate": 4.426362933971354e-06, "loss": 3.9966, "step": 8410 }, { "epoch": 1.077056, "grad_norm": 3.0642848014831543, "learning_rate": 4.421353516901358e-06, "loss": 3.8124, "step": 8415 }, { "epoch": 1.077696, "grad_norm": 3.655663013458252, "learning_rate": 4.416344688475324e-06, "loss": 3.8068, "step": 8420 }, { "epoch": 1.078336, "grad_norm": 3.383075475692749, "learning_rate": 4.411336453788622e-06, "loss": 3.96, "step": 8425 }, { "epoch": 1.078976, "grad_norm": 3.0383999347686768, "learning_rate": 4.406328817936012e-06, "loss": 3.7295, "step": 8430 }, { "epoch": 1.079616, "grad_norm": 3.2788867950439453, "learning_rate": 4.401321786011653e-06, "loss": 3.9375, "step": 8435 }, { "epoch": 1.0802559999999999, "grad_norm": 3.3316409587860107, "learning_rate": 4.396315363109084e-06, "loss": 3.9218, "step": 8440 }, { "epoch": 1.080896, "grad_norm": 3.043272018432617, "learning_rate": 4.391309554321224e-06, "loss": 3.9933, "step": 8445 }, { "epoch": 1.081536, "grad_norm": 3.2069058418273926, "learning_rate": 4.3863043647403695e-06, "loss": 3.812, "step": 8450 }, { "epoch": 1.082176, "grad_norm": 3.4925637245178223, "learning_rate": 4.381299799458186e-06, "loss": 3.8843, "step": 8455 }, { "epoch": 1.082816, "grad_norm": 3.4238312244415283, "learning_rate": 4.376295863565708e-06, "loss": 3.8669, "step": 8460 }, { "epoch": 1.083456, "grad_norm": 3.490819215774536, "learning_rate": 4.371292562153322e-06, "loss": 3.9678, "step": 8465 }, { "epoch": 1.084096, "grad_norm": 3.41055965423584, "learning_rate": 4.366289900310773e-06, "loss": 3.8562, "step": 8470 }, { "epoch": 1.084736, "grad_norm": 3.2907748222351074, "learning_rate": 4.36128788312716e-06, "loss": 3.8839, "step": 8475 }, { "epoch": 1.085376, "grad_norm": 3.337765693664551, "learning_rate": 4.356286515690919e-06, "loss": 3.9953, "step": 8480 }, { "epoch": 1.086016, "grad_norm": 3.3621182441711426, "learning_rate": 4.351285803089827e-06, "loss": 3.8961, "step": 8485 }, { "epoch": 1.086656, "grad_norm": 3.304739236831665, "learning_rate": 4.346285750410996e-06, "loss": 3.8328, "step": 8490 }, { "epoch": 1.087296, "grad_norm": 3.2477059364318848, "learning_rate": 4.341286362740867e-06, "loss": 3.9207, "step": 8495 }, { "epoch": 1.087936, "grad_norm": 3.1985952854156494, "learning_rate": 4.336287645165205e-06, "loss": 3.8237, "step": 8500 }, { "epoch": 1.087936, "eval_loss": 0.9712408185005188, "eval_runtime": 7.5304, "eval_samples_per_second": 132.795, "eval_steps_per_second": 16.599, "step": 8500 }, { "epoch": 1.088576, "grad_norm": 3.250664234161377, "learning_rate": 4.331289602769091e-06, "loss": 3.8786, "step": 8505 }, { "epoch": 1.089216, "grad_norm": 3.40374493598938, "learning_rate": 4.32629224063692e-06, "loss": 3.9622, "step": 8510 }, { "epoch": 1.089856, "grad_norm": 3.1335885524749756, "learning_rate": 4.321295563852394e-06, "loss": 3.9723, "step": 8515 }, { "epoch": 1.090496, "grad_norm": 3.25437331199646, "learning_rate": 4.316299577498522e-06, "loss": 3.9138, "step": 8520 }, { "epoch": 1.091136, "grad_norm": 3.3222084045410156, "learning_rate": 4.311304286657608e-06, "loss": 3.9344, "step": 8525 }, { "epoch": 1.091776, "grad_norm": 3.0206170082092285, "learning_rate": 4.306309696411246e-06, "loss": 3.7114, "step": 8530 }, { "epoch": 1.092416, "grad_norm": 3.313721179962158, "learning_rate": 4.301315811840319e-06, "loss": 3.831, "step": 8535 }, { "epoch": 1.093056, "grad_norm": 2.9408037662506104, "learning_rate": 4.296322638024996e-06, "loss": 3.8785, "step": 8540 }, { "epoch": 1.093696, "grad_norm": 3.5085701942443848, "learning_rate": 4.291330180044717e-06, "loss": 3.9432, "step": 8545 }, { "epoch": 1.094336, "grad_norm": 3.4924538135528564, "learning_rate": 4.286338442978196e-06, "loss": 3.9569, "step": 8550 }, { "epoch": 1.094976, "grad_norm": 3.2100634574890137, "learning_rate": 4.281347431903416e-06, "loss": 3.8973, "step": 8555 }, { "epoch": 1.095616, "grad_norm": 3.3668410778045654, "learning_rate": 4.276357151897619e-06, "loss": 3.7831, "step": 8560 }, { "epoch": 1.096256, "grad_norm": 3.127249240875244, "learning_rate": 4.271367608037304e-06, "loss": 3.919, "step": 8565 }, { "epoch": 1.096896, "grad_norm": 3.3912453651428223, "learning_rate": 4.266378805398221e-06, "loss": 3.9414, "step": 8570 }, { "epoch": 1.097536, "grad_norm": 3.356959342956543, "learning_rate": 4.261390749055363e-06, "loss": 3.7983, "step": 8575 }, { "epoch": 1.098176, "grad_norm": 4.657166957855225, "learning_rate": 4.256403444082972e-06, "loss": 3.7763, "step": 8580 }, { "epoch": 1.098816, "grad_norm": 3.484509229660034, "learning_rate": 4.251416895554517e-06, "loss": 3.9047, "step": 8585 }, { "epoch": 1.099456, "grad_norm": 3.3309333324432373, "learning_rate": 4.246431108542701e-06, "loss": 3.7863, "step": 8590 }, { "epoch": 1.100096, "grad_norm": 3.1959686279296875, "learning_rate": 4.241446088119452e-06, "loss": 3.9393, "step": 8595 }, { "epoch": 1.100736, "grad_norm": 3.439521074295044, "learning_rate": 4.236461839355921e-06, "loss": 3.8784, "step": 8600 }, { "epoch": 1.100736, "eval_loss": 0.9694692492485046, "eval_runtime": 7.2173, "eval_samples_per_second": 138.556, "eval_steps_per_second": 17.319, "step": 8600 }, { "epoch": 1.101376, "grad_norm": 3.6734774112701416, "learning_rate": 4.23147836732247e-06, "loss": 3.8169, "step": 8605 }, { "epoch": 1.1020159999999999, "grad_norm": 3.252636432647705, "learning_rate": 4.226495677088671e-06, "loss": 3.8374, "step": 8610 }, { "epoch": 1.102656, "grad_norm": 3.2438418865203857, "learning_rate": 4.221513773723301e-06, "loss": 3.8564, "step": 8615 }, { "epoch": 1.103296, "grad_norm": 3.2973501682281494, "learning_rate": 4.216532662294342e-06, "loss": 3.9575, "step": 8620 }, { "epoch": 1.103936, "grad_norm": 3.2574779987335205, "learning_rate": 4.211552347868961e-06, "loss": 3.8727, "step": 8625 }, { "epoch": 1.104576, "grad_norm": 3.1038432121276855, "learning_rate": 4.2065728355135225e-06, "loss": 3.7745, "step": 8630 }, { "epoch": 1.105216, "grad_norm": 3.3146615028381348, "learning_rate": 4.201594130293568e-06, "loss": 3.7769, "step": 8635 }, { "epoch": 1.105856, "grad_norm": 3.2965011596679688, "learning_rate": 4.196616237273826e-06, "loss": 3.8676, "step": 8640 }, { "epoch": 1.106496, "grad_norm": 3.0500552654266357, "learning_rate": 4.191639161518193e-06, "loss": 4.0099, "step": 8645 }, { "epoch": 1.107136, "grad_norm": 3.3446359634399414, "learning_rate": 4.1866629080897345e-06, "loss": 3.9098, "step": 8650 }, { "epoch": 1.107776, "grad_norm": 3.277162551879883, "learning_rate": 4.181687482050679e-06, "loss": 3.8826, "step": 8655 }, { "epoch": 1.108416, "grad_norm": 3.272045135498047, "learning_rate": 4.176712888462417e-06, "loss": 3.8741, "step": 8660 }, { "epoch": 1.109056, "grad_norm": 3.469515323638916, "learning_rate": 4.171739132385488e-06, "loss": 3.8808, "step": 8665 }, { "epoch": 1.109696, "grad_norm": 3.2473506927490234, "learning_rate": 4.166766218879586e-06, "loss": 3.8313, "step": 8670 }, { "epoch": 1.110336, "grad_norm": 3.272639274597168, "learning_rate": 4.161794153003538e-06, "loss": 3.922, "step": 8675 }, { "epoch": 1.110976, "grad_norm": 3.539156198501587, "learning_rate": 4.156822939815314e-06, "loss": 3.9184, "step": 8680 }, { "epoch": 1.111616, "grad_norm": 3.2175755500793457, "learning_rate": 4.151852584372021e-06, "loss": 3.84, "step": 8685 }, { "epoch": 1.112256, "grad_norm": 3.154583692550659, "learning_rate": 4.146883091729887e-06, "loss": 3.8663, "step": 8690 }, { "epoch": 1.112896, "grad_norm": 3.3193087577819824, "learning_rate": 4.141914466944262e-06, "loss": 3.9162, "step": 8695 }, { "epoch": 1.113536, "grad_norm": 7.432323932647705, "learning_rate": 4.136946715069617e-06, "loss": 3.8672, "step": 8700 }, { "epoch": 1.113536, "eval_loss": 0.978622317314148, "eval_runtime": 6.508, "eval_samples_per_second": 153.658, "eval_steps_per_second": 19.207, "step": 8700 }, { "epoch": 1.114176, "grad_norm": 3.340735912322998, "learning_rate": 4.1319798411595366e-06, "loss": 3.8371, "step": 8705 }, { "epoch": 1.114816, "grad_norm": 3.2261412143707275, "learning_rate": 4.127013850266706e-06, "loss": 3.8953, "step": 8710 }, { "epoch": 1.115456, "grad_norm": 3.2747981548309326, "learning_rate": 4.122048747442915e-06, "loss": 3.8204, "step": 8715 }, { "epoch": 1.116096, "grad_norm": 3.2461555004119873, "learning_rate": 4.117084537739049e-06, "loss": 3.9218, "step": 8720 }, { "epoch": 1.116736, "grad_norm": 3.4664454460144043, "learning_rate": 4.112121226205091e-06, "loss": 3.8422, "step": 8725 }, { "epoch": 1.117376, "grad_norm": 3.183678150177002, "learning_rate": 4.107158817890101e-06, "loss": 3.8495, "step": 8730 }, { "epoch": 1.118016, "grad_norm": 3.3027968406677246, "learning_rate": 4.102197317842227e-06, "loss": 3.847, "step": 8735 }, { "epoch": 1.118656, "grad_norm": 3.284727096557617, "learning_rate": 4.097236731108688e-06, "loss": 3.9299, "step": 8740 }, { "epoch": 1.119296, "grad_norm": 3.5089738368988037, "learning_rate": 4.092277062735779e-06, "loss": 3.9105, "step": 8745 }, { "epoch": 1.119936, "grad_norm": 3.26532244682312, "learning_rate": 4.0873183177688595e-06, "loss": 3.8122, "step": 8750 }, { "epoch": 1.120576, "grad_norm": 3.258549213409424, "learning_rate": 4.082360501252345e-06, "loss": 3.8606, "step": 8755 }, { "epoch": 1.121216, "grad_norm": 3.922128677368164, "learning_rate": 4.077403618229711e-06, "loss": 3.8658, "step": 8760 }, { "epoch": 1.121856, "grad_norm": 3.0818393230438232, "learning_rate": 4.072447673743484e-06, "loss": 3.9445, "step": 8765 }, { "epoch": 1.122496, "grad_norm": 3.128241777420044, "learning_rate": 4.067492672835231e-06, "loss": 3.8835, "step": 8770 }, { "epoch": 1.123136, "grad_norm": 3.4902095794677734, "learning_rate": 4.0625386205455675e-06, "loss": 3.8119, "step": 8775 }, { "epoch": 1.1237759999999999, "grad_norm": 3.1120331287384033, "learning_rate": 4.057585521914132e-06, "loss": 3.9596, "step": 8780 }, { "epoch": 1.124416, "grad_norm": 3.81187105178833, "learning_rate": 4.052633381979605e-06, "loss": 3.8186, "step": 8785 }, { "epoch": 1.125056, "grad_norm": 3.3931918144226074, "learning_rate": 4.047682205779684e-06, "loss": 3.877, "step": 8790 }, { "epoch": 1.125696, "grad_norm": 3.126008987426758, "learning_rate": 4.042731998351088e-06, "loss": 3.844, "step": 8795 }, { "epoch": 1.126336, "grad_norm": 3.5310118198394775, "learning_rate": 4.037782764729552e-06, "loss": 3.9613, "step": 8800 }, { "epoch": 1.126336, "eval_loss": 0.9762705564498901, "eval_runtime": 6.6433, "eval_samples_per_second": 150.527, "eval_steps_per_second": 18.816, "step": 8800 }, { "epoch": 1.126976, "grad_norm": 3.327422857284546, "learning_rate": 4.032834509949818e-06, "loss": 3.8829, "step": 8805 }, { "epoch": 1.127616, "grad_norm": 3.368654727935791, "learning_rate": 4.027887239045636e-06, "loss": 3.8485, "step": 8810 }, { "epoch": 1.128256, "grad_norm": 3.317504644393921, "learning_rate": 4.022940957049752e-06, "loss": 3.8833, "step": 8815 }, { "epoch": 1.1288960000000001, "grad_norm": 3.361346483230591, "learning_rate": 4.017995668993904e-06, "loss": 3.7834, "step": 8820 }, { "epoch": 1.129536, "grad_norm": 3.4562320709228516, "learning_rate": 4.013051379908822e-06, "loss": 3.9239, "step": 8825 }, { "epoch": 1.130176, "grad_norm": 3.381047487258911, "learning_rate": 4.008108094824222e-06, "loss": 3.8583, "step": 8830 }, { "epoch": 1.130816, "grad_norm": 3.192483425140381, "learning_rate": 4.0031658187687946e-06, "loss": 3.9943, "step": 8835 }, { "epoch": 1.131456, "grad_norm": 3.2975564002990723, "learning_rate": 3.998224556770205e-06, "loss": 3.8301, "step": 8840 }, { "epoch": 1.132096, "grad_norm": 3.210685968399048, "learning_rate": 3.993284313855086e-06, "loss": 3.8343, "step": 8845 }, { "epoch": 1.132736, "grad_norm": 3.294238328933716, "learning_rate": 3.988345095049039e-06, "loss": 3.9252, "step": 8850 }, { "epoch": 1.133376, "grad_norm": 3.1468687057495117, "learning_rate": 3.983406905376615e-06, "loss": 3.8875, "step": 8855 }, { "epoch": 1.134016, "grad_norm": 3.2530410289764404, "learning_rate": 3.978469749861326e-06, "loss": 3.7618, "step": 8860 }, { "epoch": 1.134656, "grad_norm": 3.5931050777435303, "learning_rate": 3.973533633525623e-06, "loss": 3.9476, "step": 8865 }, { "epoch": 1.135296, "grad_norm": 3.414227247238159, "learning_rate": 3.968598561390911e-06, "loss": 3.8739, "step": 8870 }, { "epoch": 1.135936, "grad_norm": 3.238137722015381, "learning_rate": 3.963664538477527e-06, "loss": 3.8407, "step": 8875 }, { "epoch": 1.136576, "grad_norm": 3.327712297439575, "learning_rate": 3.958731569804738e-06, "loss": 3.8728, "step": 8880 }, { "epoch": 1.137216, "grad_norm": 3.3951661586761475, "learning_rate": 3.95379966039074e-06, "loss": 3.9143, "step": 8885 }, { "epoch": 1.137856, "grad_norm": 3.6422483921051025, "learning_rate": 3.948868815252658e-06, "loss": 3.9115, "step": 8890 }, { "epoch": 1.138496, "grad_norm": 3.4670307636260986, "learning_rate": 3.9439390394065245e-06, "loss": 3.9659, "step": 8895 }, { "epoch": 1.139136, "grad_norm": 3.2578067779541016, "learning_rate": 3.93901033786729e-06, "loss": 3.9247, "step": 8900 }, { "epoch": 1.139136, "eval_loss": 0.9684814214706421, "eval_runtime": 6.8159, "eval_samples_per_second": 146.715, "eval_steps_per_second": 18.339, "step": 8900 }, { "epoch": 1.139776, "grad_norm": 3.3462729454040527, "learning_rate": 3.934082715648812e-06, "loss": 4.0389, "step": 8905 }, { "epoch": 1.140416, "grad_norm": 3.5041468143463135, "learning_rate": 3.9291561777638486e-06, "loss": 3.8389, "step": 8910 }, { "epoch": 1.141056, "grad_norm": 3.22247576713562, "learning_rate": 3.924230729224056e-06, "loss": 3.8226, "step": 8915 }, { "epoch": 1.141696, "grad_norm": 3.356802463531494, "learning_rate": 3.91930637503998e-06, "loss": 3.9254, "step": 8920 }, { "epoch": 1.142336, "grad_norm": 3.6070470809936523, "learning_rate": 3.914383120221053e-06, "loss": 3.8787, "step": 8925 }, { "epoch": 1.142976, "grad_norm": 3.1773595809936523, "learning_rate": 3.909460969775595e-06, "loss": 3.7098, "step": 8930 }, { "epoch": 1.143616, "grad_norm": 3.2447359561920166, "learning_rate": 3.904539928710796e-06, "loss": 3.8433, "step": 8935 }, { "epoch": 1.144256, "grad_norm": 3.0843183994293213, "learning_rate": 3.899620002032718e-06, "loss": 3.961, "step": 8940 }, { "epoch": 1.144896, "grad_norm": 3.4813742637634277, "learning_rate": 3.894701194746291e-06, "loss": 3.8173, "step": 8945 }, { "epoch": 1.1455359999999999, "grad_norm": 3.433196783065796, "learning_rate": 3.889783511855311e-06, "loss": 3.944, "step": 8950 }, { "epoch": 1.146176, "grad_norm": 3.2032461166381836, "learning_rate": 3.884866958362421e-06, "loss": 3.8223, "step": 8955 }, { "epoch": 1.146816, "grad_norm": 3.171764850616455, "learning_rate": 3.879951539269122e-06, "loss": 3.8118, "step": 8960 }, { "epoch": 1.147456, "grad_norm": 3.438223123550415, "learning_rate": 3.8750372595757545e-06, "loss": 3.8781, "step": 8965 }, { "epoch": 1.148096, "grad_norm": 3.287339687347412, "learning_rate": 3.870124124281509e-06, "loss": 3.8611, "step": 8970 }, { "epoch": 1.148736, "grad_norm": 3.185896158218384, "learning_rate": 3.8652121383844035e-06, "loss": 3.7097, "step": 8975 }, { "epoch": 1.149376, "grad_norm": 3.296564817428589, "learning_rate": 3.860301306881292e-06, "loss": 3.8195, "step": 8980 }, { "epoch": 1.150016, "grad_norm": 3.3196237087249756, "learning_rate": 3.85539163476785e-06, "loss": 3.8855, "step": 8985 }, { "epoch": 1.1506560000000001, "grad_norm": 3.337581157684326, "learning_rate": 3.8504831270385765e-06, "loss": 3.8602, "step": 8990 }, { "epoch": 1.151296, "grad_norm": 3.3622496128082275, "learning_rate": 3.845575788686787e-06, "loss": 3.8893, "step": 8995 }, { "epoch": 1.151936, "grad_norm": 3.600410223007202, "learning_rate": 3.840669624704605e-06, "loss": 3.9255, "step": 9000 }, { "epoch": 1.151936, "eval_loss": 0.9813051223754883, "eval_runtime": 6.8153, "eval_samples_per_second": 146.729, "eval_steps_per_second": 18.341, "step": 9000 } ], "logging_steps": 5, "max_steps": 15624, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.1416322171475067e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }