{ "best_metric": 1.2660380601882935, "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-900", "epoch": 0.1152, "eval_steps": 100, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00064, "grad_norm": 7.650606632232666, "learning_rate": 1.0000000000000002e-06, "loss": 6.29, "step": 5 }, { "epoch": 0.00128, "grad_norm": 4.541823387145996, "learning_rate": 2.0000000000000003e-06, "loss": 6.3815, "step": 10 }, { "epoch": 0.00192, "grad_norm": 4.245054721832275, "learning_rate": 3e-06, "loss": 6.2854, "step": 15 }, { "epoch": 0.00256, "grad_norm": 4.5587897300720215, "learning_rate": 4.000000000000001e-06, "loss": 6.0674, "step": 20 }, { "epoch": 0.0032, "grad_norm": 3.7703804969787598, "learning_rate": 4.800000000000001e-06, "loss": 6.2961, "step": 25 }, { "epoch": 0.00384, "grad_norm": 3.8425862789154053, "learning_rate": 5.8e-06, "loss": 6.3326, "step": 30 }, { "epoch": 0.00448, "grad_norm": 4.413463115692139, "learning_rate": 6.800000000000001e-06, "loss": 6.183, "step": 35 }, { "epoch": 0.00512, "grad_norm": 4.1980509757995605, "learning_rate": 7.800000000000002e-06, "loss": 6.2654, "step": 40 }, { "epoch": 0.00576, "grad_norm": 3.9166719913482666, "learning_rate": 8.8e-06, "loss": 6.0916, "step": 45 }, { "epoch": 0.0064, "grad_norm": 3.4706904888153076, "learning_rate": 9.800000000000001e-06, "loss": 6.103, "step": 50 }, { "epoch": 0.00704, "grad_norm": 5.138203144073486, "learning_rate": 9.999998372356185e-06, "loss": 6.2379, "step": 55 }, { "epoch": 0.00768, "grad_norm": 3.7806520462036133, "learning_rate": 9.999991760055e-06, "loss": 6.1776, "step": 60 }, { "epoch": 0.00832, "grad_norm": 3.5731871128082275, "learning_rate": 9.999980061375427e-06, "loss": 6.2082, "step": 65 }, { "epoch": 0.00896, "grad_norm": 3.661797285079956, "learning_rate": 9.999963276329369e-06, "loss": 6.0704, "step": 70 }, { "epoch": 0.0096, "grad_norm": 3.6181113719940186, "learning_rate": 9.999941404933902e-06, "loss": 6.2081, "step": 75 }, { "epoch": 0.01024, "grad_norm": 3.3162803649902344, "learning_rate": 9.99991444721127e-06, "loss": 5.8807, "step": 80 }, { "epoch": 0.01088, "grad_norm": 3.6022472381591797, "learning_rate": 9.999882403188902e-06, "loss": 6.1092, "step": 85 }, { "epoch": 0.01152, "grad_norm": 7.291418552398682, "learning_rate": 9.999845272899393e-06, "loss": 5.7668, "step": 90 }, { "epoch": 0.01216, "grad_norm": 3.522437810897827, "learning_rate": 9.999803056380517e-06, "loss": 6.1621, "step": 95 }, { "epoch": 0.0128, "grad_norm": 3.9014439582824707, "learning_rate": 9.999755753675216e-06, "loss": 6.0573, "step": 100 }, { "epoch": 0.0128, "eval_loss": 1.5072969198226929, "eval_runtime": 11.1161, "eval_samples_per_second": 89.96, "eval_steps_per_second": 11.245, "step": 100 }, { "epoch": 0.01344, "grad_norm": 3.7579081058502197, "learning_rate": 9.999703364831614e-06, "loss": 6.1671, "step": 105 }, { "epoch": 0.01408, "grad_norm": 3.7058262825012207, "learning_rate": 9.999645889903002e-06, "loss": 6.1348, "step": 110 }, { "epoch": 0.01472, "grad_norm": 5.018667697906494, "learning_rate": 9.99958332894785e-06, "loss": 5.9376, "step": 115 }, { "epoch": 0.01536, "grad_norm": 3.5420188903808594, "learning_rate": 9.999515682029798e-06, "loss": 5.9961, "step": 120 }, { "epoch": 0.016, "grad_norm": 3.5725393295288086, "learning_rate": 9.999442949217663e-06, "loss": 5.8439, "step": 125 }, { "epoch": 0.01664, "grad_norm": 3.8440959453582764, "learning_rate": 9.999365130585435e-06, "loss": 5.7857, "step": 130 }, { "epoch": 0.01728, "grad_norm": 3.4371285438537598, "learning_rate": 9.999282226212276e-06, "loss": 5.799, "step": 135 }, { "epoch": 0.01792, "grad_norm": 3.996847152709961, "learning_rate": 9.999194236182523e-06, "loss": 6.0022, "step": 140 }, { "epoch": 0.01856, "grad_norm": 3.720330238342285, "learning_rate": 9.999101160585687e-06, "loss": 5.925, "step": 145 }, { "epoch": 0.0192, "grad_norm": 3.8822953701019287, "learning_rate": 9.99900299951645e-06, "loss": 5.8085, "step": 150 }, { "epoch": 0.01984, "grad_norm": 3.599283456802368, "learning_rate": 9.99889975307467e-06, "loss": 5.6533, "step": 155 }, { "epoch": 0.02048, "grad_norm": 3.4847381114959717, "learning_rate": 9.998791421365376e-06, "loss": 5.9021, "step": 160 }, { "epoch": 0.02112, "grad_norm": 3.4302055835723877, "learning_rate": 9.998678004498774e-06, "loss": 5.962, "step": 165 }, { "epoch": 0.02176, "grad_norm": 4.561929702758789, "learning_rate": 9.99855950259024e-06, "loss": 5.9011, "step": 170 }, { "epoch": 0.0224, "grad_norm": 4.069271087646484, "learning_rate": 9.998435915760323e-06, "loss": 5.6782, "step": 175 }, { "epoch": 0.02304, "grad_norm": 3.5959055423736572, "learning_rate": 9.998307244134741e-06, "loss": 5.8107, "step": 180 }, { "epoch": 0.02368, "grad_norm": 3.5477242469787598, "learning_rate": 9.998173487844396e-06, "loss": 5.8335, "step": 185 }, { "epoch": 0.02432, "grad_norm": 4.488218307495117, "learning_rate": 9.998034647025349e-06, "loss": 5.8285, "step": 190 }, { "epoch": 0.02496, "grad_norm": 3.555074691772461, "learning_rate": 9.997890721818844e-06, "loss": 5.817, "step": 195 }, { "epoch": 0.0256, "grad_norm": 3.6248419284820557, "learning_rate": 9.99774171237129e-06, "loss": 5.8368, "step": 200 }, { "epoch": 0.0256, "eval_loss": 1.440572738647461, "eval_runtime": 6.6468, "eval_samples_per_second": 150.448, "eval_steps_per_second": 18.806, "step": 200 }, { "epoch": 0.02624, "grad_norm": 3.432421922683716, "learning_rate": 9.997587618834272e-06, "loss": 5.7842, "step": 205 }, { "epoch": 0.02688, "grad_norm": 3.333038806915283, "learning_rate": 9.997428441364546e-06, "loss": 5.7173, "step": 210 }, { "epoch": 0.02752, "grad_norm": 3.7716541290283203, "learning_rate": 9.997264180124038e-06, "loss": 5.719, "step": 215 }, { "epoch": 0.02816, "grad_norm": 3.345600128173828, "learning_rate": 9.99709483527985e-06, "loss": 5.8428, "step": 220 }, { "epoch": 0.0288, "grad_norm": 3.7677502632141113, "learning_rate": 9.99692040700425e-06, "loss": 5.7393, "step": 225 }, { "epoch": 0.02944, "grad_norm": 11.996383666992188, "learning_rate": 9.996740895474682e-06, "loss": 5.5566, "step": 230 }, { "epoch": 0.03008, "grad_norm": 3.6089084148406982, "learning_rate": 9.996556300873758e-06, "loss": 5.6939, "step": 235 }, { "epoch": 0.03072, "grad_norm": 3.834825038909912, "learning_rate": 9.996366623389263e-06, "loss": 5.8123, "step": 240 }, { "epoch": 0.03136, "grad_norm": 3.570263147354126, "learning_rate": 9.99617186321415e-06, "loss": 5.6839, "step": 245 }, { "epoch": 0.032, "grad_norm": 3.5728812217712402, "learning_rate": 9.995972020546545e-06, "loss": 5.7764, "step": 250 }, { "epoch": 0.03264, "grad_norm": 3.4725637435913086, "learning_rate": 9.995767095589743e-06, "loss": 5.6879, "step": 255 }, { "epoch": 0.03328, "grad_norm": 3.811537742614746, "learning_rate": 9.99555708855221e-06, "loss": 5.6418, "step": 260 }, { "epoch": 0.03392, "grad_norm": 3.494992971420288, "learning_rate": 9.99534199964758e-06, "loss": 5.6927, "step": 265 }, { "epoch": 0.03456, "grad_norm": 3.8107383251190186, "learning_rate": 9.995121829094662e-06, "loss": 5.5658, "step": 270 }, { "epoch": 0.0352, "grad_norm": 3.570551633834839, "learning_rate": 9.994896577117425e-06, "loss": 5.8131, "step": 275 }, { "epoch": 0.03584, "grad_norm": 3.540811538696289, "learning_rate": 9.994666243945018e-06, "loss": 5.6009, "step": 280 }, { "epoch": 0.03648, "grad_norm": 3.7275819778442383, "learning_rate": 9.99443082981175e-06, "loss": 5.6407, "step": 285 }, { "epoch": 0.03712, "grad_norm": 4.194495677947998, "learning_rate": 9.994190334957103e-06, "loss": 5.8319, "step": 290 }, { "epoch": 0.03776, "grad_norm": 3.5107626914978027, "learning_rate": 9.993944759625728e-06, "loss": 5.5765, "step": 295 }, { "epoch": 0.0384, "grad_norm": 3.4100208282470703, "learning_rate": 9.993694104067444e-06, "loss": 5.7473, "step": 300 }, { "epoch": 0.0384, "eval_loss": 1.407908320426941, "eval_runtime": 6.6542, "eval_samples_per_second": 150.281, "eval_steps_per_second": 18.785, "step": 300 }, { "epoch": 0.03904, "grad_norm": 3.7727818489074707, "learning_rate": 9.993438368537236e-06, "loss": 5.6802, "step": 305 }, { "epoch": 0.03968, "grad_norm": 3.445909023284912, "learning_rate": 9.993177553295258e-06, "loss": 5.7484, "step": 310 }, { "epoch": 0.04032, "grad_norm": 3.4199888706207275, "learning_rate": 9.992911658606832e-06, "loss": 5.7648, "step": 315 }, { "epoch": 0.04096, "grad_norm": 4.9640655517578125, "learning_rate": 9.992640684742445e-06, "loss": 5.7922, "step": 320 }, { "epoch": 0.0416, "grad_norm": 3.3730976581573486, "learning_rate": 9.992364631977754e-06, "loss": 5.677, "step": 325 }, { "epoch": 0.04224, "grad_norm": 3.540597915649414, "learning_rate": 9.99208350059358e-06, "loss": 5.5495, "step": 330 }, { "epoch": 0.04288, "grad_norm": 3.6853768825531006, "learning_rate": 9.991797290875915e-06, "loss": 5.4089, "step": 335 }, { "epoch": 0.04352, "grad_norm": 3.6380045413970947, "learning_rate": 9.991506003115911e-06, "loss": 5.4849, "step": 340 }, { "epoch": 0.04416, "grad_norm": 3.265488862991333, "learning_rate": 9.991209637609887e-06, "loss": 5.523, "step": 345 }, { "epoch": 0.0448, "grad_norm": 3.2634189128875732, "learning_rate": 9.990908194659332e-06, "loss": 5.5664, "step": 350 }, { "epoch": 0.04544, "grad_norm": 3.569810152053833, "learning_rate": 9.990601674570895e-06, "loss": 5.5059, "step": 355 }, { "epoch": 0.04608, "grad_norm": 3.580211877822876, "learning_rate": 9.990290077656393e-06, "loss": 5.4079, "step": 360 }, { "epoch": 0.04672, "grad_norm": 3.4860317707061768, "learning_rate": 9.989973404232805e-06, "loss": 5.6858, "step": 365 }, { "epoch": 0.04736, "grad_norm": 4.026730060577393, "learning_rate": 9.989651654622277e-06, "loss": 5.5662, "step": 370 }, { "epoch": 0.048, "grad_norm": 3.364692449569702, "learning_rate": 9.989324829152119e-06, "loss": 5.5304, "step": 375 }, { "epoch": 0.04864, "grad_norm": 3.611964464187622, "learning_rate": 9.9889929281548e-06, "loss": 5.3911, "step": 380 }, { "epoch": 0.04928, "grad_norm": 3.2946035861968994, "learning_rate": 9.988655951967958e-06, "loss": 5.4102, "step": 385 }, { "epoch": 0.04992, "grad_norm": 3.963909864425659, "learning_rate": 9.98831390093439e-06, "loss": 5.549, "step": 390 }, { "epoch": 0.05056, "grad_norm": 3.2876341342926025, "learning_rate": 9.987966775402056e-06, "loss": 5.5388, "step": 395 }, { "epoch": 0.0512, "grad_norm": 3.8467471599578857, "learning_rate": 9.98761457572408e-06, "loss": 5.454, "step": 400 }, { "epoch": 0.0512, "eval_loss": 1.3826359510421753, "eval_runtime": 7.0199, "eval_samples_per_second": 142.452, "eval_steps_per_second": 17.807, "step": 400 }, { "epoch": 0.05184, "grad_norm": 3.675231695175171, "learning_rate": 9.987257302258748e-06, "loss": 5.674, "step": 405 }, { "epoch": 0.05248, "grad_norm": 3.787940263748169, "learning_rate": 9.986894955369504e-06, "loss": 5.5466, "step": 410 }, { "epoch": 0.05312, "grad_norm": 3.677966833114624, "learning_rate": 9.986527535424956e-06, "loss": 5.4762, "step": 415 }, { "epoch": 0.05376, "grad_norm": 3.5083606243133545, "learning_rate": 9.986155042798874e-06, "loss": 5.3145, "step": 420 }, { "epoch": 0.0544, "grad_norm": 3.536379098892212, "learning_rate": 9.98577747787018e-06, "loss": 5.3769, "step": 425 }, { "epoch": 0.05504, "grad_norm": 3.5448412895202637, "learning_rate": 9.98539484102297e-06, "loss": 5.3996, "step": 430 }, { "epoch": 0.05568, "grad_norm": 3.359647274017334, "learning_rate": 9.985007132646489e-06, "loss": 5.3114, "step": 435 }, { "epoch": 0.05632, "grad_norm": 3.3419110774993896, "learning_rate": 9.984614353135143e-06, "loss": 5.4383, "step": 440 }, { "epoch": 0.05696, "grad_norm": 3.558025360107422, "learning_rate": 9.984216502888496e-06, "loss": 5.5239, "step": 445 }, { "epoch": 0.0576, "grad_norm": 3.6349422931671143, "learning_rate": 9.983813582311277e-06, "loss": 5.5639, "step": 450 }, { "epoch": 0.05824, "grad_norm": 3.2916922569274902, "learning_rate": 9.983405591813362e-06, "loss": 5.3886, "step": 455 }, { "epoch": 0.05888, "grad_norm": 3.32891845703125, "learning_rate": 9.982992531809796e-06, "loss": 5.526, "step": 460 }, { "epoch": 0.05952, "grad_norm": 3.8752880096435547, "learning_rate": 9.982574402720773e-06, "loss": 5.6599, "step": 465 }, { "epoch": 0.06016, "grad_norm": 3.604433536529541, "learning_rate": 9.982151204971646e-06, "loss": 5.4567, "step": 470 }, { "epoch": 0.0608, "grad_norm": 3.3058159351348877, "learning_rate": 9.981722938992926e-06, "loss": 5.4981, "step": 475 }, { "epoch": 0.06144, "grad_norm": 3.7341926097869873, "learning_rate": 9.981289605220276e-06, "loss": 5.3278, "step": 480 }, { "epoch": 0.06208, "grad_norm": 3.51798415184021, "learning_rate": 9.980851204094519e-06, "loss": 5.5029, "step": 485 }, { "epoch": 0.06272, "grad_norm": 3.6541428565979004, "learning_rate": 9.980407736061629e-06, "loss": 5.3987, "step": 490 }, { "epoch": 0.06336, "grad_norm": 3.420767307281494, "learning_rate": 9.979959201572736e-06, "loss": 5.405, "step": 495 }, { "epoch": 0.064, "grad_norm": 3.7169559001922607, "learning_rate": 9.979505601084124e-06, "loss": 5.498, "step": 500 }, { "epoch": 0.064, "eval_loss": 1.3493109941482544, "eval_runtime": 7.1309, "eval_samples_per_second": 140.234, "eval_steps_per_second": 17.529, "step": 500 }, { "epoch": 0.06464, "grad_norm": 4.536627769470215, "learning_rate": 9.97904693505723e-06, "loss": 5.5237, "step": 505 }, { "epoch": 0.06528, "grad_norm": 3.204948902130127, "learning_rate": 9.978583203958649e-06, "loss": 5.3746, "step": 510 }, { "epoch": 0.06592, "grad_norm": 3.4658005237579346, "learning_rate": 9.978114408260118e-06, "loss": 5.4567, "step": 515 }, { "epoch": 0.06656, "grad_norm": 4.932333469390869, "learning_rate": 9.977640548438534e-06, "loss": 5.1959, "step": 520 }, { "epoch": 0.0672, "grad_norm": 3.4697563648223877, "learning_rate": 9.977161624975948e-06, "loss": 5.4013, "step": 525 }, { "epoch": 0.06784, "grad_norm": 3.441819667816162, "learning_rate": 9.976677638359553e-06, "loss": 5.4899, "step": 530 }, { "epoch": 0.06848, "grad_norm": 3.4293930530548096, "learning_rate": 9.9761885890817e-06, "loss": 5.3569, "step": 535 }, { "epoch": 0.06912, "grad_norm": 3.5388574600219727, "learning_rate": 9.975694477639885e-06, "loss": 5.2739, "step": 540 }, { "epoch": 0.06976, "grad_norm": 3.735548973083496, "learning_rate": 9.97519530453676e-06, "loss": 5.4253, "step": 545 }, { "epoch": 0.0704, "grad_norm": 3.33503794670105, "learning_rate": 9.974691070280121e-06, "loss": 5.1569, "step": 550 }, { "epoch": 0.07104, "grad_norm": 3.5171401500701904, "learning_rate": 9.974181775382915e-06, "loss": 5.3242, "step": 555 }, { "epoch": 0.07168, "grad_norm": 3.565356969833374, "learning_rate": 9.973667420363233e-06, "loss": 5.3893, "step": 560 }, { "epoch": 0.07232, "grad_norm": 3.172163248062134, "learning_rate": 9.973148005744319e-06, "loss": 5.3824, "step": 565 }, { "epoch": 0.07296, "grad_norm": 3.517838716506958, "learning_rate": 9.972623532054564e-06, "loss": 5.2673, "step": 570 }, { "epoch": 0.0736, "grad_norm": 3.328416585922241, "learning_rate": 9.9720939998275e-06, "loss": 5.2649, "step": 575 }, { "epoch": 0.07424, "grad_norm": 3.475539445877075, "learning_rate": 9.971559409601807e-06, "loss": 5.3318, "step": 580 }, { "epoch": 0.07488, "grad_norm": 3.492013692855835, "learning_rate": 9.971019761921317e-06, "loss": 5.2735, "step": 585 }, { "epoch": 0.07552, "grad_norm": 3.474803924560547, "learning_rate": 9.970475057334997e-06, "loss": 5.3722, "step": 590 }, { "epoch": 0.07616, "grad_norm": 3.4162726402282715, "learning_rate": 9.96992529639696e-06, "loss": 5.3901, "step": 595 }, { "epoch": 0.0768, "grad_norm": 3.3643155097961426, "learning_rate": 9.969370479666473e-06, "loss": 5.2384, "step": 600 }, { "epoch": 0.0768, "eval_loss": 1.3373793363571167, "eval_runtime": 6.5847, "eval_samples_per_second": 151.867, "eval_steps_per_second": 18.983, "step": 600 }, { "epoch": 0.07744, "grad_norm": 3.44301176071167, "learning_rate": 9.968810607707933e-06, "loss": 5.2322, "step": 605 }, { "epoch": 0.07808, "grad_norm": 3.422262668609619, "learning_rate": 9.968245681090887e-06, "loss": 5.1708, "step": 610 }, { "epoch": 0.07872, "grad_norm": 3.2879252433776855, "learning_rate": 9.96767570039002e-06, "loss": 5.2291, "step": 615 }, { "epoch": 0.07936, "grad_norm": 3.6026480197906494, "learning_rate": 9.967100666185163e-06, "loss": 5.4241, "step": 620 }, { "epoch": 0.08, "grad_norm": 3.3642101287841797, "learning_rate": 9.966520579061286e-06, "loss": 5.4473, "step": 625 }, { "epoch": 0.08064, "grad_norm": 3.5968470573425293, "learning_rate": 9.965935439608493e-06, "loss": 5.3982, "step": 630 }, { "epoch": 0.08128, "grad_norm": 3.352083206176758, "learning_rate": 9.96534524842204e-06, "loss": 5.3953, "step": 635 }, { "epoch": 0.08192, "grad_norm": 3.3571720123291016, "learning_rate": 9.964750006102311e-06, "loss": 5.3159, "step": 640 }, { "epoch": 0.08256, "grad_norm": 3.486246109008789, "learning_rate": 9.964149713254833e-06, "loss": 5.211, "step": 645 }, { "epoch": 0.0832, "grad_norm": 3.674906015396118, "learning_rate": 9.96354437049027e-06, "loss": 5.3374, "step": 650 }, { "epoch": 0.08384, "grad_norm": 3.590810537338257, "learning_rate": 9.962933978424426e-06, "loss": 5.2194, "step": 655 }, { "epoch": 0.08448, "grad_norm": 3.551786184310913, "learning_rate": 9.962318537678238e-06, "loss": 5.1187, "step": 660 }, { "epoch": 0.08512, "grad_norm": 3.5391581058502197, "learning_rate": 9.961698048877776e-06, "loss": 5.2001, "step": 665 }, { "epoch": 0.08576, "grad_norm": 3.6105592250823975, "learning_rate": 9.961072512654255e-06, "loss": 5.2758, "step": 670 }, { "epoch": 0.0864, "grad_norm": 3.7463858127593994, "learning_rate": 9.960441929644017e-06, "loss": 5.2137, "step": 675 }, { "epoch": 0.08704, "grad_norm": 3.9237470626831055, "learning_rate": 9.959806300488538e-06, "loss": 5.2047, "step": 680 }, { "epoch": 0.08768, "grad_norm": 3.392827272415161, "learning_rate": 9.95916562583443e-06, "loss": 5.3071, "step": 685 }, { "epoch": 0.08832, "grad_norm": 3.221484661102295, "learning_rate": 9.958519906333438e-06, "loss": 5.183, "step": 690 }, { "epoch": 0.08896, "grad_norm": 3.5143983364105225, "learning_rate": 9.957869142642437e-06, "loss": 5.3171, "step": 695 }, { "epoch": 0.0896, "grad_norm": 3.497072696685791, "learning_rate": 9.957213335423433e-06, "loss": 5.1784, "step": 700 }, { "epoch": 0.0896, "eval_loss": 1.2988511323928833, "eval_runtime": 6.9763, "eval_samples_per_second": 143.342, "eval_steps_per_second": 17.918, "step": 700 }, { "epoch": 0.09024, "grad_norm": 3.3822438716888428, "learning_rate": 9.956552485343566e-06, "loss": 5.1732, "step": 705 }, { "epoch": 0.09088, "grad_norm": 3.3949694633483887, "learning_rate": 9.955886593075101e-06, "loss": 5.2725, "step": 710 }, { "epoch": 0.09152, "grad_norm": 3.2577288150787354, "learning_rate": 9.955215659295438e-06, "loss": 5.2207, "step": 715 }, { "epoch": 0.09216, "grad_norm": 3.769519567489624, "learning_rate": 9.954539684687103e-06, "loss": 5.2152, "step": 720 }, { "epoch": 0.0928, "grad_norm": 3.3824892044067383, "learning_rate": 9.953858669937746e-06, "loss": 5.2085, "step": 725 }, { "epoch": 0.09344, "grad_norm": 3.771742105484009, "learning_rate": 9.953172615740152e-06, "loss": 5.1575, "step": 730 }, { "epoch": 0.09408, "grad_norm": 3.7706689834594727, "learning_rate": 9.952481522792226e-06, "loss": 4.9608, "step": 735 }, { "epoch": 0.09472, "grad_norm": 3.8110334873199463, "learning_rate": 9.951785391797001e-06, "loss": 5.21, "step": 740 }, { "epoch": 0.09536, "grad_norm": 3.3012993335723877, "learning_rate": 9.951084223462636e-06, "loss": 5.2475, "step": 745 }, { "epoch": 0.096, "grad_norm": 3.6353518962860107, "learning_rate": 9.950378018502415e-06, "loss": 5.0985, "step": 750 }, { "epoch": 0.09664, "grad_norm": 3.369378089904785, "learning_rate": 9.949666777634743e-06, "loss": 5.1986, "step": 755 }, { "epoch": 0.09728, "grad_norm": 3.2247676849365234, "learning_rate": 9.948950501583147e-06, "loss": 5.3192, "step": 760 }, { "epoch": 0.09792, "grad_norm": 3.6966888904571533, "learning_rate": 9.948229191076284e-06, "loss": 5.1654, "step": 765 }, { "epoch": 0.09856, "grad_norm": 3.5823962688446045, "learning_rate": 9.947502846847921e-06, "loss": 5.1351, "step": 770 }, { "epoch": 0.0992, "grad_norm": 3.5258729457855225, "learning_rate": 9.946771469636955e-06, "loss": 5.1745, "step": 775 }, { "epoch": 0.09984, "grad_norm": 3.42067813873291, "learning_rate": 9.946035060187398e-06, "loss": 5.1569, "step": 780 }, { "epoch": 0.10048, "grad_norm": 3.9832825660705566, "learning_rate": 9.945293619248383e-06, "loss": 4.9796, "step": 785 }, { "epoch": 0.10112, "grad_norm": 3.742013692855835, "learning_rate": 9.944547147574162e-06, "loss": 5.1625, "step": 790 }, { "epoch": 0.10176, "grad_norm": 3.3150367736816406, "learning_rate": 9.943795645924104e-06, "loss": 5.099, "step": 795 }, { "epoch": 0.1024, "grad_norm": 3.359069585800171, "learning_rate": 9.943039115062691e-06, "loss": 5.1877, "step": 800 }, { "epoch": 0.1024, "eval_loss": 1.2946017980575562, "eval_runtime": 7.4306, "eval_samples_per_second": 134.579, "eval_steps_per_second": 16.822, "step": 800 }, { "epoch": 0.10304, "grad_norm": 3.703000545501709, "learning_rate": 9.94227755575953e-06, "loss": 5.1581, "step": 805 }, { "epoch": 0.10368, "grad_norm": 3.5370070934295654, "learning_rate": 9.941510968789334e-06, "loss": 5.2402, "step": 810 }, { "epoch": 0.10432, "grad_norm": 3.5010828971862793, "learning_rate": 9.940739354931936e-06, "loss": 5.1828, "step": 815 }, { "epoch": 0.10496, "grad_norm": 3.4637820720672607, "learning_rate": 9.93996271497228e-06, "loss": 5.1792, "step": 820 }, { "epoch": 0.1056, "grad_norm": 3.409712076187134, "learning_rate": 9.939181049700427e-06, "loss": 5.0721, "step": 825 }, { "epoch": 0.10624, "grad_norm": 3.589414596557617, "learning_rate": 9.938394359911545e-06, "loss": 5.234, "step": 830 }, { "epoch": 0.10688, "grad_norm": 3.444977045059204, "learning_rate": 9.937602646405918e-06, "loss": 4.9763, "step": 835 }, { "epoch": 0.10752, "grad_norm": 3.3560900688171387, "learning_rate": 9.936805909988935e-06, "loss": 5.2006, "step": 840 }, { "epoch": 0.10816, "grad_norm": 3.345703601837158, "learning_rate": 9.9360041514711e-06, "loss": 5.0287, "step": 845 }, { "epoch": 0.1088, "grad_norm": 3.492363691329956, "learning_rate": 9.935197371668024e-06, "loss": 5.0908, "step": 850 }, { "epoch": 0.10944, "grad_norm": 7.459951400756836, "learning_rate": 9.934385571400425e-06, "loss": 5.1735, "step": 855 }, { "epoch": 0.11008, "grad_norm": 3.5033841133117676, "learning_rate": 9.933568751494131e-06, "loss": 5.053, "step": 860 }, { "epoch": 0.11072, "grad_norm": 3.5542259216308594, "learning_rate": 9.93274691278007e-06, "loss": 5.1463, "step": 865 }, { "epoch": 0.11136, "grad_norm": 3.3819243907928467, "learning_rate": 9.931920056094285e-06, "loss": 5.0397, "step": 870 }, { "epoch": 0.112, "grad_norm": 3.406768798828125, "learning_rate": 9.931088182277915e-06, "loss": 5.179, "step": 875 }, { "epoch": 0.11264, "grad_norm": 5.960773944854736, "learning_rate": 9.930251292177206e-06, "loss": 5.217, "step": 880 }, { "epoch": 0.11328, "grad_norm": 3.5821049213409424, "learning_rate": 9.929409386643511e-06, "loss": 5.0374, "step": 885 }, { "epoch": 0.11392, "grad_norm": 3.3204903602600098, "learning_rate": 9.928562466533279e-06, "loss": 5.1856, "step": 890 }, { "epoch": 0.11456, "grad_norm": 4.022350788116455, "learning_rate": 9.927710532708064e-06, "loss": 5.1051, "step": 895 }, { "epoch": 0.1152, "grad_norm": 3.3810718059539795, "learning_rate": 9.926853586034515e-06, "loss": 5.1691, "step": 900 }, { "epoch": 0.1152, "eval_loss": 1.2660380601882935, "eval_runtime": 6.8853, "eval_samples_per_second": 145.238, "eval_steps_per_second": 18.155, "step": 900 } ], "logging_steps": 5, "max_steps": 15624, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.141806761967616e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }