|
{ |
|
"best_metric": 1.1752163171768188, |
|
"best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-1600", |
|
"epoch": 0.2048, |
|
"eval_steps": 100, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00064, |
|
"grad_norm": 7.650606632232666, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 6.29, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 4.541823387145996, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 6.3815, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00192, |
|
"grad_norm": 4.245054721832275, |
|
"learning_rate": 3e-06, |
|
"loss": 6.2854, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 4.5587897300720215, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 6.0674, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 3.7703804969787598, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 6.2961, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 3.8425862789154053, |
|
"learning_rate": 5.8e-06, |
|
"loss": 6.3326, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00448, |
|
"grad_norm": 4.413463115692139, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 6.183, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 4.1980509757995605, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 6.2654, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00576, |
|
"grad_norm": 3.9166719913482666, |
|
"learning_rate": 8.8e-06, |
|
"loss": 6.0916, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 3.4706904888153076, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 6.103, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00704, |
|
"grad_norm": 5.138203144073486, |
|
"learning_rate": 9.999998372356185e-06, |
|
"loss": 6.2379, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00768, |
|
"grad_norm": 3.7806520462036133, |
|
"learning_rate": 9.999991760055e-06, |
|
"loss": 6.1776, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00832, |
|
"grad_norm": 3.5731871128082275, |
|
"learning_rate": 9.999980061375427e-06, |
|
"loss": 6.2082, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00896, |
|
"grad_norm": 3.661797285079956, |
|
"learning_rate": 9.999963276329369e-06, |
|
"loss": 6.0704, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 3.6181113719940186, |
|
"learning_rate": 9.999941404933902e-06, |
|
"loss": 6.2081, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01024, |
|
"grad_norm": 3.3162803649902344, |
|
"learning_rate": 9.99991444721127e-06, |
|
"loss": 5.8807, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01088, |
|
"grad_norm": 3.6022472381591797, |
|
"learning_rate": 9.999882403188902e-06, |
|
"loss": 6.1092, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01152, |
|
"grad_norm": 7.291418552398682, |
|
"learning_rate": 9.999845272899393e-06, |
|
"loss": 5.7668, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01216, |
|
"grad_norm": 3.522437810897827, |
|
"learning_rate": 9.999803056380517e-06, |
|
"loss": 6.1621, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 3.9014439582824707, |
|
"learning_rate": 9.999755753675216e-06, |
|
"loss": 6.0573, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"eval_loss": 1.5072969198226929, |
|
"eval_runtime": 11.1161, |
|
"eval_samples_per_second": 89.96, |
|
"eval_steps_per_second": 11.245, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01344, |
|
"grad_norm": 3.7579081058502197, |
|
"learning_rate": 9.999703364831614e-06, |
|
"loss": 6.1671, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01408, |
|
"grad_norm": 3.7058262825012207, |
|
"learning_rate": 9.999645889903002e-06, |
|
"loss": 6.1348, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01472, |
|
"grad_norm": 5.018667697906494, |
|
"learning_rate": 9.99958332894785e-06, |
|
"loss": 5.9376, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.01536, |
|
"grad_norm": 3.5420188903808594, |
|
"learning_rate": 9.999515682029798e-06, |
|
"loss": 5.9961, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 3.5725393295288086, |
|
"learning_rate": 9.999442949217663e-06, |
|
"loss": 5.8439, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.01664, |
|
"grad_norm": 3.8440959453582764, |
|
"learning_rate": 9.999365130585435e-06, |
|
"loss": 5.7857, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01728, |
|
"grad_norm": 3.4371285438537598, |
|
"learning_rate": 9.999282226212276e-06, |
|
"loss": 5.799, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.01792, |
|
"grad_norm": 3.996847152709961, |
|
"learning_rate": 9.999194236182523e-06, |
|
"loss": 6.0022, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01856, |
|
"grad_norm": 3.720330238342285, |
|
"learning_rate": 9.999101160585687e-06, |
|
"loss": 5.925, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 3.8822953701019287, |
|
"learning_rate": 9.99900299951645e-06, |
|
"loss": 5.8085, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01984, |
|
"grad_norm": 3.599283456802368, |
|
"learning_rate": 9.99889975307467e-06, |
|
"loss": 5.6533, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.02048, |
|
"grad_norm": 3.4847381114959717, |
|
"learning_rate": 9.998791421365376e-06, |
|
"loss": 5.9021, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02112, |
|
"grad_norm": 3.4302055835723877, |
|
"learning_rate": 9.998678004498774e-06, |
|
"loss": 5.962, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02176, |
|
"grad_norm": 4.561929702758789, |
|
"learning_rate": 9.99855950259024e-06, |
|
"loss": 5.9011, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 4.069271087646484, |
|
"learning_rate": 9.998435915760323e-06, |
|
"loss": 5.6782, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02304, |
|
"grad_norm": 3.5959055423736572, |
|
"learning_rate": 9.998307244134741e-06, |
|
"loss": 5.8107, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02368, |
|
"grad_norm": 3.5477242469787598, |
|
"learning_rate": 9.998173487844396e-06, |
|
"loss": 5.8335, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02432, |
|
"grad_norm": 4.488218307495117, |
|
"learning_rate": 9.998034647025349e-06, |
|
"loss": 5.8285, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02496, |
|
"grad_norm": 3.555074691772461, |
|
"learning_rate": 9.997890721818844e-06, |
|
"loss": 5.817, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 3.6248419284820557, |
|
"learning_rate": 9.99774171237129e-06, |
|
"loss": 5.8368, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"eval_loss": 1.440572738647461, |
|
"eval_runtime": 6.6468, |
|
"eval_samples_per_second": 150.448, |
|
"eval_steps_per_second": 18.806, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02624, |
|
"grad_norm": 3.432421922683716, |
|
"learning_rate": 9.997587618834272e-06, |
|
"loss": 5.7842, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.02688, |
|
"grad_norm": 3.333038806915283, |
|
"learning_rate": 9.997428441364546e-06, |
|
"loss": 5.7173, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02752, |
|
"grad_norm": 3.7716541290283203, |
|
"learning_rate": 9.997264180124038e-06, |
|
"loss": 5.719, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.02816, |
|
"grad_norm": 3.345600128173828, |
|
"learning_rate": 9.99709483527985e-06, |
|
"loss": 5.8428, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 3.7677502632141113, |
|
"learning_rate": 9.99692040700425e-06, |
|
"loss": 5.7393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.02944, |
|
"grad_norm": 11.996383666992188, |
|
"learning_rate": 9.996740895474682e-06, |
|
"loss": 5.5566, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03008, |
|
"grad_norm": 3.6089084148406982, |
|
"learning_rate": 9.996556300873758e-06, |
|
"loss": 5.6939, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.03072, |
|
"grad_norm": 3.834825038909912, |
|
"learning_rate": 9.996366623389263e-06, |
|
"loss": 5.8123, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03136, |
|
"grad_norm": 3.570263147354126, |
|
"learning_rate": 9.99617186321415e-06, |
|
"loss": 5.6839, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 3.5728812217712402, |
|
"learning_rate": 9.995972020546545e-06, |
|
"loss": 5.7764, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03264, |
|
"grad_norm": 3.4725637435913086, |
|
"learning_rate": 9.995767095589743e-06, |
|
"loss": 5.6879, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.03328, |
|
"grad_norm": 3.811537742614746, |
|
"learning_rate": 9.99555708855221e-06, |
|
"loss": 5.6418, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03392, |
|
"grad_norm": 3.494992971420288, |
|
"learning_rate": 9.99534199964758e-06, |
|
"loss": 5.6927, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03456, |
|
"grad_norm": 3.8107383251190186, |
|
"learning_rate": 9.995121829094662e-06, |
|
"loss": 5.5658, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 3.570551633834839, |
|
"learning_rate": 9.994896577117425e-06, |
|
"loss": 5.8131, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03584, |
|
"grad_norm": 3.540811538696289, |
|
"learning_rate": 9.994666243945018e-06, |
|
"loss": 5.6009, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03648, |
|
"grad_norm": 3.7275819778442383, |
|
"learning_rate": 9.99443082981175e-06, |
|
"loss": 5.6407, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03712, |
|
"grad_norm": 4.194495677947998, |
|
"learning_rate": 9.994190334957103e-06, |
|
"loss": 5.8319, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03776, |
|
"grad_norm": 3.5107626914978027, |
|
"learning_rate": 9.993944759625728e-06, |
|
"loss": 5.5765, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 3.4100208282470703, |
|
"learning_rate": 9.993694104067444e-06, |
|
"loss": 5.7473, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"eval_loss": 1.407908320426941, |
|
"eval_runtime": 6.6542, |
|
"eval_samples_per_second": 150.281, |
|
"eval_steps_per_second": 18.785, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03904, |
|
"grad_norm": 3.7727818489074707, |
|
"learning_rate": 9.993438368537236e-06, |
|
"loss": 5.6802, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.03968, |
|
"grad_norm": 3.445909023284912, |
|
"learning_rate": 9.993177553295258e-06, |
|
"loss": 5.7484, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.04032, |
|
"grad_norm": 3.4199888706207275, |
|
"learning_rate": 9.992911658606832e-06, |
|
"loss": 5.7648, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.04096, |
|
"grad_norm": 4.9640655517578125, |
|
"learning_rate": 9.992640684742445e-06, |
|
"loss": 5.7922, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 3.3730976581573486, |
|
"learning_rate": 9.992364631977754e-06, |
|
"loss": 5.677, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.04224, |
|
"grad_norm": 3.540597915649414, |
|
"learning_rate": 9.99208350059358e-06, |
|
"loss": 5.5495, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04288, |
|
"grad_norm": 3.6853768825531006, |
|
"learning_rate": 9.991797290875915e-06, |
|
"loss": 5.4089, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04352, |
|
"grad_norm": 3.6380045413970947, |
|
"learning_rate": 9.991506003115911e-06, |
|
"loss": 5.4849, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04416, |
|
"grad_norm": 3.265488862991333, |
|
"learning_rate": 9.991209637609887e-06, |
|
"loss": 5.523, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 3.2634189128875732, |
|
"learning_rate": 9.990908194659332e-06, |
|
"loss": 5.5664, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04544, |
|
"grad_norm": 3.569810152053833, |
|
"learning_rate": 9.990601674570895e-06, |
|
"loss": 5.5059, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.04608, |
|
"grad_norm": 3.580211877822876, |
|
"learning_rate": 9.990290077656393e-06, |
|
"loss": 5.4079, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04672, |
|
"grad_norm": 3.4860317707061768, |
|
"learning_rate": 9.989973404232805e-06, |
|
"loss": 5.6858, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04736, |
|
"grad_norm": 4.026730060577393, |
|
"learning_rate": 9.989651654622277e-06, |
|
"loss": 5.5662, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 3.364692449569702, |
|
"learning_rate": 9.989324829152119e-06, |
|
"loss": 5.5304, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04864, |
|
"grad_norm": 3.611964464187622, |
|
"learning_rate": 9.9889929281548e-06, |
|
"loss": 5.3911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04928, |
|
"grad_norm": 3.2946035861968994, |
|
"learning_rate": 9.988655951967958e-06, |
|
"loss": 5.4102, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.04992, |
|
"grad_norm": 3.963909864425659, |
|
"learning_rate": 9.98831390093439e-06, |
|
"loss": 5.549, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05056, |
|
"grad_norm": 3.2876341342926025, |
|
"learning_rate": 9.987966775402056e-06, |
|
"loss": 5.5388, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 3.8467471599578857, |
|
"learning_rate": 9.98761457572408e-06, |
|
"loss": 5.454, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"eval_loss": 1.3826359510421753, |
|
"eval_runtime": 7.0199, |
|
"eval_samples_per_second": 142.452, |
|
"eval_steps_per_second": 17.807, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05184, |
|
"grad_norm": 3.675231695175171, |
|
"learning_rate": 9.987257302258748e-06, |
|
"loss": 5.674, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.05248, |
|
"grad_norm": 3.787940263748169, |
|
"learning_rate": 9.986894955369504e-06, |
|
"loss": 5.5466, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.05312, |
|
"grad_norm": 3.677966833114624, |
|
"learning_rate": 9.986527535424956e-06, |
|
"loss": 5.4762, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05376, |
|
"grad_norm": 3.5083606243133545, |
|
"learning_rate": 9.986155042798874e-06, |
|
"loss": 5.3145, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 3.536379098892212, |
|
"learning_rate": 9.98577747787018e-06, |
|
"loss": 5.3769, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05504, |
|
"grad_norm": 3.5448412895202637, |
|
"learning_rate": 9.98539484102297e-06, |
|
"loss": 5.3996, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05568, |
|
"grad_norm": 3.359647274017334, |
|
"learning_rate": 9.985007132646489e-06, |
|
"loss": 5.3114, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.05632, |
|
"grad_norm": 3.3419110774993896, |
|
"learning_rate": 9.984614353135143e-06, |
|
"loss": 5.4383, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05696, |
|
"grad_norm": 3.558025360107422, |
|
"learning_rate": 9.984216502888496e-06, |
|
"loss": 5.5239, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 3.6349422931671143, |
|
"learning_rate": 9.983813582311277e-06, |
|
"loss": 5.5639, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05824, |
|
"grad_norm": 3.2916922569274902, |
|
"learning_rate": 9.983405591813362e-06, |
|
"loss": 5.3886, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.05888, |
|
"grad_norm": 3.32891845703125, |
|
"learning_rate": 9.982992531809796e-06, |
|
"loss": 5.526, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05952, |
|
"grad_norm": 3.8752880096435547, |
|
"learning_rate": 9.982574402720773e-06, |
|
"loss": 5.6599, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.06016, |
|
"grad_norm": 3.604433536529541, |
|
"learning_rate": 9.982151204971646e-06, |
|
"loss": 5.4567, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 3.3058159351348877, |
|
"learning_rate": 9.981722938992926e-06, |
|
"loss": 5.4981, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.06144, |
|
"grad_norm": 3.7341926097869873, |
|
"learning_rate": 9.981289605220276e-06, |
|
"loss": 5.3278, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.06208, |
|
"grad_norm": 3.51798415184021, |
|
"learning_rate": 9.980851204094519e-06, |
|
"loss": 5.5029, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.06272, |
|
"grad_norm": 3.6541428565979004, |
|
"learning_rate": 9.980407736061629e-06, |
|
"loss": 5.3987, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06336, |
|
"grad_norm": 3.420767307281494, |
|
"learning_rate": 9.979959201572736e-06, |
|
"loss": 5.405, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 3.7169559001922607, |
|
"learning_rate": 9.979505601084124e-06, |
|
"loss": 5.498, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_loss": 1.3493109941482544, |
|
"eval_runtime": 7.1309, |
|
"eval_samples_per_second": 140.234, |
|
"eval_steps_per_second": 17.529, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06464, |
|
"grad_norm": 4.536627769470215, |
|
"learning_rate": 9.97904693505723e-06, |
|
"loss": 5.5237, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.06528, |
|
"grad_norm": 3.204948902130127, |
|
"learning_rate": 9.978583203958649e-06, |
|
"loss": 5.3746, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.06592, |
|
"grad_norm": 3.4658005237579346, |
|
"learning_rate": 9.978114408260118e-06, |
|
"loss": 5.4567, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.06656, |
|
"grad_norm": 4.932333469390869, |
|
"learning_rate": 9.977640548438534e-06, |
|
"loss": 5.1959, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 3.4697563648223877, |
|
"learning_rate": 9.977161624975948e-06, |
|
"loss": 5.4013, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.06784, |
|
"grad_norm": 3.441819667816162, |
|
"learning_rate": 9.976677638359553e-06, |
|
"loss": 5.4899, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.06848, |
|
"grad_norm": 3.4293930530548096, |
|
"learning_rate": 9.9761885890817e-06, |
|
"loss": 5.3569, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.06912, |
|
"grad_norm": 3.5388574600219727, |
|
"learning_rate": 9.975694477639885e-06, |
|
"loss": 5.2739, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06976, |
|
"grad_norm": 3.735548973083496, |
|
"learning_rate": 9.97519530453676e-06, |
|
"loss": 5.4253, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 3.33503794670105, |
|
"learning_rate": 9.974691070280121e-06, |
|
"loss": 5.1569, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.07104, |
|
"grad_norm": 3.5171401500701904, |
|
"learning_rate": 9.974181775382915e-06, |
|
"loss": 5.3242, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.07168, |
|
"grad_norm": 3.565356969833374, |
|
"learning_rate": 9.973667420363233e-06, |
|
"loss": 5.3893, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.07232, |
|
"grad_norm": 3.172163248062134, |
|
"learning_rate": 9.973148005744319e-06, |
|
"loss": 5.3824, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.07296, |
|
"grad_norm": 3.517838716506958, |
|
"learning_rate": 9.972623532054564e-06, |
|
"loss": 5.2673, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 3.328416585922241, |
|
"learning_rate": 9.9720939998275e-06, |
|
"loss": 5.2649, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.07424, |
|
"grad_norm": 3.475539445877075, |
|
"learning_rate": 9.971559409601807e-06, |
|
"loss": 5.3318, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.07488, |
|
"grad_norm": 3.492013692855835, |
|
"learning_rate": 9.971019761921317e-06, |
|
"loss": 5.2735, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.07552, |
|
"grad_norm": 3.474803924560547, |
|
"learning_rate": 9.970475057334997e-06, |
|
"loss": 5.3722, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.07616, |
|
"grad_norm": 3.4162726402282715, |
|
"learning_rate": 9.96992529639696e-06, |
|
"loss": 5.3901, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 3.3643155097961426, |
|
"learning_rate": 9.969370479666473e-06, |
|
"loss": 5.2384, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"eval_loss": 1.3373793363571167, |
|
"eval_runtime": 6.5847, |
|
"eval_samples_per_second": 151.867, |
|
"eval_steps_per_second": 18.983, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07744, |
|
"grad_norm": 3.44301176071167, |
|
"learning_rate": 9.968810607707933e-06, |
|
"loss": 5.2322, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.07808, |
|
"grad_norm": 3.422262668609619, |
|
"learning_rate": 9.968245681090887e-06, |
|
"loss": 5.1708, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.07872, |
|
"grad_norm": 3.2879252433776855, |
|
"learning_rate": 9.96767570039002e-06, |
|
"loss": 5.2291, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.07936, |
|
"grad_norm": 3.6026480197906494, |
|
"learning_rate": 9.967100666185163e-06, |
|
"loss": 5.4241, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.3642101287841797, |
|
"learning_rate": 9.966520579061286e-06, |
|
"loss": 5.4473, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.08064, |
|
"grad_norm": 3.5968470573425293, |
|
"learning_rate": 9.965935439608493e-06, |
|
"loss": 5.3982, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.08128, |
|
"grad_norm": 3.352083206176758, |
|
"learning_rate": 9.96534524842204e-06, |
|
"loss": 5.3953, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.08192, |
|
"grad_norm": 3.3571720123291016, |
|
"learning_rate": 9.964750006102311e-06, |
|
"loss": 5.3159, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.08256, |
|
"grad_norm": 3.486246109008789, |
|
"learning_rate": 9.964149713254833e-06, |
|
"loss": 5.211, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 3.674906015396118, |
|
"learning_rate": 9.96354437049027e-06, |
|
"loss": 5.3374, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.08384, |
|
"grad_norm": 3.590810537338257, |
|
"learning_rate": 9.962933978424426e-06, |
|
"loss": 5.2194, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.08448, |
|
"grad_norm": 3.551786184310913, |
|
"learning_rate": 9.962318537678238e-06, |
|
"loss": 5.1187, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.08512, |
|
"grad_norm": 3.5391581058502197, |
|
"learning_rate": 9.961698048877776e-06, |
|
"loss": 5.2001, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.08576, |
|
"grad_norm": 3.6105592250823975, |
|
"learning_rate": 9.961072512654255e-06, |
|
"loss": 5.2758, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 3.7463858127593994, |
|
"learning_rate": 9.960441929644017e-06, |
|
"loss": 5.2137, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.08704, |
|
"grad_norm": 3.9237470626831055, |
|
"learning_rate": 9.959806300488538e-06, |
|
"loss": 5.2047, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.08768, |
|
"grad_norm": 3.392827272415161, |
|
"learning_rate": 9.95916562583443e-06, |
|
"loss": 5.3071, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.08832, |
|
"grad_norm": 3.221484661102295, |
|
"learning_rate": 9.958519906333438e-06, |
|
"loss": 5.183, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.08896, |
|
"grad_norm": 3.5143983364105225, |
|
"learning_rate": 9.957869142642437e-06, |
|
"loss": 5.3171, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 3.497072696685791, |
|
"learning_rate": 9.957213335423433e-06, |
|
"loss": 5.1784, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"eval_loss": 1.2988511323928833, |
|
"eval_runtime": 6.9763, |
|
"eval_samples_per_second": 143.342, |
|
"eval_steps_per_second": 17.918, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.09024, |
|
"grad_norm": 3.3822438716888428, |
|
"learning_rate": 9.956552485343566e-06, |
|
"loss": 5.1732, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.09088, |
|
"grad_norm": 3.3949694633483887, |
|
"learning_rate": 9.955886593075101e-06, |
|
"loss": 5.2725, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.09152, |
|
"grad_norm": 3.2577288150787354, |
|
"learning_rate": 9.955215659295438e-06, |
|
"loss": 5.2207, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.09216, |
|
"grad_norm": 3.769519567489624, |
|
"learning_rate": 9.954539684687103e-06, |
|
"loss": 5.2152, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 3.3824892044067383, |
|
"learning_rate": 9.953858669937746e-06, |
|
"loss": 5.2085, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.09344, |
|
"grad_norm": 3.771742105484009, |
|
"learning_rate": 9.953172615740152e-06, |
|
"loss": 5.1575, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.09408, |
|
"grad_norm": 3.7706689834594727, |
|
"learning_rate": 9.952481522792226e-06, |
|
"loss": 4.9608, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.09472, |
|
"grad_norm": 3.8110334873199463, |
|
"learning_rate": 9.951785391797001e-06, |
|
"loss": 5.21, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.09536, |
|
"grad_norm": 3.3012993335723877, |
|
"learning_rate": 9.951084223462636e-06, |
|
"loss": 5.2475, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 3.6353518962860107, |
|
"learning_rate": 9.950378018502415e-06, |
|
"loss": 5.0985, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09664, |
|
"grad_norm": 3.369378089904785, |
|
"learning_rate": 9.949666777634743e-06, |
|
"loss": 5.1986, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.09728, |
|
"grad_norm": 3.2247676849365234, |
|
"learning_rate": 9.948950501583147e-06, |
|
"loss": 5.3192, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.09792, |
|
"grad_norm": 3.6966888904571533, |
|
"learning_rate": 9.948229191076284e-06, |
|
"loss": 5.1654, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.09856, |
|
"grad_norm": 3.5823962688446045, |
|
"learning_rate": 9.947502846847921e-06, |
|
"loss": 5.1351, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 3.5258729457855225, |
|
"learning_rate": 9.946771469636955e-06, |
|
"loss": 5.1745, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.09984, |
|
"grad_norm": 3.42067813873291, |
|
"learning_rate": 9.946035060187398e-06, |
|
"loss": 5.1569, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.10048, |
|
"grad_norm": 3.9832825660705566, |
|
"learning_rate": 9.945293619248383e-06, |
|
"loss": 4.9796, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.10112, |
|
"grad_norm": 3.742013692855835, |
|
"learning_rate": 9.944547147574162e-06, |
|
"loss": 5.1625, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.10176, |
|
"grad_norm": 3.3150367736816406, |
|
"learning_rate": 9.943795645924104e-06, |
|
"loss": 5.099, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 3.359069585800171, |
|
"learning_rate": 9.943039115062691e-06, |
|
"loss": 5.1877, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"eval_loss": 1.2946017980575562, |
|
"eval_runtime": 7.4306, |
|
"eval_samples_per_second": 134.579, |
|
"eval_steps_per_second": 16.822, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.10304, |
|
"grad_norm": 3.703000545501709, |
|
"learning_rate": 9.94227755575953e-06, |
|
"loss": 5.1581, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.10368, |
|
"grad_norm": 3.5370070934295654, |
|
"learning_rate": 9.941510968789334e-06, |
|
"loss": 5.2402, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.10432, |
|
"grad_norm": 3.5010828971862793, |
|
"learning_rate": 9.940739354931936e-06, |
|
"loss": 5.1828, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.10496, |
|
"grad_norm": 3.4637820720672607, |
|
"learning_rate": 9.93996271497228e-06, |
|
"loss": 5.1792, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 3.409712076187134, |
|
"learning_rate": 9.939181049700427e-06, |
|
"loss": 5.0721, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.10624, |
|
"grad_norm": 3.589414596557617, |
|
"learning_rate": 9.938394359911545e-06, |
|
"loss": 5.234, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.10688, |
|
"grad_norm": 3.444977045059204, |
|
"learning_rate": 9.937602646405918e-06, |
|
"loss": 4.9763, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.10752, |
|
"grad_norm": 3.3560900688171387, |
|
"learning_rate": 9.936805909988935e-06, |
|
"loss": 5.2006, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.10816, |
|
"grad_norm": 3.345703601837158, |
|
"learning_rate": 9.9360041514711e-06, |
|
"loss": 5.0287, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 3.492363691329956, |
|
"learning_rate": 9.935197371668024e-06, |
|
"loss": 5.0908, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10944, |
|
"grad_norm": 7.459951400756836, |
|
"learning_rate": 9.934385571400425e-06, |
|
"loss": 5.1735, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.11008, |
|
"grad_norm": 3.5033841133117676, |
|
"learning_rate": 9.933568751494131e-06, |
|
"loss": 5.053, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.11072, |
|
"grad_norm": 3.5542259216308594, |
|
"learning_rate": 9.93274691278007e-06, |
|
"loss": 5.1463, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.11136, |
|
"grad_norm": 3.3819243907928467, |
|
"learning_rate": 9.931920056094285e-06, |
|
"loss": 5.0397, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 3.406768798828125, |
|
"learning_rate": 9.931088182277915e-06, |
|
"loss": 5.179, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.11264, |
|
"grad_norm": 5.960773944854736, |
|
"learning_rate": 9.930251292177206e-06, |
|
"loss": 5.217, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.11328, |
|
"grad_norm": 3.5821049213409424, |
|
"learning_rate": 9.929409386643511e-06, |
|
"loss": 5.0374, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.11392, |
|
"grad_norm": 3.3204903602600098, |
|
"learning_rate": 9.928562466533279e-06, |
|
"loss": 5.1856, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.11456, |
|
"grad_norm": 4.022350788116455, |
|
"learning_rate": 9.927710532708064e-06, |
|
"loss": 5.1051, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 3.3810718059539795, |
|
"learning_rate": 9.926853586034515e-06, |
|
"loss": 5.1691, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"eval_loss": 1.2660380601882935, |
|
"eval_runtime": 6.8853, |
|
"eval_samples_per_second": 145.238, |
|
"eval_steps_per_second": 18.155, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11584, |
|
"grad_norm": 3.5757713317871094, |
|
"learning_rate": 9.92599162738439e-06, |
|
"loss": 5.1505, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.11648, |
|
"grad_norm": 3.38582706451416, |
|
"learning_rate": 9.925124657634537e-06, |
|
"loss": 5.0915, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.11712, |
|
"grad_norm": 3.4189300537109375, |
|
"learning_rate": 9.924252677666905e-06, |
|
"loss": 5.1992, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.11776, |
|
"grad_norm": 3.4118812084198, |
|
"learning_rate": 9.92337568836854e-06, |
|
"loss": 5.1334, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 3.5167789459228516, |
|
"learning_rate": 9.922493690631583e-06, |
|
"loss": 5.1003, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.11904, |
|
"grad_norm": 3.546893358230591, |
|
"learning_rate": 9.921606685353268e-06, |
|
"loss": 5.1346, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.11968, |
|
"grad_norm": 3.1576385498046875, |
|
"learning_rate": 9.920714673435931e-06, |
|
"loss": 4.9601, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.12032, |
|
"grad_norm": 3.4227495193481445, |
|
"learning_rate": 9.91981765578699e-06, |
|
"loss": 5.0087, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.12096, |
|
"grad_norm": 3.4890694618225098, |
|
"learning_rate": 9.918915633318964e-06, |
|
"loss": 5.1319, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 3.7377865314483643, |
|
"learning_rate": 9.918008606949459e-06, |
|
"loss": 5.0618, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.12224, |
|
"grad_norm": 3.793402671813965, |
|
"learning_rate": 9.917096577601172e-06, |
|
"loss": 4.9998, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.12288, |
|
"grad_norm": 3.404918909072876, |
|
"learning_rate": 9.916179546201889e-06, |
|
"loss": 5.0865, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.12352, |
|
"grad_norm": 3.6076908111572266, |
|
"learning_rate": 9.915257513684488e-06, |
|
"loss": 5.0004, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.12416, |
|
"grad_norm": 3.631777286529541, |
|
"learning_rate": 9.914330480986932e-06, |
|
"loss": 5.2806, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 3.323333501815796, |
|
"learning_rate": 9.913398449052266e-06, |
|
"loss": 5.07, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.12544, |
|
"grad_norm": 3.6380035877227783, |
|
"learning_rate": 9.912461418828628e-06, |
|
"loss": 5.0559, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.12608, |
|
"grad_norm": 3.7685458660125732, |
|
"learning_rate": 9.911519391269238e-06, |
|
"loss": 5.0497, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.12672, |
|
"grad_norm": 3.4882941246032715, |
|
"learning_rate": 9.910572367332397e-06, |
|
"loss": 5.0388, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.12736, |
|
"grad_norm": 3.27787184715271, |
|
"learning_rate": 9.909620347981493e-06, |
|
"loss": 5.0285, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.388284921646118, |
|
"learning_rate": 9.908663334184994e-06, |
|
"loss": 5.1426, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_loss": 1.2478246688842773, |
|
"eval_runtime": 9.3123, |
|
"eval_samples_per_second": 107.384, |
|
"eval_steps_per_second": 13.423, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12864, |
|
"grad_norm": 3.4602177143096924, |
|
"learning_rate": 9.907701326916448e-06, |
|
"loss": 4.8852, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.12928, |
|
"grad_norm": 3.7464816570281982, |
|
"learning_rate": 9.906734327154481e-06, |
|
"loss": 4.9129, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.12992, |
|
"grad_norm": 6.138649940490723, |
|
"learning_rate": 9.905762335882804e-06, |
|
"loss": 5.1037, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.13056, |
|
"grad_norm": 3.5933375358581543, |
|
"learning_rate": 9.904785354090198e-06, |
|
"loss": 4.9644, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 3.6777257919311523, |
|
"learning_rate": 9.903803382770528e-06, |
|
"loss": 5.0575, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.13184, |
|
"grad_norm": 3.4429285526275635, |
|
"learning_rate": 9.902816422922727e-06, |
|
"loss": 4.8722, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.13248, |
|
"grad_norm": 3.7400121688842773, |
|
"learning_rate": 9.90182447555081e-06, |
|
"loss": 4.9521, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.13312, |
|
"grad_norm": 3.2183690071105957, |
|
"learning_rate": 9.900827541663862e-06, |
|
"loss": 5.0314, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.13376, |
|
"grad_norm": 3.563539505004883, |
|
"learning_rate": 9.899825622276041e-06, |
|
"loss": 4.9471, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 3.3289413452148438, |
|
"learning_rate": 9.898818718406578e-06, |
|
"loss": 5.0223, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.13504, |
|
"grad_norm": 3.3363258838653564, |
|
"learning_rate": 9.89780683107977e-06, |
|
"loss": 4.8883, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.13568, |
|
"grad_norm": 3.5950427055358887, |
|
"learning_rate": 9.896789961324991e-06, |
|
"loss": 4.9488, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.13632, |
|
"grad_norm": 3.2444112300872803, |
|
"learning_rate": 9.895768110176677e-06, |
|
"loss": 4.9408, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.13696, |
|
"grad_norm": 3.2985880374908447, |
|
"learning_rate": 9.894741278674337e-06, |
|
"loss": 4.9875, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 3.474818229675293, |
|
"learning_rate": 9.89370946786254e-06, |
|
"loss": 5.0526, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.13824, |
|
"grad_norm": 4.721025466918945, |
|
"learning_rate": 9.892672678790926e-06, |
|
"loss": 5.1362, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.13888, |
|
"grad_norm": 3.84086012840271, |
|
"learning_rate": 9.891630912514197e-06, |
|
"loss": 4.9631, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.13952, |
|
"grad_norm": 3.487732172012329, |
|
"learning_rate": 9.890584170092115e-06, |
|
"loss": 4.9211, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.14016, |
|
"grad_norm": 3.398810625076294, |
|
"learning_rate": 9.889532452589512e-06, |
|
"loss": 4.9814, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 3.3263680934906006, |
|
"learning_rate": 9.888475761076273e-06, |
|
"loss": 4.9985, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"eval_loss": 1.2442607879638672, |
|
"eval_runtime": 6.5582, |
|
"eval_samples_per_second": 152.481, |
|
"eval_steps_per_second": 19.06, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.14144, |
|
"grad_norm": 3.4481613636016846, |
|
"learning_rate": 9.887414096627348e-06, |
|
"loss": 5.0169, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.14208, |
|
"grad_norm": 3.2736401557922363, |
|
"learning_rate": 9.886347460322744e-06, |
|
"loss": 5.0703, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.14272, |
|
"grad_norm": 3.2973997592926025, |
|
"learning_rate": 9.885275853247526e-06, |
|
"loss": 4.9957, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.14336, |
|
"grad_norm": 3.6516940593719482, |
|
"learning_rate": 9.884199276491817e-06, |
|
"loss": 5.0162, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 3.1835155487060547, |
|
"learning_rate": 9.883117731150792e-06, |
|
"loss": 4.9765, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.14464, |
|
"grad_norm": 3.21928334236145, |
|
"learning_rate": 9.882031218324681e-06, |
|
"loss": 5.0611, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.14528, |
|
"grad_norm": 4.601723670959473, |
|
"learning_rate": 9.880939739118772e-06, |
|
"loss": 5.0637, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.14592, |
|
"grad_norm": 3.2973368167877197, |
|
"learning_rate": 9.879843294643402e-06, |
|
"loss": 4.9621, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.14656, |
|
"grad_norm": 3.4781899452209473, |
|
"learning_rate": 9.878741886013959e-06, |
|
"loss": 4.9482, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 3.5175704956054688, |
|
"learning_rate": 9.877635514350878e-06, |
|
"loss": 4.8594, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.14784, |
|
"grad_norm": 3.4302468299865723, |
|
"learning_rate": 9.87652418077965e-06, |
|
"loss": 4.8865, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.14848, |
|
"grad_norm": 3.464651346206665, |
|
"learning_rate": 9.875407886430806e-06, |
|
"loss": 4.9922, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.14912, |
|
"grad_norm": 4.064827919006348, |
|
"learning_rate": 9.87428663243993e-06, |
|
"loss": 4.9592, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.14976, |
|
"grad_norm": 3.654902458190918, |
|
"learning_rate": 9.873160419947645e-06, |
|
"loss": 4.9286, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 3.395596981048584, |
|
"learning_rate": 9.872029250099626e-06, |
|
"loss": 5.0057, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.15104, |
|
"grad_norm": 3.745281457901001, |
|
"learning_rate": 9.870893124046582e-06, |
|
"loss": 4.8671, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.15168, |
|
"grad_norm": 3.449518918991089, |
|
"learning_rate": 9.869752042944271e-06, |
|
"loss": 4.8306, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.15232, |
|
"grad_norm": 3.1926662921905518, |
|
"learning_rate": 9.868606007953487e-06, |
|
"loss": 5.0347, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.15296, |
|
"grad_norm": 3.4620425701141357, |
|
"learning_rate": 9.86745502024007e-06, |
|
"loss": 4.857, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 3.5597681999206543, |
|
"learning_rate": 9.866299080974886e-06, |
|
"loss": 4.9225, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"eval_loss": 1.2185124158859253, |
|
"eval_runtime": 7.9383, |
|
"eval_samples_per_second": 125.972, |
|
"eval_steps_per_second": 15.746, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.15424, |
|
"grad_norm": 3.5934455394744873, |
|
"learning_rate": 9.865138191333852e-06, |
|
"loss": 4.7654, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.15488, |
|
"grad_norm": 3.8588831424713135, |
|
"learning_rate": 9.863972352497912e-06, |
|
"loss": 4.9993, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.15552, |
|
"grad_norm": 3.58868408203125, |
|
"learning_rate": 9.86280156565305e-06, |
|
"loss": 4.8217, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.15616, |
|
"grad_norm": 3.5407521724700928, |
|
"learning_rate": 9.861625831990278e-06, |
|
"loss": 4.875, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 3.4974656105041504, |
|
"learning_rate": 9.860445152705644e-06, |
|
"loss": 5.0627, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.15744, |
|
"grad_norm": 3.655677556991577, |
|
"learning_rate": 9.859259529000228e-06, |
|
"loss": 4.8015, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.15808, |
|
"grad_norm": 3.55148983001709, |
|
"learning_rate": 9.858068962080136e-06, |
|
"loss": 5.1209, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.15872, |
|
"grad_norm": 3.4331536293029785, |
|
"learning_rate": 9.856873453156506e-06, |
|
"loss": 4.9739, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.15936, |
|
"grad_norm": 3.374394655227661, |
|
"learning_rate": 9.855673003445502e-06, |
|
"loss": 4.8138, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.5296385288238525, |
|
"learning_rate": 9.854467614168315e-06, |
|
"loss": 5.0274, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.16064, |
|
"grad_norm": 3.6533989906311035, |
|
"learning_rate": 9.85325728655116e-06, |
|
"loss": 4.9979, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.16128, |
|
"grad_norm": 3.3504199981689453, |
|
"learning_rate": 9.852042021825272e-06, |
|
"loss": 4.8317, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.16192, |
|
"grad_norm": 3.614529609680176, |
|
"learning_rate": 9.850821821226918e-06, |
|
"loss": 4.9413, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.16256, |
|
"grad_norm": 3.4821839332580566, |
|
"learning_rate": 9.849596685997376e-06, |
|
"loss": 4.904, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 3.3400087356567383, |
|
"learning_rate": 9.848366617382951e-06, |
|
"loss": 4.9039, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.16384, |
|
"grad_norm": 4.062397003173828, |
|
"learning_rate": 9.847131616634963e-06, |
|
"loss": 4.7378, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.16448, |
|
"grad_norm": 3.689796209335327, |
|
"learning_rate": 9.845891685009751e-06, |
|
"loss": 4.8799, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.16512, |
|
"grad_norm": 3.509657621383667, |
|
"learning_rate": 9.84464682376867e-06, |
|
"loss": 4.8513, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.16576, |
|
"grad_norm": 3.4828646183013916, |
|
"learning_rate": 9.843397034178088e-06, |
|
"loss": 5.0151, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 3.394510507583618, |
|
"learning_rate": 9.842142317509387e-06, |
|
"loss": 4.7585, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"eval_loss": 1.2179418802261353, |
|
"eval_runtime": 6.7952, |
|
"eval_samples_per_second": 147.163, |
|
"eval_steps_per_second": 18.395, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.16704, |
|
"grad_norm": 3.4089293479919434, |
|
"learning_rate": 9.840882675038962e-06, |
|
"loss": 4.7646, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.16768, |
|
"grad_norm": 3.1607353687286377, |
|
"learning_rate": 9.83961810804822e-06, |
|
"loss": 4.9528, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.16832, |
|
"grad_norm": 3.30869197845459, |
|
"learning_rate": 9.838348617823573e-06, |
|
"loss": 5.0086, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.16896, |
|
"grad_norm": 3.6550564765930176, |
|
"learning_rate": 9.837074205656452e-06, |
|
"loss": 4.8675, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 3.6141419410705566, |
|
"learning_rate": 9.835794872843281e-06, |
|
"loss": 4.8885, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.17024, |
|
"grad_norm": 3.4006361961364746, |
|
"learning_rate": 9.834510620685497e-06, |
|
"loss": 4.7784, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.17088, |
|
"grad_norm": 3.4397149085998535, |
|
"learning_rate": 9.833221450489543e-06, |
|
"loss": 4.929, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.17152, |
|
"grad_norm": 3.613502025604248, |
|
"learning_rate": 9.83192736356686e-06, |
|
"loss": 4.8763, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.17216, |
|
"grad_norm": 3.613837957382202, |
|
"learning_rate": 9.830628361233896e-06, |
|
"loss": 4.8765, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 3.775621175765991, |
|
"learning_rate": 9.829324444812096e-06, |
|
"loss": 4.8103, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.17344, |
|
"grad_norm": 3.6856908798217773, |
|
"learning_rate": 9.828015615627904e-06, |
|
"loss": 4.8867, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.17408, |
|
"grad_norm": 3.3510427474975586, |
|
"learning_rate": 9.826701875012763e-06, |
|
"loss": 4.7708, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.17472, |
|
"grad_norm": 3.342366933822632, |
|
"learning_rate": 9.82538322430311e-06, |
|
"loss": 4.8404, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.17536, |
|
"grad_norm": 3.5898385047912598, |
|
"learning_rate": 9.824059664840378e-06, |
|
"loss": 4.8205, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 3.1588313579559326, |
|
"learning_rate": 9.822731197970998e-06, |
|
"loss": 4.7214, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.17664, |
|
"grad_norm": 3.431478261947632, |
|
"learning_rate": 9.821397825046387e-06, |
|
"loss": 4.8892, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.17728, |
|
"grad_norm": 3.7104616165161133, |
|
"learning_rate": 9.820059547422952e-06, |
|
"loss": 4.8027, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.17792, |
|
"grad_norm": 3.189239263534546, |
|
"learning_rate": 9.818716366462098e-06, |
|
"loss": 4.8692, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.17856, |
|
"grad_norm": 3.3543105125427246, |
|
"learning_rate": 9.81736828353021e-06, |
|
"loss": 4.9076, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 3.2962117195129395, |
|
"learning_rate": 9.816015299998663e-06, |
|
"loss": 4.93, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"eval_loss": 1.2212570905685425, |
|
"eval_runtime": 7.0462, |
|
"eval_samples_per_second": 141.92, |
|
"eval_steps_per_second": 17.74, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.17984, |
|
"grad_norm": 3.2857654094696045, |
|
"learning_rate": 9.814657417243814e-06, |
|
"loss": 4.7544, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.18048, |
|
"grad_norm": 3.31211256980896, |
|
"learning_rate": 9.813294636647009e-06, |
|
"loss": 4.9007, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.18112, |
|
"grad_norm": 3.3026342391967773, |
|
"learning_rate": 9.81192695959457e-06, |
|
"loss": 4.8136, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.18176, |
|
"grad_norm": 3.6015031337738037, |
|
"learning_rate": 9.810554387477812e-06, |
|
"loss": 4.8296, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 3.5558950901031494, |
|
"learning_rate": 9.809176921693013e-06, |
|
"loss": 4.9049, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.18304, |
|
"grad_norm": 3.272860288619995, |
|
"learning_rate": 9.807794563641442e-06, |
|
"loss": 4.868, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.18368, |
|
"grad_norm": 3.427809715270996, |
|
"learning_rate": 9.806407314729341e-06, |
|
"loss": 4.7899, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.18432, |
|
"grad_norm": 3.545553207397461, |
|
"learning_rate": 9.805015176367924e-06, |
|
"loss": 4.9774, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.18496, |
|
"grad_norm": 3.5434036254882812, |
|
"learning_rate": 9.803618149973383e-06, |
|
"loss": 4.8174, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 3.5401341915130615, |
|
"learning_rate": 9.802216236966882e-06, |
|
"loss": 4.8138, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.18624, |
|
"grad_norm": 3.339459180831909, |
|
"learning_rate": 9.800809438774557e-06, |
|
"loss": 4.9385, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.18688, |
|
"grad_norm": 3.541703224182129, |
|
"learning_rate": 9.799397756827508e-06, |
|
"loss": 4.8764, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.18752, |
|
"grad_norm": 3.3053269386291504, |
|
"learning_rate": 9.79798119256181e-06, |
|
"loss": 4.5765, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.18816, |
|
"grad_norm": 3.461660146713257, |
|
"learning_rate": 9.7965597474185e-06, |
|
"loss": 4.6125, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 3.564030885696411, |
|
"learning_rate": 9.795133422843583e-06, |
|
"loss": 4.8758, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.18944, |
|
"grad_norm": 3.635293483734131, |
|
"learning_rate": 9.793702220288028e-06, |
|
"loss": 4.7954, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.19008, |
|
"grad_norm": 3.4663326740264893, |
|
"learning_rate": 9.792266141207763e-06, |
|
"loss": 4.8442, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.19072, |
|
"grad_norm": 3.556608200073242, |
|
"learning_rate": 9.790825187063677e-06, |
|
"loss": 4.8431, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.19136, |
|
"grad_norm": 3.726987838745117, |
|
"learning_rate": 9.789379359321624e-06, |
|
"loss": 4.8309, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 3.535627603530884, |
|
"learning_rate": 9.78792865945241e-06, |
|
"loss": 4.8779, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_loss": 1.2059489488601685, |
|
"eval_runtime": 6.8452, |
|
"eval_samples_per_second": 146.087, |
|
"eval_steps_per_second": 18.261, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.19264, |
|
"grad_norm": 3.3409645557403564, |
|
"learning_rate": 9.7864730889318e-06, |
|
"loss": 4.8398, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.19328, |
|
"grad_norm": 3.240247964859009, |
|
"learning_rate": 9.78501264924051e-06, |
|
"loss": 4.689, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.19392, |
|
"grad_norm": 3.6355326175689697, |
|
"learning_rate": 9.783547341864216e-06, |
|
"loss": 4.7737, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.19456, |
|
"grad_norm": 3.4650771617889404, |
|
"learning_rate": 9.78207716829354e-06, |
|
"loss": 4.7844, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 3.281463146209717, |
|
"learning_rate": 9.780602130024055e-06, |
|
"loss": 4.6872, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.19584, |
|
"grad_norm": 3.264622926712036, |
|
"learning_rate": 9.779122228556289e-06, |
|
"loss": 4.7438, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.19648, |
|
"grad_norm": 3.598848342895508, |
|
"learning_rate": 9.777637465395706e-06, |
|
"loss": 4.6983, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.19712, |
|
"grad_norm": 3.3951942920684814, |
|
"learning_rate": 9.776147842052725e-06, |
|
"loss": 4.8429, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.19776, |
|
"grad_norm": 3.088014841079712, |
|
"learning_rate": 9.774653360042706e-06, |
|
"loss": 4.8207, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 3.4452457427978516, |
|
"learning_rate": 9.773154020885953e-06, |
|
"loss": 4.8426, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.19904, |
|
"grad_norm": 3.3782291412353516, |
|
"learning_rate": 9.771649826107707e-06, |
|
"loss": 4.9081, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.19968, |
|
"grad_norm": 3.420620918273926, |
|
"learning_rate": 9.770140777238153e-06, |
|
"loss": 4.8296, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.20032, |
|
"grad_norm": 3.3439509868621826, |
|
"learning_rate": 9.76862687581241e-06, |
|
"loss": 4.626, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.20096, |
|
"grad_norm": 3.2657105922698975, |
|
"learning_rate": 9.76710812337054e-06, |
|
"loss": 4.8035, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 3.4240477085113525, |
|
"learning_rate": 9.765584521457533e-06, |
|
"loss": 4.776, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.20224, |
|
"grad_norm": 3.7116453647613525, |
|
"learning_rate": 9.764056071623314e-06, |
|
"loss": 4.8099, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.20288, |
|
"grad_norm": 3.3470919132232666, |
|
"learning_rate": 9.762522775422741e-06, |
|
"loss": 4.6686, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.20352, |
|
"grad_norm": 3.552156925201416, |
|
"learning_rate": 9.760984634415602e-06, |
|
"loss": 4.7256, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.20416, |
|
"grad_norm": 3.144547939300537, |
|
"learning_rate": 9.759441650166612e-06, |
|
"loss": 4.6914, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 3.3078038692474365, |
|
"learning_rate": 9.757893824245414e-06, |
|
"loss": 4.7828, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"eval_loss": 1.1752163171768188, |
|
"eval_runtime": 6.6642, |
|
"eval_samples_per_second": 150.056, |
|
"eval_steps_per_second": 18.757, |
|
"step": 1600 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 15624, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.585434243497984e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|