|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.558665720038641, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.9869178533554077, |
|
"learning_rate": 9.99997860104108e-06, |
|
"loss": 1.4113, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.7985073924064636, |
|
"learning_rate": 9.999914404347487e-06, |
|
"loss": 1.3054, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5533589720726013, |
|
"learning_rate": 9.999807410468713e-06, |
|
"loss": 1.2698, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.527658224105835, |
|
"learning_rate": 9.999657620320587e-06, |
|
"loss": 1.1817, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5116757750511169, |
|
"learning_rate": 9.999465035185248e-06, |
|
"loss": 1.1681, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.40034493803977966, |
|
"learning_rate": 9.999229656711143e-06, |
|
"loss": 1.1709, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.41553106904029846, |
|
"learning_rate": 9.998951486913015e-06, |
|
"loss": 1.133, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.38187628984451294, |
|
"learning_rate": 9.998630528171882e-06, |
|
"loss": 1.0915, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.39334940910339355, |
|
"learning_rate": 9.998266783235018e-06, |
|
"loss": 1.1154, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.38754093647003174, |
|
"learning_rate": 9.997860255215926e-06, |
|
"loss": 1.0616, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3934289216995239, |
|
"learning_rate": 9.997410947594317e-06, |
|
"loss": 1.1031, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.44751420617103577, |
|
"learning_rate": 9.996918864216081e-06, |
|
"loss": 1.0547, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.4995039701461792, |
|
"learning_rate": 9.99638400929324e-06, |
|
"loss": 1.0634, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.519430935382843, |
|
"learning_rate": 9.995806387403935e-06, |
|
"loss": 1.0366, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4063739478588104, |
|
"learning_rate": 9.995186003492366e-06, |
|
"loss": 1.0329, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.47834908962249756, |
|
"learning_rate": 9.994522862868763e-06, |
|
"loss": 1.0168, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4666057825088501, |
|
"learning_rate": 9.993816971209332e-06, |
|
"loss": 1.044, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5252623558044434, |
|
"learning_rate": 9.99306833455621e-06, |
|
"loss": 1.0311, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.41999587416648865, |
|
"learning_rate": 9.992276959317419e-06, |
|
"loss": 1.0273, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4322344660758972, |
|
"learning_rate": 9.9914428522668e-06, |
|
"loss": 1.0174, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.44230663776397705, |
|
"learning_rate": 9.99056602054396e-06, |
|
"loss": 0.9849, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.48284780979156494, |
|
"learning_rate": 9.989646471654216e-06, |
|
"loss": 1.0219, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.478462815284729, |
|
"learning_rate": 9.988684213468521e-06, |
|
"loss": 1.01, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4927475154399872, |
|
"learning_rate": 9.987679254223405e-06, |
|
"loss": 1.0199, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4281039237976074, |
|
"learning_rate": 9.986631602520904e-06, |
|
"loss": 1.0072, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5018512010574341, |
|
"learning_rate": 9.985541267328479e-06, |
|
"loss": 0.9704, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4972495138645172, |
|
"learning_rate": 9.98440825797894e-06, |
|
"loss": 0.9858, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5815178751945496, |
|
"learning_rate": 9.98323258417038e-06, |
|
"loss": 0.9752, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6393250823020935, |
|
"learning_rate": 9.982014255966078e-06, |
|
"loss": 1.0034, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5562496781349182, |
|
"learning_rate": 9.980753283794414e-06, |
|
"loss": 1.0004, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.48535385727882385, |
|
"learning_rate": 9.979449678448785e-06, |
|
"loss": 0.995, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6010926365852356, |
|
"learning_rate": 9.978103451087511e-06, |
|
"loss": 0.9669, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5188761949539185, |
|
"learning_rate": 9.976714613233736e-06, |
|
"loss": 0.978, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7140072584152222, |
|
"learning_rate": 9.975283176775332e-06, |
|
"loss": 0.9778, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5040972232818604, |
|
"learning_rate": 9.973809153964803e-06, |
|
"loss": 0.984, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.557113528251648, |
|
"learning_rate": 9.97229255741917e-06, |
|
"loss": 0.9871, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6509271860122681, |
|
"learning_rate": 9.970733400119865e-06, |
|
"loss": 0.9797, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6498264670372009, |
|
"learning_rate": 9.969131695412629e-06, |
|
"loss": 0.9665, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6095420718193054, |
|
"learning_rate": 9.967487457007382e-06, |
|
"loss": 0.9589, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5704444646835327, |
|
"learning_rate": 9.965800698978126e-06, |
|
"loss": 0.9666, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5877268314361572, |
|
"learning_rate": 9.964071435762802e-06, |
|
"loss": 0.9999, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5909533500671387, |
|
"learning_rate": 9.962299682163185e-06, |
|
"loss": 0.9381, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5712386965751648, |
|
"learning_rate": 9.96048545334475e-06, |
|
"loss": 0.972, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.737307608127594, |
|
"learning_rate": 9.95862876483654e-06, |
|
"loss": 0.9754, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7787168025970459, |
|
"learning_rate": 9.956729632531032e-06, |
|
"loss": 1.0197, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5988175868988037, |
|
"learning_rate": 9.954788072684011e-06, |
|
"loss": 0.9596, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.981139600276947, |
|
"learning_rate": 9.952804101914418e-06, |
|
"loss": 0.9876, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6938815712928772, |
|
"learning_rate": 9.95077773720422e-06, |
|
"loss": 0.9533, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.671062707901001, |
|
"learning_rate": 9.948708995898251e-06, |
|
"loss": 0.9812, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6615872979164124, |
|
"learning_rate": 9.94659789570408e-06, |
|
"loss": 0.9522, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5831172466278076, |
|
"learning_rate": 9.94444445469184e-06, |
|
"loss": 0.9454, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6584324836730957, |
|
"learning_rate": 9.942248691294092e-06, |
|
"loss": 0.951, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6748599410057068, |
|
"learning_rate": 9.940010624305658e-06, |
|
"loss": 0.927, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5425313115119934, |
|
"learning_rate": 9.937730272883458e-06, |
|
"loss": 0.9542, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.551030158996582, |
|
"learning_rate": 9.93540765654635e-06, |
|
"loss": 0.933, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.743243396282196, |
|
"learning_rate": 9.933042795174964e-06, |
|
"loss": 0.9573, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6725818514823914, |
|
"learning_rate": 9.930635709011524e-06, |
|
"loss": 0.9745, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.743486225605011, |
|
"learning_rate": 9.928186418659693e-06, |
|
"loss": 0.9705, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6531779170036316, |
|
"learning_rate": 9.925694945084369e-06, |
|
"loss": 0.9526, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7471463680267334, |
|
"learning_rate": 9.923161309611534e-06, |
|
"loss": 0.9718, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6551310420036316, |
|
"learning_rate": 9.920585533928049e-06, |
|
"loss": 0.9738, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.835099458694458, |
|
"learning_rate": 9.917967640081482e-06, |
|
"loss": 0.9559, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6788685321807861, |
|
"learning_rate": 9.915307650479915e-06, |
|
"loss": 0.9448, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.640802800655365, |
|
"learning_rate": 9.912605587891748e-06, |
|
"loss": 0.9632, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7521352767944336, |
|
"learning_rate": 9.909861475445517e-06, |
|
"loss": 0.9947, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6148037314414978, |
|
"learning_rate": 9.907075336629679e-06, |
|
"loss": 0.8992, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7000744938850403, |
|
"learning_rate": 9.90424719529242e-06, |
|
"loss": 0.9978, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6232642531394958, |
|
"learning_rate": 9.901377075641457e-06, |
|
"loss": 0.9916, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6425381898880005, |
|
"learning_rate": 9.898465002243813e-06, |
|
"loss": 0.9478, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6344175934791565, |
|
"learning_rate": 9.89551100002563e-06, |
|
"loss": 0.9749, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.677696704864502, |
|
"learning_rate": 9.892515094271931e-06, |
|
"loss": 0.9815, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6174888610839844, |
|
"learning_rate": 9.889477310626426e-06, |
|
"loss": 0.9444, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7845975756645203, |
|
"learning_rate": 9.886397675091275e-06, |
|
"loss": 0.9751, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.844460666179657, |
|
"learning_rate": 9.883276214026877e-06, |
|
"loss": 0.9412, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6752843260765076, |
|
"learning_rate": 9.880112954151639e-06, |
|
"loss": 0.95, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7257810831069946, |
|
"learning_rate": 9.876907922541748e-06, |
|
"loss": 0.9468, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6597475409507751, |
|
"learning_rate": 9.87366114663094e-06, |
|
"loss": 0.9219, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6881552934646606, |
|
"learning_rate": 9.870372654210265e-06, |
|
"loss": 0.9571, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8410021662712097, |
|
"learning_rate": 9.867042473427848e-06, |
|
"loss": 0.9429, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6908501982688904, |
|
"learning_rate": 9.863670632788652e-06, |
|
"loss": 0.9618, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6102294921875, |
|
"learning_rate": 9.860257161154224e-06, |
|
"loss": 0.9539, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8168625831604004, |
|
"learning_rate": 9.856802087742463e-06, |
|
"loss": 0.9384, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6453863978385925, |
|
"learning_rate": 9.85330544212736e-06, |
|
"loss": 0.9341, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6028039455413818, |
|
"learning_rate": 9.849767254238741e-06, |
|
"loss": 0.95, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6181658506393433, |
|
"learning_rate": 9.846187554362026e-06, |
|
"loss": 0.9184, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7346929907798767, |
|
"learning_rate": 9.842566373137949e-06, |
|
"loss": 0.9714, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6204361915588379, |
|
"learning_rate": 9.83890374156232e-06, |
|
"loss": 0.9161, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7579419016838074, |
|
"learning_rate": 9.835199690985737e-06, |
|
"loss": 0.943, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6947230100631714, |
|
"learning_rate": 9.831454253113328e-06, |
|
"loss": 0.9092, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.887786865234375, |
|
"learning_rate": 9.827667460004487e-06, |
|
"loss": 0.9149, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7788590788841248, |
|
"learning_rate": 9.823839344072582e-06, |
|
"loss": 0.9187, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6010196208953857, |
|
"learning_rate": 9.81996993808469e-06, |
|
"loss": 0.9445, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6567085385322571, |
|
"learning_rate": 9.816059275161318e-06, |
|
"loss": 0.9492, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8992276191711426, |
|
"learning_rate": 9.812107388776113e-06, |
|
"loss": 0.8785, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6781167387962341, |
|
"learning_rate": 9.808114312755574e-06, |
|
"loss": 0.902, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6629174947738647, |
|
"learning_rate": 9.804080081278768e-06, |
|
"loss": 0.9416, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8017804026603699, |
|
"learning_rate": 9.800004728877042e-06, |
|
"loss": 0.9273, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6197686195373535, |
|
"learning_rate": 9.795888290433709e-06, |
|
"loss": 0.9335, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7496291995048523, |
|
"learning_rate": 9.791730801183772e-06, |
|
"loss": 0.9036, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6641804575920105, |
|
"learning_rate": 9.787532296713605e-06, |
|
"loss": 0.9768, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7570294737815857, |
|
"learning_rate": 9.78329281296066e-06, |
|
"loss": 0.9162, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7247979044914246, |
|
"learning_rate": 9.779012386213151e-06, |
|
"loss": 0.9261, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7128037214279175, |
|
"learning_rate": 9.77469105310975e-06, |
|
"loss": 0.9403, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7633296847343445, |
|
"learning_rate": 9.770328850639268e-06, |
|
"loss": 0.9184, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7282195687294006, |
|
"learning_rate": 9.76592581614034e-06, |
|
"loss": 0.9144, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7485966086387634, |
|
"learning_rate": 9.761481987301111e-06, |
|
"loss": 0.9307, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7050462961196899, |
|
"learning_rate": 9.756997402158904e-06, |
|
"loss": 0.9269, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6218549609184265, |
|
"learning_rate": 9.752472099099897e-06, |
|
"loss": 0.9497, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.019995927810669, |
|
"learning_rate": 9.747906116858805e-06, |
|
"loss": 0.9218, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6491205096244812, |
|
"learning_rate": 9.743299494518532e-06, |
|
"loss": 0.9447, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6476668119430542, |
|
"learning_rate": 9.738652271509846e-06, |
|
"loss": 0.9189, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6570378541946411, |
|
"learning_rate": 9.733964487611044e-06, |
|
"loss": 0.938, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6387359499931335, |
|
"learning_rate": 9.729236182947597e-06, |
|
"loss": 0.933, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5747265815734863, |
|
"learning_rate": 9.724467397991833e-06, |
|
"loss": 0.9155, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6549525260925293, |
|
"learning_rate": 9.719658173562558e-06, |
|
"loss": 0.9293, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6779614686965942, |
|
"learning_rate": 9.714808550824735e-06, |
|
"loss": 0.9377, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6967174410820007, |
|
"learning_rate": 9.709918571289114e-06, |
|
"loss": 0.9253, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7676512598991394, |
|
"learning_rate": 9.704988276811883e-06, |
|
"loss": 0.9454, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.655295193195343, |
|
"learning_rate": 9.70001770959431e-06, |
|
"loss": 0.9125, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7563107013702393, |
|
"learning_rate": 9.695006912182379e-06, |
|
"loss": 0.9405, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8399559259414673, |
|
"learning_rate": 9.68995592746643e-06, |
|
"loss": 0.9427, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7510868906974792, |
|
"learning_rate": 9.684864798680789e-06, |
|
"loss": 0.9303, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9126793146133423, |
|
"learning_rate": 9.679733569403398e-06, |
|
"loss": 0.9218, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6488207578659058, |
|
"learning_rate": 9.674562283555445e-06, |
|
"loss": 0.943, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9493321776390076, |
|
"learning_rate": 9.669350985400979e-06, |
|
"loss": 0.9429, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7507259845733643, |
|
"learning_rate": 9.664099719546547e-06, |
|
"loss": 0.9041, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7052815556526184, |
|
"learning_rate": 9.658808530940794e-06, |
|
"loss": 0.9214, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7954320311546326, |
|
"learning_rate": 9.653477464874093e-06, |
|
"loss": 0.9178, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0493558645248413, |
|
"learning_rate": 9.648106566978149e-06, |
|
"loss": 0.9019, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.767572820186615, |
|
"learning_rate": 9.64269588322561e-06, |
|
"loss": 0.9551, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7655388712882996, |
|
"learning_rate": 9.637245459929678e-06, |
|
"loss": 0.9281, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2685116529464722, |
|
"learning_rate": 9.631755343743707e-06, |
|
"loss": 0.9443, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6689302325248718, |
|
"learning_rate": 9.626225581660802e-06, |
|
"loss": 0.9128, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8155603408813477, |
|
"learning_rate": 9.620656221013428e-06, |
|
"loss": 0.9437, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7464457154273987, |
|
"learning_rate": 9.61504730947299e-06, |
|
"loss": 0.891, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.629400908946991, |
|
"learning_rate": 9.609398895049435e-06, |
|
"loss": 0.8955, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7425062656402588, |
|
"learning_rate": 9.60371102609084e-06, |
|
"loss": 0.8861, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6443877816200256, |
|
"learning_rate": 9.597983751282993e-06, |
|
"loss": 0.9051, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7057684063911438, |
|
"learning_rate": 9.592217119648982e-06, |
|
"loss": 0.8885, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8697768449783325, |
|
"learning_rate": 9.586411180548771e-06, |
|
"loss": 0.914, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7217336893081665, |
|
"learning_rate": 9.580565983678784e-06, |
|
"loss": 0.9314, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.609015703201294, |
|
"learning_rate": 9.57468157907147e-06, |
|
"loss": 0.9134, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7643072605133057, |
|
"learning_rate": 9.568758017094884e-06, |
|
"loss": 0.9209, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6023324728012085, |
|
"learning_rate": 9.562795348452245e-06, |
|
"loss": 0.929, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6700206398963928, |
|
"learning_rate": 9.55679362418152e-06, |
|
"loss": 0.9361, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6590617895126343, |
|
"learning_rate": 9.550752895654963e-06, |
|
"loss": 0.8946, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6463683843612671, |
|
"learning_rate": 9.544673214578699e-06, |
|
"loss": 0.9069, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6586316823959351, |
|
"learning_rate": 9.538554632992265e-06, |
|
"loss": 0.9218, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6628296375274658, |
|
"learning_rate": 9.532397203268172e-06, |
|
"loss": 0.8845, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6902598142623901, |
|
"learning_rate": 9.526200978111452e-06, |
|
"loss": 0.9196, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6551186442375183, |
|
"learning_rate": 9.519966010559217e-06, |
|
"loss": 0.9147, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6625112295150757, |
|
"learning_rate": 9.513692353980186e-06, |
|
"loss": 0.912, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7705367207527161, |
|
"learning_rate": 9.507380062074252e-06, |
|
"loss": 0.9298, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6840114593505859, |
|
"learning_rate": 9.501029188872005e-06, |
|
"loss": 0.9199, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6210880875587463, |
|
"learning_rate": 9.49463978873427e-06, |
|
"loss": 0.9077, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7556228637695312, |
|
"learning_rate": 9.488211916351656e-06, |
|
"loss": 0.9089, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.735604465007782, |
|
"learning_rate": 9.481745626744071e-06, |
|
"loss": 0.9305, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7971885800361633, |
|
"learning_rate": 9.475240975260266e-06, |
|
"loss": 0.9319, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6014077067375183, |
|
"learning_rate": 9.468698017577344e-06, |
|
"loss": 0.9292, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7462084889411926, |
|
"learning_rate": 9.4621168097003e-06, |
|
"loss": 0.8942, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7277187705039978, |
|
"learning_rate": 9.455497407961533e-06, |
|
"loss": 0.8978, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7430045008659363, |
|
"learning_rate": 9.448839869020364e-06, |
|
"loss": 0.9175, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.693151593208313, |
|
"learning_rate": 9.442144249862555e-06, |
|
"loss": 0.9416, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6412799954414368, |
|
"learning_rate": 9.435410607799818e-06, |
|
"loss": 0.8838, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8170580863952637, |
|
"learning_rate": 9.428639000469326e-06, |
|
"loss": 0.924, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.699052095413208, |
|
"learning_rate": 9.421829485833214e-06, |
|
"loss": 0.9344, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7370940446853638, |
|
"learning_rate": 9.414982122178095e-06, |
|
"loss": 0.935, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7777985334396362, |
|
"learning_rate": 9.40809696811455e-06, |
|
"loss": 0.933, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7101726531982422, |
|
"learning_rate": 9.40117408257663e-06, |
|
"loss": 0.9002, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6398364305496216, |
|
"learning_rate": 9.394213524821351e-06, |
|
"loss": 0.9449, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8372628688812256, |
|
"learning_rate": 9.387215354428192e-06, |
|
"loss": 0.916, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6209415793418884, |
|
"learning_rate": 9.380179631298573e-06, |
|
"loss": 0.9391, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6791089177131653, |
|
"learning_rate": 9.373106415655359e-06, |
|
"loss": 0.8643, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.681890070438385, |
|
"learning_rate": 9.365995768042327e-06, |
|
"loss": 0.9468, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7921962738037109, |
|
"learning_rate": 9.35884774932366e-06, |
|
"loss": 0.9127, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7955833077430725, |
|
"learning_rate": 9.351662420683421e-06, |
|
"loss": 0.9091, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7748980522155762, |
|
"learning_rate": 9.344439843625034e-06, |
|
"loss": 0.897, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8075698614120483, |
|
"learning_rate": 9.337180079970747e-06, |
|
"loss": 0.9111, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6480504274368286, |
|
"learning_rate": 9.329883191861116e-06, |
|
"loss": 0.9003, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6438207626342773, |
|
"learning_rate": 9.322549241754465e-06, |
|
"loss": 0.8922, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7411364912986755, |
|
"learning_rate": 9.31517829242635e-06, |
|
"loss": 0.8988, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.864782989025116, |
|
"learning_rate": 9.307770406969032e-06, |
|
"loss": 0.9395, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7258287668228149, |
|
"learning_rate": 9.300325648790922e-06, |
|
"loss": 0.9025, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7146714925765991, |
|
"learning_rate": 9.29284408161605e-06, |
|
"loss": 0.9223, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6053746342658997, |
|
"learning_rate": 9.285325769483517e-06, |
|
"loss": 0.9332, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8638799786567688, |
|
"learning_rate": 9.277770776746944e-06, |
|
"loss": 0.942, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8022944927215576, |
|
"learning_rate": 9.270179168073923e-06, |
|
"loss": 0.909, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7210090756416321, |
|
"learning_rate": 9.26255100844546e-06, |
|
"loss": 0.9314, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.650475263595581, |
|
"learning_rate": 9.254886363155429e-06, |
|
"loss": 0.9355, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8607627153396606, |
|
"learning_rate": 9.247185297809997e-06, |
|
"loss": 0.9138, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.9340272545814514, |
|
"learning_rate": 9.239447878327081e-06, |
|
"loss": 0.9417, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7494912147521973, |
|
"learning_rate": 9.231674170935767e-06, |
|
"loss": 0.8857, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7654171586036682, |
|
"learning_rate": 9.223864242175756e-06, |
|
"loss": 0.8908, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7642711400985718, |
|
"learning_rate": 9.21601815889678e-06, |
|
"loss": 0.8765, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.696348249912262, |
|
"learning_rate": 9.20813598825805e-06, |
|
"loss": 0.9323, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6663697957992554, |
|
"learning_rate": 9.200217797727663e-06, |
|
"loss": 0.8775, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7656813859939575, |
|
"learning_rate": 9.192263655082033e-06, |
|
"loss": 0.8997, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6377949714660645, |
|
"learning_rate": 9.184273628405304e-06, |
|
"loss": 0.8842, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7093709707260132, |
|
"learning_rate": 9.176247786088783e-06, |
|
"loss": 0.9002, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.875128984451294, |
|
"learning_rate": 9.168186196830336e-06, |
|
"loss": 0.9209, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7550728917121887, |
|
"learning_rate": 9.16008892963381e-06, |
|
"loss": 0.9035, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0108016729354858, |
|
"learning_rate": 9.15195605380844e-06, |
|
"loss": 0.875, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7439371943473816, |
|
"learning_rate": 9.143787638968255e-06, |
|
"loss": 0.8722, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6964495182037354, |
|
"learning_rate": 9.135583755031486e-06, |
|
"loss": 0.8726, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9313227534294128, |
|
"learning_rate": 9.127344472219964e-06, |
|
"loss": 0.9107, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4056764841079712, |
|
"learning_rate": 9.119069861058516e-06, |
|
"loss": 0.9061, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.680668830871582, |
|
"learning_rate": 9.110759992374369e-06, |
|
"loss": 0.9099, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7828279733657837, |
|
"learning_rate": 9.102414937296542e-06, |
|
"loss": 0.8803, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6008317470550537, |
|
"learning_rate": 9.094034767255225e-06, |
|
"loss": 0.8987, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8024834990501404, |
|
"learning_rate": 9.085619553981186e-06, |
|
"loss": 0.9235, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8021408915519714, |
|
"learning_rate": 9.07716936950515e-06, |
|
"loss": 0.8767, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7997848391532898, |
|
"learning_rate": 9.068684286157174e-06, |
|
"loss": 0.8941, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8140115141868591, |
|
"learning_rate": 9.060164376566037e-06, |
|
"loss": 0.8886, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7906537652015686, |
|
"learning_rate": 9.051609713658619e-06, |
|
"loss": 0.8951, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6860573291778564, |
|
"learning_rate": 9.04302037065927e-06, |
|
"loss": 0.8922, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7657009363174438, |
|
"learning_rate": 9.034396421089192e-06, |
|
"loss": 0.9203, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.9350467920303345, |
|
"learning_rate": 9.025737938765803e-06, |
|
"loss": 0.9592, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8163906931877136, |
|
"learning_rate": 9.0170449978021e-06, |
|
"loss": 0.9156, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6962751150131226, |
|
"learning_rate": 9.008317672606044e-06, |
|
"loss": 0.8692, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6299543976783752, |
|
"learning_rate": 8.9995560378799e-06, |
|
"loss": 0.9052, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6928566694259644, |
|
"learning_rate": 8.990760168619616e-06, |
|
"loss": 0.9231, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8014935851097107, |
|
"learning_rate": 8.981930140114167e-06, |
|
"loss": 0.9165, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7766875624656677, |
|
"learning_rate": 8.97306602794492e-06, |
|
"loss": 0.9003, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7637115120887756, |
|
"learning_rate": 8.964167907984989e-06, |
|
"loss": 0.8775, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8758956789970398, |
|
"learning_rate": 8.955235856398568e-06, |
|
"loss": 0.8945, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8250870704650879, |
|
"learning_rate": 8.946269949640306e-06, |
|
"loss": 0.8865, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6754716634750366, |
|
"learning_rate": 8.937270264454624e-06, |
|
"loss": 0.8994, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6988954544067383, |
|
"learning_rate": 8.928236877875084e-06, |
|
"loss": 0.9221, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7391093373298645, |
|
"learning_rate": 8.919169867223713e-06, |
|
"loss": 0.9124, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7323642373085022, |
|
"learning_rate": 8.910069310110346e-06, |
|
"loss": 0.887, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9379021525382996, |
|
"learning_rate": 8.900935284431962e-06, |
|
"loss": 0.8925, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7654469609260559, |
|
"learning_rate": 8.891767868372016e-06, |
|
"loss": 0.9569, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6094922423362732, |
|
"learning_rate": 8.882567140399775e-06, |
|
"loss": 0.878, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7786396741867065, |
|
"learning_rate": 8.873333179269635e-06, |
|
"loss": 0.8933, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6561703681945801, |
|
"learning_rate": 8.864066064020462e-06, |
|
"loss": 0.8821, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6152118444442749, |
|
"learning_rate": 8.854765873974898e-06, |
|
"loss": 0.8905, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8001115322113037, |
|
"learning_rate": 8.845432688738703e-06, |
|
"loss": 0.9002, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.709892988204956, |
|
"learning_rate": 8.836066588200052e-06, |
|
"loss": 0.887, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6732577085494995, |
|
"learning_rate": 8.826667652528866e-06, |
|
"loss": 0.8866, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.9450706839561462, |
|
"learning_rate": 8.817235962176121e-06, |
|
"loss": 0.8808, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.9471026659011841, |
|
"learning_rate": 8.807771597873159e-06, |
|
"loss": 0.8943, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7772449851036072, |
|
"learning_rate": 8.798274640630997e-06, |
|
"loss": 0.8859, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8146265745162964, |
|
"learning_rate": 8.788745171739632e-06, |
|
"loss": 0.8869, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7327932715415955, |
|
"learning_rate": 8.779183272767353e-06, |
|
"loss": 0.874, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6848121881484985, |
|
"learning_rate": 8.76958902556003e-06, |
|
"loss": 0.8677, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.679011881351471, |
|
"learning_rate": 8.759962512240426e-06, |
|
"loss": 0.9292, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.8341182470321655, |
|
"learning_rate": 8.750303815207487e-06, |
|
"loss": 0.9098, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.874434769153595, |
|
"learning_rate": 8.740613017135634e-06, |
|
"loss": 0.9081, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7139870524406433, |
|
"learning_rate": 8.730890200974065e-06, |
|
"loss": 0.896, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7836432456970215, |
|
"learning_rate": 8.721135449946037e-06, |
|
"loss": 0.8939, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6674822568893433, |
|
"learning_rate": 8.711348847548157e-06, |
|
"loss": 0.9039, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6287670135498047, |
|
"learning_rate": 8.701530477549666e-06, |
|
"loss": 0.8786, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7027100324630737, |
|
"learning_rate": 8.691680423991722e-06, |
|
"loss": 0.8887, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.713030993938446, |
|
"learning_rate": 8.681798771186683e-06, |
|
"loss": 0.9048, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7505326867103577, |
|
"learning_rate": 8.67188560371738e-06, |
|
"loss": 0.868, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6765055060386658, |
|
"learning_rate": 8.661941006436401e-06, |
|
"loss": 0.8686, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7267132997512817, |
|
"learning_rate": 8.651965064465355e-06, |
|
"loss": 0.9243, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6484735608100891, |
|
"learning_rate": 8.641957863194153e-06, |
|
"loss": 0.8846, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6731644868850708, |
|
"learning_rate": 8.631919488280267e-06, |
|
"loss": 0.914, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6963881850242615, |
|
"learning_rate": 8.621850025648008e-06, |
|
"loss": 0.8604, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.9081583619117737, |
|
"learning_rate": 8.611749561487785e-06, |
|
"loss": 0.9331, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7570102214813232, |
|
"learning_rate": 8.601618182255364e-06, |
|
"loss": 0.8996, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.820863664150238, |
|
"learning_rate": 8.59145597467113e-06, |
|
"loss": 0.8707, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8625741004943848, |
|
"learning_rate": 8.581263025719352e-06, |
|
"loss": 0.8806, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1321467161178589, |
|
"learning_rate": 8.571039422647423e-06, |
|
"loss": 0.9222, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7913120985031128, |
|
"learning_rate": 8.560785252965131e-06, |
|
"loss": 0.9212, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7801851034164429, |
|
"learning_rate": 8.5505006044439e-06, |
|
"loss": 0.9001, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.689361035823822, |
|
"learning_rate": 8.540185565116034e-06, |
|
"loss": 0.9105, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6706151366233826, |
|
"learning_rate": 8.52984022327398e-06, |
|
"loss": 0.8955, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8290680050849915, |
|
"learning_rate": 8.51946466746955e-06, |
|
"loss": 0.9029, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.74539715051651, |
|
"learning_rate": 8.509058986513185e-06, |
|
"loss": 0.9072, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6989156603813171, |
|
"learning_rate": 8.498623269473178e-06, |
|
"loss": 0.9111, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8335017561912537, |
|
"learning_rate": 8.488157605674924e-06, |
|
"loss": 0.9043, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8350189924240112, |
|
"learning_rate": 8.477662084700145e-06, |
|
"loss": 0.9496, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7352144122123718, |
|
"learning_rate": 8.46713679638613e-06, |
|
"loss": 0.8805, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8180497884750366, |
|
"learning_rate": 8.456581830824966e-06, |
|
"loss": 0.8715, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8565731644630432, |
|
"learning_rate": 8.445997278362759e-06, |
|
"loss": 0.881, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7524616122245789, |
|
"learning_rate": 8.435383229598872e-06, |
|
"loss": 0.873, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8238971829414368, |
|
"learning_rate": 8.424739775385145e-06, |
|
"loss": 0.8766, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7271744608879089, |
|
"learning_rate": 8.414067006825108e-06, |
|
"loss": 0.8659, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6863881945610046, |
|
"learning_rate": 8.403365015273223e-06, |
|
"loss": 0.9122, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7746860384941101, |
|
"learning_rate": 8.392633892334074e-06, |
|
"loss": 0.8661, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7054237723350525, |
|
"learning_rate": 8.381873729861609e-06, |
|
"loss": 0.8884, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6736589074134827, |
|
"learning_rate": 8.371084619958337e-06, |
|
"loss": 0.8776, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.671820878982544, |
|
"learning_rate": 8.360266654974543e-06, |
|
"loss": 0.866, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8050073981285095, |
|
"learning_rate": 8.349419927507505e-06, |
|
"loss": 0.8884, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8177492618560791, |
|
"learning_rate": 8.338544530400693e-06, |
|
"loss": 0.8678, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8012502193450928, |
|
"learning_rate": 8.327640556742978e-06, |
|
"loss": 0.8792, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6939083337783813, |
|
"learning_rate": 8.316708099867834e-06, |
|
"loss": 0.8928, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7548695206642151, |
|
"learning_rate": 8.305747253352534e-06, |
|
"loss": 0.8564, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7304048538208008, |
|
"learning_rate": 8.294758111017367e-06, |
|
"loss": 0.8836, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8671132922172546, |
|
"learning_rate": 8.28374076692481e-06, |
|
"loss": 0.8864, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7430797219276428, |
|
"learning_rate": 8.272695315378744e-06, |
|
"loss": 0.874, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6934476494789124, |
|
"learning_rate": 8.261621850923634e-06, |
|
"loss": 0.8583, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7404267191886902, |
|
"learning_rate": 8.250520468343722e-06, |
|
"loss": 0.8734, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6465054154396057, |
|
"learning_rate": 8.239391262662221e-06, |
|
"loss": 0.8734, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6825598478317261, |
|
"learning_rate": 8.228234329140498e-06, |
|
"loss": 0.8666, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6966167688369751, |
|
"learning_rate": 8.217049763277257e-06, |
|
"loss": 0.8823, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8441101908683777, |
|
"learning_rate": 8.205837660807726e-06, |
|
"loss": 0.8876, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.78465336561203, |
|
"learning_rate": 8.194598117702828e-06, |
|
"loss": 0.8991, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10738, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 1.0789441600455967e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|