|
{ |
|
"best_metric": 1.4924039840698242, |
|
"best_model_checkpoint": "./output/checkpoint-3900", |
|
"epoch": 0.34580599397056216, |
|
"eval_steps": 150, |
|
"global_step": 3900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008866820358219542, |
|
"grad_norm": 12.221611976623535, |
|
"learning_rate": 1.0000000000000004e-05, |
|
"loss": 1.5409, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0017733640716439084, |
|
"grad_norm": 10.172511100769043, |
|
"learning_rate": 2.000000000000001e-05, |
|
"loss": 1.5034, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0026600461074658627, |
|
"grad_norm": 7.491011619567871, |
|
"learning_rate": 3.0000000000000008e-05, |
|
"loss": 1.5275, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003546728143287817, |
|
"grad_norm": 8.695981979370117, |
|
"learning_rate": 4.000000000000002e-05, |
|
"loss": 1.6105, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004433410179109771, |
|
"grad_norm": 9.496545791625977, |
|
"learning_rate": 5.0000000000000016e-05, |
|
"loss": 1.5488, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005320092214931725, |
|
"grad_norm": 8.968175888061523, |
|
"learning_rate": 6.0000000000000015e-05, |
|
"loss": 1.5128, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00620677425075368, |
|
"grad_norm": 9.554357528686523, |
|
"learning_rate": 7.000000000000002e-05, |
|
"loss": 1.5392, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007093456286575634, |
|
"grad_norm": 10.577646255493164, |
|
"learning_rate": 8.000000000000003e-05, |
|
"loss": 1.7024, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.007980138322397589, |
|
"grad_norm": 7.967463493347168, |
|
"learning_rate": 9.000000000000003e-05, |
|
"loss": 1.6036, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008866820358219542, |
|
"grad_norm": 10.403460502624512, |
|
"learning_rate": 0.00010000000000000003, |
|
"loss": 1.7425, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009753502394041496, |
|
"grad_norm": 7.898177623748779, |
|
"learning_rate": 9.999897234791833e-05, |
|
"loss": 1.6833, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01064018442986345, |
|
"grad_norm": 9.69663143157959, |
|
"learning_rate": 9.9995889433916e-05, |
|
"loss": 1.5979, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.011526866465685405, |
|
"grad_norm": 10.984465599060059, |
|
"learning_rate": 9.999075138471954e-05, |
|
"loss": 1.8234, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01241354850150736, |
|
"grad_norm": 7.519535541534424, |
|
"learning_rate": 9.998355841153402e-05, |
|
"loss": 1.5598, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013300230537329314, |
|
"grad_norm": 10.765931129455566, |
|
"learning_rate": 9.997431081003442e-05, |
|
"loss": 1.6027, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.013300230537329314, |
|
"eval_loss": 1.7432395219802856, |
|
"eval_runtime": 59.4356, |
|
"eval_samples_per_second": 8.412, |
|
"eval_steps_per_second": 8.412, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.014186912573151267, |
|
"grad_norm": 7.04581356048584, |
|
"learning_rate": 9.996300896035342e-05, |
|
"loss": 1.625, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.015073594608973222, |
|
"grad_norm": 10.549965858459473, |
|
"learning_rate": 9.994965332706576e-05, |
|
"loss": 1.7696, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.015960276644795178, |
|
"grad_norm": 17.5423641204834, |
|
"learning_rate": 9.993424445916925e-05, |
|
"loss": 1.6977, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01684695868061713, |
|
"grad_norm": 15.295858383178711, |
|
"learning_rate": 9.991678299006208e-05, |
|
"loss": 1.7496, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.017733640716439084, |
|
"grad_norm": 14.050594329833984, |
|
"learning_rate": 9.989726963751685e-05, |
|
"loss": 1.6698, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01862032275226104, |
|
"grad_norm": 9.636832237243652, |
|
"learning_rate": 9.987570520365106e-05, |
|
"loss": 1.7278, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019507004788082993, |
|
"grad_norm": 10.37330150604248, |
|
"learning_rate": 9.985209057489412e-05, |
|
"loss": 1.6985, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02039368682390495, |
|
"grad_norm": 8.965282440185547, |
|
"learning_rate": 9.982642672195095e-05, |
|
"loss": 1.652, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0212803688597269, |
|
"grad_norm": 11.881516456604004, |
|
"learning_rate": 9.979871469976199e-05, |
|
"loss": 1.6963, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.022167050895548854, |
|
"grad_norm": 8.806026458740234, |
|
"learning_rate": 9.976895564745994e-05, |
|
"loss": 1.6385, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02305373293137081, |
|
"grad_norm": 8.204315185546875, |
|
"learning_rate": 9.97371507883229e-05, |
|
"loss": 1.8237, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.023940414967192764, |
|
"grad_norm": 7.4131879806518555, |
|
"learning_rate": 9.970330142972404e-05, |
|
"loss": 1.6853, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.02482709700301472, |
|
"grad_norm": 8.07067584991455, |
|
"learning_rate": 9.966740896307794e-05, |
|
"loss": 1.6961, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.025713779038836673, |
|
"grad_norm": 10.194974899291992, |
|
"learning_rate": 9.962947486378328e-05, |
|
"loss": 1.6454, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.02660046107465863, |
|
"grad_norm": 9.990214347839355, |
|
"learning_rate": 9.958950069116233e-05, |
|
"loss": 1.7213, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02660046107465863, |
|
"eval_loss": 1.710451364517212, |
|
"eval_runtime": 59.6513, |
|
"eval_samples_per_second": 8.382, |
|
"eval_steps_per_second": 8.382, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02748714311048058, |
|
"grad_norm": 9.65303897857666, |
|
"learning_rate": 9.954748808839677e-05, |
|
"loss": 1.6927, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.028373825146302534, |
|
"grad_norm": 11.48330307006836, |
|
"learning_rate": 9.950343878246013e-05, |
|
"loss": 1.6804, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02926050718212449, |
|
"grad_norm": 8.232380867004395, |
|
"learning_rate": 9.945735458404684e-05, |
|
"loss": 1.6682, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.030147189217946443, |
|
"grad_norm": 8.22367000579834, |
|
"learning_rate": 9.940923738749781e-05, |
|
"loss": 1.705, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0310338712537684, |
|
"grad_norm": 8.59586238861084, |
|
"learning_rate": 9.935908917072255e-05, |
|
"loss": 1.7646, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.031920553289590356, |
|
"grad_norm": 11.29736328125, |
|
"learning_rate": 9.930691199511777e-05, |
|
"loss": 1.5999, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.032807235325412305, |
|
"grad_norm": 7.492193222045898, |
|
"learning_rate": 9.925270800548287e-05, |
|
"loss": 1.7898, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03369391736123426, |
|
"grad_norm": 9.845335006713867, |
|
"learning_rate": 9.919647942993151e-05, |
|
"loss": 1.7107, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03458059939705622, |
|
"grad_norm": 6.399631500244141, |
|
"learning_rate": 9.913822857980023e-05, |
|
"loss": 1.8382, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03546728143287817, |
|
"grad_norm": 14.003453254699707, |
|
"learning_rate": 9.90779578495533e-05, |
|
"loss": 1.5307, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03635396346870012, |
|
"grad_norm": 7.186993598937988, |
|
"learning_rate": 9.90156697166844e-05, |
|
"loss": 1.6269, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03724064550452208, |
|
"grad_norm": 10.279706001281738, |
|
"learning_rate": 9.895136674161468e-05, |
|
"loss": 1.7234, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.038127327540344036, |
|
"grad_norm": 15.368351936340332, |
|
"learning_rate": 9.888505156758762e-05, |
|
"loss": 1.6987, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.039014009576165985, |
|
"grad_norm": 12.989547729492188, |
|
"learning_rate": 9.881672692056024e-05, |
|
"loss": 1.7605, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03990069161198794, |
|
"grad_norm": 10.625384330749512, |
|
"learning_rate": 9.87463956090912e-05, |
|
"loss": 1.7047, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03990069161198794, |
|
"eval_loss": 1.7071540355682373, |
|
"eval_runtime": 59.4581, |
|
"eval_samples_per_second": 8.409, |
|
"eval_steps_per_second": 8.409, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0407873736478099, |
|
"grad_norm": 13.621452331542969, |
|
"learning_rate": 9.867406052422526e-05, |
|
"loss": 1.6615, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04167405568363185, |
|
"grad_norm": 11.978480339050293, |
|
"learning_rate": 9.859972463937443e-05, |
|
"loss": 1.712, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0425607377194538, |
|
"grad_norm": 8.805001258850098, |
|
"learning_rate": 9.852339101019577e-05, |
|
"loss": 1.6285, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04344741975527576, |
|
"grad_norm": 6.610208034515381, |
|
"learning_rate": 9.84450627744658e-05, |
|
"loss": 1.6863, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04433410179109771, |
|
"grad_norm": 8.630401611328125, |
|
"learning_rate": 9.83647431519515e-05, |
|
"loss": 1.7646, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.045220783826919665, |
|
"grad_norm": 9.726890563964844, |
|
"learning_rate": 9.828243544427798e-05, |
|
"loss": 1.7782, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04610746586274162, |
|
"grad_norm": 9.516313552856445, |
|
"learning_rate": 9.81981430347927e-05, |
|
"loss": 1.7227, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04699414789856358, |
|
"grad_norm": 8.171401023864746, |
|
"learning_rate": 9.811186938842648e-05, |
|
"loss": 1.6726, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.04788082993438553, |
|
"grad_norm": 9.201889991760254, |
|
"learning_rate": 9.8023618051551e-05, |
|
"loss": 1.4651, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04876751197020748, |
|
"grad_norm": 8.409120559692383, |
|
"learning_rate": 9.793339265183306e-05, |
|
"loss": 1.5729, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04965419400602944, |
|
"grad_norm": 8.624395370483398, |
|
"learning_rate": 9.784119689808547e-05, |
|
"loss": 1.7279, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05054087604185139, |
|
"grad_norm": 9.849170684814453, |
|
"learning_rate": 9.774703458011455e-05, |
|
"loss": 1.7016, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.051427558077673345, |
|
"grad_norm": 10.26559829711914, |
|
"learning_rate": 9.765090956856439e-05, |
|
"loss": 1.6263, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0523142401134953, |
|
"grad_norm": 8.199737548828125, |
|
"learning_rate": 9.755282581475771e-05, |
|
"loss": 1.6802, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.05320092214931726, |
|
"grad_norm": 8.513672828674316, |
|
"learning_rate": 9.745278735053346e-05, |
|
"loss": 1.8307, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05320092214931726, |
|
"eval_loss": 1.7056752443313599, |
|
"eval_runtime": 59.3846, |
|
"eval_samples_per_second": 8.42, |
|
"eval_steps_per_second": 8.42, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05408760418513921, |
|
"grad_norm": 6.43255615234375, |
|
"learning_rate": 9.73507982880811e-05, |
|
"loss": 1.6386, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.05497428622096116, |
|
"grad_norm": 7.917320728302002, |
|
"learning_rate": 9.724686281977149e-05, |
|
"loss": 1.4825, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05586096825678312, |
|
"grad_norm": 7.494490146636963, |
|
"learning_rate": 9.714098521798468e-05, |
|
"loss": 1.6621, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.05674765029260507, |
|
"grad_norm": 7.454612731933594, |
|
"learning_rate": 9.703316983493417e-05, |
|
"loss": 1.6406, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.057634332328427025, |
|
"grad_norm": 8.165127754211426, |
|
"learning_rate": 9.692342110248805e-05, |
|
"loss": 1.7403, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05852101436424898, |
|
"grad_norm": 7.547806739807129, |
|
"learning_rate": 9.68117435319869e-05, |
|
"loss": 1.8081, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05940769640007094, |
|
"grad_norm": 6.669929504394531, |
|
"learning_rate": 9.669814171405819e-05, |
|
"loss": 1.5977, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.06029437843589289, |
|
"grad_norm": 10.013628005981445, |
|
"learning_rate": 9.658262031842773e-05, |
|
"loss": 1.7354, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06118106047171484, |
|
"grad_norm": 7.262237071990967, |
|
"learning_rate": 9.646518409372763e-05, |
|
"loss": 1.73, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.0620677425075368, |
|
"grad_norm": 10.55190658569336, |
|
"learning_rate": 9.634583786730112e-05, |
|
"loss": 1.5776, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06295442454335876, |
|
"grad_norm": 6.680520534515381, |
|
"learning_rate": 9.62245865450041e-05, |
|
"loss": 1.6498, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.06384110657918071, |
|
"grad_norm": 8.451497077941895, |
|
"learning_rate": 9.610143511100356e-05, |
|
"loss": 1.5667, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06472778861500265, |
|
"grad_norm": 6.46457576751709, |
|
"learning_rate": 9.597638862757256e-05, |
|
"loss": 1.6396, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06561447065082461, |
|
"grad_norm": 7.483574867248535, |
|
"learning_rate": 9.58494522348823e-05, |
|
"loss": 1.7103, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06650115268664657, |
|
"grad_norm": 7.17427921295166, |
|
"learning_rate": 9.572063115079066e-05, |
|
"loss": 1.7465, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06650115268664657, |
|
"eval_loss": 1.7024177312850952, |
|
"eval_runtime": 59.3886, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 8.419, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06738783472246852, |
|
"grad_norm": 10.202959060668945, |
|
"learning_rate": 9.558993067062788e-05, |
|
"loss": 1.5595, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06827451675829048, |
|
"grad_norm": 8.259611129760742, |
|
"learning_rate": 9.545735616697878e-05, |
|
"loss": 1.6273, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.06916119879411244, |
|
"grad_norm": 10.105010032653809, |
|
"learning_rate": 9.532291308946193e-05, |
|
"loss": 1.6441, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07004788082993439, |
|
"grad_norm": 11.567327499389648, |
|
"learning_rate": 9.518660696450571e-05, |
|
"loss": 1.7721, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.07093456286575633, |
|
"grad_norm": 7.6176838874816895, |
|
"learning_rate": 9.504844339512098e-05, |
|
"loss": 1.5159, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07182124490157829, |
|
"grad_norm": 8.693132400512695, |
|
"learning_rate": 9.490842806067098e-05, |
|
"loss": 1.7085, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07270792693740025, |
|
"grad_norm": 8.85315227508545, |
|
"learning_rate": 9.476656671663768e-05, |
|
"loss": 1.6169, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0735946089732222, |
|
"grad_norm": 8.727082252502441, |
|
"learning_rate": 9.462286519438532e-05, |
|
"loss": 1.5426, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07448129100904416, |
|
"grad_norm": 9.754756927490234, |
|
"learning_rate": 9.447732940092063e-05, |
|
"loss": 1.5564, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07536797304486612, |
|
"grad_norm": 9.421085357666016, |
|
"learning_rate": 9.432996531865004e-05, |
|
"loss": 1.6749, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07625465508068807, |
|
"grad_norm": 8.565560340881348, |
|
"learning_rate": 9.418077900513379e-05, |
|
"loss": 1.5869, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07714133711651001, |
|
"grad_norm": 6.376341342926025, |
|
"learning_rate": 9.402977659283693e-05, |
|
"loss": 1.6504, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07802801915233197, |
|
"grad_norm": 8.475419044494629, |
|
"learning_rate": 9.387696428887718e-05, |
|
"loss": 1.6349, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07891470118815393, |
|
"grad_norm": 8.961620330810547, |
|
"learning_rate": 9.372234837476981e-05, |
|
"loss": 1.6964, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.07980138322397588, |
|
"grad_norm": 10.981645584106445, |
|
"learning_rate": 9.35659352061695e-05, |
|
"loss": 1.8223, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07980138322397588, |
|
"eval_loss": 1.7124860286712646, |
|
"eval_runtime": 59.2388, |
|
"eval_samples_per_second": 8.44, |
|
"eval_steps_per_second": 8.44, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08068806525979784, |
|
"grad_norm": 7.960820198059082, |
|
"learning_rate": 9.340773121260896e-05, |
|
"loss": 1.7361, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.0815747472956198, |
|
"grad_norm": 7.7724785804748535, |
|
"learning_rate": 9.32477428972347e-05, |
|
"loss": 1.7118, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08246142933144175, |
|
"grad_norm": 12.896247863769531, |
|
"learning_rate": 9.308597683653978e-05, |
|
"loss": 1.6849, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0833481113672637, |
|
"grad_norm": 10.572025299072266, |
|
"learning_rate": 9.292243968009333e-05, |
|
"loss": 1.7373, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08423479340308565, |
|
"grad_norm": 15.746474266052246, |
|
"learning_rate": 9.275713815026734e-05, |
|
"loss": 1.5829, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0851214754389076, |
|
"grad_norm": 8.268464088439941, |
|
"learning_rate": 9.259007904196024e-05, |
|
"loss": 1.659, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08600815747472956, |
|
"grad_norm": 6.80485200881958, |
|
"learning_rate": 9.242126922231766e-05, |
|
"loss": 1.5132, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.08689483951055152, |
|
"grad_norm": 9.059598922729492, |
|
"learning_rate": 9.225071563045009e-05, |
|
"loss": 1.5896, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08778152154637348, |
|
"grad_norm": 6.6695170402526855, |
|
"learning_rate": 9.207842527714768e-05, |
|
"loss": 1.7439, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08866820358219542, |
|
"grad_norm": 8.449771881103516, |
|
"learning_rate": 9.190440524459205e-05, |
|
"loss": 1.9, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08955488561801737, |
|
"grad_norm": 45.434688568115234, |
|
"learning_rate": 9.172866268606516e-05, |
|
"loss": 1.6689, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.09044156765383933, |
|
"grad_norm": 7.402377605438232, |
|
"learning_rate": 9.155120482565522e-05, |
|
"loss": 1.7466, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09132824968966129, |
|
"grad_norm": 8.143401145935059, |
|
"learning_rate": 9.137203895795986e-05, |
|
"loss": 1.5923, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09221493172548324, |
|
"grad_norm": 7.547423839569092, |
|
"learning_rate": 9.11911724477861e-05, |
|
"loss": 1.6487, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0931016137613052, |
|
"grad_norm": 6.291411399841309, |
|
"learning_rate": 9.100861272984782e-05, |
|
"loss": 1.7453, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.0931016137613052, |
|
"eval_loss": 1.6903132200241089, |
|
"eval_runtime": 59.3722, |
|
"eval_samples_per_second": 8.421, |
|
"eval_steps_per_second": 8.421, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09398829579712716, |
|
"grad_norm": 6.51155948638916, |
|
"learning_rate": 9.082436730845996e-05, |
|
"loss": 1.5121, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0948749778329491, |
|
"grad_norm": 7.315041542053223, |
|
"learning_rate": 9.063844375723016e-05, |
|
"loss": 1.45, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.09576165986877105, |
|
"grad_norm": 9.287749290466309, |
|
"learning_rate": 9.045084971874741e-05, |
|
"loss": 1.6892, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09664834190459301, |
|
"grad_norm": 7.6157097816467285, |
|
"learning_rate": 9.026159290426783e-05, |
|
"loss": 1.832, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09753502394041497, |
|
"grad_norm": 6.081124782562256, |
|
"learning_rate": 9.007068109339786e-05, |
|
"loss": 1.6911, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09842170597623692, |
|
"grad_norm": 7.2468671798706055, |
|
"learning_rate": 8.987812213377425e-05, |
|
"loss": 1.6959, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.09930838801205888, |
|
"grad_norm": 7.454516887664795, |
|
"learning_rate": 8.968392394074165e-05, |
|
"loss": 1.5169, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.10019507004788084, |
|
"grad_norm": 10.253645896911621, |
|
"learning_rate": 8.948809449702714e-05, |
|
"loss": 1.6779, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.10108175208370278, |
|
"grad_norm": 8.075345993041992, |
|
"learning_rate": 8.929064185241216e-05, |
|
"loss": 1.6622, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10196843411952473, |
|
"grad_norm": 11.007535934448242, |
|
"learning_rate": 8.909157412340152e-05, |
|
"loss": 1.7568, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10285511615534669, |
|
"grad_norm": 8.019722938537598, |
|
"learning_rate": 8.889089949288989e-05, |
|
"loss": 1.6177, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10374179819116865, |
|
"grad_norm": 8.618474960327148, |
|
"learning_rate": 8.868862620982537e-05, |
|
"loss": 1.5605, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.1046284802269906, |
|
"grad_norm": 8.008125305175781, |
|
"learning_rate": 8.848476258887034e-05, |
|
"loss": 1.5995, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.10551516226281256, |
|
"grad_norm": 11.63944149017334, |
|
"learning_rate": 8.827931701005976e-05, |
|
"loss": 1.5778, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.10640184429863452, |
|
"grad_norm": 9.485556602478027, |
|
"learning_rate": 8.807229791845674e-05, |
|
"loss": 1.547, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.10640184429863452, |
|
"eval_loss": 1.6847599744796753, |
|
"eval_runtime": 59.4403, |
|
"eval_samples_per_second": 8.412, |
|
"eval_steps_per_second": 8.412, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.10728852633445646, |
|
"grad_norm": 8.38836669921875, |
|
"learning_rate": 8.786371382380529e-05, |
|
"loss": 1.6206, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10817520837027841, |
|
"grad_norm": 6.360514163970947, |
|
"learning_rate": 8.765357330018059e-05, |
|
"loss": 1.5409, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10906189040610037, |
|
"grad_norm": 8.692633628845215, |
|
"learning_rate": 8.744188498563644e-05, |
|
"loss": 1.549, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.10994857244192233, |
|
"grad_norm": 6.637635707855225, |
|
"learning_rate": 8.722865758185038e-05, |
|
"loss": 1.6354, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11083525447774428, |
|
"grad_norm": 9.428290367126465, |
|
"learning_rate": 8.70138998537658e-05, |
|
"loss": 1.84, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11172193651356624, |
|
"grad_norm": 7.926419734954834, |
|
"learning_rate": 8.679762062923178e-05, |
|
"loss": 1.6915, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.1126086185493882, |
|
"grad_norm": 9.914402961730957, |
|
"learning_rate": 8.65798287986401e-05, |
|
"loss": 1.6808, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.11349530058521014, |
|
"grad_norm": 11.937222480773926, |
|
"learning_rate": 8.636053331455989e-05, |
|
"loss": 1.6807, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1143819826210321, |
|
"grad_norm": 9.676164627075195, |
|
"learning_rate": 8.613974319136961e-05, |
|
"loss": 1.6505, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11526866465685405, |
|
"grad_norm": 8.247169494628906, |
|
"learning_rate": 8.59174675048864e-05, |
|
"loss": 1.6287, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.116155346692676, |
|
"grad_norm": 7.142345905303955, |
|
"learning_rate": 8.569371539199318e-05, |
|
"loss": 1.6104, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.11704202872849796, |
|
"grad_norm": 7.208193778991699, |
|
"learning_rate": 8.546849605026292e-05, |
|
"loss": 1.5853, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11792871076431992, |
|
"grad_norm": 7.444331645965576, |
|
"learning_rate": 8.524181873758061e-05, |
|
"loss": 1.5583, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.11881539280014188, |
|
"grad_norm": 9.676839828491211, |
|
"learning_rate": 8.501369277176277e-05, |
|
"loss": 1.7519, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.11970207483596382, |
|
"grad_norm": 8.323569297790527, |
|
"learning_rate": 8.478412753017435e-05, |
|
"loss": 1.6618, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.11970207483596382, |
|
"eval_loss": 1.6926313638687134, |
|
"eval_runtime": 59.2541, |
|
"eval_samples_per_second": 8.438, |
|
"eval_steps_per_second": 8.438, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12058875687178577, |
|
"grad_norm": 9.658554077148438, |
|
"learning_rate": 8.455313244934327e-05, |
|
"loss": 1.8027, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.12147543890760773, |
|
"grad_norm": 10.467687606811523, |
|
"learning_rate": 8.432071702457255e-05, |
|
"loss": 1.6109, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.12236212094342969, |
|
"grad_norm": 8.080743789672852, |
|
"learning_rate": 8.408689080955001e-05, |
|
"loss": 1.5724, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.12324880297925164, |
|
"grad_norm": 7.568329334259033, |
|
"learning_rate": 8.38516634159555e-05, |
|
"loss": 1.5831, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.1241354850150736, |
|
"grad_norm": 6.801577091217041, |
|
"learning_rate": 8.361504451306586e-05, |
|
"loss": 1.6606, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12502216705089556, |
|
"grad_norm": 8.583120346069336, |
|
"learning_rate": 8.337704382735743e-05, |
|
"loss": 1.8597, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1259088490867175, |
|
"grad_norm": 9.193146705627441, |
|
"learning_rate": 8.313767114210618e-05, |
|
"loss": 1.6648, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.12679553112253947, |
|
"grad_norm": 7.931524753570557, |
|
"learning_rate": 8.289693629698566e-05, |
|
"loss": 1.6386, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.12768221315836142, |
|
"grad_norm": 8.280115127563477, |
|
"learning_rate": 8.265484918766245e-05, |
|
"loss": 1.5829, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12856889519418335, |
|
"grad_norm": 7.822704315185547, |
|
"learning_rate": 8.241141976538945e-05, |
|
"loss": 1.7449, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1294555772300053, |
|
"grad_norm": 6.792067050933838, |
|
"learning_rate": 8.216665803659673e-05, |
|
"loss": 1.8019, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.13034225926582726, |
|
"grad_norm": 6.622004508972168, |
|
"learning_rate": 8.19205740624803e-05, |
|
"loss": 1.6184, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13122894130164922, |
|
"grad_norm": 8.733943939208984, |
|
"learning_rate": 8.167317795858853e-05, |
|
"loss": 1.7708, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13211562333747118, |
|
"grad_norm": 12.745617866516113, |
|
"learning_rate": 8.142447989440621e-05, |
|
"loss": 1.4564, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.13300230537329313, |
|
"grad_norm": 16.666255950927734, |
|
"learning_rate": 8.117449009293671e-05, |
|
"loss": 1.7387, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13300230537329313, |
|
"eval_loss": 1.6664111614227295, |
|
"eval_runtime": 59.3291, |
|
"eval_samples_per_second": 8.428, |
|
"eval_steps_per_second": 8.428, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1338889874091151, |
|
"grad_norm": 9.83285140991211, |
|
"learning_rate": 8.09232188302816e-05, |
|
"loss": 1.545, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.13477566944493705, |
|
"grad_norm": 12.203434944152832, |
|
"learning_rate": 8.067067643521836e-05, |
|
"loss": 1.8037, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.135662351480759, |
|
"grad_norm": 6.172882556915283, |
|
"learning_rate": 8.041687328877568e-05, |
|
"loss": 1.5307, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.13654903351658096, |
|
"grad_norm": 8.186355590820312, |
|
"learning_rate": 8.016181982380684e-05, |
|
"loss": 1.4897, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.13743571555240292, |
|
"grad_norm": 6.983214378356934, |
|
"learning_rate": 7.990552652456082e-05, |
|
"loss": 1.5217, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13832239758822487, |
|
"grad_norm": 7.637870788574219, |
|
"learning_rate": 7.964800392625131e-05, |
|
"loss": 1.4999, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13920907962404683, |
|
"grad_norm": 6.830051422119141, |
|
"learning_rate": 7.938926261462369e-05, |
|
"loss": 1.6, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.14009576165986878, |
|
"grad_norm": 8.118239402770996, |
|
"learning_rate": 7.912931322551983e-05, |
|
"loss": 1.5677, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.1409824436956907, |
|
"grad_norm": 6.605192184448242, |
|
"learning_rate": 7.8868166444441e-05, |
|
"loss": 1.701, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.14186912573151267, |
|
"grad_norm": 8.868054389953613, |
|
"learning_rate": 7.86058330061085e-05, |
|
"loss": 1.5491, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14275580776733462, |
|
"grad_norm": 9.033772468566895, |
|
"learning_rate": 7.834232369402252e-05, |
|
"loss": 1.6596, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.14364248980315658, |
|
"grad_norm": 10.658461570739746, |
|
"learning_rate": 7.807764934001877e-05, |
|
"loss": 1.4879, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.14452917183897854, |
|
"grad_norm": 8.768953323364258, |
|
"learning_rate": 7.781182082382326e-05, |
|
"loss": 1.6105, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.1454158538748005, |
|
"grad_norm": 6.057528018951416, |
|
"learning_rate": 7.754484907260515e-05, |
|
"loss": 1.3959, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.14630253591062245, |
|
"grad_norm": 8.636338233947754, |
|
"learning_rate": 7.727674506052746e-05, |
|
"loss": 1.4808, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14630253591062245, |
|
"eval_loss": 1.6922165155410767, |
|
"eval_runtime": 59.3594, |
|
"eval_samples_per_second": 8.423, |
|
"eval_steps_per_second": 8.423, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1471892179464444, |
|
"grad_norm": 8.238250732421875, |
|
"learning_rate": 7.700751980829603e-05, |
|
"loss": 1.6605, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.14807589998226636, |
|
"grad_norm": 7.141147136688232, |
|
"learning_rate": 7.67371843827065e-05, |
|
"loss": 1.7022, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.14896258201808832, |
|
"grad_norm": 7.410867691040039, |
|
"learning_rate": 7.64657498961894e-05, |
|
"loss": 1.676, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.14984926405391027, |
|
"grad_norm": 7.417080402374268, |
|
"learning_rate": 7.61932275063533e-05, |
|
"loss": 1.6914, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.15073594608973223, |
|
"grad_norm": 29.780086517333984, |
|
"learning_rate": 7.591962841552628e-05, |
|
"loss": 1.7629, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1516226281255542, |
|
"grad_norm": 7.910691738128662, |
|
"learning_rate": 7.564496387029534e-05, |
|
"loss": 1.4673, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.15250931016137614, |
|
"grad_norm": 8.581075668334961, |
|
"learning_rate": 7.536924516104414e-05, |
|
"loss": 1.588, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.15339599219719807, |
|
"grad_norm": 9.336210250854492, |
|
"learning_rate": 7.50924836214889e-05, |
|
"loss": 1.7385, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.15428267423302003, |
|
"grad_norm": 7.002266883850098, |
|
"learning_rate": 7.481469062821253e-05, |
|
"loss": 1.6644, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.15516935626884198, |
|
"grad_norm": 13.515497207641602, |
|
"learning_rate": 7.453587760019692e-05, |
|
"loss": 1.7763, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15605603830466394, |
|
"grad_norm": 8.133156776428223, |
|
"learning_rate": 7.425605599835362e-05, |
|
"loss": 1.5641, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.1569427203404859, |
|
"grad_norm": 6.456133842468262, |
|
"learning_rate": 7.397523732505273e-05, |
|
"loss": 1.7309, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.15782940237630785, |
|
"grad_norm": 14.745148658752441, |
|
"learning_rate": 7.369343312364996e-05, |
|
"loss": 1.6363, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.1587160844121298, |
|
"grad_norm": 7.444967746734619, |
|
"learning_rate": 7.341065497801231e-05, |
|
"loss": 1.581, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.15960276644795177, |
|
"grad_norm": 7.5540595054626465, |
|
"learning_rate": 7.31269145120418e-05, |
|
"loss": 1.7818, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15960276644795177, |
|
"eval_loss": 1.6818472146987915, |
|
"eval_runtime": 59.3071, |
|
"eval_samples_per_second": 8.431, |
|
"eval_steps_per_second": 8.431, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.16048944848377372, |
|
"grad_norm": 9.014618873596191, |
|
"learning_rate": 7.284222338919761e-05, |
|
"loss": 1.7272, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.16137613051959568, |
|
"grad_norm": 7.285683631896973, |
|
"learning_rate": 7.255659331201675e-05, |
|
"loss": 1.7136, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.16226281255541763, |
|
"grad_norm": 8.174327850341797, |
|
"learning_rate": 7.227003602163298e-05, |
|
"loss": 1.6277, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.1631494945912396, |
|
"grad_norm": 8.313567161560059, |
|
"learning_rate": 7.198256329729413e-05, |
|
"loss": 1.5496, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.16403617662706155, |
|
"grad_norm": 6.585691452026367, |
|
"learning_rate": 7.169418695587792e-05, |
|
"loss": 1.4631, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.1649228586628835, |
|
"grad_norm": 6.994291305541992, |
|
"learning_rate": 7.140491885140631e-05, |
|
"loss": 1.6103, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.16580954069870543, |
|
"grad_norm": 10.888928413391113, |
|
"learning_rate": 7.111477087455802e-05, |
|
"loss": 1.5637, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.1666962227345274, |
|
"grad_norm": 9.979802131652832, |
|
"learning_rate": 7.082375495217998e-05, |
|
"loss": 1.7041, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.16758290477034934, |
|
"grad_norm": 7.943565368652344, |
|
"learning_rate": 7.053188304679693e-05, |
|
"loss": 1.5222, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.1684695868061713, |
|
"grad_norm": 7.496694087982178, |
|
"learning_rate": 7.02391671561197e-05, |
|
"loss": 1.4436, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16935626884199326, |
|
"grad_norm": 10.703848838806152, |
|
"learning_rate": 6.994561931255211e-05, |
|
"loss": 1.6628, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1702429508778152, |
|
"grad_norm": 9.519083976745605, |
|
"learning_rate": 6.96512515826962e-05, |
|
"loss": 1.5567, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.17112963291363717, |
|
"grad_norm": 9.946447372436523, |
|
"learning_rate": 6.935607606685643e-05, |
|
"loss": 1.5579, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.17201631494945913, |
|
"grad_norm": 10.239102363586426, |
|
"learning_rate": 6.90601048985421e-05, |
|
"loss": 1.8462, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.17290299698528108, |
|
"grad_norm": 7.735941410064697, |
|
"learning_rate": 6.876335024396873e-05, |
|
"loss": 1.7393, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17290299698528108, |
|
"eval_loss": 1.6459492444992065, |
|
"eval_runtime": 59.3323, |
|
"eval_samples_per_second": 8.427, |
|
"eval_steps_per_second": 8.427, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17378967902110304, |
|
"grad_norm": 7.9266228675842285, |
|
"learning_rate": 6.846582430155784e-05, |
|
"loss": 1.6871, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.174676361056925, |
|
"grad_norm": 7.106112003326416, |
|
"learning_rate": 6.81675393014356e-05, |
|
"loss": 1.4537, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.17556304309274695, |
|
"grad_norm": 7.142345428466797, |
|
"learning_rate": 6.786850750493007e-05, |
|
"loss": 1.7198, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1764497251285689, |
|
"grad_norm": 9.443406105041504, |
|
"learning_rate": 6.756874120406716e-05, |
|
"loss": 1.8101, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.17733640716439084, |
|
"grad_norm": 9.909912109375, |
|
"learning_rate": 6.72682527210654e-05, |
|
"loss": 1.5669, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1782230892002128, |
|
"grad_norm": 7.394705772399902, |
|
"learning_rate": 6.696705440782941e-05, |
|
"loss": 1.7875, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.17910977123603475, |
|
"grad_norm": 6.871555805206299, |
|
"learning_rate": 6.66651586454421e-05, |
|
"loss": 1.6649, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.1799964532718567, |
|
"grad_norm": 8.4517183303833, |
|
"learning_rate": 6.636257784365587e-05, |
|
"loss": 1.5413, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.18088313530767866, |
|
"grad_norm": 7.050330638885498, |
|
"learning_rate": 6.60593244403823e-05, |
|
"loss": 1.591, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.18176981734350062, |
|
"grad_norm": 8.818882942199707, |
|
"learning_rate": 6.575541090118106e-05, |
|
"loss": 1.5542, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.18265649937932257, |
|
"grad_norm": 9.987334251403809, |
|
"learning_rate": 6.54508497187474e-05, |
|
"loss": 1.6636, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.18354318141514453, |
|
"grad_norm": 9.73635196685791, |
|
"learning_rate": 6.514565341239862e-05, |
|
"loss": 1.8135, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.18442986345096649, |
|
"grad_norm": 6.916482925415039, |
|
"learning_rate": 6.483983452755954e-05, |
|
"loss": 1.7043, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.18531654548678844, |
|
"grad_norm": 13.21588134765625, |
|
"learning_rate": 6.45334056352467e-05, |
|
"loss": 1.5767, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.1862032275226104, |
|
"grad_norm": 7.8419904708862305, |
|
"learning_rate": 6.422637933155163e-05, |
|
"loss": 1.5591, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1862032275226104, |
|
"eval_loss": 1.6301392316818237, |
|
"eval_runtime": 59.4431, |
|
"eval_samples_per_second": 8.411, |
|
"eval_steps_per_second": 8.411, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18708990955843235, |
|
"grad_norm": 8.962486267089844, |
|
"learning_rate": 6.391876823712319e-05, |
|
"loss": 1.4843, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.1879765915942543, |
|
"grad_norm": 10.67493724822998, |
|
"learning_rate": 6.361058499664857e-05, |
|
"loss": 1.6638, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.18886327363007627, |
|
"grad_norm": 8.06369686126709, |
|
"learning_rate": 6.330184227833377e-05, |
|
"loss": 1.6439, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.1897499556658982, |
|
"grad_norm": 9.005534172058105, |
|
"learning_rate": 6.299255277338267e-05, |
|
"loss": 1.5289, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.19063663770172015, |
|
"grad_norm": 9.255204200744629, |
|
"learning_rate": 6.268272919547539e-05, |
|
"loss": 1.46, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.1915233197375421, |
|
"grad_norm": 7.344980239868164, |
|
"learning_rate": 6.237238428024573e-05, |
|
"loss": 1.4932, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.19241000177336406, |
|
"grad_norm": 8.692234992980957, |
|
"learning_rate": 6.206153078475765e-05, |
|
"loss": 1.6582, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.19329668380918602, |
|
"grad_norm": 7.381601333618164, |
|
"learning_rate": 6.175018148698078e-05, |
|
"loss": 1.5007, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.19418336584500798, |
|
"grad_norm": 7.794239044189453, |
|
"learning_rate": 6.143834918526529e-05, |
|
"loss": 1.6501, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.19507004788082993, |
|
"grad_norm": 8.13096809387207, |
|
"learning_rate": 6.112604669781574e-05, |
|
"loss": 1.6862, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1959567299166519, |
|
"grad_norm": 6.846219539642334, |
|
"learning_rate": 6.081328686216419e-05, |
|
"loss": 1.5702, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.19684341195247385, |
|
"grad_norm": 8.771533966064453, |
|
"learning_rate": 6.0500082534642485e-05, |
|
"loss": 1.6259, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.1977300939882958, |
|
"grad_norm": 6.50418758392334, |
|
"learning_rate": 6.01864465898538e-05, |
|
"loss": 1.6948, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.19861677602411776, |
|
"grad_norm": 8.83719539642334, |
|
"learning_rate": 5.987239192014337e-05, |
|
"loss": 1.643, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.19950345805993971, |
|
"grad_norm": 7.24541711807251, |
|
"learning_rate": 5.955793143506864e-05, |
|
"loss": 1.624, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.19950345805993971, |
|
"eval_loss": 1.5997846126556396, |
|
"eval_runtime": 59.4561, |
|
"eval_samples_per_second": 8.41, |
|
"eval_steps_per_second": 8.41, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.20039014009576167, |
|
"grad_norm": 13.114813804626465, |
|
"learning_rate": 5.9243078060868454e-05, |
|
"loss": 1.5787, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.20127682213158363, |
|
"grad_norm": 6.7087321281433105, |
|
"learning_rate": 5.8927844739931854e-05, |
|
"loss": 1.3785, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.20216350416740556, |
|
"grad_norm": 6.644030570983887, |
|
"learning_rate": 5.8612244430265966e-05, |
|
"loss": 1.5126, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.2030501862032275, |
|
"grad_norm": 10.291509628295898, |
|
"learning_rate": 5.829629010496342e-05, |
|
"loss": 1.4863, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.20393686823904947, |
|
"grad_norm": 6.426754951477051, |
|
"learning_rate": 5.797999475166898e-05, |
|
"loss": 1.5586, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.20482355027487142, |
|
"grad_norm": 9.044095039367676, |
|
"learning_rate": 5.766337137204581e-05, |
|
"loss": 1.5063, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.20571023231069338, |
|
"grad_norm": 8.852991104125977, |
|
"learning_rate": 5.734643298124092e-05, |
|
"loss": 1.7211, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.20659691434651534, |
|
"grad_norm": 73.65837860107422, |
|
"learning_rate": 5.702919260735016e-05, |
|
"loss": 1.5191, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.2074835963823373, |
|
"grad_norm": 8.413342475891113, |
|
"learning_rate": 5.671166329088279e-05, |
|
"loss": 1.5013, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.20837027841815925, |
|
"grad_norm": 6.938820838928223, |
|
"learning_rate": 5.639385808422532e-05, |
|
"loss": 1.5099, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.2092569604539812, |
|
"grad_norm": 7.757599353790283, |
|
"learning_rate": 5.6075790051105044e-05, |
|
"loss": 1.5848, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.21014364248980316, |
|
"grad_norm": 7.502821445465088, |
|
"learning_rate": 5.5757472266052994e-05, |
|
"loss": 1.7166, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.21103032452562512, |
|
"grad_norm": 11.332352638244629, |
|
"learning_rate": 5.543891781386657e-05, |
|
"loss": 1.671, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.21191700656144707, |
|
"grad_norm": 7.515905380249023, |
|
"learning_rate": 5.512013978907158e-05, |
|
"loss": 1.6298, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.21280368859726903, |
|
"grad_norm": 6.094747543334961, |
|
"learning_rate": 5.4801151295384105e-05, |
|
"loss": 1.5135, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.21280368859726903, |
|
"eval_loss": 1.5888803005218506, |
|
"eval_runtime": 59.4453, |
|
"eval_samples_per_second": 8.411, |
|
"eval_steps_per_second": 8.411, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.213690370633091, |
|
"grad_norm": 7.49708366394043, |
|
"learning_rate": 5.448196544517169e-05, |
|
"loss": 1.5031, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.21457705266891292, |
|
"grad_norm": 8.41457748413086, |
|
"learning_rate": 5.4162595358914485e-05, |
|
"loss": 1.5116, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.21546373470473487, |
|
"grad_norm": 7.308359146118164, |
|
"learning_rate": 5.3843054164665855e-05, |
|
"loss": 1.4185, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.21635041674055683, |
|
"grad_norm": 13.086946487426758, |
|
"learning_rate": 5.352335499751271e-05, |
|
"loss": 1.6723, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.21723709877637878, |
|
"grad_norm": 7.7518157958984375, |
|
"learning_rate": 5.3203510999035666e-05, |
|
"loss": 1.4357, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.21812378081220074, |
|
"grad_norm": 7.657406806945801, |
|
"learning_rate": 5.2883535316768745e-05, |
|
"loss": 1.5464, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2190104628480227, |
|
"grad_norm": 6.197967529296875, |
|
"learning_rate": 5.2563441103658975e-05, |
|
"loss": 1.3296, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.21989714488384465, |
|
"grad_norm": 10.529012680053711, |
|
"learning_rate": 5.224324151752577e-05, |
|
"loss": 1.6508, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2207838269196666, |
|
"grad_norm": 7.516609191894531, |
|
"learning_rate": 5.1922949720519935e-05, |
|
"loss": 1.5441, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.22167050895548857, |
|
"grad_norm": 5.759303569793701, |
|
"learning_rate": 5.160257887858279e-05, |
|
"loss": 1.5121, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22255719099131052, |
|
"grad_norm": 9.412184715270996, |
|
"learning_rate": 5.1282142160904794e-05, |
|
"loss": 1.5882, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.22344387302713248, |
|
"grad_norm": 6.849535942077637, |
|
"learning_rate": 5.096165273938437e-05, |
|
"loss": 1.4593, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.22433055506295443, |
|
"grad_norm": 6.8701300621032715, |
|
"learning_rate": 5.064112378808638e-05, |
|
"loss": 1.6232, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.2252172370987764, |
|
"grad_norm": 10.939626693725586, |
|
"learning_rate": 5.032056848270057e-05, |
|
"loss": 1.605, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.22610391913459835, |
|
"grad_norm": 9.180508613586426, |
|
"learning_rate": 5.0000000000000016e-05, |
|
"loss": 1.572, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.22610391913459835, |
|
"eval_loss": 1.5821589231491089, |
|
"eval_runtime": 59.3605, |
|
"eval_samples_per_second": 8.423, |
|
"eval_steps_per_second": 8.423, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.22699060117042028, |
|
"grad_norm": 7.859452247619629, |
|
"learning_rate": 4.967943151729946e-05, |
|
"loss": 1.4249, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.22787728320624223, |
|
"grad_norm": 8.19686222076416, |
|
"learning_rate": 4.935887621191365e-05, |
|
"loss": 1.5235, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.2287639652420642, |
|
"grad_norm": 6.515259265899658, |
|
"learning_rate": 4.903834726061566e-05, |
|
"loss": 1.6435, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.22965064727788614, |
|
"grad_norm": 7.381898403167725, |
|
"learning_rate": 4.8717857839095245e-05, |
|
"loss": 1.5029, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.2305373293137081, |
|
"grad_norm": 7.736898422241211, |
|
"learning_rate": 4.8397421121417256e-05, |
|
"loss": 1.4735, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.23142401134953006, |
|
"grad_norm": 5.957932949066162, |
|
"learning_rate": 4.807705027948009e-05, |
|
"loss": 1.4774, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.232310693385352, |
|
"grad_norm": 6.916577339172363, |
|
"learning_rate": 4.7756758482474285e-05, |
|
"loss": 1.5927, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.23319737542117397, |
|
"grad_norm": 11.942724227905273, |
|
"learning_rate": 4.7436558896341064e-05, |
|
"loss": 1.4889, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.23408405745699593, |
|
"grad_norm": 6.2363057136535645, |
|
"learning_rate": 4.71164646832313e-05, |
|
"loss": 1.5574, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.23497073949281788, |
|
"grad_norm": 7.307931423187256, |
|
"learning_rate": 4.679648900096437e-05, |
|
"loss": 1.4904, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.23585742152863984, |
|
"grad_norm": 6.648471355438232, |
|
"learning_rate": 4.6476645002487314e-05, |
|
"loss": 1.5208, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.2367441035644618, |
|
"grad_norm": 7.0004167556762695, |
|
"learning_rate": 4.615694583533419e-05, |
|
"loss": 1.5019, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.23763078560028375, |
|
"grad_norm": 7.872572422027588, |
|
"learning_rate": 4.583740464108555e-05, |
|
"loss": 1.5381, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.23851746763610568, |
|
"grad_norm": 8.477245330810547, |
|
"learning_rate": 4.5518034554828346e-05, |
|
"loss": 1.5339, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.23940414967192764, |
|
"grad_norm": 9.522911071777344, |
|
"learning_rate": 4.519884870461593e-05, |
|
"loss": 1.5553, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23940414967192764, |
|
"eval_loss": 1.5684587955474854, |
|
"eval_runtime": 59.3936, |
|
"eval_samples_per_second": 8.418, |
|
"eval_steps_per_second": 8.418, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.2402908317077496, |
|
"grad_norm": 6.57583475112915, |
|
"learning_rate": 4.487986021092845e-05, |
|
"loss": 1.584, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.24117751374357155, |
|
"grad_norm": 10.243136405944824, |
|
"learning_rate": 4.456108218613348e-05, |
|
"loss": 1.5983, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.2420641957793935, |
|
"grad_norm": 7.099559307098389, |
|
"learning_rate": 4.424252773394705e-05, |
|
"loss": 1.5817, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.24295087781521546, |
|
"grad_norm": 7.89496374130249, |
|
"learning_rate": 4.3924209948894995e-05, |
|
"loss": 1.6713, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.24383755985103742, |
|
"grad_norm": 7.425642490386963, |
|
"learning_rate": 4.360614191577471e-05, |
|
"loss": 1.4357, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.24472424188685937, |
|
"grad_norm": 7.661828994750977, |
|
"learning_rate": 4.3288336709117256e-05, |
|
"loss": 1.4964, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.24561092392268133, |
|
"grad_norm": 9.707741737365723, |
|
"learning_rate": 4.297080739264988e-05, |
|
"loss": 1.3392, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.24649760595850329, |
|
"grad_norm": 6.905391693115234, |
|
"learning_rate": 4.2653567018759114e-05, |
|
"loss": 1.4212, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.24738428799432524, |
|
"grad_norm": 7.846536636352539, |
|
"learning_rate": 4.233662862795421e-05, |
|
"loss": 1.6355, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.2482709700301472, |
|
"grad_norm": 6.390925884246826, |
|
"learning_rate": 4.202000524833106e-05, |
|
"loss": 1.4986, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.24915765206596915, |
|
"grad_norm": 7.581704616546631, |
|
"learning_rate": 4.170370989503664e-05, |
|
"loss": 1.6026, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.2500443341017911, |
|
"grad_norm": 4.918117523193359, |
|
"learning_rate": 4.138775556973407e-05, |
|
"loss": 1.4725, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.25093101613761304, |
|
"grad_norm": 12.692071914672852, |
|
"learning_rate": 4.1072155260068185e-05, |
|
"loss": 1.6113, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.251817698173435, |
|
"grad_norm": 6.600620746612549, |
|
"learning_rate": 4.075692193913158e-05, |
|
"loss": 1.5967, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.25270438020925695, |
|
"grad_norm": 6.879825115203857, |
|
"learning_rate": 4.0442068564931405e-05, |
|
"loss": 1.4693, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.25270438020925695, |
|
"eval_loss": 1.5608752965927124, |
|
"eval_runtime": 59.3283, |
|
"eval_samples_per_second": 8.428, |
|
"eval_steps_per_second": 8.428, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.25359106224507894, |
|
"grad_norm": 7.351998329162598, |
|
"learning_rate": 4.012760807985666e-05, |
|
"loss": 1.5118, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.25447774428090086, |
|
"grad_norm": 9.371225357055664, |
|
"learning_rate": 3.9813553410146234e-05, |
|
"loss": 1.5299, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.25536442631672285, |
|
"grad_norm": 7.500007152557373, |
|
"learning_rate": 3.949991746535754e-05, |
|
"loss": 1.5855, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.2562511083525448, |
|
"grad_norm": 7.6006903648376465, |
|
"learning_rate": 3.918671313783584e-05, |
|
"loss": 1.5459, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2571377903883667, |
|
"grad_norm": 6.81592321395874, |
|
"learning_rate": 3.8873953302184295e-05, |
|
"loss": 1.361, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2580244724241887, |
|
"grad_norm": 6.851174831390381, |
|
"learning_rate": 3.856165081473475e-05, |
|
"loss": 1.2751, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.2589111544600106, |
|
"grad_norm": 8.746306419372559, |
|
"learning_rate": 3.824981851301925e-05, |
|
"loss": 1.3964, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.2597978364958326, |
|
"grad_norm": 8.92397689819336, |
|
"learning_rate": 3.7938469215242386e-05, |
|
"loss": 1.5833, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.26068451853165453, |
|
"grad_norm": 12.532337188720703, |
|
"learning_rate": 3.762761571975431e-05, |
|
"loss": 1.754, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.2615712005674765, |
|
"grad_norm": 7.304866313934326, |
|
"learning_rate": 3.731727080452465e-05, |
|
"loss": 1.5328, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.26245788260329844, |
|
"grad_norm": 7.864557266235352, |
|
"learning_rate": 3.700744722661737e-05, |
|
"loss": 1.5286, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.2633445646391204, |
|
"grad_norm": 6.201906204223633, |
|
"learning_rate": 3.669815772166626e-05, |
|
"loss": 1.5775, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.26423124667494235, |
|
"grad_norm": 8.181777954101562, |
|
"learning_rate": 3.6389415003351454e-05, |
|
"loss": 1.6203, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.26511792871076434, |
|
"grad_norm": 8.13985824584961, |
|
"learning_rate": 3.608123176287686e-05, |
|
"loss": 1.4212, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.26600461074658627, |
|
"grad_norm": 7.873915672302246, |
|
"learning_rate": 3.577362066844839e-05, |
|
"loss": 1.4327, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26600461074658627, |
|
"eval_loss": 1.5497733354568481, |
|
"eval_runtime": 59.3515, |
|
"eval_samples_per_second": 8.424, |
|
"eval_steps_per_second": 8.424, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.26689129278240825, |
|
"grad_norm": 9.156728744506836, |
|
"learning_rate": 3.546659436475333e-05, |
|
"loss": 1.4905, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.2677779748182302, |
|
"grad_norm": 9.707823753356934, |
|
"learning_rate": 3.516016547244048e-05, |
|
"loss": 1.537, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.26866465685405216, |
|
"grad_norm": 7.352388858795166, |
|
"learning_rate": 3.485434658760141e-05, |
|
"loss": 1.4339, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.2695513388898741, |
|
"grad_norm": 6.827575206756592, |
|
"learning_rate": 3.454915028125264e-05, |
|
"loss": 1.4186, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.270438020925696, |
|
"grad_norm": 10.17138671875, |
|
"learning_rate": 3.424458909881898e-05, |
|
"loss": 1.7247, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.271324702961518, |
|
"grad_norm": 6.4703850746154785, |
|
"learning_rate": 3.394067555961773e-05, |
|
"loss": 1.5231, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.27221138499733993, |
|
"grad_norm": 10.700502395629883, |
|
"learning_rate": 3.363742215634417e-05, |
|
"loss": 1.6463, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.2730980670331619, |
|
"grad_norm": 7.374842643737793, |
|
"learning_rate": 3.333484135455793e-05, |
|
"loss": 1.4507, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.27398474906898385, |
|
"grad_norm": 20.006195068359375, |
|
"learning_rate": 3.303294559217064e-05, |
|
"loss": 1.5481, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.27487143110480583, |
|
"grad_norm": 9.058706283569336, |
|
"learning_rate": 3.273174727893464e-05, |
|
"loss": 1.4567, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.27575811314062776, |
|
"grad_norm": 6.159706115722656, |
|
"learning_rate": 3.243125879593287e-05, |
|
"loss": 1.6317, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.27664479517644974, |
|
"grad_norm": 7.272984981536865, |
|
"learning_rate": 3.213149249506998e-05, |
|
"loss": 1.4127, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.27753147721227167, |
|
"grad_norm": 7.003290176391602, |
|
"learning_rate": 3.183246069856444e-05, |
|
"loss": 1.4704, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.27841815924809366, |
|
"grad_norm": 9.395560264587402, |
|
"learning_rate": 3.15341756984422e-05, |
|
"loss": 1.4897, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.2793048412839156, |
|
"grad_norm": 9.366192817687988, |
|
"learning_rate": 3.123664975603131e-05, |
|
"loss": 1.4859, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2793048412839156, |
|
"eval_loss": 1.5420976877212524, |
|
"eval_runtime": 59.3868, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 8.419, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.28019152331973757, |
|
"grad_norm": 9.653646469116211, |
|
"learning_rate": 3.093989510145792e-05, |
|
"loss": 1.5305, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.2810782053555595, |
|
"grad_norm": 7.124954700469971, |
|
"learning_rate": 3.0643923933143614e-05, |
|
"loss": 1.5319, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.2819648873913814, |
|
"grad_norm": 7.334105968475342, |
|
"learning_rate": 3.0348748417303834e-05, |
|
"loss": 1.4719, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.2828515694272034, |
|
"grad_norm": 8.093483924865723, |
|
"learning_rate": 3.005438068744793e-05, |
|
"loss": 1.6516, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.28373825146302534, |
|
"grad_norm": 7.165219306945801, |
|
"learning_rate": 2.9760832843880317e-05, |
|
"loss": 1.5383, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2846249334988473, |
|
"grad_norm": 8.381077766418457, |
|
"learning_rate": 2.9468116953203113e-05, |
|
"loss": 1.5084, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.28551161553466925, |
|
"grad_norm": 8.025052070617676, |
|
"learning_rate": 2.917624504782007e-05, |
|
"loss": 1.4437, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.28639829757049123, |
|
"grad_norm": 6.2781453132629395, |
|
"learning_rate": 2.8885229125442027e-05, |
|
"loss": 1.6153, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.28728497960631316, |
|
"grad_norm": 7.848852157592773, |
|
"learning_rate": 2.8595081148593748e-05, |
|
"loss": 1.4542, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.28817166164213515, |
|
"grad_norm": 9.784353256225586, |
|
"learning_rate": 2.8305813044122107e-05, |
|
"loss": 1.4358, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2890583436779571, |
|
"grad_norm": 8.469407081604004, |
|
"learning_rate": 2.8017436702705908e-05, |
|
"loss": 1.4224, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.28994502571377906, |
|
"grad_norm": 7.443441390991211, |
|
"learning_rate": 2.7729963978367048e-05, |
|
"loss": 1.5289, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.290831707749601, |
|
"grad_norm": 9.007468223571777, |
|
"learning_rate": 2.7443406687983272e-05, |
|
"loss": 1.4351, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.29171838978542297, |
|
"grad_norm": 9.351861953735352, |
|
"learning_rate": 2.715777661080242e-05, |
|
"loss": 1.5687, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.2926050718212449, |
|
"grad_norm": 7.077907562255859, |
|
"learning_rate": 2.6873085487958257e-05, |
|
"loss": 1.7491, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2926050718212449, |
|
"eval_loss": 1.5311921834945679, |
|
"eval_runtime": 59.4179, |
|
"eval_samples_per_second": 8.415, |
|
"eval_steps_per_second": 8.415, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.29349175385706683, |
|
"grad_norm": 5.730030059814453, |
|
"learning_rate": 2.6589345021987728e-05, |
|
"loss": 1.5217, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.2943784358928888, |
|
"grad_norm": 6.85732889175415, |
|
"learning_rate": 2.6306566876350076e-05, |
|
"loss": 1.4184, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.29526511792871074, |
|
"grad_norm": 9.774615287780762, |
|
"learning_rate": 2.602476267494732e-05, |
|
"loss": 1.4287, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.2961517999645327, |
|
"grad_norm": 6.502627372741699, |
|
"learning_rate": 2.5743944001646398e-05, |
|
"loss": 1.6562, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.29703848200035465, |
|
"grad_norm": 10.487425804138184, |
|
"learning_rate": 2.546412239980313e-05, |
|
"loss": 1.5361, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.29792516403617664, |
|
"grad_norm": 6.752458095550537, |
|
"learning_rate": 2.518530937178752e-05, |
|
"loss": 1.473, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.29881184607199857, |
|
"grad_norm": 9.102508544921875, |
|
"learning_rate": 2.4907516378511142e-05, |
|
"loss": 1.626, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.29969852810782055, |
|
"grad_norm": 11.789603233337402, |
|
"learning_rate": 2.4630754838955908e-05, |
|
"loss": 1.5676, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.3005852101436425, |
|
"grad_norm": 4.690525531768799, |
|
"learning_rate": 2.4355036129704707e-05, |
|
"loss": 1.3247, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.30147189217946446, |
|
"grad_norm": 8.3900785446167, |
|
"learning_rate": 2.4080371584473755e-05, |
|
"loss": 1.5134, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3023585742152864, |
|
"grad_norm": 6.29799747467041, |
|
"learning_rate": 2.380677249364673e-05, |
|
"loss": 1.4641, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3032452562511084, |
|
"grad_norm": 6.937891960144043, |
|
"learning_rate": 2.3534250103810636e-05, |
|
"loss": 1.5212, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.3041319382869303, |
|
"grad_norm": 8.222491264343262, |
|
"learning_rate": 2.326281561729352e-05, |
|
"loss": 1.452, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3050186203227523, |
|
"grad_norm": 7.146228313446045, |
|
"learning_rate": 2.299248019170401e-05, |
|
"loss": 1.4393, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.3059053023585742, |
|
"grad_norm": 8.970130920410156, |
|
"learning_rate": 2.2723254939472577e-05, |
|
"loss": 1.5834, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.3059053023585742, |
|
"eval_loss": 1.5162127017974854, |
|
"eval_runtime": 59.4395, |
|
"eval_samples_per_second": 8.412, |
|
"eval_steps_per_second": 8.412, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.30679198439439614, |
|
"grad_norm": 8.676182746887207, |
|
"learning_rate": 2.2455150927394888e-05, |
|
"loss": 1.4935, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.30767866643021813, |
|
"grad_norm": 7.3444719314575195, |
|
"learning_rate": 2.2188179176176773e-05, |
|
"loss": 1.4348, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.30856534846604006, |
|
"grad_norm": 8.14013957977295, |
|
"learning_rate": 2.1922350659981268e-05, |
|
"loss": 1.486, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.30945203050186204, |
|
"grad_norm": 8.495216369628906, |
|
"learning_rate": 2.1657676305977525e-05, |
|
"loss": 1.4254, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.31033871253768397, |
|
"grad_norm": 6.765787601470947, |
|
"learning_rate": 2.1394166993891536e-05, |
|
"loss": 1.4532, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.31122539457350595, |
|
"grad_norm": 7.284378528594971, |
|
"learning_rate": 2.1131833555559044e-05, |
|
"loss": 1.4648, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.3121120766093279, |
|
"grad_norm": 5.678081512451172, |
|
"learning_rate": 2.0870686774480203e-05, |
|
"loss": 1.4927, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.31299875864514987, |
|
"grad_norm": 7.111896514892578, |
|
"learning_rate": 2.0610737385376356e-05, |
|
"loss": 1.3698, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.3138854406809718, |
|
"grad_norm": 8.32049560546875, |
|
"learning_rate": 2.035199607374872e-05, |
|
"loss": 1.3701, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.3147721227167938, |
|
"grad_norm": 5.531152248382568, |
|
"learning_rate": 2.009447347543921e-05, |
|
"loss": 1.4623, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.3156588047526157, |
|
"grad_norm": 11.240205764770508, |
|
"learning_rate": 1.983818017619318e-05, |
|
"loss": 1.4952, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.3165454867884377, |
|
"grad_norm": 9.017026901245117, |
|
"learning_rate": 1.9583126711224347e-05, |
|
"loss": 1.3575, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.3174321688242596, |
|
"grad_norm": 7.459463596343994, |
|
"learning_rate": 1.9329323564781685e-05, |
|
"loss": 1.5883, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.31831885086008155, |
|
"grad_norm": 8.614239692687988, |
|
"learning_rate": 1.907678116971843e-05, |
|
"loss": 1.3795, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.31920553289590353, |
|
"grad_norm": 7.115685939788818, |
|
"learning_rate": 1.882550990706333e-05, |
|
"loss": 1.4851, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.31920553289590353, |
|
"eval_loss": 1.507421612739563, |
|
"eval_runtime": 59.4282, |
|
"eval_samples_per_second": 8.414, |
|
"eval_steps_per_second": 8.414, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.32009221493172546, |
|
"grad_norm": 7.0822882652282715, |
|
"learning_rate": 1.8575520105593824e-05, |
|
"loss": 1.4626, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.32097889696754744, |
|
"grad_norm": 8.020169258117676, |
|
"learning_rate": 1.8326822041411528e-05, |
|
"loss": 1.4378, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.3218655790033694, |
|
"grad_norm": 7.411886692047119, |
|
"learning_rate": 1.8079425937519732e-05, |
|
"loss": 1.4582, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.32275226103919136, |
|
"grad_norm": 8.050488471984863, |
|
"learning_rate": 1.7833341963403314e-05, |
|
"loss": 1.5404, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.3236389430750133, |
|
"grad_norm": 7.69305944442749, |
|
"learning_rate": 1.7588580234610594e-05, |
|
"loss": 1.3396, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.32452562511083527, |
|
"grad_norm": 10.572087287902832, |
|
"learning_rate": 1.7345150812337567e-05, |
|
"loss": 1.5023, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3254123071466572, |
|
"grad_norm": 9.625089645385742, |
|
"learning_rate": 1.7103063703014376e-05, |
|
"loss": 1.3099, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.3262989891824792, |
|
"grad_norm": 7.519534587860107, |
|
"learning_rate": 1.686232885789386e-05, |
|
"loss": 1.4512, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3271856712183011, |
|
"grad_norm": 8.976761817932129, |
|
"learning_rate": 1.6622956172642604e-05, |
|
"loss": 1.5594, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.3280723532541231, |
|
"grad_norm": 6.362197399139404, |
|
"learning_rate": 1.638495548693416e-05, |
|
"loss": 1.3491, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.328959035289945, |
|
"grad_norm": 7.0223469734191895, |
|
"learning_rate": 1.6148336584044543e-05, |
|
"loss": 1.5594, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.329845717325767, |
|
"grad_norm": 7.8357625007629395, |
|
"learning_rate": 1.5913109190450035e-05, |
|
"loss": 1.6264, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.33073239936158894, |
|
"grad_norm": 7.040388107299805, |
|
"learning_rate": 1.5679282975427494e-05, |
|
"loss": 1.4702, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.33161908139741086, |
|
"grad_norm": 8.075628280639648, |
|
"learning_rate": 1.5446867550656772e-05, |
|
"loss": 1.5274, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.33250576343323285, |
|
"grad_norm": 10.086247444152832, |
|
"learning_rate": 1.5215872469825685e-05, |
|
"loss": 1.5678, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.33250576343323285, |
|
"eval_loss": 1.499360203742981, |
|
"eval_runtime": 59.3955, |
|
"eval_samples_per_second": 8.418, |
|
"eval_steps_per_second": 8.418, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3333924454690548, |
|
"grad_norm": 6.615363597869873, |
|
"learning_rate": 1.4986307228237271e-05, |
|
"loss": 1.4275, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.33427912750487676, |
|
"grad_norm": 10.221020698547363, |
|
"learning_rate": 1.4758181262419428e-05, |
|
"loss": 1.5383, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.3351658095406987, |
|
"grad_norm": 7.772680759429932, |
|
"learning_rate": 1.4531503949737111e-05, |
|
"loss": 1.4759, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.3360524915765207, |
|
"grad_norm": 12.315176963806152, |
|
"learning_rate": 1.4306284608006841e-05, |
|
"loss": 1.6371, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.3369391736123426, |
|
"grad_norm": 5.77493953704834, |
|
"learning_rate": 1.408253249511363e-05, |
|
"loss": 1.5273, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3378258556481646, |
|
"grad_norm": 7.274715423583984, |
|
"learning_rate": 1.3860256808630431e-05, |
|
"loss": 1.5488, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.3387125376839865, |
|
"grad_norm": 5.848362922668457, |
|
"learning_rate": 1.3639466685440138e-05, |
|
"loss": 1.3308, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.3395992197198085, |
|
"grad_norm": 7.360718727111816, |
|
"learning_rate": 1.3420171201359936e-05, |
|
"loss": 1.3528, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3404859017556304, |
|
"grad_norm": 7.868961334228516, |
|
"learning_rate": 1.3202379370768256e-05, |
|
"loss": 1.4614, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3413725837914524, |
|
"grad_norm": 9.383559226989746, |
|
"learning_rate": 1.2986100146234235e-05, |
|
"loss": 1.4874, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.34225926582727434, |
|
"grad_norm": 11.13818645477295, |
|
"learning_rate": 1.277134241814966e-05, |
|
"loss": 1.3849, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.34314594786309627, |
|
"grad_norm": 10.342170715332031, |
|
"learning_rate": 1.2558115014363595e-05, |
|
"loss": 1.3115, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.34403262989891825, |
|
"grad_norm": 6.494439125061035, |
|
"learning_rate": 1.2346426699819462e-05, |
|
"loss": 1.4533, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3449193119347402, |
|
"grad_norm": 9.364208221435547, |
|
"learning_rate": 1.2136286176194748e-05, |
|
"loss": 1.4229, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.34580599397056216, |
|
"grad_norm": 6.627419471740723, |
|
"learning_rate": 1.1927702081543282e-05, |
|
"loss": 1.2769, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.34580599397056216, |
|
"eval_loss": 1.4924039840698242, |
|
"eval_runtime": 59.4072, |
|
"eval_samples_per_second": 8.416, |
|
"eval_steps_per_second": 8.416, |
|
"step": 3900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3131363193066783e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|