|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993739889922224, |
|
"eval_steps": 500, |
|
"global_step": 1559, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006410352719642222, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 1.8493, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003205176359821111, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 6.41025641025641e-06, |
|
"loss": 1.8865, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006410352719642222, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 1.282051282051282e-05, |
|
"loss": 1.8383, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009615529079463333, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 1.8385, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012820705439284444, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 2.564102564102564e-05, |
|
"loss": 1.8346, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.016025881799105555, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 3.205128205128206e-05, |
|
"loss": 1.8127, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.019231058158926666, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 1.7981, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022436234518747777, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 4.4871794871794874e-05, |
|
"loss": 1.7907, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.025641410878568888, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 5.128205128205128e-05, |
|
"loss": 1.7468, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02884658723839, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 5.769230769230769e-05, |
|
"loss": 1.7105, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03205176359821111, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 6.410256410256412e-05, |
|
"loss": 1.6887, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.035256939958032224, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 7.051282051282052e-05, |
|
"loss": 1.6757, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03846211631785333, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 1.6736, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.041667292677674446, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.6252, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04487246903749555, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 8.974358974358975e-05, |
|
"loss": 1.5655, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04807764539731667, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.615384615384617e-05, |
|
"loss": 1.5646, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.051282821757137775, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 0.00010256410256410256, |
|
"loss": 1.5861, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05448799811695889, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 0.00010897435897435896, |
|
"loss": 1.5379, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05769317447678, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 0.00011538461538461538, |
|
"loss": 1.5557, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06089835083660111, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 0.00012179487179487179, |
|
"loss": 1.5102, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06410352719642222, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 0.00012820512820512823, |
|
"loss": 1.5048, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06730870355624333, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.00013461538461538464, |
|
"loss": 1.5127, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07051387991606445, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 0.00014102564102564104, |
|
"loss": 1.5161, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07371905627588556, |
|
"grad_norm": 0.0301513671875, |
|
"learning_rate": 0.00014743589743589745, |
|
"loss": 1.4948, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07692423263570666, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 1.4584, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08012940899552777, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 0.00016025641025641028, |
|
"loss": 1.4704, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08333458535534889, |
|
"grad_norm": 0.0279541015625, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.4411, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08653976171517, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 0.0001730769230769231, |
|
"loss": 1.4723, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0897449380749911, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 0.0001794871794871795, |
|
"loss": 1.4505, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09295011443481223, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 0.0001858974358974359, |
|
"loss": 1.4367, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09615529079463334, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 1.4291, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09936046715445444, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 0.00019871794871794874, |
|
"loss": 1.4075, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.10256564351427555, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 0.00019999598882613538, |
|
"loss": 1.4203, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10577081987409667, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.00019997969398381457, |
|
"loss": 1.4188, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10897599623391778, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.00019995086681563726, |
|
"loss": 1.4512, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11218117259373889, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.0001999095109350519, |
|
"loss": 1.417, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11538634895356, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 0.0001998556315259648, |
|
"loss": 1.4309, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11859152531338112, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 0.00019978923534209054, |
|
"loss": 1.4201, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.12179670167320222, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 0.00019971033070610518, |
|
"loss": 1.4187, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12500187803302334, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 0.0001996189275086033, |
|
"loss": 1.4153, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12820705439284444, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.00019951503720685784, |
|
"loss": 1.4279, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13141223075266556, |
|
"grad_norm": 0.0267333984375, |
|
"learning_rate": 0.0001993986728233844, |
|
"loss": 1.4052, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.13461740711248665, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 0.0001992698489443085, |
|
"loss": 1.3943, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13782258347230777, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.0001991285817175375, |
|
"loss": 1.3931, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1410277598321289, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 0.0001989748888507363, |
|
"loss": 1.3931, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.14423293619195, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.00019880878960910772, |
|
"loss": 1.3899, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1474381125517711, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.0001986303048129778, |
|
"loss": 1.4305, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15064328891159223, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 0.0001984394568351858, |
|
"loss": 1.4028, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.15384846527141333, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 0.00019823626959827997, |
|
"loss": 1.3758, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15705364163123445, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 0.0001980207685715186, |
|
"loss": 1.407, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.16025881799105554, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 0.00019779298076767795, |
|
"loss": 1.3923, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16346399435087666, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.00019755293473966572, |
|
"loss": 1.3967, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.16666917071069778, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.00019730066057694235, |
|
"loss": 1.4007, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16987434707051888, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00019703618990174918, |
|
"loss": 1.3978, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.17307952343034, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00019675955586514468, |
|
"loss": 1.3744, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.17628469979016112, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.00019647079314284897, |
|
"loss": 1.3929, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1794898761499822, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.0001961699379308974, |
|
"loss": 1.4031, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.18269505250980334, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 0.0001958570279411032, |
|
"loss": 1.3813, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.18590022886962446, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00019553210239633056, |
|
"loss": 1.3956, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18910540522944555, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00019519520202557797, |
|
"loss": 1.3988, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.19231058158926667, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 0.00019484636905887296, |
|
"loss": 1.3925, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19551575794908777, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 0.00019448564722197853, |
|
"loss": 1.376, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1987209343089089, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 0.00019411308173091228, |
|
"loss": 1.3974, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.20192611066873, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.0001937287192862787, |
|
"loss": 1.3765, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2051312870285511, |
|
"grad_norm": 0.059326171875, |
|
"learning_rate": 0.00019333260806741502, |
|
"loss": 1.3769, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20833646338837222, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.00019292479772635237, |
|
"loss": 1.3792, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.21154163974819334, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00019250533938159166, |
|
"loss": 1.3968, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21474681610801444, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 0.00019207428561169608, |
|
"loss": 1.38, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.21795199246783556, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.0001916316904487005, |
|
"loss": 1.3737, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22115716882765665, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.00019117760937133844, |
|
"loss": 1.4065, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.22436234518747777, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 0.00019071209929808806, |
|
"loss": 1.4012, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2275675215472989, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 0.00019023521858003742, |
|
"loss": 1.3941, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.23077269790712, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 0.00018974702699357029, |
|
"loss": 1.4072, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2339778742669411, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.00018924758573287315, |
|
"loss": 1.3531, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.23718305062676223, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 0.00018873695740226468, |
|
"loss": 1.3682, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24038822698658333, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.0001882152060083484, |
|
"loss": 1.3796, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.24359340334640445, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 0.00018768239695198945, |
|
"loss": 1.3835, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.24679857970622554, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 0.0001871385970201168, |
|
"loss": 1.3678, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2500037560660467, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 0.00018658387437735135, |
|
"loss": 1.3778, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2532089324258678, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 0.00018601829855746185, |
|
"loss": 1.3811, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2564141087856889, |
|
"grad_norm": 0.057373046875, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 1.3851, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25961928514551, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0001848548723146581, |
|
"loss": 1.3865, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2628244615053311, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.00018425716772572473, |
|
"loss": 1.3638, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2660296378651522, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 0.00018364890160934904, |
|
"loss": 1.3918, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2692348142249733, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 0.00018303015021090525, |
|
"loss": 1.3794, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.27243999058479446, |
|
"grad_norm": 0.06005859375, |
|
"learning_rate": 0.00018240099109008412, |
|
"loss": 1.3836, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.27564516694461555, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.000181761503111171, |
|
"loss": 1.3676, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.27885034330443664, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 0.0001811117664331604, |
|
"loss": 1.3513, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2820555196642578, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.00018045186249970784, |
|
"loss": 1.3602, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2852606960240789, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 0.00017978187402892148, |
|
"loss": 1.3468, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2884658723839, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00017910188500299304, |
|
"loss": 1.3651, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.29167104874372113, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 0.00017841198065767107, |
|
"loss": 1.3763, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2948762251035422, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 0.00017771224747157652, |
|
"loss": 1.3597, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2980814014633633, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 0.00017700277315536305, |
|
"loss": 1.3558, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.30128657782318446, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.00017628364664072218, |
|
"loss": 1.3534, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.30449175418300556, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 0.00017555495806923635, |
|
"loss": 1.3525, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.30769693054282665, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 0.00017481679878107926, |
|
"loss": 1.3715, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3109021069026478, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 0.00017406926130356692, |
|
"loss": 1.3689, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3141072832624689, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.00017331243933955918, |
|
"loss": 1.3686, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.31731245962229, |
|
"grad_norm": 0.059326171875, |
|
"learning_rate": 0.00017254642775571438, |
|
"loss": 1.3784, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3205176359821111, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.00017177132257059787, |
|
"loss": 1.3488, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32372281234193223, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 0.00017098722094264617, |
|
"loss": 1.3789, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3269279887017533, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.00017019422115798833, |
|
"loss": 1.3414, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3301331650615744, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0001693924226181259, |
|
"loss": 1.3667, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.33333834142139557, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00016858192582747304, |
|
"loss": 1.3749, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.33654351778121666, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.00016776283238075851, |
|
"loss": 1.3929, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.33974869414103775, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00016693524495029068, |
|
"loss": 1.3527, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3429538705008589, |
|
"grad_norm": 0.059814453125, |
|
"learning_rate": 0.00016609926727308806, |
|
"loss": 1.3577, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.34615904686068, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00016525500413787554, |
|
"loss": 1.3639, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3493642232205011, |
|
"grad_norm": 0.0595703125, |
|
"learning_rate": 0.00016440256137194965, |
|
"loss": 1.3608, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.35256939958032224, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001635420458279131, |
|
"loss": 1.3324, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.35577457594014333, |
|
"grad_norm": 0.062255859375, |
|
"learning_rate": 0.0001626735653702809, |
|
"loss": 1.3283, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3589797522999644, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00016179722886195967, |
|
"loss": 1.3287, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3621849286597856, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.00016091314615060195, |
|
"loss": 1.3799, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.36539010501960667, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00016002142805483685, |
|
"loss": 1.3399, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.36859528137942776, |
|
"grad_norm": 0.05908203125, |
|
"learning_rate": 0.00015912218635037896, |
|
"loss": 1.3698, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3718004577392489, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.0001582155337560177, |
|
"loss": 1.3378, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37500563409907, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00015730158391948784, |
|
"loss": 1.337, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3782108104588911, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.0001563804514032242, |
|
"loss": 1.3527, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3814159868187122, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.0001554522516700011, |
|
"loss": 1.3583, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.38462116317853334, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 0.00015451710106845955, |
|
"loss": 1.3421, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.38782633953835444, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.0001535751168185228, |
|
"loss": 1.3577, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.39103151589817553, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 0.00015262641699670328, |
|
"loss": 1.3706, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3942366922579967, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 0.0001516711205213016, |
|
"loss": 1.3439, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3974418686178178, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00015070934713750042, |
|
"loss": 1.3353, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.40064704497763887, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00014974121740235456, |
|
"loss": 1.3489, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.40385222133746, |
|
"grad_norm": 0.057373046875, |
|
"learning_rate": 0.00014876685266967924, |
|
"loss": 1.3481, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4070573976972811, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00014778637507483866, |
|
"loss": 1.3533, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4102625740571022, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 0.0001467999075194363, |
|
"loss": 1.3522, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.41346775041692335, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 0.00014580757365590963, |
|
"loss": 1.3712, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.41667292677674445, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.00014480949787203014, |
|
"loss": 1.3606, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.41987810313656554, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 0.0001438058052753118, |
|
"loss": 1.3488, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4230832794963867, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 0.00014279662167732867, |
|
"loss": 1.342, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4262884558562078, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 0.00014178207357794486, |
|
"loss": 1.3712, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4294936322160289, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00014076228814945778, |
|
"loss": 1.3227, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.43269880857585, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 0.00013973739322065728, |
|
"loss": 1.3201, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4359039849356711, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00013870751726080256, |
|
"loss": 1.3406, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4391091612954922, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 0.00013767278936351854, |
|
"loss": 1.3636, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4423143376553133, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0001366333392306143, |
|
"loss": 1.3576, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.44551951401513445, |
|
"grad_norm": 0.06005859375, |
|
"learning_rate": 0.00013558929715582515, |
|
"loss": 1.3517, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.44872469037495555, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00013454079400848027, |
|
"loss": 1.3376, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.45192986673477664, |
|
"grad_norm": 0.059326171875, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 1.3633, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4551350430945978, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00013243093075291444, |
|
"loss": 1.3217, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4583402194544189, |
|
"grad_norm": 0.056884765625, |
|
"learning_rate": 0.00013136983511333482, |
|
"loss": 1.3265, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.46154539581424, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.00013030480730533145, |
|
"loss": 1.3451, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4647505721740611, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 0.00012923598082876812, |
|
"loss": 1.376, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.4679557485338822, |
|
"grad_norm": 0.058349609375, |
|
"learning_rate": 0.0001281634896596669, |
|
"loss": 1.3524, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4711609248937033, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.00012708746823341446, |
|
"loss": 1.3599, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.47436610125352446, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00012600805142791042, |
|
"loss": 1.3416, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.47757127761334556, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 0.000124925374546661, |
|
"loss": 1.3574, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.48077645397316665, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001238395733018187, |
|
"loss": 1.3574, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4839816303329878, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00012275078379717089, |
|
"loss": 1.3341, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.4871868066928089, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.00012165914251107952, |
|
"loss": 1.3241, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.49039198305263, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 0.00012056478627937365, |
|
"loss": 1.3788, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.4935971594124511, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00011946785227819726, |
|
"loss": 1.3581, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.49680233577227223, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 0.00011836847800681443, |
|
"loss": 1.3328, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5000075121320934, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.00011726680127037401, |
|
"loss": 1.3533, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5032126884919145, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.00011616296016263582, |
|
"loss": 1.3622, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5064178648517356, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00011505709304866084, |
|
"loss": 1.3446, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5096230412115567, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00011394933854746733, |
|
"loss": 1.3384, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5128282175713778, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 0.00011283983551465511, |
|
"loss": 1.3378, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5160333939311988, |
|
"grad_norm": 0.060791015625, |
|
"learning_rate": 0.00011172872302500017, |
|
"loss": 1.3656, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.51923857029102, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00011061614035502193, |
|
"loss": 1.3521, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5224437466508411, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.00010950222696552486, |
|
"loss": 1.3614, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5256489230106622, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00010838712248411753, |
|
"loss": 1.3314, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5288540993704833, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00010727096668771036, |
|
"loss": 1.338, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5320592757303044, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.0001061538994849946, |
|
"loss": 1.3611, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5352644520901255, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 0.00010503606089890529, |
|
"loss": 1.3175, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5384696284499466, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 0.00010391759104906928, |
|
"loss": 1.3525, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5416748048097678, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00010279863013424154, |
|
"loss": 1.3313, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5448799811695889, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00010167931841473142, |
|
"loss": 1.3349, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.54808515752941, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.00010055979619482112, |
|
"loss": 1.3408, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5512903338892311, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 9.944020380517889e-05, |
|
"loss": 1.3175, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5544955102490522, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 9.832068158526862e-05, |
|
"loss": 1.3375, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5577006866088733, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 9.720136986575848e-05, |
|
"loss": 1.3475, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5609058629686945, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 9.608240895093076e-05, |
|
"loss": 1.3295, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5641110393285156, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 9.496393910109472e-05, |
|
"loss": 1.3429, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5673162156883367, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 9.384610051500545e-05, |
|
"loss": 1.3293, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5705213920481578, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 9.272903331228968e-05, |
|
"loss": 1.3498, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5737265684079789, |
|
"grad_norm": 0.062255859375, |
|
"learning_rate": 9.161287751588248e-05, |
|
"loss": 1.3351, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5769317447678, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 9.049777303447516e-05, |
|
"loss": 1.353, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5801369211276212, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 8.938385964497808e-05, |
|
"loss": 1.3363, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5833420974874423, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 8.827127697499984e-05, |
|
"loss": 1.3696, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5865472738472634, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 8.71601644853449e-05, |
|
"loss": 1.3481, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5897524502070844, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 8.605066145253268e-05, |
|
"loss": 1.3256, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5929576265669055, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 8.494290695133917e-05, |
|
"loss": 1.3544, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5961628029267266, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 8.383703983736419e-05, |
|
"loss": 1.3443, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5993679792865477, |
|
"grad_norm": 0.06103515625, |
|
"learning_rate": 8.2733198729626e-05, |
|
"loss": 1.3816, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6025731556463689, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 8.163152199318558e-05, |
|
"loss": 1.3247, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.60577833200619, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 8.053214772180277e-05, |
|
"loss": 1.3532, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6089835083660111, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 7.94352137206264e-05, |
|
"loss": 1.3443, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6121886847258322, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 7.83408574889205e-05, |
|
"loss": 1.3327, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6153938610856533, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 7.724921620282916e-05, |
|
"loss": 1.334, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6185990374454744, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 7.616042669818133e-05, |
|
"loss": 1.3572, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6218042138052956, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 7.507462545333903e-05, |
|
"loss": 1.3322, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6250093901651167, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 7.399194857208961e-05, |
|
"loss": 1.3222, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6282145665249378, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 7.291253176658561e-05, |
|
"loss": 1.3375, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6314197428847589, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 7.183651034033313e-05, |
|
"loss": 1.3397, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.63462491924458, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 7.07640191712319e-05, |
|
"loss": 1.34, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6378300956044011, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 6.969519269466857e-05, |
|
"loss": 1.3344, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6410352719642222, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 6.863016488666517e-05, |
|
"loss": 1.3475, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6442404483240434, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 6.756906924708558e-05, |
|
"loss": 1.3317, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6474456246838645, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 1.3243, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6506508010436856, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 6.545920599151975e-05, |
|
"loss": 1.3351, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6538559774035066, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 6.441070284417487e-05, |
|
"loss": 1.3536, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6570611537633277, |
|
"grad_norm": 0.060791015625, |
|
"learning_rate": 6.336666076938572e-05, |
|
"loss": 1.3064, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6602663301231488, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 6.232721063648148e-05, |
|
"loss": 1.3496, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.66347150648297, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 6.12924827391975e-05, |
|
"loss": 1.3487, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6666766828427911, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 6.026260677934272e-05, |
|
"loss": 1.3241, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6698818592026122, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 5.9237711850542246e-05, |
|
"loss": 1.3454, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6730870355624333, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 5.8217926422055126e-05, |
|
"loss": 1.3364, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6762922119222544, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 5.7203378322671355e-05, |
|
"loss": 1.3152, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6794973882820755, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 5.619419472468823e-05, |
|
"loss": 1.3486, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6827025646418967, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 5.519050212796986e-05, |
|
"loss": 1.3301, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6859077410017178, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 5.419242634409039e-05, |
|
"loss": 1.3279, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6891129173615389, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 5.32000924805637e-05, |
|
"loss": 1.3415, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.69231809372136, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 5.2213624925161386e-05, |
|
"loss": 1.3449, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6955232700811811, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 5.123314733032074e-05, |
|
"loss": 1.3442, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6987284464410022, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 5.0258782597645446e-05, |
|
"loss": 1.3309, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7019336228008233, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.929065286249959e-05, |
|
"loss": 1.3564, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.7051387991606445, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 4.832887947869841e-05, |
|
"loss": 1.3578, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7083439755204656, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.737358300329673e-05, |
|
"loss": 1.3417, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7115491518802867, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 4.642488318147723e-05, |
|
"loss": 1.3259, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7147543282401078, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 4.548289893154051e-05, |
|
"loss": 1.3568, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7179595045999289, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 4.4547748329998925e-05, |
|
"loss": 1.3211, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.72116468095975, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 4.361954859677584e-05, |
|
"loss": 1.3398, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7243698573195712, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 4.2698416080512204e-05, |
|
"loss": 1.3266, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7275750336793922, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 4.1784466243982324e-05, |
|
"loss": 1.3447, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.7307802100392133, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 4.0877813649621076e-05, |
|
"loss": 1.3385, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7339853863990344, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.997857194516319e-05, |
|
"loss": 1.3403, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7371905627588555, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 3.9086853849398065e-05, |
|
"loss": 1.3503, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7403957391186766, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 3.8202771138040336e-05, |
|
"loss": 1.3354, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7436009154784978, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 3.732643462971912e-05, |
|
"loss": 1.3258, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7468060918383189, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 3.6457954172086896e-05, |
|
"loss": 1.3493, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.75001126819814, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 3.559743862805034e-05, |
|
"loss": 1.3275, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7532164445579611, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 3.47449958621245e-05, |
|
"loss": 1.3148, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7564216209177822, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 3.390073272691198e-05, |
|
"loss": 1.3338, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7596267972776033, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 3.306475504970931e-05, |
|
"loss": 1.2935, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7628319736374244, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.2237167619241495e-05, |
|
"loss": 1.3275, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7660371499972456, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 3.141807417252697e-05, |
|
"loss": 1.3461, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7692423263570667, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 3.060757738187409e-05, |
|
"loss": 1.3394, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7724475027168878, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 2.980577884201169e-05, |
|
"loss": 1.3511, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7756526790767089, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.9012779057353855e-05, |
|
"loss": 1.3213, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.77885785543653, |
|
"grad_norm": 0.0576171875, |
|
"learning_rate": 2.822867742940214e-05, |
|
"loss": 1.3384, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7820630317963511, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.745357224428563e-05, |
|
"loss": 1.343, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7852682081561723, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 2.6687560660440858e-05, |
|
"loss": 1.3541, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.7884733845159934, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.593073869643312e-05, |
|
"loss": 1.3491, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7916785608758145, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.518320121892076e-05, |
|
"loss": 1.3439, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7948837372356355, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.4445041930763678e-05, |
|
"loss": 1.3236, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7980889135954566, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.371635335927781e-05, |
|
"loss": 1.3505, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.8012940899552777, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 2.2997226844636977e-05, |
|
"loss": 1.3223, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8044992663150989, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.2287752528423468e-05, |
|
"loss": 1.3282, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.80770444267492, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 2.1588019342328968e-05, |
|
"loss": 1.3294, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8109096190347411, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.089811499700699e-05, |
|
"loss": 1.3356, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8141147953945622, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.021812597107855e-05, |
|
"loss": 1.3486, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8173199717543833, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.954813750029216e-05, |
|
"loss": 1.3492, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8205251481142044, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 1.8888233566839653e-05, |
|
"loss": 1.329, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8237303244740255, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.8238496888828982e-05, |
|
"loss": 1.317, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8269355008338467, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.759900890991589e-05, |
|
"loss": 1.3177, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8301406771936678, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.696984978909476e-05, |
|
"loss": 1.323, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.8333458535534889, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.6351098390650966e-05, |
|
"loss": 1.3517, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.83655102991331, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 1.5742832274275288e-05, |
|
"loss": 1.35, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.8397562062731311, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.514512768534193e-05, |
|
"loss": 1.3614, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8429613826329522, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 1.3389, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.8461665589927734, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.3981701442538153e-05, |
|
"loss": 1.3272, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8493717353525945, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 1.3416125622648668e-05, |
|
"loss": 1.3324, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.8525769117124156, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.286140297988323e-05, |
|
"loss": 1.3352, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8557820880722367, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.231760304801054e-05, |
|
"loss": 1.3361, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.8589872644320578, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.1784793991651621e-05, |
|
"loss": 1.3252, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8621924407918788, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.1263042597735362e-05, |
|
"loss": 1.3468, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.8653976171517, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.0752414267126875e-05, |
|
"loss": 1.3301, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8686027935115211, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.0252973006429733e-05, |
|
"loss": 1.36, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.8718079698713422, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 9.764781419962577e-06, |
|
"loss": 1.3482, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8750131462311633, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 9.287900701911944e-06, |
|
"loss": 1.3232, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8782183225909844, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 8.822390628661582e-06, |
|
"loss": 1.3571, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8814234989508055, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 8.368309551299536e-06, |
|
"loss": 1.3274, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8846286753106266, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 7.92571438830394e-06, |
|
"loss": 1.3656, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8878338516704478, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 7.494660618408378e-06, |
|
"loss": 1.3659, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8910390280302689, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 7.075202273647652e-06, |
|
"loss": 1.3305, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.89424420439009, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 6.667391932584999e-06, |
|
"loss": 1.36, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8974493807499111, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 6.271280713721317e-06, |
|
"loss": 1.3382, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9006545571097322, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 5.886918269087716e-06, |
|
"loss": 1.326, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.9038597334695533, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 5.514352778021492e-06, |
|
"loss": 1.3602, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9070649098293745, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 5.153630941127063e-06, |
|
"loss": 1.3407, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.9102700861891956, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 4.804797974422026e-06, |
|
"loss": 1.3241, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9134752625490167, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 4.4678976036694355e-06, |
|
"loss": 1.3324, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9166804389088378, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 4.142972058896811e-06, |
|
"loss": 1.3267, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9198856152686589, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 3.830062069102602e-06, |
|
"loss": 1.3496, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.92309079162848, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 3.529206857151035e-06, |
|
"loss": 1.3481, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9262959679883012, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 3.240444134855347e-06, |
|
"loss": 1.3433, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.9295011443481223, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.963810098250841e-06, |
|
"loss": 1.3555, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9327063207079433, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.6993394230576674e-06, |
|
"loss": 1.3218, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.9359114970677644, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 2.4470652603343023e-06, |
|
"loss": 1.346, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9391166734275855, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.2070192323220607e-06, |
|
"loss": 1.3551, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.9423218497874066, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.9792314284813986e-06, |
|
"loss": 1.3262, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.9455270261472277, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.763730401720065e-06, |
|
"loss": 1.3257, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.9487322025070489, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.5605431648141878e-06, |
|
"loss": 1.3158, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.95193737886687, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.3696951870222018e-06, |
|
"loss": 1.3637, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.9551425552266911, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 1.1912103908922945e-06, |
|
"loss": 1.3337, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.9583477315865122, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 1.0251111492637244e-06, |
|
"loss": 1.3557, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.9615529079463333, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 8.714182824624883e-07, |
|
"loss": 1.3373, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9647580843061544, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 7.301510556914859e-07, |
|
"loss": 1.3274, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.9679632606659756, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 6.01327176615607e-07, |
|
"loss": 1.3894, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.9711684370257967, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 4.84962793142163e-07, |
|
"loss": 1.3419, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.9743736133856178, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 3.8107249139672783e-07, |
|
"loss": 1.3321, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9775787897454389, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.89669293894812e-07, |
|
"loss": 1.3497, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.98078396610526, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 2.1076465790946798e-07, |
|
"loss": 1.3518, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9839891424650811, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.443684740351947e-07, |
|
"loss": 1.3224, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9871943188249022, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.048906494811826e-08, |
|
"loss": 1.3513, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9903994951847234, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 4.9133184362748497e-08, |
|
"loss": 1.3494, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9936046715445445, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.0306016185456244e-08, |
|
"loss": 1.3344, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9968098479043656, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.011173864637563e-09, |
|
"loss": 1.3662, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9993739889922224, |
|
"eval_loss": 1.4191993474960327, |
|
"eval_runtime": 1938.5869, |
|
"eval_samples_per_second": 7.3, |
|
"eval_steps_per_second": 7.3, |
|
"step": 1559 |
|
}, |
|
{ |
|
"epoch": 0.9993739889922224, |
|
"step": 1559, |
|
"total_flos": 3.232184148701479e+18, |
|
"train_loss": 0.016414370117774753, |
|
"train_runtime": 2971.8566, |
|
"train_samples_per_second": 67.189, |
|
"train_steps_per_second": 0.525 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1559, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 3.232184148701479e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|