|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 3665, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013642564802182811, |
|
"grad_norm": 5.18676233291626, |
|
"learning_rate": 7.999880967748602e-05, |
|
"loss": 2.6446, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027285129604365622, |
|
"grad_norm": 5.224745273590088, |
|
"learning_rate": 7.999469507332807e-05, |
|
"loss": 2.5833, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.040927694406548434, |
|
"grad_norm": 5.343475341796875, |
|
"learning_rate": 7.998764179444167e-05, |
|
"loss": 2.3323, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.054570259208731244, |
|
"grad_norm": 3.960702657699585, |
|
"learning_rate": 7.997765035907784e-05, |
|
"loss": 2.2764, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06821282401091405, |
|
"grad_norm": 5.80894136428833, |
|
"learning_rate": 7.99661465457664e-05, |
|
"loss": 2.3653, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08185538881309687, |
|
"grad_norm": 3.9454762935638428, |
|
"learning_rate": 7.995057481402883e-05, |
|
"loss": 2.3655, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09549795361527967, |
|
"grad_norm": 4.658463478088379, |
|
"learning_rate": 7.993206764937005e-05, |
|
"loss": 2.2008, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10914051841746249, |
|
"grad_norm": 7.062836647033691, |
|
"learning_rate": 7.99106264116338e-05, |
|
"loss": 2.4619, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12278308321964529, |
|
"grad_norm": 5.339938640594482, |
|
"learning_rate": 7.988625267624962e-05, |
|
"loss": 2.4662, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1364256480218281, |
|
"grad_norm": 6.298858165740967, |
|
"learning_rate": 7.9858948234117e-05, |
|
"loss": 2.3909, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15006821282401092, |
|
"grad_norm": 5.622934818267822, |
|
"learning_rate": 7.98287150914739e-05, |
|
"loss": 2.4736, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16371077762619374, |
|
"grad_norm": 3.229802370071411, |
|
"learning_rate": 7.979555546974922e-05, |
|
"loss": 2.3796, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17735334242837653, |
|
"grad_norm": 5.506136894226074, |
|
"learning_rate": 7.975947180539966e-05, |
|
"loss": 2.2357, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19099590723055934, |
|
"grad_norm": 5.04240608215332, |
|
"learning_rate": 7.972046674973065e-05, |
|
"loss": 2.3435, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.20463847203274216, |
|
"grad_norm": 6.470039367675781, |
|
"learning_rate": 7.967854316870156e-05, |
|
"loss": 2.4652, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21828103683492497, |
|
"grad_norm": 3.5564420223236084, |
|
"learning_rate": 7.963370414271514e-05, |
|
"loss": 2.2549, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23192360163710776, |
|
"grad_norm": 3.998383045196533, |
|
"learning_rate": 7.958595296639115e-05, |
|
"loss": 2.3102, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.24556616643929058, |
|
"grad_norm": 4.615527629852295, |
|
"learning_rate": 7.953529314832426e-05, |
|
"loss": 2.2444, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2592087312414734, |
|
"grad_norm": 6.488475322723389, |
|
"learning_rate": 7.948172841082635e-05, |
|
"loss": 2.2928, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2728512960436562, |
|
"grad_norm": 4.134075164794922, |
|
"learning_rate": 7.942526268965287e-05, |
|
"loss": 2.2113, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.286493860845839, |
|
"grad_norm": 5.373613357543945, |
|
"learning_rate": 7.936590013371378e-05, |
|
"loss": 2.0795, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.30013642564802184, |
|
"grad_norm": 5.965483665466309, |
|
"learning_rate": 7.93036451047686e-05, |
|
"loss": 2.3991, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.31377899045020463, |
|
"grad_norm": 4.945356369018555, |
|
"learning_rate": 7.923850217710604e-05, |
|
"loss": 2.3399, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3274215552523875, |
|
"grad_norm": 5.210105895996094, |
|
"learning_rate": 7.917047613720773e-05, |
|
"loss": 2.3305, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.34106412005457026, |
|
"grad_norm": 3.3301730155944824, |
|
"learning_rate": 7.909957198339669e-05, |
|
"loss": 2.2404, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.35470668485675305, |
|
"grad_norm": 3.685528039932251, |
|
"learning_rate": 7.902579492546998e-05, |
|
"loss": 2.1853, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3683492496589359, |
|
"grad_norm": 5.30728006362915, |
|
"learning_rate": 7.894915038431595e-05, |
|
"loss": 2.3177, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3819918144611187, |
|
"grad_norm": 2.644773006439209, |
|
"learning_rate": 7.886964399151586e-05, |
|
"loss": 2.3621, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3956343792633015, |
|
"grad_norm": 4.921834468841553, |
|
"learning_rate": 7.878728158893015e-05, |
|
"loss": 2.3687, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4092769440654843, |
|
"grad_norm": 3.9288289546966553, |
|
"learning_rate": 7.870206922826921e-05, |
|
"loss": 2.2021, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4229195088676671, |
|
"grad_norm": 6.143318176269531, |
|
"learning_rate": 7.862294656001264e-05, |
|
"loss": 2.0236, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.43656207366984995, |
|
"grad_norm": 4.3533806800842285, |
|
"learning_rate": 7.853233670108533e-05, |
|
"loss": 2.4181, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.45020463847203274, |
|
"grad_norm": 3.7804017066955566, |
|
"learning_rate": 7.843889561656962e-05, |
|
"loss": 2.2359, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4638472032742155, |
|
"grad_norm": 2.7896721363067627, |
|
"learning_rate": 7.83426301721999e-05, |
|
"loss": 2.2156, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.47748976807639837, |
|
"grad_norm": 4.168877601623535, |
|
"learning_rate": 7.824354744123498e-05, |
|
"loss": 2.256, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.49113233287858116, |
|
"grad_norm": 7.003695011138916, |
|
"learning_rate": 7.814165470393832e-05, |
|
"loss": 2.2627, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.504774897680764, |
|
"grad_norm": 9.064947128295898, |
|
"learning_rate": 7.803695944704313e-05, |
|
"loss": 2.1303, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5184174624829468, |
|
"grad_norm": 6.936222553253174, |
|
"learning_rate": 7.79294693632023e-05, |
|
"loss": 2.307, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5320600272851296, |
|
"grad_norm": 4.441155433654785, |
|
"learning_rate": 7.781919235042309e-05, |
|
"loss": 2.2182, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5457025920873124, |
|
"grad_norm": 7.236617088317871, |
|
"learning_rate": 7.770613651148688e-05, |
|
"loss": 2.2309, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5593451568894953, |
|
"grad_norm": 4.184124946594238, |
|
"learning_rate": 7.75903101533538e-05, |
|
"loss": 2.0949, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.572987721691678, |
|
"grad_norm": 2.9721992015838623, |
|
"learning_rate": 7.747172178655232e-05, |
|
"loss": 2.0541, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5866302864938608, |
|
"grad_norm": 2.36011004447937, |
|
"learning_rate": 7.735038012455398e-05, |
|
"loss": 2.0429, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6002728512960437, |
|
"grad_norm": 5.892980575561523, |
|
"learning_rate": 7.722629408313309e-05, |
|
"loss": 2.1867, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6139154160982264, |
|
"grad_norm": 5.935312747955322, |
|
"learning_rate": 7.709947277971168e-05, |
|
"loss": 2.2556, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6275579809004093, |
|
"grad_norm": 5.781426906585693, |
|
"learning_rate": 7.69699255326896e-05, |
|
"loss": 2.3728, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6412005457025921, |
|
"grad_norm": 6.5159149169921875, |
|
"learning_rate": 7.683766186075973e-05, |
|
"loss": 2.4319, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.654843110504775, |
|
"grad_norm": 3.946726083755493, |
|
"learning_rate": 7.670269148220874e-05, |
|
"loss": 2.2417, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6684856753069577, |
|
"grad_norm": 5.711483001708984, |
|
"learning_rate": 7.656502431420286e-05, |
|
"loss": 2.1867, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6821282401091405, |
|
"grad_norm": 1.9001113176345825, |
|
"learning_rate": 7.64246704720593e-05, |
|
"loss": 2.2426, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6957708049113234, |
|
"grad_norm": 6.293882846832275, |
|
"learning_rate": 7.628164026850302e-05, |
|
"loss": 2.1057, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7094133697135061, |
|
"grad_norm": 2.980394124984741, |
|
"learning_rate": 7.613594421290888e-05, |
|
"loss": 2.0724, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.723055934515689, |
|
"grad_norm": 7.354122161865234, |
|
"learning_rate": 7.59875930105296e-05, |
|
"loss": 2.1513, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7366984993178718, |
|
"grad_norm": 3.451127290725708, |
|
"learning_rate": 7.583659756170904e-05, |
|
"loss": 1.996, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7503410641200545, |
|
"grad_norm": 4.8892951011657715, |
|
"learning_rate": 7.568296896108135e-05, |
|
"loss": 2.2927, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7639836289222374, |
|
"grad_norm": 5.78281831741333, |
|
"learning_rate": 7.552671849675571e-05, |
|
"loss": 2.2445, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7776261937244202, |
|
"grad_norm": 3.0643134117126465, |
|
"learning_rate": 7.536785764948705e-05, |
|
"loss": 2.1723, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.791268758526603, |
|
"grad_norm": 5.0908732414245605, |
|
"learning_rate": 7.520639809183234e-05, |
|
"loss": 2.1945, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8049113233287858, |
|
"grad_norm": 4.066890716552734, |
|
"learning_rate": 7.5042351687293e-05, |
|
"loss": 2.385, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8185538881309686, |
|
"grad_norm": 6.034420967102051, |
|
"learning_rate": 7.487573048944318e-05, |
|
"loss": 1.8776, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8321964529331515, |
|
"grad_norm": 2.5823731422424316, |
|
"learning_rate": 7.470654674104415e-05, |
|
"loss": 2.1586, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8458390177353342, |
|
"grad_norm": 4.781266212463379, |
|
"learning_rate": 7.453481287314469e-05, |
|
"loss": 2.1229, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.859481582537517, |
|
"grad_norm": 4.008852481842041, |
|
"learning_rate": 7.436054150416777e-05, |
|
"loss": 2.1562, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8731241473396999, |
|
"grad_norm": 7.812346458435059, |
|
"learning_rate": 7.418374543898328e-05, |
|
"loss": 2.2563, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8867667121418826, |
|
"grad_norm": 5.588919639587402, |
|
"learning_rate": 7.400443766796728e-05, |
|
"loss": 2.0738, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9004092769440655, |
|
"grad_norm": 3.1923656463623047, |
|
"learning_rate": 7.382263136604744e-05, |
|
"loss": 1.9363, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9140518417462483, |
|
"grad_norm": 6.451751708984375, |
|
"learning_rate": 7.363833989173504e-05, |
|
"loss": 2.3014, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.927694406548431, |
|
"grad_norm": 4.7484517097473145, |
|
"learning_rate": 7.34515767861434e-05, |
|
"loss": 2.1957, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9413369713506139, |
|
"grad_norm": 4.468317031860352, |
|
"learning_rate": 7.326235577199292e-05, |
|
"loss": 2.4058, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9549795361527967, |
|
"grad_norm": 4.4900078773498535, |
|
"learning_rate": 7.307069075260283e-05, |
|
"loss": 2.0714, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9686221009549796, |
|
"grad_norm": 4.51573371887207, |
|
"learning_rate": 7.287659581086957e-05, |
|
"loss": 2.0244, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9822646657571623, |
|
"grad_norm": 2.892160654067993, |
|
"learning_rate": 7.268008520823203e-05, |
|
"loss": 2.1891, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9959072305593452, |
|
"grad_norm": 5.765126705169678, |
|
"learning_rate": 7.248117338362371e-05, |
|
"loss": 2.0008, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.009549795361528, |
|
"grad_norm": 4.076498031616211, |
|
"learning_rate": 7.227987495241174e-05, |
|
"loss": 1.7823, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0231923601637107, |
|
"grad_norm": 5.407172203063965, |
|
"learning_rate": 7.207620470532302e-05, |
|
"loss": 1.7272, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0368349249658937, |
|
"grad_norm": 2.3942642211914062, |
|
"learning_rate": 7.187017760735748e-05, |
|
"loss": 1.7809, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0504774897680764, |
|
"grad_norm": 6.462409019470215, |
|
"learning_rate": 7.166180879668843e-05, |
|
"loss": 1.5226, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0641200545702592, |
|
"grad_norm": 10.609193801879883, |
|
"learning_rate": 7.145111358355031e-05, |
|
"loss": 1.6743, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.077762619372442, |
|
"grad_norm": 5.816223621368408, |
|
"learning_rate": 7.123810744911376e-05, |
|
"loss": 1.3974, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0914051841746248, |
|
"grad_norm": 8.911855697631836, |
|
"learning_rate": 7.102280604434805e-05, |
|
"loss": 1.4882, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1050477489768076, |
|
"grad_norm": 8.37972640991211, |
|
"learning_rate": 7.080522518887116e-05, |
|
"loss": 1.8217, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1186903137789905, |
|
"grad_norm": 5.856902599334717, |
|
"learning_rate": 7.058538086978738e-05, |
|
"loss": 1.5443, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1323328785811733, |
|
"grad_norm": 8.661952018737793, |
|
"learning_rate": 7.036328924051266e-05, |
|
"loss": 1.426, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.145975443383356, |
|
"grad_norm": 7.178607940673828, |
|
"learning_rate": 7.013896661958766e-05, |
|
"loss": 1.6086, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.159618008185539, |
|
"grad_norm": 6.399686813354492, |
|
"learning_rate": 6.991242948947879e-05, |
|
"loss": 1.64, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1732605729877217, |
|
"grad_norm": 5.770147800445557, |
|
"learning_rate": 6.968369449536705e-05, |
|
"loss": 1.6026, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.1869031377899044, |
|
"grad_norm": 7.86605167388916, |
|
"learning_rate": 6.945277844392504e-05, |
|
"loss": 1.5026, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2005457025920874, |
|
"grad_norm": 3.953367233276367, |
|
"learning_rate": 6.921969830208212e-05, |
|
"loss": 1.6705, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.21418826739427, |
|
"grad_norm": 10.93819808959961, |
|
"learning_rate": 6.898447119577764e-05, |
|
"loss": 1.6982, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2278308321964528, |
|
"grad_norm": 4.714999675750732, |
|
"learning_rate": 6.874711440870264e-05, |
|
"loss": 1.2631, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2414733969986358, |
|
"grad_norm": 4.0259480476379395, |
|
"learning_rate": 6.85076453810299e-05, |
|
"loss": 1.5005, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2551159618008185, |
|
"grad_norm": 7.3578643798828125, |
|
"learning_rate": 6.826608170813243e-05, |
|
"loss": 1.6076, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.2687585266030013, |
|
"grad_norm": 5.368468284606934, |
|
"learning_rate": 6.802244113929075e-05, |
|
"loss": 1.7727, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.2824010914051842, |
|
"grad_norm": 7.0559916496276855, |
|
"learning_rate": 6.777674157638862e-05, |
|
"loss": 1.6125, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.296043656207367, |
|
"grad_norm": 6.349229335784912, |
|
"learning_rate": 6.75290010725977e-05, |
|
"loss": 1.4289, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.30968622100955, |
|
"grad_norm": 4.133677959442139, |
|
"learning_rate": 6.727923783105111e-05, |
|
"loss": 1.6043, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3233287858117326, |
|
"grad_norm": 5.688299655914307, |
|
"learning_rate": 6.702747020350586e-05, |
|
"loss": 1.6637, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3369713506139154, |
|
"grad_norm": 8.215998649597168, |
|
"learning_rate": 6.677371668899448e-05, |
|
"loss": 1.717, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.350613915416098, |
|
"grad_norm": 6.267200946807861, |
|
"learning_rate": 6.651799593246569e-05, |
|
"loss": 1.572, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.364256480218281, |
|
"grad_norm": 3.5424532890319824, |
|
"learning_rate": 6.626032672341458e-05, |
|
"loss": 1.6264, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3778990450204638, |
|
"grad_norm": 6.2543206214904785, |
|
"learning_rate": 6.600072799450186e-05, |
|
"loss": 1.5057, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.3915416098226467, |
|
"grad_norm": 5.060370445251465, |
|
"learning_rate": 6.573921882016284e-05, |
|
"loss": 1.6963, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.4051841746248295, |
|
"grad_norm": 7.101504325866699, |
|
"learning_rate": 6.547581841520589e-05, |
|
"loss": 1.3956, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4188267394270122, |
|
"grad_norm": 7.740041732788086, |
|
"learning_rate": 6.521054613340064e-05, |
|
"loss": 1.469, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4324693042291952, |
|
"grad_norm": 7.723188877105713, |
|
"learning_rate": 6.494342146605581e-05, |
|
"loss": 1.5518, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.446111869031378, |
|
"grad_norm": 5.523282527923584, |
|
"learning_rate": 6.467446404058722e-05, |
|
"loss": 1.5123, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4597544338335606, |
|
"grad_norm": 10.065855026245117, |
|
"learning_rate": 6.44036936190755e-05, |
|
"loss": 1.5065, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.4733969986357436, |
|
"grad_norm": 8.757994651794434, |
|
"learning_rate": 6.413113009681411e-05, |
|
"loss": 1.5362, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.4870395634379263, |
|
"grad_norm": 10.307364463806152, |
|
"learning_rate": 6.385679350084743e-05, |
|
"loss": 1.7138, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.500682128240109, |
|
"grad_norm": 7.119322299957275, |
|
"learning_rate": 6.358070398849938e-05, |
|
"loss": 1.6659, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.514324693042292, |
|
"grad_norm": 12.579729080200195, |
|
"learning_rate": 6.330288184589216e-05, |
|
"loss": 1.6374, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5279672578444747, |
|
"grad_norm": 8.232572555541992, |
|
"learning_rate": 6.30233474864558e-05, |
|
"loss": 1.6296, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5416098226466577, |
|
"grad_norm": 10.136465072631836, |
|
"learning_rate": 6.274212144942824e-05, |
|
"loss": 1.6491, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.5552523874488404, |
|
"grad_norm": 5.127601623535156, |
|
"learning_rate": 6.245922439834612e-05, |
|
"loss": 1.6309, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5688949522510232, |
|
"grad_norm": 6.231171607971191, |
|
"learning_rate": 6.217467711952658e-05, |
|
"loss": 1.5365, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5825375170532059, |
|
"grad_norm": 7.796588897705078, |
|
"learning_rate": 6.188850052053985e-05, |
|
"loss": 1.493, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.5961800818553888, |
|
"grad_norm": 8.250170707702637, |
|
"learning_rate": 6.160071562867316e-05, |
|
"loss": 1.6771, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.6098226466575716, |
|
"grad_norm": 4.563484191894531, |
|
"learning_rate": 6.131134358938559e-05, |
|
"loss": 1.5381, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.6234652114597545, |
|
"grad_norm": 7.776663303375244, |
|
"learning_rate": 6.1020405664754455e-05, |
|
"loss": 1.4791, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.6371077762619373, |
|
"grad_norm": 6.122801780700684, |
|
"learning_rate": 6.0727923231913035e-05, |
|
"loss": 1.5296, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.65075034106412, |
|
"grad_norm": 5.5693278312683105, |
|
"learning_rate": 6.04339177814798e-05, |
|
"loss": 1.4165, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6643929058663027, |
|
"grad_norm": 8.107104301452637, |
|
"learning_rate": 6.013841091597947e-05, |
|
"loss": 1.5694, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.6780354706684857, |
|
"grad_norm": 12.389050483703613, |
|
"learning_rate": 5.9841424348255596e-05, |
|
"loss": 1.6024, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.6916780354706686, |
|
"grad_norm": 6.175624370574951, |
|
"learning_rate": 5.954297989987526e-05, |
|
"loss": 1.499, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.7053206002728514, |
|
"grad_norm": 4.447775840759277, |
|
"learning_rate": 5.924309949952565e-05, |
|
"loss": 1.5177, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.718963165075034, |
|
"grad_norm": 6.788479804992676, |
|
"learning_rate": 5.8941805181402886e-05, |
|
"loss": 1.4564, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.7326057298772168, |
|
"grad_norm": 6.172934532165527, |
|
"learning_rate": 5.8639119083592954e-05, |
|
"loss": 1.3263, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7462482946793996, |
|
"grad_norm": 7.947272777557373, |
|
"learning_rate": 5.833506344644507e-05, |
|
"loss": 1.6518, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.7598908594815825, |
|
"grad_norm": 5.520730018615723, |
|
"learning_rate": 5.802966061093762e-05, |
|
"loss": 1.4169, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.7735334242837655, |
|
"grad_norm": 6.108306407928467, |
|
"learning_rate": 5.7722933017036515e-05, |
|
"loss": 1.4631, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.7871759890859482, |
|
"grad_norm": 3.4584460258483887, |
|
"learning_rate": 5.741490320204644e-05, |
|
"loss": 1.5939, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.800818553888131, |
|
"grad_norm": 7.870245456695557, |
|
"learning_rate": 5.7105593798954895e-05, |
|
"loss": 1.4775, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8144611186903137, |
|
"grad_norm": 6.256468772888184, |
|
"learning_rate": 5.679502753476913e-05, |
|
"loss": 1.6513, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.8281036834924966, |
|
"grad_norm": 8.314029693603516, |
|
"learning_rate": 5.648322722884635e-05, |
|
"loss": 1.6092, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.8417462482946794, |
|
"grad_norm": 7.979041576385498, |
|
"learning_rate": 5.6170215791216926e-05, |
|
"loss": 1.5116, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.8553888130968623, |
|
"grad_norm": 7.900625228881836, |
|
"learning_rate": 5.58560162209011e-05, |
|
"loss": 1.3667, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.869031377899045, |
|
"grad_norm": 7.304281234741211, |
|
"learning_rate": 5.554065160421907e-05, |
|
"loss": 1.4584, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.8826739427012278, |
|
"grad_norm": 5.6456618309021, |
|
"learning_rate": 5.522414511309472e-05, |
|
"loss": 1.7889, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8963165075034105, |
|
"grad_norm": 5.90806770324707, |
|
"learning_rate": 5.490652000335297e-05, |
|
"loss": 1.5623, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.9099590723055935, |
|
"grad_norm": 6.941085338592529, |
|
"learning_rate": 5.461972027286809e-05, |
|
"loss": 1.5508, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9236016371077762, |
|
"grad_norm": 7.18194580078125, |
|
"learning_rate": 5.430003415054097e-05, |
|
"loss": 1.4951, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.9372442019099592, |
|
"grad_norm": 9.118647575378418, |
|
"learning_rate": 5.397929731013993e-05, |
|
"loss": 1.601, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.950886766712142, |
|
"grad_norm": 7.178833961486816, |
|
"learning_rate": 5.365753331832165e-05, |
|
"loss": 1.5427, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9645293315143246, |
|
"grad_norm": 5.578968524932861, |
|
"learning_rate": 5.3334765817214407e-05, |
|
"loss": 1.4017, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9781718963165074, |
|
"grad_norm": 10.137267112731934, |
|
"learning_rate": 5.301101852268093e-05, |
|
"loss": 1.6255, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.9918144611186903, |
|
"grad_norm": 7.634779930114746, |
|
"learning_rate": 5.268631522257586e-05, |
|
"loss": 1.5628, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.0054570259208733, |
|
"grad_norm": 4.639617919921875, |
|
"learning_rate": 5.23606797749979e-05, |
|
"loss": 1.2903, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.019099590723056, |
|
"grad_norm": 9.048364639282227, |
|
"learning_rate": 5.2034136106536784e-05, |
|
"loss": 0.8694, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.0327421555252387, |
|
"grad_norm": 12.236319541931152, |
|
"learning_rate": 5.1706708210515225e-05, |
|
"loss": 1.0926, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.0463847203274215, |
|
"grad_norm": 8.067400932312012, |
|
"learning_rate": 5.1378420145226e-05, |
|
"loss": 0.9019, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.060027285129604, |
|
"grad_norm": 5.9596028327941895, |
|
"learning_rate": 5.104929603216422e-05, |
|
"loss": 0.9077, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.0736698499317874, |
|
"grad_norm": 16.68425941467285, |
|
"learning_rate": 5.0719360054254925e-05, |
|
"loss": 0.8833, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.08731241473397, |
|
"grad_norm": 10.05809497833252, |
|
"learning_rate": 5.0388636454076256e-05, |
|
"loss": 0.8182, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.100954979536153, |
|
"grad_norm": 7.299218654632568, |
|
"learning_rate": 5.0057149532078165e-05, |
|
"loss": 0.7097, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.1145975443383356, |
|
"grad_norm": 7.193046569824219, |
|
"learning_rate": 4.9724923644796904e-05, |
|
"loss": 0.8243, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.1282401091405183, |
|
"grad_norm": 6.852187633514404, |
|
"learning_rate": 4.939198320306537e-05, |
|
"loss": 0.8468, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.141882673942701, |
|
"grad_norm": 8.87316608428955, |
|
"learning_rate": 4.9058352670219576e-05, |
|
"loss": 0.8348, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.155525238744884, |
|
"grad_norm": 7.688050270080566, |
|
"learning_rate": 4.872405656030099e-05, |
|
"loss": 0.9292, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.169167803547067, |
|
"grad_norm": 9.83198356628418, |
|
"learning_rate": 4.83891194362555e-05, |
|
"loss": 0.7706, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.1828103683492497, |
|
"grad_norm": 10.323474884033203, |
|
"learning_rate": 4.805356590812852e-05, |
|
"loss": 0.8487, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.1964529331514324, |
|
"grad_norm": 8.7017183303833, |
|
"learning_rate": 4.771742063125674e-05, |
|
"loss": 0.8509, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.210095497953615, |
|
"grad_norm": 6.59673547744751, |
|
"learning_rate": 4.7380708304456554e-05, |
|
"loss": 0.8833, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.223738062755798, |
|
"grad_norm": 15.534720420837402, |
|
"learning_rate": 4.704345366820927e-05, |
|
"loss": 0.8968, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.237380627557981, |
|
"grad_norm": 11.261545181274414, |
|
"learning_rate": 4.670568150284323e-05, |
|
"loss": 0.9044, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.251023192360164, |
|
"grad_norm": 8.76096248626709, |
|
"learning_rate": 4.636741662671308e-05, |
|
"loss": 0.974, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.2646657571623465, |
|
"grad_norm": 5.140685081481934, |
|
"learning_rate": 4.602868389437622e-05, |
|
"loss": 0.8121, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.2783083219645293, |
|
"grad_norm": 9.906394004821777, |
|
"learning_rate": 4.568950819476648e-05, |
|
"loss": 1.1324, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.291950886766712, |
|
"grad_norm": 13.181589126586914, |
|
"learning_rate": 4.5349914449365435e-05, |
|
"loss": 0.8856, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.305593451568895, |
|
"grad_norm": 12.309771537780762, |
|
"learning_rate": 4.500992761037129e-05, |
|
"loss": 0.8276, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.319236016371078, |
|
"grad_norm": 10.110664367675781, |
|
"learning_rate": 4.4669572658865405e-05, |
|
"loss": 0.7492, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.3328785811732606, |
|
"grad_norm": 8.248074531555176, |
|
"learning_rate": 4.4328874602976786e-05, |
|
"loss": 0.9167, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.3465211459754434, |
|
"grad_norm": 8.356489181518555, |
|
"learning_rate": 4.3987858476044617e-05, |
|
"loss": 0.9453, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.360163710777626, |
|
"grad_norm": 5.316605091094971, |
|
"learning_rate": 4.364654933477886e-05, |
|
"loss": 0.8714, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.373806275579809, |
|
"grad_norm": 9.405285835266113, |
|
"learning_rate": 4.330497225741917e-05, |
|
"loss": 0.9251, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.3874488403819916, |
|
"grad_norm": 9.250997543334961, |
|
"learning_rate": 4.296315234189223e-05, |
|
"loss": 0.9479, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.4010914051841747, |
|
"grad_norm": 7.570913791656494, |
|
"learning_rate": 4.262111470396766e-05, |
|
"loss": 0.8346, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.4147339699863575, |
|
"grad_norm": 7.284623146057129, |
|
"learning_rate": 4.2278884475412585e-05, |
|
"loss": 0.754, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.42837653478854, |
|
"grad_norm": 8.737554550170898, |
|
"learning_rate": 4.193648680214505e-05, |
|
"loss": 0.7743, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.442019099590723, |
|
"grad_norm": 9.590819358825684, |
|
"learning_rate": 4.159394684238635e-05, |
|
"loss": 0.9273, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.4556616643929057, |
|
"grad_norm": 9.176666259765625, |
|
"learning_rate": 4.1251289764812495e-05, |
|
"loss": 1.0037, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.469304229195089, |
|
"grad_norm": 9.587228775024414, |
|
"learning_rate": 4.090854074670495e-05, |
|
"loss": 0.7728, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.4829467939972716, |
|
"grad_norm": 3.603977918624878, |
|
"learning_rate": 4.056572497210066e-05, |
|
"loss": 0.8098, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.4965893587994543, |
|
"grad_norm": 10.71971321105957, |
|
"learning_rate": 4.0222867629941554e-05, |
|
"loss": 0.8404, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.510231923601637, |
|
"grad_norm": 8.777016639709473, |
|
"learning_rate": 3.987999391222389e-05, |
|
"loss": 0.7021, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.52387448840382, |
|
"grad_norm": 9.952046394348145, |
|
"learning_rate": 3.953712901214707e-05, |
|
"loss": 0.9656, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.5375170532060025, |
|
"grad_norm": 10.772456169128418, |
|
"learning_rate": 3.9194298122262666e-05, |
|
"loss": 0.7487, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.5511596180081857, |
|
"grad_norm": 7.986364364624023, |
|
"learning_rate": 3.8851526432623254e-05, |
|
"loss": 0.8693, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.5648021828103684, |
|
"grad_norm": 8.993766784667969, |
|
"learning_rate": 3.850883912893158e-05, |
|
"loss": 0.8933, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.578444747612551, |
|
"grad_norm": 8.045419692993164, |
|
"learning_rate": 3.816626139069004e-05, |
|
"loss": 0.7098, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.592087312414734, |
|
"grad_norm": 7.299800872802734, |
|
"learning_rate": 3.782381838935047e-05, |
|
"loss": 0.8742, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.6057298772169166, |
|
"grad_norm": 8.76268482208252, |
|
"learning_rate": 3.748153528646472e-05, |
|
"loss": 0.7846, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.6193724420191, |
|
"grad_norm": 13.77757453918457, |
|
"learning_rate": 3.713943723183587e-05, |
|
"loss": 0.9032, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.6330150068212825, |
|
"grad_norm": 9.216800689697266, |
|
"learning_rate": 3.6797549361670257e-05, |
|
"loss": 0.8044, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.6466575716234653, |
|
"grad_norm": 12.811419486999512, |
|
"learning_rate": 3.6455896796730554e-05, |
|
"loss": 0.8528, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.660300136425648, |
|
"grad_norm": 8.257354736328125, |
|
"learning_rate": 3.611450464049005e-05, |
|
"loss": 0.9294, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.6739427012278307, |
|
"grad_norm": 6.599920272827148, |
|
"learning_rate": 3.577339797728805e-05, |
|
"loss": 0.9441, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.6875852660300135, |
|
"grad_norm": 10.485870361328125, |
|
"learning_rate": 3.5432601870486795e-05, |
|
"loss": 0.8572, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.701227830832196, |
|
"grad_norm": 5.167399883270264, |
|
"learning_rate": 3.509214136062993e-05, |
|
"loss": 0.9912, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.7148703956343794, |
|
"grad_norm": 9.000060081481934, |
|
"learning_rate": 3.475204146360254e-05, |
|
"loss": 0.9876, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.728512960436562, |
|
"grad_norm": 7.356120586395264, |
|
"learning_rate": 3.44123271687931e-05, |
|
"loss": 0.8796, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.742155525238745, |
|
"grad_norm": 10.706610679626465, |
|
"learning_rate": 3.407302343725737e-05, |
|
"loss": 0.8394, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.7557980900409276, |
|
"grad_norm": 6.853879928588867, |
|
"learning_rate": 3.3734155199884275e-05, |
|
"loss": 0.7603, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.7694406548431107, |
|
"grad_norm": 8.325788497924805, |
|
"learning_rate": 3.339574735556412e-05, |
|
"loss": 0.8944, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.7830832196452935, |
|
"grad_norm": 12.427726745605469, |
|
"learning_rate": 3.3057824769359104e-05, |
|
"loss": 0.9737, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.796725784447476, |
|
"grad_norm": 11.044093132019043, |
|
"learning_rate": 3.2720412270676275e-05, |
|
"loss": 0.9483, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.810368349249659, |
|
"grad_norm": 5.907009601593018, |
|
"learning_rate": 3.2383534651443206e-05, |
|
"loss": 0.9254, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.8240109140518417, |
|
"grad_norm": 5.986649036407471, |
|
"learning_rate": 3.204721666428631e-05, |
|
"loss": 0.7472, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.8376534788540244, |
|
"grad_norm": 9.0693359375, |
|
"learning_rate": 3.171148302071215e-05, |
|
"loss": 0.8689, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.851296043656207, |
|
"grad_norm": 7.178049087524414, |
|
"learning_rate": 3.137635838929169e-05, |
|
"loss": 0.7493, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.8649386084583903, |
|
"grad_norm": 8.808353424072266, |
|
"learning_rate": 3.1041867393847764e-05, |
|
"loss": 1.0867, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.878581173260573, |
|
"grad_norm": 11.312020301818848, |
|
"learning_rate": 3.070803461164575e-05, |
|
"loss": 0.9846, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.892223738062756, |
|
"grad_norm": 2.6739227771759033, |
|
"learning_rate": 3.0374884571587776e-05, |
|
"loss": 0.7862, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.9058663028649385, |
|
"grad_norm": 9.54980182647705, |
|
"learning_rate": 3.004244175241038e-05, |
|
"loss": 1.0586, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.9195088676671213, |
|
"grad_norm": 10.564549446105957, |
|
"learning_rate": 2.971073058088587e-05, |
|
"loss": 0.8757, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.9331514324693044, |
|
"grad_norm": 9.608818054199219, |
|
"learning_rate": 2.937977543002764e-05, |
|
"loss": 0.9026, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.946793997271487, |
|
"grad_norm": 4.548944473266602, |
|
"learning_rate": 2.9049600617299188e-05, |
|
"loss": 0.8864, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.96043656207367, |
|
"grad_norm": 7.556874752044678, |
|
"learning_rate": 2.872023040282739e-05, |
|
"loss": 0.8224, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.9740791268758526, |
|
"grad_norm": 9.024819374084473, |
|
"learning_rate": 2.8391688987620045e-05, |
|
"loss": 0.794, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.9877216916780354, |
|
"grad_norm": 5.675693035125732, |
|
"learning_rate": 2.8064000511787523e-05, |
|
"loss": 0.9792, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.001364256480218, |
|
"grad_norm": 6.426331520080566, |
|
"learning_rate": 2.77371890527691e-05, |
|
"loss": 0.7812, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.0150068212824013, |
|
"grad_norm": 7.24752140045166, |
|
"learning_rate": 2.741127862356389e-05, |
|
"loss": 0.496, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.028649386084584, |
|
"grad_norm": 8.4566650390625, |
|
"learning_rate": 2.7086293170966312e-05, |
|
"loss": 0.3416, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.0422919508867667, |
|
"grad_norm": 5.965997219085693, |
|
"learning_rate": 2.6762256573806664e-05, |
|
"loss": 0.3462, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.0559345156889495, |
|
"grad_norm": 10.749195098876953, |
|
"learning_rate": 2.6439192641196583e-05, |
|
"loss": 0.4756, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.069577080491132, |
|
"grad_norm": 7.666036605834961, |
|
"learning_rate": 2.611712511077959e-05, |
|
"loss": 0.357, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.083219645293315, |
|
"grad_norm": 7.038111209869385, |
|
"learning_rate": 2.5796077646986922e-05, |
|
"loss": 0.3361, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.096862210095498, |
|
"grad_norm": 10.389034271240234, |
|
"learning_rate": 2.5476073839298857e-05, |
|
"loss": 0.419, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.110504774897681, |
|
"grad_norm": 6.829967021942139, |
|
"learning_rate": 2.5157137200511253e-05, |
|
"loss": 0.4211, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.1241473396998636, |
|
"grad_norm": 4.85243558883667, |
|
"learning_rate": 2.4839291165008073e-05, |
|
"loss": 0.5248, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.1377899045020463, |
|
"grad_norm": 7.922482490539551, |
|
"learning_rate": 2.452255908703945e-05, |
|
"loss": 0.3983, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.151432469304229, |
|
"grad_norm": 4.674210071563721, |
|
"learning_rate": 2.420696423900567e-05, |
|
"loss": 0.3728, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.1650750341064118, |
|
"grad_norm": 16.18085289001465, |
|
"learning_rate": 2.3892529809747195e-05, |
|
"loss": 0.4099, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.178717598908595, |
|
"grad_norm": 10.127204895019531, |
|
"learning_rate": 2.35792789028409e-05, |
|
"loss": 0.5466, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.1923601637107777, |
|
"grad_norm": 6.371789932250977, |
|
"learning_rate": 2.32672345349024e-05, |
|
"loss": 0.3956, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.2060027285129604, |
|
"grad_norm": 6.170881748199463, |
|
"learning_rate": 2.2956419633894922e-05, |
|
"loss": 0.2686, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.219645293315143, |
|
"grad_norm": 8.651374816894531, |
|
"learning_rate": 2.264685703744466e-05, |
|
"loss": 0.3545, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.233287858117326, |
|
"grad_norm": 9.146707534790039, |
|
"learning_rate": 2.2338569491162688e-05, |
|
"loss": 0.4361, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.246930422919509, |
|
"grad_norm": 10.473475456237793, |
|
"learning_rate": 2.2031579646973662e-05, |
|
"loss": 0.4616, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.260572987721692, |
|
"grad_norm": 7.665262222290039, |
|
"learning_rate": 2.1725910061451582e-05, |
|
"loss": 0.4095, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.2742155525238745, |
|
"grad_norm": 14.176889419555664, |
|
"learning_rate": 2.1421583194162237e-05, |
|
"loss": 0.4428, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.2878581173260573, |
|
"grad_norm": 10.604450225830078, |
|
"learning_rate": 2.1118621406013045e-05, |
|
"loss": 0.4946, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.30150068212824, |
|
"grad_norm": 9.188895225524902, |
|
"learning_rate": 2.0817046957610073e-05, |
|
"loss": 0.3676, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.3151432469304227, |
|
"grad_norm": 5.148584842681885, |
|
"learning_rate": 2.0516882007622318e-05, |
|
"loss": 0.4319, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.328785811732606, |
|
"grad_norm": 10.153374671936035, |
|
"learning_rate": 2.0218148611153614e-05, |
|
"loss": 0.3973, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.3424283765347886, |
|
"grad_norm": 2.798651695251465, |
|
"learning_rate": 1.99208687181221e-05, |
|
"loss": 0.3728, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.3560709413369714, |
|
"grad_norm": 7.391672611236572, |
|
"learning_rate": 1.9625064171647403e-05, |
|
"loss": 0.4029, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.369713506139154, |
|
"grad_norm": 8.293444633483887, |
|
"learning_rate": 1.933075670644566e-05, |
|
"loss": 0.5182, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.383356070941337, |
|
"grad_norm": 12.008163452148438, |
|
"learning_rate": 1.903796794723261e-05, |
|
"loss": 0.3211, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.39699863574352, |
|
"grad_norm": 4.839990139007568, |
|
"learning_rate": 1.8746719407134558e-05, |
|
"loss": 0.3218, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.4106412005457027, |
|
"grad_norm": 5.815826892852783, |
|
"learning_rate": 1.8457032486107733e-05, |
|
"loss": 0.3561, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.4242837653478855, |
|
"grad_norm": 5.609132766723633, |
|
"learning_rate": 1.816892846936592e-05, |
|
"loss": 0.4223, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.437926330150068, |
|
"grad_norm": 6.10213041305542, |
|
"learning_rate": 1.7882428525816434e-05, |
|
"loss": 0.2911, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.451568894952251, |
|
"grad_norm": 6.413115501403809, |
|
"learning_rate": 1.759755370650472e-05, |
|
"loss": 0.3991, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.4652114597544337, |
|
"grad_norm": 10.340503692626953, |
|
"learning_rate": 1.7314324943067598e-05, |
|
"loss": 0.3823, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.4788540245566164, |
|
"grad_norm": 10.008712768554688, |
|
"learning_rate": 1.70327630461953e-05, |
|
"loss": 0.3564, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.4924965893587996, |
|
"grad_norm": 7.237100601196289, |
|
"learning_rate": 1.6752888704102304e-05, |
|
"loss": 0.4012, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.5061391541609823, |
|
"grad_norm": 7.369434833526611, |
|
"learning_rate": 1.6474722481007344e-05, |
|
"loss": 0.3622, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.519781718963165, |
|
"grad_norm": 10.581721305847168, |
|
"learning_rate": 1.619828481562229e-05, |
|
"loss": 0.3502, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.533424283765348, |
|
"grad_norm": 7.968098163604736, |
|
"learning_rate": 1.5923596019650517e-05, |
|
"loss": 0.4129, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.547066848567531, |
|
"grad_norm": 9.581936836242676, |
|
"learning_rate": 1.565067627629432e-05, |
|
"loss": 0.455, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.5607094133697137, |
|
"grad_norm": 7.407210350036621, |
|
"learning_rate": 1.5379545638772032e-05, |
|
"loss": 0.3905, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.5743519781718964, |
|
"grad_norm": 9.555901527404785, |
|
"learning_rate": 1.511022402884459e-05, |
|
"loss": 0.4603, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.587994542974079, |
|
"grad_norm": 5.677056312561035, |
|
"learning_rate": 1.4842731235351653e-05, |
|
"loss": 0.3654, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.601637107776262, |
|
"grad_norm": 8.808677673339844, |
|
"learning_rate": 1.4577086912757659e-05, |
|
"loss": 0.4481, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.6152796725784446, |
|
"grad_norm": 11.250635147094727, |
|
"learning_rate": 1.4313310579707697e-05, |
|
"loss": 0.4829, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.6289222373806274, |
|
"grad_norm": 8.87308406829834, |
|
"learning_rate": 1.405142161759327e-05, |
|
"loss": 0.3937, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.64256480218281, |
|
"grad_norm": 7.986633777618408, |
|
"learning_rate": 1.3791439269128274e-05, |
|
"loss": 0.3902, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.6562073669849933, |
|
"grad_norm": 6.5852155685424805, |
|
"learning_rate": 1.3533382636935092e-05, |
|
"loss": 0.4495, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.669849931787176, |
|
"grad_norm": 22.476903915405273, |
|
"learning_rate": 1.3277270682140996e-05, |
|
"loss": 0.4707, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.6834924965893587, |
|
"grad_norm": 6.103644371032715, |
|
"learning_rate": 1.3023122222984941e-05, |
|
"loss": 0.4189, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.6971350613915415, |
|
"grad_norm": 8.584049224853516, |
|
"learning_rate": 1.2770955933434906e-05, |
|
"loss": 0.3741, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.7107776261937246, |
|
"grad_norm": 8.539772033691406, |
|
"learning_rate": 1.2520790341815726e-05, |
|
"loss": 0.3724, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.7244201909959074, |
|
"grad_norm": 8.408806800842285, |
|
"learning_rate": 1.2272643829447723e-05, |
|
"loss": 0.4002, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.73806275579809, |
|
"grad_norm": 13.712347030639648, |
|
"learning_rate": 1.2026534629296168e-05, |
|
"loss": 0.4129, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.751705320600273, |
|
"grad_norm": 7.925304412841797, |
|
"learning_rate": 1.1782480824631478e-05, |
|
"loss": 0.3354, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.7653478854024556, |
|
"grad_norm": 6.753428936004639, |
|
"learning_rate": 1.154050034770057e-05, |
|
"loss": 0.3, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.7789904502046383, |
|
"grad_norm": 11.436506271362305, |
|
"learning_rate": 1.1300610978409301e-05, |
|
"loss": 0.4613, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.792633015006821, |
|
"grad_norm": 7.002927780151367, |
|
"learning_rate": 1.1062830343015998e-05, |
|
"loss": 0.3782, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.806275579809004, |
|
"grad_norm": 8.752483367919922, |
|
"learning_rate": 1.0827175912836352e-05, |
|
"loss": 0.3154, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.819918144611187, |
|
"grad_norm": 8.298131942749023, |
|
"learning_rate": 1.059366500295973e-05, |
|
"loss": 0.3941, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.8335607094133697, |
|
"grad_norm": 9.577731132507324, |
|
"learning_rate": 1.0362314770976858e-05, |
|
"loss": 0.4402, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.8472032742155524, |
|
"grad_norm": 8.360587120056152, |
|
"learning_rate": 1.0133142215719176e-05, |
|
"loss": 0.3588, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.8608458390177356, |
|
"grad_norm": 7.106631755828857, |
|
"learning_rate": 9.906164176009825e-06, |
|
"loss": 0.4171, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.8744884038199183, |
|
"grad_norm": 7.905004501342773, |
|
"learning_rate": 9.681397329426363e-06, |
|
"loss": 0.4008, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.888130968622101, |
|
"grad_norm": 7.430079460144043, |
|
"learning_rate": 9.458858191075358e-06, |
|
"loss": 0.2851, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.901773533424284, |
|
"grad_norm": 6.089357852935791, |
|
"learning_rate": 9.238563112378967e-06, |
|
"loss": 0.486, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.9154160982264665, |
|
"grad_norm": 10.433391571044922, |
|
"learning_rate": 9.02052827987339e-06, |
|
"loss": 0.3321, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.9290586630286493, |
|
"grad_norm": 11.072164535522461, |
|
"learning_rate": 8.804769714019619e-06, |
|
"loss": 0.2981, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.942701227830832, |
|
"grad_norm": 6.538973331451416, |
|
"learning_rate": 8.591303268026293e-06, |
|
"loss": 0.3412, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.956343792633015, |
|
"grad_norm": 6.748257637023926, |
|
"learning_rate": 8.380144626684829e-06, |
|
"loss": 0.3648, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.969986357435198, |
|
"grad_norm": 8.447965621948242, |
|
"learning_rate": 8.171309305216973e-06, |
|
"loss": 0.4028, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.9836289222373806, |
|
"grad_norm": 11.228553771972656, |
|
"learning_rate": 7.96481264813481e-06, |
|
"loss": 0.5365, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.9972714870395634, |
|
"grad_norm": 16.4576416015625, |
|
"learning_rate": 7.760669828113276e-06, |
|
"loss": 0.3312, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.0109140518417465, |
|
"grad_norm": 9.181668281555176, |
|
"learning_rate": 7.558895844875325e-06, |
|
"loss": 0.2586, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.024556616643929, |
|
"grad_norm": 6.005313873291016, |
|
"learning_rate": 7.359505524089843e-06, |
|
"loss": 0.174, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.038199181446112, |
|
"grad_norm": 9.592679977416992, |
|
"learning_rate": 7.162513516282236e-06, |
|
"loss": 0.1319, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.051841746248295, |
|
"grad_norm": 4.006359100341797, |
|
"learning_rate": 6.967934295758003e-06, |
|
"loss": 0.1098, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.0654843110504775, |
|
"grad_norm": 8.887368202209473, |
|
"learning_rate": 6.775782159539237e-06, |
|
"loss": 0.1351, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.07912687585266, |
|
"grad_norm": 4.626531600952148, |
|
"learning_rate": 6.586071226314046e-06, |
|
"loss": 0.2408, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.092769440654843, |
|
"grad_norm": 3.412660598754883, |
|
"learning_rate": 6.3988154353992285e-06, |
|
"loss": 0.1759, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.106412005457026, |
|
"grad_norm": 3.9307303428649902, |
|
"learning_rate": 6.214028545716071e-06, |
|
"loss": 0.2398, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.120054570259208, |
|
"grad_norm": 4.589839935302734, |
|
"learning_rate": 6.031724134779331e-06, |
|
"loss": 0.1491, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.133697135061391, |
|
"grad_norm": 5.797046184539795, |
|
"learning_rate": 5.851915597699638e-06, |
|
"loss": 0.1711, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.147339699863575, |
|
"grad_norm": 6.0500030517578125, |
|
"learning_rate": 5.674616146199277e-06, |
|
"loss": 0.1815, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.1609822646657575, |
|
"grad_norm": 5.424590587615967, |
|
"learning_rate": 5.499838807641413e-06, |
|
"loss": 0.2796, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.17462482946794, |
|
"grad_norm": 10.55384349822998, |
|
"learning_rate": 5.327596424072896e-06, |
|
"loss": 0.2307, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.188267394270123, |
|
"grad_norm": 11.20399284362793, |
|
"learning_rate": 5.157901651280672e-06, |
|
"loss": 0.1955, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.201909959072306, |
|
"grad_norm": 5.694499492645264, |
|
"learning_rate": 4.990766957861875e-06, |
|
"loss": 0.2332, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.215552523874488, |
|
"grad_norm": 3.623840093612671, |
|
"learning_rate": 4.826204624307665e-06, |
|
"loss": 0.1317, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.229195088676671, |
|
"grad_norm": 6.306783199310303, |
|
"learning_rate": 4.664226742100946e-06, |
|
"loss": 0.1772, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.242837653478854, |
|
"grad_norm": 5.530797004699707, |
|
"learning_rate": 4.504845212827848e-06, |
|
"loss": 0.1893, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.256480218281037, |
|
"grad_norm": 6.482370853424072, |
|
"learning_rate": 4.348071747303322e-06, |
|
"loss": 0.2459, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.270122783083219, |
|
"grad_norm": 6.513676643371582, |
|
"learning_rate": 4.193917864710599e-06, |
|
"loss": 0.1615, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.283765347885402, |
|
"grad_norm": 7.2312703132629395, |
|
"learning_rate": 4.042394891754846e-06, |
|
"loss": 0.1488, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.297407912687586, |
|
"grad_norm": 8.509649276733398, |
|
"learning_rate": 3.893513961830886e-06, |
|
"loss": 0.1507, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.311050477489768, |
|
"grad_norm": 8.615159034729004, |
|
"learning_rate": 3.74728601420518e-06, |
|
"loss": 0.1768, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.324693042291951, |
|
"grad_norm": 9.077027320861816, |
|
"learning_rate": 3.6037217932120272e-06, |
|
"loss": 0.3008, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.338335607094134, |
|
"grad_norm": 4.60724401473999, |
|
"learning_rate": 3.4628318474641344e-06, |
|
"loss": 0.1839, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.351978171896317, |
|
"grad_norm": 3.5988879203796387, |
|
"learning_rate": 3.3246265290775013e-06, |
|
"loss": 0.1681, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.365620736698499, |
|
"grad_norm": 3.6327638626098633, |
|
"learning_rate": 3.1891159929108074e-06, |
|
"loss": 0.2091, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.379263301500682, |
|
"grad_norm": 7.489198207855225, |
|
"learning_rate": 3.0563101958192677e-06, |
|
"loss": 0.175, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.392905866302865, |
|
"grad_norm": 7.076563835144043, |
|
"learning_rate": 2.9262188959230297e-06, |
|
"loss": 0.2715, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.406548431105048, |
|
"grad_norm": 4.808744430541992, |
|
"learning_rate": 2.7988516518901643e-06, |
|
"loss": 0.205, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.42019099590723, |
|
"grad_norm": 12.732096672058105, |
|
"learning_rate": 2.674217822234382e-06, |
|
"loss": 0.1804, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.433833560709413, |
|
"grad_norm": 4.898436546325684, |
|
"learning_rate": 2.5523265646273252e-06, |
|
"loss": 0.2301, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.447476125511596, |
|
"grad_norm": 3.7656116485595703, |
|
"learning_rate": 2.433186835225745e-06, |
|
"loss": 0.2665, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.461118690313779, |
|
"grad_norm": 4.772073745727539, |
|
"learning_rate": 2.316807388013431e-06, |
|
"loss": 0.1443, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.474761255115962, |
|
"grad_norm": 7.5682830810546875, |
|
"learning_rate": 2.203196774157972e-06, |
|
"loss": 0.2292, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.488403819918145, |
|
"grad_norm": 4.153563976287842, |
|
"learning_rate": 2.0923633413824663e-06, |
|
"loss": 0.1765, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.502046384720328, |
|
"grad_norm": 3.0278851985931396, |
|
"learning_rate": 1.98431523335215e-06, |
|
"loss": 0.1966, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.51568894952251, |
|
"grad_norm": 3.25349497795105, |
|
"learning_rate": 1.8790603890760328e-06, |
|
"loss": 0.2525, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.529331514324693, |
|
"grad_norm": 6.384403228759766, |
|
"learning_rate": 1.7766065423235624e-06, |
|
"loss": 0.1502, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.542974079126876, |
|
"grad_norm": 8.483981132507324, |
|
"learning_rate": 1.6769612210563834e-06, |
|
"loss": 0.2251, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 4.5566166439290585, |
|
"grad_norm": 5.477599620819092, |
|
"learning_rate": 1.5801317468751954e-06, |
|
"loss": 0.1724, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 4.570259208731241, |
|
"grad_norm": 6.450118541717529, |
|
"learning_rate": 1.4861252344817812e-06, |
|
"loss": 0.1416, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.583901773533424, |
|
"grad_norm": 8.463088035583496, |
|
"learning_rate": 1.3949485911562799e-06, |
|
"loss": 0.1507, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 4.597544338335607, |
|
"grad_norm": 5.667326927185059, |
|
"learning_rate": 1.3066085162496057e-06, |
|
"loss": 0.1486, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 4.61118690313779, |
|
"grad_norm": 8.173833847045898, |
|
"learning_rate": 1.2211115006912499e-06, |
|
"loss": 0.1823, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 4.624829467939973, |
|
"grad_norm": 7.300376892089844, |
|
"learning_rate": 1.1384638265123305e-06, |
|
"loss": 0.1146, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 4.638472032742156, |
|
"grad_norm": 5.938165664672852, |
|
"learning_rate": 1.0586715663840175e-06, |
|
"loss": 0.2206, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.6521145975443385, |
|
"grad_norm": 5.717035293579102, |
|
"learning_rate": 9.817405831713135e-07, |
|
"loss": 0.137, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 4.665757162346521, |
|
"grad_norm": 9.58558464050293, |
|
"learning_rate": 9.076765295022949e-07, |
|
"loss": 0.1969, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 4.679399727148704, |
|
"grad_norm": 5.858943462371826, |
|
"learning_rate": 8.364848473527698e-07, |
|
"loss": 0.1712, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 4.693042291950887, |
|
"grad_norm": 5.517107009887695, |
|
"learning_rate": 7.748725387928791e-07, |
|
"loss": 0.1606, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 4.7066848567530695, |
|
"grad_norm": 2.8044917583465576, |
|
"learning_rate": 7.09152598980536e-07, |
|
"loss": 0.1722, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.720327421555252, |
|
"grad_norm": 5.587821960449219, |
|
"learning_rate": 6.463196175536768e-07, |
|
"loss": 0.2592, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 4.733969986357435, |
|
"grad_norm": 8.284170150756836, |
|
"learning_rate": 5.863782112669647e-07, |
|
"loss": 0.2729, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 4.747612551159618, |
|
"grad_norm": 7.003023147583008, |
|
"learning_rate": 5.293327844118956e-07, |
|
"loss": 0.1888, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 4.7612551159618, |
|
"grad_norm": 1.9856537580490112, |
|
"learning_rate": 4.751875284932217e-07, |
|
"loss": 0.1367, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 4.774897680763983, |
|
"grad_norm": 6.205258846282959, |
|
"learning_rate": 4.2394642192095327e-07, |
|
"loss": 0.2003, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.788540245566167, |
|
"grad_norm": 5.503448009490967, |
|
"learning_rate": 3.7561322971803706e-07, |
|
"loss": 0.1503, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 4.8021828103683495, |
|
"grad_norm": 5.814334392547607, |
|
"learning_rate": 3.301915032437375e-07, |
|
"loss": 0.1425, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 4.815825375170532, |
|
"grad_norm": 7.47700309753418, |
|
"learning_rate": 2.8768457993266775e-07, |
|
"loss": 0.1265, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 4.829467939972715, |
|
"grad_norm": 6.920642375946045, |
|
"learning_rate": 2.480955830495679e-07, |
|
"loss": 0.2103, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 4.843110504774898, |
|
"grad_norm": 8.557290077209473, |
|
"learning_rate": 2.1142742145984442e-07, |
|
"loss": 0.2395, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 4.85675306957708, |
|
"grad_norm": 7.730421543121338, |
|
"learning_rate": 1.7768278941581617e-07, |
|
"loss": 0.1317, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 4.870395634379263, |
|
"grad_norm": 4.612334728240967, |
|
"learning_rate": 1.4686416635874445e-07, |
|
"loss": 0.1802, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 4.884038199181446, |
|
"grad_norm": 6.5652899742126465, |
|
"learning_rate": 1.1897381673666719e-07, |
|
"loss": 0.2484, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 4.897680763983629, |
|
"grad_norm": 8.53544807434082, |
|
"learning_rate": 9.40137898380078e-08, |
|
"loss": 0.1346, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 4.911323328785811, |
|
"grad_norm": 6.831415176391602, |
|
"learning_rate": 7.198591964099777e-08, |
|
"loss": 0.2097, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.924965893587995, |
|
"grad_norm": 11.197553634643555, |
|
"learning_rate": 5.289182467893561e-08, |
|
"loss": 0.1865, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 4.938608458390178, |
|
"grad_norm": 9.906291007995605, |
|
"learning_rate": 3.6732907921241956e-08, |
|
"loss": 0.2137, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 4.95225102319236, |
|
"grad_norm": 6.347635269165039, |
|
"learning_rate": 2.351035667038648e-08, |
|
"loss": 0.1699, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 4.965893587994543, |
|
"grad_norm": 2.5308990478515625, |
|
"learning_rate": 1.3225142474651009e-08, |
|
"loss": 0.247, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 4.979536152796726, |
|
"grad_norm": 8.235413551330566, |
|
"learning_rate": 5.878021056742178e-09, |
|
"loss": 0.1884, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 4.993178717598909, |
|
"grad_norm": 8.303121566772461, |
|
"learning_rate": 1.4695322582491956e-09, |
|
"loss": 0.186, |
|
"step": 3660 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3665, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2102292765343744e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|