|
{ |
|
"best_metric": 1.9684966802597046, |
|
"best_model_checkpoint": "./output/checkpoint-4950", |
|
"epoch": 0.17762945419313167, |
|
"eval_steps": 150, |
|
"global_step": 4950, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003588473822083468, |
|
"grad_norm": 6.717320442199707, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 2.1469, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0007176947644166936, |
|
"grad_norm": 8.888245582580566, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.0737, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0010765421466250404, |
|
"grad_norm": 6.897828102111816, |
|
"learning_rate": 1.65e-05, |
|
"loss": 2.3197, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0014353895288333873, |
|
"grad_norm": 5.266988754272461, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.1048, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0017942369110417339, |
|
"grad_norm": 5.98182487487793, |
|
"learning_rate": 2.75e-05, |
|
"loss": 2.1213, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.002153084293250081, |
|
"grad_norm": 5.487156867980957, |
|
"learning_rate": 3.3e-05, |
|
"loss": 2.0035, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0025119316754584277, |
|
"grad_norm": 7.080353260040283, |
|
"learning_rate": 3.85e-05, |
|
"loss": 1.9454, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0028707790576667745, |
|
"grad_norm": 6.8405046463012695, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.0665, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003229626439875121, |
|
"grad_norm": 5.570849895477295, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 2.0471, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0035884738220834677, |
|
"grad_norm": 8.464041709899902, |
|
"learning_rate": 5.5e-05, |
|
"loss": 2.2792, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0039473212042918145, |
|
"grad_norm": 6.397585868835449, |
|
"learning_rate": 5.4999434791355066e-05, |
|
"loss": 2.145, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004306168586500162, |
|
"grad_norm": 6.912049293518066, |
|
"learning_rate": 5.4997739188653784e-05, |
|
"loss": 2.2255, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004665015968708508, |
|
"grad_norm": 5.9162139892578125, |
|
"learning_rate": 5.4994913261595724e-05, |
|
"loss": 2.1511, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005023863350916855, |
|
"grad_norm": 6.609055995941162, |
|
"learning_rate": 5.49909571263437e-05, |
|
"loss": 2.2134, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.005382710733125202, |
|
"grad_norm": 5.2473835945129395, |
|
"learning_rate": 5.498587094551892e-05, |
|
"loss": 2.043, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005382710733125202, |
|
"eval_loss": 2.19863224029541, |
|
"eval_runtime": 54.8159, |
|
"eval_samples_per_second": 9.121, |
|
"eval_steps_per_second": 9.121, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005741558115333549, |
|
"grad_norm": 7.073841094970703, |
|
"learning_rate": 5.497965492819436e-05, |
|
"loss": 2.2452, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.006100405497541895, |
|
"grad_norm": 6.833587646484375, |
|
"learning_rate": 5.4972309329886156e-05, |
|
"loss": 2.0677, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.006459252879750242, |
|
"grad_norm": 5.268569469451904, |
|
"learning_rate": 5.496383445254307e-05, |
|
"loss": 2.131, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.006818100261958589, |
|
"grad_norm": 6.808563709259033, |
|
"learning_rate": 5.495423064453413e-05, |
|
"loss": 2.206, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.007176947644166935, |
|
"grad_norm": 5.996222019195557, |
|
"learning_rate": 5.4943498300634254e-05, |
|
"loss": 2.2165, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.007535795026375283, |
|
"grad_norm": 6.295897006988525, |
|
"learning_rate": 5.493163786200807e-05, |
|
"loss": 2.0522, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.007894642408583629, |
|
"grad_norm": 8.73654556274414, |
|
"learning_rate": 5.491864981619175e-05, |
|
"loss": 2.1393, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.008253489790791976, |
|
"grad_norm": 6.038145065307617, |
|
"learning_rate": 5.4904534697073e-05, |
|
"loss": 2.0871, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.008612337173000324, |
|
"grad_norm": 7.730770111083984, |
|
"learning_rate": 5.488929308486908e-05, |
|
"loss": 2.0365, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.008971184555208669, |
|
"grad_norm": 6.322446823120117, |
|
"learning_rate": 5.487292560610295e-05, |
|
"loss": 1.9943, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.009330031937417016, |
|
"grad_norm": 5.963322639465332, |
|
"learning_rate": 5.485543293357758e-05, |
|
"loss": 2.2698, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.009688879319625364, |
|
"grad_norm": 7.290161609649658, |
|
"learning_rate": 5.483681578634821e-05, |
|
"loss": 2.1537, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.01004772670183371, |
|
"grad_norm": 6.640771389007568, |
|
"learning_rate": 5.481707492969285e-05, |
|
"loss": 2.3031, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.010406574084042056, |
|
"grad_norm": 5.640336036682129, |
|
"learning_rate": 5.479621117508079e-05, |
|
"loss": 2.2397, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.010765421466250404, |
|
"grad_norm": 6.066910743713379, |
|
"learning_rate": 5.477422538013927e-05, |
|
"loss": 1.8947, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.010765421466250404, |
|
"eval_loss": 2.1935410499572754, |
|
"eval_runtime": 56.6355, |
|
"eval_samples_per_second": 8.828, |
|
"eval_steps_per_second": 8.828, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01112426884845875, |
|
"grad_norm": 5.537936687469482, |
|
"learning_rate": 5.475111844861821e-05, |
|
"loss": 2.3336, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.011483116230667098, |
|
"grad_norm": 4.925268173217773, |
|
"learning_rate": 5.4726891330353056e-05, |
|
"loss": 2.0345, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.011841963612875444, |
|
"grad_norm": 7.5510430335998535, |
|
"learning_rate": 5.4701545021225746e-05, |
|
"loss": 2.2388, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01220081099508379, |
|
"grad_norm": 6.070944309234619, |
|
"learning_rate": 5.4675080563123786e-05, |
|
"loss": 2.1464, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.012559658377292138, |
|
"grad_norm": 5.479851245880127, |
|
"learning_rate": 5.4647499043897386e-05, |
|
"loss": 2.1711, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.012918505759500484, |
|
"grad_norm": 5.476010799407959, |
|
"learning_rate": 5.461880159731476e-05, |
|
"loss": 2.1122, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01327735314170883, |
|
"grad_norm": 5.9396820068359375, |
|
"learning_rate": 5.4588989403015564e-05, |
|
"loss": 2.1398, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.013636200523917178, |
|
"grad_norm": 5.315182209014893, |
|
"learning_rate": 5.4558063686462315e-05, |
|
"loss": 1.9519, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.013995047906125525, |
|
"grad_norm": 6.243162631988525, |
|
"learning_rate": 5.4526025718890104e-05, |
|
"loss": 2.2456, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01435389528833387, |
|
"grad_norm": 7.608538627624512, |
|
"learning_rate": 5.44928768172543e-05, |
|
"loss": 2.0085, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.014712742670542218, |
|
"grad_norm": 6.5568623542785645, |
|
"learning_rate": 5.44586183441764e-05, |
|
"loss": 1.9001, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.015071590052750565, |
|
"grad_norm": 6.803008079528809, |
|
"learning_rate": 5.442325170788806e-05, |
|
"loss": 2.0497, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.015430437434958913, |
|
"grad_norm": 6.07423734664917, |
|
"learning_rate": 5.438677836217317e-05, |
|
"loss": 2.157, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.015789284817167258, |
|
"grad_norm": 6.988314628601074, |
|
"learning_rate": 5.434919980630811e-05, |
|
"loss": 2.1349, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.016148132199375605, |
|
"grad_norm": 5.507582664489746, |
|
"learning_rate": 5.431051758500015e-05, |
|
"loss": 1.9094, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.016148132199375605, |
|
"eval_loss": 2.174536943435669, |
|
"eval_runtime": 55.7176, |
|
"eval_samples_per_second": 8.974, |
|
"eval_steps_per_second": 8.974, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.016506979581583953, |
|
"grad_norm": 8.126635551452637, |
|
"learning_rate": 5.427073328832388e-05, |
|
"loss": 1.9681, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0168658269637923, |
|
"grad_norm": 5.529940128326416, |
|
"learning_rate": 5.422984855165592e-05, |
|
"loss": 2.2576, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.017224674346000647, |
|
"grad_norm": 8.195639610290527, |
|
"learning_rate": 5.418786505560766e-05, |
|
"loss": 1.9816, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.017583521728208994, |
|
"grad_norm": 6.6597795486450195, |
|
"learning_rate": 5.414478452595617e-05, |
|
"loss": 2.1214, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.017942369110417338, |
|
"grad_norm": 7.02098274230957, |
|
"learning_rate": 5.4100608733573315e-05, |
|
"loss": 2.166, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.018301216492625685, |
|
"grad_norm": 5.826364517211914, |
|
"learning_rate": 5.4055339494352874e-05, |
|
"loss": 2.0209, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.018660063874834033, |
|
"grad_norm": 6.55180025100708, |
|
"learning_rate": 5.400897866913597e-05, |
|
"loss": 2.0571, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.01901891125704238, |
|
"grad_norm": 5.248165607452393, |
|
"learning_rate": 5.3961528163634546e-05, |
|
"loss": 2.152, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.019377758639250727, |
|
"grad_norm": 6.659270286560059, |
|
"learning_rate": 5.391298992835303e-05, |
|
"loss": 2.2554, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.019736606021459074, |
|
"grad_norm": 6.1025919914245605, |
|
"learning_rate": 5.386336595850817e-05, |
|
"loss": 2.0898, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.02009545340366742, |
|
"grad_norm": 7.94935417175293, |
|
"learning_rate": 5.3812658293946995e-05, |
|
"loss": 2.124, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.020454300785875765, |
|
"grad_norm": 5.731258869171143, |
|
"learning_rate": 5.376086901906299e-05, |
|
"loss": 2.1947, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.020813148168084113, |
|
"grad_norm": 5.822957515716553, |
|
"learning_rate": 5.37080002627104e-05, |
|
"loss": 1.8958, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.02117199555029246, |
|
"grad_norm": 6.069146156311035, |
|
"learning_rate": 5.365405419811673e-05, |
|
"loss": 2.1358, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.021530842932500807, |
|
"grad_norm": 6.830340385437012, |
|
"learning_rate": 5.359903304279339e-05, |
|
"loss": 2.0506, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.021530842932500807, |
|
"eval_loss": 2.1694562435150146, |
|
"eval_runtime": 54.4455, |
|
"eval_samples_per_second": 9.183, |
|
"eval_steps_per_second": 9.183, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.021889690314709154, |
|
"grad_norm": 6.6071085929870605, |
|
"learning_rate": 5.354293905844459e-05, |
|
"loss": 2.1761, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.0222485376969175, |
|
"grad_norm": 7.093519687652588, |
|
"learning_rate": 5.3485774550874306e-05, |
|
"loss": 1.9589, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.02260738507912585, |
|
"grad_norm": 6.186059951782227, |
|
"learning_rate": 5.3427541869891556e-05, |
|
"loss": 2.1178, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.022966232461334196, |
|
"grad_norm": 6.744318962097168, |
|
"learning_rate": 5.336824340921377e-05, |
|
"loss": 2.1708, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.02332507984354254, |
|
"grad_norm": 5.510601997375488, |
|
"learning_rate": 5.330788160636841e-05, |
|
"loss": 2.0606, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.023683927225750887, |
|
"grad_norm": 4.821707725524902, |
|
"learning_rate": 5.3246458942592776e-05, |
|
"loss": 2.0368, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.024042774607959234, |
|
"grad_norm": 6.990603446960449, |
|
"learning_rate": 5.318397794273199e-05, |
|
"loss": 2.0595, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.02440162199016758, |
|
"grad_norm": 6.442526817321777, |
|
"learning_rate": 5.312044117513524e-05, |
|
"loss": 2.052, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.02476046937237593, |
|
"grad_norm": 6.1826934814453125, |
|
"learning_rate": 5.305585125155018e-05, |
|
"loss": 2.1528, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.025119316754584276, |
|
"grad_norm": 6.038548469543457, |
|
"learning_rate": 5.29902108270156e-05, |
|
"loss": 2.2279, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.025478164136792623, |
|
"grad_norm": 6.814423561096191, |
|
"learning_rate": 5.2923522599752245e-05, |
|
"loss": 2.0293, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.025837011519000967, |
|
"grad_norm": 7.148327350616455, |
|
"learning_rate": 5.2855789311051945e-05, |
|
"loss": 2.1998, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.026195858901209314, |
|
"grad_norm": 6.155678749084473, |
|
"learning_rate": 5.27870137451649e-05, |
|
"loss": 2.2251, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.02655470628341766, |
|
"grad_norm": 4.626000881195068, |
|
"learning_rate": 5.2717198729185245e-05, |
|
"loss": 2.112, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.02691355366562601, |
|
"grad_norm": 6.421958923339844, |
|
"learning_rate": 5.264634713293485e-05, |
|
"loss": 2.2201, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02691355366562601, |
|
"eval_loss": 2.1646759510040283, |
|
"eval_runtime": 54.8939, |
|
"eval_samples_per_second": 9.108, |
|
"eval_steps_per_second": 9.108, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.027272401047834356, |
|
"grad_norm": 6.699018478393555, |
|
"learning_rate": 5.2574461868845316e-05, |
|
"loss": 1.9635, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.027631248430042703, |
|
"grad_norm": 6.737663269042969, |
|
"learning_rate": 5.2501545891838315e-05, |
|
"loss": 2.0685, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.02799009581225105, |
|
"grad_norm": 5.519904136657715, |
|
"learning_rate": 5.242760219920405e-05, |
|
"loss": 2.2582, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.028348943194459398, |
|
"grad_norm": 6.503427505493164, |
|
"learning_rate": 5.235263383047812e-05, |
|
"loss": 2.1068, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.02870779057666774, |
|
"grad_norm": 7.244829177856445, |
|
"learning_rate": 5.2276643867316525e-05, |
|
"loss": 2.0203, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02906663795887609, |
|
"grad_norm": 5.9835638999938965, |
|
"learning_rate": 5.219963543336902e-05, |
|
"loss": 2.1161, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.029425485341084436, |
|
"grad_norm": 5.395669460296631, |
|
"learning_rate": 5.212161169415071e-05, |
|
"loss": 2.0434, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.029784332723292783, |
|
"grad_norm": 6.662552356719971, |
|
"learning_rate": 5.204257585691191e-05, |
|
"loss": 1.9083, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.03014318010550113, |
|
"grad_norm": 6.782001495361328, |
|
"learning_rate": 5.196253117050633e-05, |
|
"loss": 2.0935, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.030502027487709478, |
|
"grad_norm": 6.106321334838867, |
|
"learning_rate": 5.188148092525751e-05, |
|
"loss": 2.1237, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.030860874869917825, |
|
"grad_norm": 7.272143840789795, |
|
"learning_rate": 5.179942845282357e-05, |
|
"loss": 2.0028, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.031219722252126172, |
|
"grad_norm": 9.065749168395996, |
|
"learning_rate": 5.17163771260603e-05, |
|
"loss": 2.0158, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.031578569634334516, |
|
"grad_norm": 6.9140706062316895, |
|
"learning_rate": 5.163233035888244e-05, |
|
"loss": 2.0894, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.03193741701654287, |
|
"grad_norm": 6.950376033782959, |
|
"learning_rate": 5.154729160612338e-05, |
|
"loss": 2.0526, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.03229626439875121, |
|
"grad_norm": 5.954463005065918, |
|
"learning_rate": 5.146126436339321e-05, |
|
"loss": 2.2572, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03229626439875121, |
|
"eval_loss": 2.160017490386963, |
|
"eval_runtime": 55.1945, |
|
"eval_samples_per_second": 9.059, |
|
"eval_steps_per_second": 9.059, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.032655111780959555, |
|
"grad_norm": 6.65114164352417, |
|
"learning_rate": 5.137425216693491e-05, |
|
"loss": 2.3123, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.033013959163167905, |
|
"grad_norm": 6.134308338165283, |
|
"learning_rate": 5.128625859347907e-05, |
|
"loss": 2.0594, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.03337280654537625, |
|
"grad_norm": 6.7907586097717285, |
|
"learning_rate": 5.1197287260096865e-05, |
|
"loss": 2.1689, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0337316539275846, |
|
"grad_norm": 7.259077072143555, |
|
"learning_rate": 5.110734182405132e-05, |
|
"loss": 2.1629, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.034090501309792944, |
|
"grad_norm": 6.0704264640808105, |
|
"learning_rate": 5.1016425982647025e-05, |
|
"loss": 2.0007, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.034449348692001294, |
|
"grad_norm": 7.288009166717529, |
|
"learning_rate": 5.092454347307812e-05, |
|
"loss": 2.1205, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.03480819607420964, |
|
"grad_norm": 6.185722827911377, |
|
"learning_rate": 5.08316980722747e-05, |
|
"loss": 2.2027, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.03516704345641799, |
|
"grad_norm": 5.903477191925049, |
|
"learning_rate": 5.0737893596747534e-05, |
|
"loss": 2.1436, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.03552589083862633, |
|
"grad_norm": 5.645431995391846, |
|
"learning_rate": 5.064313390243121e-05, |
|
"loss": 2.1647, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.035884738220834676, |
|
"grad_norm": 6.282730579376221, |
|
"learning_rate": 5.054742288452562e-05, |
|
"loss": 2.2418, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03624358560304303, |
|
"grad_norm": 4.841719627380371, |
|
"learning_rate": 5.0450764477335825e-05, |
|
"loss": 2.1825, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.03660243298525137, |
|
"grad_norm": 5.957107067108154, |
|
"learning_rate": 5.035316265411036e-05, |
|
"loss": 1.9629, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.03696128036745972, |
|
"grad_norm": 6.100106239318848, |
|
"learning_rate": 5.02546214268779e-05, |
|
"loss": 2.1342, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.037320127749668065, |
|
"grad_norm": 6.175196170806885, |
|
"learning_rate": 5.0155144846282345e-05, |
|
"loss": 2.2566, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.037678975131876416, |
|
"grad_norm": 7.361276626586914, |
|
"learning_rate": 5.005473700141629e-05, |
|
"loss": 2.094, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.037678975131876416, |
|
"eval_loss": 2.147434711456299, |
|
"eval_runtime": 54.7074, |
|
"eval_samples_per_second": 9.14, |
|
"eval_steps_per_second": 9.14, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03803782251408476, |
|
"grad_norm": 5.384734153747559, |
|
"learning_rate": 4.995340201965296e-05, |
|
"loss": 2.0639, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.038396669896293104, |
|
"grad_norm": 6.4175872802734375, |
|
"learning_rate": 4.985114406647658e-05, |
|
"loss": 2.0119, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.038755517278501454, |
|
"grad_norm": 7.1519646644592285, |
|
"learning_rate": 4.9747967345311055e-05, |
|
"loss": 1.9558, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.0391143646607098, |
|
"grad_norm": 6.3451151847839355, |
|
"learning_rate": 4.9643876097347296e-05, |
|
"loss": 2.0597, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.03947321204291815, |
|
"grad_norm": 6.151363849639893, |
|
"learning_rate": 4.953887460136881e-05, |
|
"loss": 2.1947, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03983205942512649, |
|
"grad_norm": 5.386812210083008, |
|
"learning_rate": 4.943296717357583e-05, |
|
"loss": 1.9995, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.04019090680733484, |
|
"grad_norm": 6.169729709625244, |
|
"learning_rate": 4.93261581674079e-05, |
|
"loss": 2.0852, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.04054975418954319, |
|
"grad_norm": 5.84644079208374, |
|
"learning_rate": 4.921845197336491e-05, |
|
"loss": 2.0196, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.04090860157175153, |
|
"grad_norm": 6.577886581420898, |
|
"learning_rate": 4.910985301882667e-05, |
|
"loss": 1.9489, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.04126744895395988, |
|
"grad_norm": 7.484111309051514, |
|
"learning_rate": 4.9000365767870824e-05, |
|
"loss": 2.3324, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.041626296336168225, |
|
"grad_norm": 5.723704814910889, |
|
"learning_rate": 4.8889994721089426e-05, |
|
"loss": 2.0919, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.041985143718376576, |
|
"grad_norm": 7.106482982635498, |
|
"learning_rate": 4.877874441540394e-05, |
|
"loss": 2.2332, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.04234399110058492, |
|
"grad_norm": 6.120841979980469, |
|
"learning_rate": 4.866661942387867e-05, |
|
"loss": 2.0377, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.04270283848279327, |
|
"grad_norm": 6.847580432891846, |
|
"learning_rate": 4.855362435553285e-05, |
|
"loss": 2.0172, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.043061685865001614, |
|
"grad_norm": 5.777744770050049, |
|
"learning_rate": 4.84397638551512e-05, |
|
"loss": 2.0351, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.043061685865001614, |
|
"eval_loss": 2.1424458026885986, |
|
"eval_runtime": 53.9494, |
|
"eval_samples_per_second": 9.268, |
|
"eval_steps_per_second": 9.268, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.043420533247209965, |
|
"grad_norm": 7.368004322052002, |
|
"learning_rate": 4.83250426030929e-05, |
|
"loss": 2.209, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.04377938062941831, |
|
"grad_norm": 5.8113017082214355, |
|
"learning_rate": 4.82094653150993e-05, |
|
"loss": 1.626, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.04413822801162665, |
|
"grad_norm": 6.2266435623168945, |
|
"learning_rate": 4.8093036742100026e-05, |
|
"loss": 1.9042, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.044497075393835, |
|
"grad_norm": 6.476573467254639, |
|
"learning_rate": 4.79757616700177e-05, |
|
"loss": 2.1182, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.04485592277604335, |
|
"grad_norm": 16.597238540649414, |
|
"learning_rate": 4.7857644919571176e-05, |
|
"loss": 2.0605, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.0452147701582517, |
|
"grad_norm": 6.337398052215576, |
|
"learning_rate": 4.773869134607747e-05, |
|
"loss": 2.0734, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.04557361754046004, |
|
"grad_norm": 8.511456489562988, |
|
"learning_rate": 4.761890583925204e-05, |
|
"loss": 1.9335, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.04593246492266839, |
|
"grad_norm": 7.134552478790283, |
|
"learning_rate": 4.749829332300792e-05, |
|
"loss": 2.0861, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.046291312304876736, |
|
"grad_norm": 8.12893295288086, |
|
"learning_rate": 4.737685875525327e-05, |
|
"loss": 2.3456, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.04665015968708508, |
|
"grad_norm": 5.985340118408203, |
|
"learning_rate": 4.725460712768751e-05, |
|
"loss": 1.9234, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.04700900706929343, |
|
"grad_norm": 5.3571600914001465, |
|
"learning_rate": 4.7131543465596236e-05, |
|
"loss": 2.0328, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.047367854451501774, |
|
"grad_norm": 6.484771728515625, |
|
"learning_rate": 4.700767282764459e-05, |
|
"loss": 2.3337, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.047726701833710125, |
|
"grad_norm": 6.518951416015625, |
|
"learning_rate": 4.688300030566933e-05, |
|
"loss": 2.4661, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.04808554921591847, |
|
"grad_norm": 5.992385387420654, |
|
"learning_rate": 4.6757531024469514e-05, |
|
"loss": 2.2451, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.04844439659812682, |
|
"grad_norm": 4.671659469604492, |
|
"learning_rate": 4.663127014159588e-05, |
|
"loss": 1.9866, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.04844439659812682, |
|
"eval_loss": 2.134829044342041, |
|
"eval_runtime": 54.778, |
|
"eval_samples_per_second": 9.128, |
|
"eval_steps_per_second": 9.128, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.04880324398033516, |
|
"grad_norm": 6.9112372398376465, |
|
"learning_rate": 4.650422284713878e-05, |
|
"loss": 2.0803, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.04916209136254351, |
|
"grad_norm": 5.6216912269592285, |
|
"learning_rate": 4.637639436351489e-05, |
|
"loss": 2.2036, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.04952093874475186, |
|
"grad_norm": 6.844085216522217, |
|
"learning_rate": 4.624778994525249e-05, |
|
"loss": 2.131, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.0498797861269602, |
|
"grad_norm": 5.771690368652344, |
|
"learning_rate": 4.6118414878775514e-05, |
|
"loss": 2.2257, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.05023863350916855, |
|
"grad_norm": 7.582961559295654, |
|
"learning_rate": 4.5988274482186214e-05, |
|
"loss": 2.1688, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.050597480891376896, |
|
"grad_norm": 7.324771881103516, |
|
"learning_rate": 4.5857374105046574e-05, |
|
"loss": 1.9221, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.05095632827358525, |
|
"grad_norm": 5.763311386108398, |
|
"learning_rate": 4.572571912815838e-05, |
|
"loss": 2.0738, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.05131517565579359, |
|
"grad_norm": 6.5076584815979, |
|
"learning_rate": 4.55933149633421e-05, |
|
"loss": 2.1497, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.051674023038001934, |
|
"grad_norm": 5.565882682800293, |
|
"learning_rate": 4.5460167053214335e-05, |
|
"loss": 2.102, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.052032870420210285, |
|
"grad_norm": 6.655306339263916, |
|
"learning_rate": 4.532628087096419e-05, |
|
"loss": 2.0995, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.05239171780241863, |
|
"grad_norm": 6.302664279937744, |
|
"learning_rate": 4.5191661920128194e-05, |
|
"loss": 2.024, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.05275056518462698, |
|
"grad_norm": 8.187132835388184, |
|
"learning_rate": 4.5056315734364154e-05, |
|
"loss": 2.0004, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.05310941256683532, |
|
"grad_norm": 6.062857627868652, |
|
"learning_rate": 4.492024787722368e-05, |
|
"loss": 2.0406, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.053468259949043674, |
|
"grad_norm": 6.693349838256836, |
|
"learning_rate": 4.47834639419234e-05, |
|
"loss": 2.2014, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.05382710733125202, |
|
"grad_norm": 4.937839031219482, |
|
"learning_rate": 4.464596955111518e-05, |
|
"loss": 2.1102, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05382710733125202, |
|
"eval_loss": 2.113711357116699, |
|
"eval_runtime": 54.1712, |
|
"eval_samples_per_second": 9.23, |
|
"eval_steps_per_second": 9.23, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05418595471346037, |
|
"grad_norm": 6.417968273162842, |
|
"learning_rate": 4.450777035665487e-05, |
|
"loss": 2.0496, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.05454480209566871, |
|
"grad_norm": 6.756476402282715, |
|
"learning_rate": 4.436887203937009e-05, |
|
"loss": 1.8097, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.054903649477877056, |
|
"grad_norm": 6.680830955505371, |
|
"learning_rate": 4.422928030882661e-05, |
|
"loss": 1.9655, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.05526249686008541, |
|
"grad_norm": 6.578978061676025, |
|
"learning_rate": 4.4089000903093746e-05, |
|
"loss": 2.0555, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.05562134424229375, |
|
"grad_norm": 6.425287246704102, |
|
"learning_rate": 4.394803958850844e-05, |
|
"loss": 2.0528, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.0559801916245021, |
|
"grad_norm": 6.3165974617004395, |
|
"learning_rate": 4.380640215943821e-05, |
|
"loss": 2.0266, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.056339039006710445, |
|
"grad_norm": 5.450393199920654, |
|
"learning_rate": 4.366409443804301e-05, |
|
"loss": 2.1695, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.056697886388918796, |
|
"grad_norm": 5.4890031814575195, |
|
"learning_rate": 4.352112227403589e-05, |
|
"loss": 1.8961, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.05705673377112714, |
|
"grad_norm": 5.995747089385986, |
|
"learning_rate": 4.337749154444254e-05, |
|
"loss": 1.8865, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.05741558115333548, |
|
"grad_norm": 6.454551696777344, |
|
"learning_rate": 4.3233208153359665e-05, |
|
"loss": 2.0315, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.057774428535543834, |
|
"grad_norm": 8.371834754943848, |
|
"learning_rate": 4.308827803171238e-05, |
|
"loss": 2.1006, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.05813327591775218, |
|
"grad_norm": 5.792433261871338, |
|
"learning_rate": 4.294270713701031e-05, |
|
"loss": 2.0556, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.05849212329996053, |
|
"grad_norm": 5.559154033660889, |
|
"learning_rate": 4.2796501453102784e-05, |
|
"loss": 2.2598, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.05885097068216887, |
|
"grad_norm": 7.165154933929443, |
|
"learning_rate": 4.264966698993282e-05, |
|
"loss": 1.8554, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.05920981806437722, |
|
"grad_norm": 6.646023750305176, |
|
"learning_rate": 4.2502209783290085e-05, |
|
"loss": 2.0373, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.05920981806437722, |
|
"eval_loss": 2.1128716468811035, |
|
"eval_runtime": 54.1565, |
|
"eval_samples_per_second": 9.233, |
|
"eval_steps_per_second": 9.233, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.05956866544658557, |
|
"grad_norm": 6.937278747558594, |
|
"learning_rate": 4.235413589456281e-05, |
|
"loss": 2.0333, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.05992751282879391, |
|
"grad_norm": 6.656656742095947, |
|
"learning_rate": 4.2205451410488565e-05, |
|
"loss": 1.9593, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.06028636021100226, |
|
"grad_norm": 5.974368095397949, |
|
"learning_rate": 4.205616244290416e-05, |
|
"loss": 2.009, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.060645207593210605, |
|
"grad_norm": 5.182796478271484, |
|
"learning_rate": 4.1906275128494296e-05, |
|
"loss": 2.003, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.061004054975418956, |
|
"grad_norm": 5.904313564300537, |
|
"learning_rate": 4.175579562853945e-05, |
|
"loss": 2.2, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.0613629023576273, |
|
"grad_norm": 6.160686492919922, |
|
"learning_rate": 4.160473012866242e-05, |
|
"loss": 2.061, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.06172174973983565, |
|
"grad_norm": 7.759803771972656, |
|
"learning_rate": 4.145308483857426e-05, |
|
"loss": 2.084, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.062080597122043994, |
|
"grad_norm": 5.889579772949219, |
|
"learning_rate": 4.1300865991818885e-05, |
|
"loss": 2.1036, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.062439444504252345, |
|
"grad_norm": 6.886099815368652, |
|
"learning_rate": 4.114807984551688e-05, |
|
"loss": 1.9959, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.06279829188646069, |
|
"grad_norm": 6.403357982635498, |
|
"learning_rate": 4.0994732680108296e-05, |
|
"loss": 2.2174, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.06315713926866903, |
|
"grad_norm": 5.627391338348389, |
|
"learning_rate": 4.084083079909448e-05, |
|
"loss": 1.9977, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.06351598665087738, |
|
"grad_norm": 6.172947883605957, |
|
"learning_rate": 4.068638052877899e-05, |
|
"loss": 2.031, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.06387483403308573, |
|
"grad_norm": 6.240361213684082, |
|
"learning_rate": 4.0531388218007466e-05, |
|
"loss": 1.892, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.06423368141529408, |
|
"grad_norm": 6.1612467765808105, |
|
"learning_rate": 4.037586023790676e-05, |
|
"loss": 2.0618, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.06459252879750242, |
|
"grad_norm": 5.873776435852051, |
|
"learning_rate": 4.0219802981622975e-05, |
|
"loss": 2.0004, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.06459252879750242, |
|
"eval_loss": 2.101851463317871, |
|
"eval_runtime": 55.5537, |
|
"eval_samples_per_second": 9.0, |
|
"eval_steps_per_second": 9.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.06495137617971077, |
|
"grad_norm": 7.73934268951416, |
|
"learning_rate": 4.006322286405867e-05, |
|
"loss": 1.9404, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.06531022356191911, |
|
"grad_norm": 7.713481903076172, |
|
"learning_rate": 3.99061263216092e-05, |
|
"loss": 2.1536, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.06566907094412747, |
|
"grad_norm": 5.538480758666992, |
|
"learning_rate": 3.974851981189813e-05, |
|
"loss": 2.1028, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.06602791832633581, |
|
"grad_norm": 4.73122501373291, |
|
"learning_rate": 3.9590409813511765e-05, |
|
"loss": 2.1097, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.06638676570854415, |
|
"grad_norm": 6.530364036560059, |
|
"learning_rate": 3.943180282573285e-05, |
|
"loss": 1.9667, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.0667456130907525, |
|
"grad_norm": 6.589163303375244, |
|
"learning_rate": 3.927270536827346e-05, |
|
"loss": 2.0643, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.06710446047296086, |
|
"grad_norm": 7.016654968261719, |
|
"learning_rate": 3.91131239810069e-05, |
|
"loss": 1.8271, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.0674633078551692, |
|
"grad_norm": 6.05830717086792, |
|
"learning_rate": 3.895306522369898e-05, |
|
"loss": 2.0217, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.06782215523737754, |
|
"grad_norm": 7.59095573425293, |
|
"learning_rate": 3.87925356757383e-05, |
|
"loss": 2.1178, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.06818100261958589, |
|
"grad_norm": 6.172999382019043, |
|
"learning_rate": 3.863154193586583e-05, |
|
"loss": 2.0382, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.06853985000179423, |
|
"grad_norm": 5.4832763671875, |
|
"learning_rate": 3.847009062190365e-05, |
|
"loss": 2.0855, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.06889869738400259, |
|
"grad_norm": 7.419639587402344, |
|
"learning_rate": 3.83081883704829e-05, |
|
"loss": 2.0993, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.06925754476621093, |
|
"grad_norm": 6.414045333862305, |
|
"learning_rate": 3.814584183677102e-05, |
|
"loss": 1.7863, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.06961639214841928, |
|
"grad_norm": 6.358382701873779, |
|
"learning_rate": 3.7983057694198145e-05, |
|
"loss": 1.9655, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.06997523953062762, |
|
"grad_norm": 6.980108261108398, |
|
"learning_rate": 3.781984263418279e-05, |
|
"loss": 1.8557, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.06997523953062762, |
|
"eval_loss": 2.0883588790893555, |
|
"eval_runtime": 56.8739, |
|
"eval_samples_per_second": 8.791, |
|
"eval_steps_per_second": 8.791, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.07033408691283598, |
|
"grad_norm": 6.167099952697754, |
|
"learning_rate": 3.76562033658568e-05, |
|
"loss": 2.1926, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.07069293429504432, |
|
"grad_norm": 5.376810550689697, |
|
"learning_rate": 3.749214661578957e-05, |
|
"loss": 2.1606, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.07105178167725267, |
|
"grad_norm": 6.163499355316162, |
|
"learning_rate": 3.732767912771153e-05, |
|
"loss": 2.2241, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.07141062905946101, |
|
"grad_norm": 5.99036979675293, |
|
"learning_rate": 3.716280766223693e-05, |
|
"loss": 2.1552, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.07176947644166935, |
|
"grad_norm": 6.162662982940674, |
|
"learning_rate": 3.699753899658596e-05, |
|
"loss": 1.9543, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07212832382387771, |
|
"grad_norm": 6.522611618041992, |
|
"learning_rate": 3.683187992430616e-05, |
|
"loss": 2.0151, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.07248717120608605, |
|
"grad_norm": 7.548040390014648, |
|
"learning_rate": 3.666583725499315e-05, |
|
"loss": 1.9932, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.0728460185882944, |
|
"grad_norm": 5.687351226806641, |
|
"learning_rate": 3.6499417814010715e-05, |
|
"loss": 2.0136, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.07320486597050274, |
|
"grad_norm": 5.9614715576171875, |
|
"learning_rate": 3.6332628442210255e-05, |
|
"loss": 2.127, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.07356371335271109, |
|
"grad_norm": 5.873690128326416, |
|
"learning_rate": 3.616547599564958e-05, |
|
"loss": 1.9534, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.07392256073491944, |
|
"grad_norm": 4.788761615753174, |
|
"learning_rate": 3.599796734531105e-05, |
|
"loss": 2.145, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.07428140811712779, |
|
"grad_norm": 6.241523742675781, |
|
"learning_rate": 3.5830109376819235e-05, |
|
"loss": 2.1061, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.07464025549933613, |
|
"grad_norm": 7.868603229522705, |
|
"learning_rate": 3.566190899015774e-05, |
|
"loss": 2.0651, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.07499910288154447, |
|
"grad_norm": 6.114261627197266, |
|
"learning_rate": 3.5493373099385677e-05, |
|
"loss": 1.905, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.07535795026375283, |
|
"grad_norm": 6.0507588386535645, |
|
"learning_rate": 3.5324508632353394e-05, |
|
"loss": 1.9759, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.07535795026375283, |
|
"eval_loss": 2.0796425342559814, |
|
"eval_runtime": 55.915, |
|
"eval_samples_per_second": 8.942, |
|
"eval_steps_per_second": 8.942, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.07571679764596118, |
|
"grad_norm": 6.28771448135376, |
|
"learning_rate": 3.515532253041774e-05, |
|
"loss": 1.9569, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.07607564502816952, |
|
"grad_norm": 5.322301864624023, |
|
"learning_rate": 3.498582174815671e-05, |
|
"loss": 2.061, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.07643449241037786, |
|
"grad_norm": 5.690662384033203, |
|
"learning_rate": 3.481601325308357e-05, |
|
"loss": 1.7273, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.07679333979258621, |
|
"grad_norm": 6.411470890045166, |
|
"learning_rate": 3.4645904025360455e-05, |
|
"loss": 1.8976, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.07715218717479456, |
|
"grad_norm": 7.68689489364624, |
|
"learning_rate": 3.447550105751145e-05, |
|
"loss": 2.3546, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.07751103455700291, |
|
"grad_norm": 6.074337005615234, |
|
"learning_rate": 3.4304811354135145e-05, |
|
"loss": 2.1717, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.07786988193921125, |
|
"grad_norm": 6.254448413848877, |
|
"learning_rate": 3.4133841931616696e-05, |
|
"loss": 2.1495, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.0782287293214196, |
|
"grad_norm": 6.312126636505127, |
|
"learning_rate": 3.396259981783942e-05, |
|
"loss": 1.8669, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.07858757670362795, |
|
"grad_norm": 6.91135311126709, |
|
"learning_rate": 3.37910920518959e-05, |
|
"loss": 1.9932, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.0789464240858363, |
|
"grad_norm": 5.6363935470581055, |
|
"learning_rate": 3.3619325683798646e-05, |
|
"loss": 2.0576, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.07930527146804464, |
|
"grad_norm": 5.843607425689697, |
|
"learning_rate": 3.3447307774190296e-05, |
|
"loss": 1.8834, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.07966411885025299, |
|
"grad_norm": 5.396796226501465, |
|
"learning_rate": 3.327504539405335e-05, |
|
"loss": 2.0703, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.08002296623246133, |
|
"grad_norm": 5.016238212585449, |
|
"learning_rate": 3.3102545624419583e-05, |
|
"loss": 2.0865, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.08038181361466969, |
|
"grad_norm": 6.115816116333008, |
|
"learning_rate": 3.292981555607884e-05, |
|
"loss": 2.2753, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.08074066099687803, |
|
"grad_norm": 8.006583213806152, |
|
"learning_rate": 3.2756862289287746e-05, |
|
"loss": 2.0162, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.08074066099687803, |
|
"eval_loss": 2.0666754245758057, |
|
"eval_runtime": 55.4275, |
|
"eval_samples_per_second": 9.021, |
|
"eval_steps_per_second": 9.021, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.08109950837908637, |
|
"grad_norm": 7.408049583435059, |
|
"learning_rate": 3.258369293347764e-05, |
|
"loss": 1.822, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.08145835576129472, |
|
"grad_norm": 6.273202896118164, |
|
"learning_rate": 3.241031460696251e-05, |
|
"loss": 2.0655, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.08181720314350306, |
|
"grad_norm": 6.0729756355285645, |
|
"learning_rate": 3.223673443664627e-05, |
|
"loss": 1.8968, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.08217605052571142, |
|
"grad_norm": 5.854898452758789, |
|
"learning_rate": 3.206295955772987e-05, |
|
"loss": 2.0447, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.08253489790791976, |
|
"grad_norm": 6.057408809661865, |
|
"learning_rate": 3.188899711341793e-05, |
|
"loss": 1.9488, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.0828937452901281, |
|
"grad_norm": 7.64241886138916, |
|
"learning_rate": 3.171485425462518e-05, |
|
"loss": 2.0667, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.08325259267233645, |
|
"grad_norm": 5.485935688018799, |
|
"learning_rate": 3.15405381396825e-05, |
|
"loss": 1.9668, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.08361144005454481, |
|
"grad_norm": 6.850480556488037, |
|
"learning_rate": 3.136605593404258e-05, |
|
"loss": 2.0711, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.08397028743675315, |
|
"grad_norm": 5.64699649810791, |
|
"learning_rate": 3.119141480998553e-05, |
|
"loss": 2.0573, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.0843291348189615, |
|
"grad_norm": 7.39286994934082, |
|
"learning_rate": 3.101662194632392e-05, |
|
"loss": 1.8136, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.08468798220116984, |
|
"grad_norm": 6.044868469238281, |
|
"learning_rate": 3.0841684528107766e-05, |
|
"loss": 1.6417, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.08504682958337818, |
|
"grad_norm": 5.207494258880615, |
|
"learning_rate": 3.066660974632914e-05, |
|
"loss": 1.8696, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.08540567696558654, |
|
"grad_norm": 4.362065315246582, |
|
"learning_rate": 3.0491404797626605e-05, |
|
"loss": 1.8506, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.08576452434779488, |
|
"grad_norm": 7.511015892028809, |
|
"learning_rate": 3.031607688398936e-05, |
|
"loss": 2.084, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.08612337173000323, |
|
"grad_norm": 6.082335472106934, |
|
"learning_rate": 3.0140633212461248e-05, |
|
"loss": 1.8544, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.08612337173000323, |
|
"eval_loss": 2.056474208831787, |
|
"eval_runtime": 55.8391, |
|
"eval_samples_per_second": 8.954, |
|
"eval_steps_per_second": 8.954, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.08648221911221157, |
|
"grad_norm": 6.311079502105713, |
|
"learning_rate": 2.9965080994844422e-05, |
|
"loss": 2.0455, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.08684106649441993, |
|
"grad_norm": 5.970705986022949, |
|
"learning_rate": 2.978942744740296e-05, |
|
"loss": 1.9218, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.08719991387662827, |
|
"grad_norm": 6.782005786895752, |
|
"learning_rate": 2.961367979056621e-05, |
|
"loss": 2.0712, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.08755876125883662, |
|
"grad_norm": 6.76627254486084, |
|
"learning_rate": 2.9437845248631984e-05, |
|
"loss": 1.984, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.08791760864104496, |
|
"grad_norm": 6.093632698059082, |
|
"learning_rate": 2.926193104946961e-05, |
|
"loss": 2.0155, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.0882764560232533, |
|
"grad_norm": 4.814499378204346, |
|
"learning_rate": 2.90859444242228e-05, |
|
"loss": 2.0989, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.08863530340546166, |
|
"grad_norm": 7.10281229019165, |
|
"learning_rate": 2.8909892607012427e-05, |
|
"loss": 1.9328, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.08899415078767, |
|
"grad_norm": 8.456049919128418, |
|
"learning_rate": 2.8733782834639165e-05, |
|
"loss": 1.8714, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.08935299816987835, |
|
"grad_norm": 5.278759479522705, |
|
"learning_rate": 2.8557622346285957e-05, |
|
"loss": 1.9494, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.0897118455520867, |
|
"grad_norm": 5.60235071182251, |
|
"learning_rate": 2.8381418383220526e-05, |
|
"loss": 2.1887, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.09007069293429504, |
|
"grad_norm": 5.99770975112915, |
|
"learning_rate": 2.8205178188497627e-05, |
|
"loss": 2.0496, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.0904295403165034, |
|
"grad_norm": 6.4976396560668945, |
|
"learning_rate": 2.8028909006661396e-05, |
|
"loss": 2.0247, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.09078838769871174, |
|
"grad_norm": 6.767058849334717, |
|
"learning_rate": 2.78526180834475e-05, |
|
"loss": 2.051, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.09114723508092008, |
|
"grad_norm": 7.581692218780518, |
|
"learning_rate": 2.7676312665485307e-05, |
|
"loss": 2.0146, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.09150608246312843, |
|
"grad_norm": 5.49759578704834, |
|
"learning_rate": 2.75e-05, |
|
"loss": 2.164, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.09150608246312843, |
|
"eval_loss": 2.0493671894073486, |
|
"eval_runtime": 55.2033, |
|
"eval_samples_per_second": 9.057, |
|
"eval_steps_per_second": 9.057, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.09186492984533678, |
|
"grad_norm": 6.024051189422607, |
|
"learning_rate": 2.7323687334514695e-05, |
|
"loss": 2.0757, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.09222377722754513, |
|
"grad_norm": 6.39860725402832, |
|
"learning_rate": 2.71473819165525e-05, |
|
"loss": 1.979, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.09258262460975347, |
|
"grad_norm": 6.058781147003174, |
|
"learning_rate": 2.6971090993338606e-05, |
|
"loss": 2.2082, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.09294147199196182, |
|
"grad_norm": 5.549355506896973, |
|
"learning_rate": 2.679482181150238e-05, |
|
"loss": 2.1154, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.09330031937417016, |
|
"grad_norm": 5.480435371398926, |
|
"learning_rate": 2.6618581616779483e-05, |
|
"loss": 2.0381, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.09365916675637852, |
|
"grad_norm": 4.98717737197876, |
|
"learning_rate": 2.644237765371404e-05, |
|
"loss": 1.9317, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.09401801413858686, |
|
"grad_norm": 7.349699020385742, |
|
"learning_rate": 2.626621716536085e-05, |
|
"loss": 2.135, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.0943768615207952, |
|
"grad_norm": 6.156417369842529, |
|
"learning_rate": 2.6090107392987575e-05, |
|
"loss": 1.8111, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.09473570890300355, |
|
"grad_norm": 5.92335844039917, |
|
"learning_rate": 2.591405557577721e-05, |
|
"loss": 2.0825, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.09509455628521189, |
|
"grad_norm": 6.328557014465332, |
|
"learning_rate": 2.5738068950530398e-05, |
|
"loss": 2.0139, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.09545340366742025, |
|
"grad_norm": 5.462120056152344, |
|
"learning_rate": 2.5562154751368014e-05, |
|
"loss": 2.0133, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.0958122510496286, |
|
"grad_norm": 5.9423933029174805, |
|
"learning_rate": 2.5386320209433798e-05, |
|
"loss": 2.0737, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.09617109843183694, |
|
"grad_norm": 5.581554889678955, |
|
"learning_rate": 2.5210572552597046e-05, |
|
"loss": 2.1384, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.09652994581404528, |
|
"grad_norm": 5.9095048904418945, |
|
"learning_rate": 2.5034919005155583e-05, |
|
"loss": 1.6144, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.09688879319625364, |
|
"grad_norm": 4.735531330108643, |
|
"learning_rate": 2.4859366787538754e-05, |
|
"loss": 1.8778, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.09688879319625364, |
|
"eval_loss": 2.0385708808898926, |
|
"eval_runtime": 55.0586, |
|
"eval_samples_per_second": 9.081, |
|
"eval_steps_per_second": 9.081, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.09724764057846198, |
|
"grad_norm": 8.026204109191895, |
|
"learning_rate": 2.468392311601064e-05, |
|
"loss": 1.7923, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.09760648796067033, |
|
"grad_norm": 4.735470771789551, |
|
"learning_rate": 2.4508595202373404e-05, |
|
"loss": 1.9045, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.09796533534287867, |
|
"grad_norm": 5.316372871398926, |
|
"learning_rate": 2.433339025367087e-05, |
|
"loss": 1.8494, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.09832418272508701, |
|
"grad_norm": 6.198282241821289, |
|
"learning_rate": 2.415831547189224e-05, |
|
"loss": 1.8475, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.09868303010729537, |
|
"grad_norm": 6.189024448394775, |
|
"learning_rate": 2.3983378053676083e-05, |
|
"loss": 1.8453, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.09904187748950372, |
|
"grad_norm": 6.285246849060059, |
|
"learning_rate": 2.3808585190014484e-05, |
|
"loss": 2.0555, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.09940072487171206, |
|
"grad_norm": 5.848416328430176, |
|
"learning_rate": 2.3633944065957427e-05, |
|
"loss": 1.8966, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.0997595722539204, |
|
"grad_norm": 4.883105278015137, |
|
"learning_rate": 2.345946186031751e-05, |
|
"loss": 1.9917, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.10011841963612876, |
|
"grad_norm": 7.421256065368652, |
|
"learning_rate": 2.328514574537481e-05, |
|
"loss": 2.087, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.1004772670183371, |
|
"grad_norm": 7.284880638122559, |
|
"learning_rate": 2.311100288658208e-05, |
|
"loss": 2.2515, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.10083611440054545, |
|
"grad_norm": 6.9479241371154785, |
|
"learning_rate": 2.2937040442270142e-05, |
|
"loss": 2.1164, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.10119496178275379, |
|
"grad_norm": 5.4754838943481445, |
|
"learning_rate": 2.2763265563353733e-05, |
|
"loss": 2.0222, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.10155380916496214, |
|
"grad_norm": 5.791376113891602, |
|
"learning_rate": 2.2589685393037495e-05, |
|
"loss": 1.8716, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.1019126565471705, |
|
"grad_norm": 6.261075496673584, |
|
"learning_rate": 2.241630706652236e-05, |
|
"loss": 1.8274, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.10227150392937884, |
|
"grad_norm": 6.414833068847656, |
|
"learning_rate": 2.2243137710712266e-05, |
|
"loss": 2.1204, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.10227150392937884, |
|
"eval_loss": 2.02933406829834, |
|
"eval_runtime": 56.1943, |
|
"eval_samples_per_second": 8.898, |
|
"eval_steps_per_second": 8.898, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.10263035131158718, |
|
"grad_norm": 4.790386199951172, |
|
"learning_rate": 2.2070184443921156e-05, |
|
"loss": 1.7387, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.10298919869379553, |
|
"grad_norm": 5.4755964279174805, |
|
"learning_rate": 2.1897454375580425e-05, |
|
"loss": 2.0086, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.10334804607600387, |
|
"grad_norm": 7.731913089752197, |
|
"learning_rate": 2.1724954605946642e-05, |
|
"loss": 1.927, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.10370689345821223, |
|
"grad_norm": 5.966332912445068, |
|
"learning_rate": 2.1552692225809706e-05, |
|
"loss": 1.8911, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.10406574084042057, |
|
"grad_norm": 5.681090354919434, |
|
"learning_rate": 2.1380674316201356e-05, |
|
"loss": 2.1041, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.10442458822262891, |
|
"grad_norm": 5.348470687866211, |
|
"learning_rate": 2.1208907948104105e-05, |
|
"loss": 2.0891, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.10478343560483726, |
|
"grad_norm": 5.933499336242676, |
|
"learning_rate": 2.1037400182160584e-05, |
|
"loss": 2.1604, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.10514228298704562, |
|
"grad_norm": 6.975137233734131, |
|
"learning_rate": 2.0866158068383306e-05, |
|
"loss": 1.9219, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.10550113036925396, |
|
"grad_norm": 5.7571702003479, |
|
"learning_rate": 2.069518864586486e-05, |
|
"loss": 2.1009, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.1058599777514623, |
|
"grad_norm": 6.262353897094727, |
|
"learning_rate": 2.052449894248855e-05, |
|
"loss": 1.9981, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.10621882513367065, |
|
"grad_norm": 6.145860195159912, |
|
"learning_rate": 2.035409597463955e-05, |
|
"loss": 1.9854, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.10657767251587899, |
|
"grad_norm": 8.0071382522583, |
|
"learning_rate": 2.0183986746916438e-05, |
|
"loss": 2.0659, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.10693651989808735, |
|
"grad_norm": 6.31511926651001, |
|
"learning_rate": 2.0014178251843294e-05, |
|
"loss": 2.0654, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.10729536728029569, |
|
"grad_norm": 4.879230976104736, |
|
"learning_rate": 1.9844677469582266e-05, |
|
"loss": 2.2041, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.10765421466250404, |
|
"grad_norm": 7.417413234710693, |
|
"learning_rate": 1.967549136764661e-05, |
|
"loss": 1.7006, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10765421466250404, |
|
"eval_loss": 2.021873712539673, |
|
"eval_runtime": 56.6447, |
|
"eval_samples_per_second": 8.827, |
|
"eval_steps_per_second": 8.827, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10801306204471238, |
|
"grad_norm": 6.9786696434021, |
|
"learning_rate": 1.950662690061433e-05, |
|
"loss": 1.8899, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.10837190942692074, |
|
"grad_norm": 6.570639133453369, |
|
"learning_rate": 1.9338091009842258e-05, |
|
"loss": 1.9747, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.10873075680912908, |
|
"grad_norm": 6.848568439483643, |
|
"learning_rate": 1.916989062318077e-05, |
|
"loss": 1.8844, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.10908960419133742, |
|
"grad_norm": 5.3527631759643555, |
|
"learning_rate": 1.900203265468895e-05, |
|
"loss": 2.0005, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.10944845157354577, |
|
"grad_norm": 6.555069446563721, |
|
"learning_rate": 1.8834524004350432e-05, |
|
"loss": 2.0391, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.10980729895575411, |
|
"grad_norm": 4.19492244720459, |
|
"learning_rate": 1.8667371557789747e-05, |
|
"loss": 2.0223, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.11016614633796247, |
|
"grad_norm": 7.25844144821167, |
|
"learning_rate": 1.8500582185989287e-05, |
|
"loss": 1.7081, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.11052499372017081, |
|
"grad_norm": 5.693355560302734, |
|
"learning_rate": 1.8334162745006857e-05, |
|
"loss": 1.8408, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.11088384110237916, |
|
"grad_norm": 5.997495174407959, |
|
"learning_rate": 1.8168120075693843e-05, |
|
"loss": 1.9224, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.1112426884845875, |
|
"grad_norm": 5.106387615203857, |
|
"learning_rate": 1.8002461003414043e-05, |
|
"loss": 2.3499, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.11160153586679585, |
|
"grad_norm": 6.284130573272705, |
|
"learning_rate": 1.7837192337763072e-05, |
|
"loss": 2.0203, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.1119603832490042, |
|
"grad_norm": 6.099617004394531, |
|
"learning_rate": 1.7672320872288483e-05, |
|
"loss": 2.0099, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.11231923063121255, |
|
"grad_norm": 6.717756748199463, |
|
"learning_rate": 1.750785338421044e-05, |
|
"loss": 2.1629, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.11267807801342089, |
|
"grad_norm": 7.799689769744873, |
|
"learning_rate": 1.7343796634143204e-05, |
|
"loss": 2.1267, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.11303692539562923, |
|
"grad_norm": 7.066042900085449, |
|
"learning_rate": 1.7180157365817214e-05, |
|
"loss": 2.1079, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.11303692539562923, |
|
"eval_loss": 2.0140340328216553, |
|
"eval_runtime": 54.1066, |
|
"eval_samples_per_second": 9.241, |
|
"eval_steps_per_second": 9.241, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.11339577277783759, |
|
"grad_norm": 5.944588661193848, |
|
"learning_rate": 1.7016942305801853e-05, |
|
"loss": 2.0258, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.11375462016004594, |
|
"grad_norm": 6.430848598480225, |
|
"learning_rate": 1.6854158163228982e-05, |
|
"loss": 2.0832, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.11411346754225428, |
|
"grad_norm": 5.41016149520874, |
|
"learning_rate": 1.6691811629517104e-05, |
|
"loss": 1.7936, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.11447231492446262, |
|
"grad_norm": 5.203214168548584, |
|
"learning_rate": 1.6529909378096355e-05, |
|
"loss": 2.0471, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.11483116230667097, |
|
"grad_norm": 7.248607635498047, |
|
"learning_rate": 1.636845806413417e-05, |
|
"loss": 2.0012, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.11519000968887932, |
|
"grad_norm": 7.401021480560303, |
|
"learning_rate": 1.6207464324261707e-05, |
|
"loss": 2.0099, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.11554885707108767, |
|
"grad_norm": 6.161318302154541, |
|
"learning_rate": 1.6046934776301034e-05, |
|
"loss": 1.9928, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.11590770445329601, |
|
"grad_norm": 6.958371162414551, |
|
"learning_rate": 1.588687601899311e-05, |
|
"loss": 1.8316, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.11626655183550436, |
|
"grad_norm": 6.18934440612793, |
|
"learning_rate": 1.5727294631726555e-05, |
|
"loss": 1.987, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.11662539921771271, |
|
"grad_norm": 6.302839756011963, |
|
"learning_rate": 1.5568197174267155e-05, |
|
"loss": 1.9393, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.11698424659992106, |
|
"grad_norm": 6.34058952331543, |
|
"learning_rate": 1.5409590186488247e-05, |
|
"loss": 1.7194, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.1173430939821294, |
|
"grad_norm": 5.862635612487793, |
|
"learning_rate": 1.5251480188101872e-05, |
|
"loss": 1.7798, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.11770194136433774, |
|
"grad_norm": 6.237195014953613, |
|
"learning_rate": 1.5093873678390796e-05, |
|
"loss": 2.0145, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.11806078874654609, |
|
"grad_norm": 7.536279678344727, |
|
"learning_rate": 1.4936777135941329e-05, |
|
"loss": 1.927, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.11841963612875445, |
|
"grad_norm": 4.670105457305908, |
|
"learning_rate": 1.4780197018377037e-05, |
|
"loss": 1.9629, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.11841963612875445, |
|
"eval_loss": 2.0052058696746826, |
|
"eval_runtime": 54.4162, |
|
"eval_samples_per_second": 9.188, |
|
"eval_steps_per_second": 9.188, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.11877848351096279, |
|
"grad_norm": 5.815291881561279, |
|
"learning_rate": 1.4624139762093247e-05, |
|
"loss": 1.902, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.11913733089317113, |
|
"grad_norm": 9.564926147460938, |
|
"learning_rate": 1.4468611781992537e-05, |
|
"loss": 1.7636, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.11949617827537948, |
|
"grad_norm": 6.171423435211182, |
|
"learning_rate": 1.4313619471221022e-05, |
|
"loss": 2.1226, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.11985502565758782, |
|
"grad_norm": 7.247731685638428, |
|
"learning_rate": 1.4159169200905515e-05, |
|
"loss": 1.9989, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.12021387303979618, |
|
"grad_norm": 6.01906156539917, |
|
"learning_rate": 1.4005267319891719e-05, |
|
"loss": 1.8619, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.12057272042200452, |
|
"grad_norm": 5.43905782699585, |
|
"learning_rate": 1.3851920154483133e-05, |
|
"loss": 2.022, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.12093156780421287, |
|
"grad_norm": 5.442080020904541, |
|
"learning_rate": 1.3699134008181126e-05, |
|
"loss": 1.8867, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.12129041518642121, |
|
"grad_norm": 6.583103179931641, |
|
"learning_rate": 1.3546915161425745e-05, |
|
"loss": 1.6851, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.12164926256862957, |
|
"grad_norm": 5.341058731079102, |
|
"learning_rate": 1.3395269871337586e-05, |
|
"loss": 1.8617, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.12200810995083791, |
|
"grad_norm": 6.2624993324279785, |
|
"learning_rate": 1.3244204371460562e-05, |
|
"loss": 1.8916, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.12236695733304626, |
|
"grad_norm": 5.550163745880127, |
|
"learning_rate": 1.3093724871505698e-05, |
|
"loss": 1.8684, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.1227258047152546, |
|
"grad_norm": 6.564154624938965, |
|
"learning_rate": 1.2943837557095845e-05, |
|
"loss": 2.0437, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.12308465209746294, |
|
"grad_norm": 5.661564826965332, |
|
"learning_rate": 1.2794548589511433e-05, |
|
"loss": 1.8696, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.1234434994796713, |
|
"grad_norm": 5.914830207824707, |
|
"learning_rate": 1.2645864105437201e-05, |
|
"loss": 1.8871, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.12380234686187964, |
|
"grad_norm": 5.326396942138672, |
|
"learning_rate": 1.2497790216709914e-05, |
|
"loss": 1.8341, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.12380234686187964, |
|
"eval_loss": 2.000833034515381, |
|
"eval_runtime": 54.9976, |
|
"eval_samples_per_second": 9.091, |
|
"eval_steps_per_second": 9.091, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.12416119424408799, |
|
"grad_norm": 6.134740829467773, |
|
"learning_rate": 1.2350333010067184e-05, |
|
"loss": 1.7077, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.12452004162629633, |
|
"grad_norm": 5.5340576171875, |
|
"learning_rate": 1.2203498546897221e-05, |
|
"loss": 1.9318, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.12487888900850469, |
|
"grad_norm": 6.9045305252075195, |
|
"learning_rate": 1.2057292862989693e-05, |
|
"loss": 2.0016, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.12523773639071303, |
|
"grad_norm": 7.031048774719238, |
|
"learning_rate": 1.1911721968287635e-05, |
|
"loss": 1.875, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.12559658377292138, |
|
"grad_norm": 5.962172508239746, |
|
"learning_rate": 1.176679184664034e-05, |
|
"loss": 1.9473, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.12595543115512972, |
|
"grad_norm": 5.502182483673096, |
|
"learning_rate": 1.1622508455557471e-05, |
|
"loss": 1.9898, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.12631427853733806, |
|
"grad_norm": 4.957609176635742, |
|
"learning_rate": 1.1478877725964109e-05, |
|
"loss": 1.9896, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.1266731259195464, |
|
"grad_norm": 5.988749980926514, |
|
"learning_rate": 1.1335905561956992e-05, |
|
"loss": 1.8328, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.12703197330175475, |
|
"grad_norm": 7.231197357177734, |
|
"learning_rate": 1.1193597840561793e-05, |
|
"loss": 1.8092, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.12739082068396312, |
|
"grad_norm": 5.365965366363525, |
|
"learning_rate": 1.1051960411491561e-05, |
|
"loss": 1.6869, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.12774966806617147, |
|
"grad_norm": 5.000406265258789, |
|
"learning_rate": 1.0910999096906248e-05, |
|
"loss": 1.7699, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.1281085154483798, |
|
"grad_norm": 5.213188648223877, |
|
"learning_rate": 1.0770719691173388e-05, |
|
"loss": 2.0603, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.12846736283058816, |
|
"grad_norm": 6.383294582366943, |
|
"learning_rate": 1.0631127960629924e-05, |
|
"loss": 1.9855, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.1288262102127965, |
|
"grad_norm": 5.159536838531494, |
|
"learning_rate": 1.0492229643345136e-05, |
|
"loss": 1.9706, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.12918505759500484, |
|
"grad_norm": 6.980235576629639, |
|
"learning_rate": 1.0354030448884829e-05, |
|
"loss": 2.0075, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.12918505759500484, |
|
"eval_loss": 1.9919238090515137, |
|
"eval_runtime": 56.2773, |
|
"eval_samples_per_second": 8.885, |
|
"eval_steps_per_second": 8.885, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.1295439049772132, |
|
"grad_norm": 5.887603759765625, |
|
"learning_rate": 1.02165360580766e-05, |
|
"loss": 1.9446, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.12990275235942153, |
|
"grad_norm": 5.93571662902832, |
|
"learning_rate": 1.0079752122776338e-05, |
|
"loss": 1.9175, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.13026159974162987, |
|
"grad_norm": 5.173825740814209, |
|
"learning_rate": 9.94368426563585e-06, |
|
"loss": 1.7861, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.13062044712383822, |
|
"grad_norm": 5.444214344024658, |
|
"learning_rate": 9.80833807987182e-06, |
|
"loss": 1.9604, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.1309792945060466, |
|
"grad_norm": 6.3942718505859375, |
|
"learning_rate": 9.673719129035826e-06, |
|
"loss": 1.8913, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.13133814188825493, |
|
"grad_norm": 6.097336769104004, |
|
"learning_rate": 9.53983294678566e-06, |
|
"loss": 1.9104, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.13169698927046328, |
|
"grad_norm": 6.532045364379883, |
|
"learning_rate": 9.406685036657904e-06, |
|
"loss": 2.192, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.13205583665267162, |
|
"grad_norm": 5.635133743286133, |
|
"learning_rate": 9.27428087184162e-06, |
|
"loss": 1.9662, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.13241468403487996, |
|
"grad_norm": 5.020458698272705, |
|
"learning_rate": 9.142625894953431e-06, |
|
"loss": 1.8653, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.1327735314170883, |
|
"grad_norm": 6.398831367492676, |
|
"learning_rate": 9.011725517813786e-06, |
|
"loss": 1.9715, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.13313237879929665, |
|
"grad_norm": 5.7541890144348145, |
|
"learning_rate": 8.881585121224496e-06, |
|
"loss": 1.7682, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.133491226181505, |
|
"grad_norm": 6.622452259063721, |
|
"learning_rate": 8.752210054747517e-06, |
|
"loss": 1.9944, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.13385007356371334, |
|
"grad_norm": 5.588860034942627, |
|
"learning_rate": 8.623605636485119e-06, |
|
"loss": 2.1929, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.1342089209459217, |
|
"grad_norm": 5.577978134155273, |
|
"learning_rate": 8.495777152861222e-06, |
|
"loss": 1.9228, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.13456776832813006, |
|
"grad_norm": 5.694477081298828, |
|
"learning_rate": 8.368729858404125e-06, |
|
"loss": 1.9947, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.13456776832813006, |
|
"eval_loss": 1.9859551191329956, |
|
"eval_runtime": 55.4111, |
|
"eval_samples_per_second": 9.023, |
|
"eval_steps_per_second": 9.023, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.1349266157103384, |
|
"grad_norm": 7.570437431335449, |
|
"learning_rate": 8.242468975530497e-06, |
|
"loss": 1.8791, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.13528546309254674, |
|
"grad_norm": 7.051590442657471, |
|
"learning_rate": 8.116999694330684e-06, |
|
"loss": 1.9133, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.1356443104747551, |
|
"grad_norm": 8.620963096618652, |
|
"learning_rate": 7.99232717235541e-06, |
|
"loss": 1.9702, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.13600315785696343, |
|
"grad_norm": 5.9947686195373535, |
|
"learning_rate": 7.86845653440376e-06, |
|
"loss": 1.9478, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.13636200523917177, |
|
"grad_norm": 7.256514072418213, |
|
"learning_rate": 7.745392872312495e-06, |
|
"loss": 1.9925, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.13672085262138012, |
|
"grad_norm": 6.057373523712158, |
|
"learning_rate": 7.623141244746736e-06, |
|
"loss": 1.9152, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.13707970000358846, |
|
"grad_norm": 6.098325729370117, |
|
"learning_rate": 7.5017066769920735e-06, |
|
"loss": 1.7821, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.13743854738579683, |
|
"grad_norm": 6.742500305175781, |
|
"learning_rate": 7.381094160747963e-06, |
|
"loss": 2.0775, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.13779739476800518, |
|
"grad_norm": 6.884401798248291, |
|
"learning_rate": 7.261308653922539e-06, |
|
"loss": 2.111, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.13815624215021352, |
|
"grad_norm": 5.880162239074707, |
|
"learning_rate": 7.1423550804288275e-06, |
|
"loss": 1.9414, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.13851508953242186, |
|
"grad_norm": 5.843265056610107, |
|
"learning_rate": 7.024238329982311e-06, |
|
"loss": 1.982, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.1388739369146302, |
|
"grad_norm": 6.20538854598999, |
|
"learning_rate": 6.906963257899975e-06, |
|
"loss": 2.0604, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.13923278429683855, |
|
"grad_norm": 5.832340240478516, |
|
"learning_rate": 6.7905346849007014e-06, |
|
"loss": 1.86, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.1395916316790469, |
|
"grad_norm": 5.763581275939941, |
|
"learning_rate": 6.674957396907109e-06, |
|
"loss": 1.8716, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.13995047906125524, |
|
"grad_norm": 5.205382823944092, |
|
"learning_rate": 6.560236144848803e-06, |
|
"loss": 1.9625, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.13995047906125524, |
|
"eval_loss": 1.9828507900238037, |
|
"eval_runtime": 54.4769, |
|
"eval_samples_per_second": 9.178, |
|
"eval_steps_per_second": 9.178, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.14030932644346358, |
|
"grad_norm": 6.159921169281006, |
|
"learning_rate": 6.4463756444671446e-06, |
|
"loss": 2.0608, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.14066817382567195, |
|
"grad_norm": 7.304522514343262, |
|
"learning_rate": 6.333380576121334e-06, |
|
"loss": 2.1806, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.1410270212078803, |
|
"grad_norm": 6.259190559387207, |
|
"learning_rate": 6.221255584596061e-06, |
|
"loss": 1.9558, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.14138586859008864, |
|
"grad_norm": 5.298434734344482, |
|
"learning_rate": 6.110005278910572e-06, |
|
"loss": 1.9614, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.141744715972297, |
|
"grad_norm": 5.2134528160095215, |
|
"learning_rate": 5.999634232129181e-06, |
|
"loss": 1.7626, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.14210356335450533, |
|
"grad_norm": 6.282635688781738, |
|
"learning_rate": 5.890146981173336e-06, |
|
"loss": 1.801, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.14246241073671367, |
|
"grad_norm": 5.433168888092041, |
|
"learning_rate": 5.781548026635087e-06, |
|
"loss": 2.0156, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.14282125811892202, |
|
"grad_norm": 7.175068378448486, |
|
"learning_rate": 5.673841832592114e-06, |
|
"loss": 2.1676, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.14318010550113036, |
|
"grad_norm": 5.729061603546143, |
|
"learning_rate": 5.56703282642418e-06, |
|
"loss": 2.0456, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.1435389528833387, |
|
"grad_norm": 5.6335883140563965, |
|
"learning_rate": 5.461125398631196e-06, |
|
"loss": 1.8919, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14389780026554708, |
|
"grad_norm": 6.064164161682129, |
|
"learning_rate": 5.356123902652707e-06, |
|
"loss": 1.9008, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.14425664764775542, |
|
"grad_norm": 5.617476463317871, |
|
"learning_rate": 5.252032654688949e-06, |
|
"loss": 1.7857, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.14461549502996376, |
|
"grad_norm": 5.734108924865723, |
|
"learning_rate": 5.148855933523428e-06, |
|
"loss": 1.8976, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.1449743424121721, |
|
"grad_norm": 5.556881427764893, |
|
"learning_rate": 5.046597980347035e-06, |
|
"loss": 2.1003, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.14533318979438045, |
|
"grad_norm": 6.115296840667725, |
|
"learning_rate": 4.945262998583711e-06, |
|
"loss": 1.9994, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.14533318979438045, |
|
"eval_loss": 1.977493405342102, |
|
"eval_runtime": 53.8847, |
|
"eval_samples_per_second": 9.279, |
|
"eval_steps_per_second": 9.279, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.1456920371765888, |
|
"grad_norm": 5.492610454559326, |
|
"learning_rate": 4.844855153717654e-06, |
|
"loss": 1.785, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.14605088455879714, |
|
"grad_norm": 6.202512264251709, |
|
"learning_rate": 4.745378573122101e-06, |
|
"loss": 1.9527, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.14640973194100548, |
|
"grad_norm": 5.397292137145996, |
|
"learning_rate": 4.646837345889642e-06, |
|
"loss": 1.8806, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.14676857932321383, |
|
"grad_norm": 4.79435396194458, |
|
"learning_rate": 4.5492355226641775e-06, |
|
"loss": 1.6986, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.14712742670542217, |
|
"grad_norm": 5.705773830413818, |
|
"learning_rate": 4.452577115474384e-06, |
|
"loss": 1.9535, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.14748627408763054, |
|
"grad_norm": 6.980686187744141, |
|
"learning_rate": 4.3568660975687884e-06, |
|
"loss": 2.0289, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.14784512146983889, |
|
"grad_norm": 5.97441291809082, |
|
"learning_rate": 4.262106403252474e-06, |
|
"loss": 1.8012, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.14820396885204723, |
|
"grad_norm": 6.008128643035889, |
|
"learning_rate": 4.168301927725312e-06, |
|
"loss": 1.7863, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.14856281623425557, |
|
"grad_norm": 6.05222749710083, |
|
"learning_rate": 4.075456526921887e-06, |
|
"loss": 1.6858, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.14892166361646392, |
|
"grad_norm": 4.7681803703308105, |
|
"learning_rate": 3.983574017352983e-06, |
|
"loss": 1.8194, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.14928051099867226, |
|
"grad_norm": 6.2553887367248535, |
|
"learning_rate": 3.8926581759486824e-06, |
|
"loss": 1.9261, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.1496393583808806, |
|
"grad_norm": 6.245451927185059, |
|
"learning_rate": 3.8027127399031364e-06, |
|
"loss": 2.0298, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.14999820576308895, |
|
"grad_norm": 5.130878448486328, |
|
"learning_rate": 3.7137414065209284e-06, |
|
"loss": 2.0474, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.1503570531452973, |
|
"grad_norm": 5.6020684242248535, |
|
"learning_rate": 3.6257478330650916e-06, |
|
"loss": 1.975, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.15071590052750566, |
|
"grad_norm": 6.997092247009277, |
|
"learning_rate": 3.5387356366067913e-06, |
|
"loss": 1.8301, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.15071590052750566, |
|
"eval_loss": 1.9740864038467407, |
|
"eval_runtime": 54.8117, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 9.122, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.151074747909714, |
|
"grad_norm": 5.829411029815674, |
|
"learning_rate": 3.45270839387662e-06, |
|
"loss": 1.8474, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.15143359529192235, |
|
"grad_norm": 5.0621018409729, |
|
"learning_rate": 3.3676696411175727e-06, |
|
"loss": 1.8278, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.1517924426741307, |
|
"grad_norm": 5.297969818115234, |
|
"learning_rate": 3.283622873939705e-06, |
|
"loss": 1.9351, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.15215129005633904, |
|
"grad_norm": 7.4726738929748535, |
|
"learning_rate": 3.2005715471764303e-06, |
|
"loss": 2.0655, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.15251013743854738, |
|
"grad_norm": 7.288102149963379, |
|
"learning_rate": 3.118519074742497e-06, |
|
"loss": 2.1868, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.15286898482075573, |
|
"grad_norm": 6.620118618011475, |
|
"learning_rate": 3.037468829493679e-06, |
|
"loss": 2.0158, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.15322783220296407, |
|
"grad_norm": 5.841970443725586, |
|
"learning_rate": 2.9574241430880926e-06, |
|
"loss": 1.891, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.15358667958517241, |
|
"grad_norm": 6.563481330871582, |
|
"learning_rate": 2.878388305849292e-06, |
|
"loss": 2.0859, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.15394552696738079, |
|
"grad_norm": 6.011387348175049, |
|
"learning_rate": 2.8003645666309768e-06, |
|
"loss": 1.9052, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.15430437434958913, |
|
"grad_norm": 5.620529651641846, |
|
"learning_rate": 2.7233561326834765e-06, |
|
"loss": 1.8405, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.15466322173179747, |
|
"grad_norm": 7.352434158325195, |
|
"learning_rate": 2.647366169521881e-06, |
|
"loss": 1.9094, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.15502206911400582, |
|
"grad_norm": 6.008353233337402, |
|
"learning_rate": 2.5723978007959507e-06, |
|
"loss": 1.8437, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.15538091649621416, |
|
"grad_norm": 6.1527419090271, |
|
"learning_rate": 2.4984541081616895e-06, |
|
"loss": 1.9673, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.1557397638784225, |
|
"grad_norm": 5.206054210662842, |
|
"learning_rate": 2.4255381311546833e-06, |
|
"loss": 1.8418, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.15609861126063085, |
|
"grad_norm": 5.852407455444336, |
|
"learning_rate": 2.3536528670651595e-06, |
|
"loss": 2.0213, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.15609861126063085, |
|
"eval_loss": 1.9717787504196167, |
|
"eval_runtime": 55.4488, |
|
"eval_samples_per_second": 9.017, |
|
"eval_steps_per_second": 9.017, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.1564574586428392, |
|
"grad_norm": 6.1430439949035645, |
|
"learning_rate": 2.2828012708147603e-06, |
|
"loss": 1.7364, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.15681630602504754, |
|
"grad_norm": 7.5634989738464355, |
|
"learning_rate": 2.2129862548351094e-06, |
|
"loss": 1.8513, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.1571751534072559, |
|
"grad_norm": 4.393237590789795, |
|
"learning_rate": 2.1442106889480615e-06, |
|
"loss": 2.0098, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.15753400078946425, |
|
"grad_norm": 7.880530834197998, |
|
"learning_rate": 2.0764774002477615e-06, |
|
"loss": 1.878, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.1578928481716726, |
|
"grad_norm": 5.541848659515381, |
|
"learning_rate": 2.009789172984405e-06, |
|
"loss": 1.7038, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.15825169555388094, |
|
"grad_norm": 6.225683212280273, |
|
"learning_rate": 1.9441487484498223e-06, |
|
"loss": 1.8292, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.15861054293608928, |
|
"grad_norm": 5.8407464027404785, |
|
"learning_rate": 1.8795588248647634e-06, |
|
"loss": 2.0944, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.15896939031829763, |
|
"grad_norm": 5.6365580558776855, |
|
"learning_rate": 1.8160220572680145e-06, |
|
"loss": 1.8074, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.15932823770050597, |
|
"grad_norm": 5.789924621582031, |
|
"learning_rate": 1.753541057407227e-06, |
|
"loss": 1.9745, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.15968708508271431, |
|
"grad_norm": 6.211676120758057, |
|
"learning_rate": 1.692118393631588e-06, |
|
"loss": 1.7479, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.16004593246492266, |
|
"grad_norm": 5.150852680206299, |
|
"learning_rate": 1.6317565907862317e-06, |
|
"loss": 1.8298, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.160404779847131, |
|
"grad_norm": 7.136476039886475, |
|
"learning_rate": 1.5724581301084432e-06, |
|
"loss": 1.946, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.16076362722933937, |
|
"grad_norm": 5.5826311111450195, |
|
"learning_rate": 1.5142254491256988e-06, |
|
"loss": 1.9713, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.16112247461154772, |
|
"grad_norm": 7.025698661804199, |
|
"learning_rate": 1.4570609415554178e-06, |
|
"loss": 1.9835, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.16148132199375606, |
|
"grad_norm": 5.846466541290283, |
|
"learning_rate": 1.4009669572066124e-06, |
|
"loss": 2.0159, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.16148132199375606, |
|
"eval_loss": 1.9697834253311157, |
|
"eval_runtime": 53.8246, |
|
"eval_samples_per_second": 9.289, |
|
"eval_steps_per_second": 9.289, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.1618401693759644, |
|
"grad_norm": 5.875072002410889, |
|
"learning_rate": 1.345945801883278e-06, |
|
"loss": 2.0738, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.16219901675817275, |
|
"grad_norm": 7.465427398681641, |
|
"learning_rate": 1.2919997372896026e-06, |
|
"loss": 1.8473, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.1625578641403811, |
|
"grad_norm": 6.052853584289551, |
|
"learning_rate": 1.2391309809370159e-06, |
|
"loss": 1.983, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.16291671152258944, |
|
"grad_norm": 5.640964031219482, |
|
"learning_rate": 1.18734170605301e-06, |
|
"loss": 1.7979, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.16327555890479778, |
|
"grad_norm": 6.585112571716309, |
|
"learning_rate": 1.136634041491834e-06, |
|
"loss": 1.7018, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.16363440628700612, |
|
"grad_norm": 5.19002628326416, |
|
"learning_rate": 1.0870100716469694e-06, |
|
"loss": 1.9632, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.1639932536692145, |
|
"grad_norm": 5.477431774139404, |
|
"learning_rate": 1.0384718363654598e-06, |
|
"loss": 1.8951, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.16435210105142284, |
|
"grad_norm": 6.346002101898193, |
|
"learning_rate": 9.910213308640359e-07, |
|
"loss": 1.6237, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.16471094843363118, |
|
"grad_norm": 6.293008327484131, |
|
"learning_rate": 9.446605056471311e-07, |
|
"loss": 1.9506, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.16506979581583953, |
|
"grad_norm": 6.646719932556152, |
|
"learning_rate": 8.993912664266901e-07, |
|
"loss": 1.9005, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.16542864319804787, |
|
"grad_norm": 6.372305393218994, |
|
"learning_rate": 8.5521547404383e-07, |
|
"loss": 1.9383, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.1657874905802562, |
|
"grad_norm": 5.362992286682129, |
|
"learning_rate": 8.121349443923473e-07, |
|
"loss": 1.8118, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.16614633796246456, |
|
"grad_norm": 5.154087066650391, |
|
"learning_rate": 7.701514483440844e-07, |
|
"loss": 2.0736, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.1665051853446729, |
|
"grad_norm": 5.9843926429748535, |
|
"learning_rate": 7.292667116761223e-07, |
|
"loss": 2.1853, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.16686403272688125, |
|
"grad_norm": 6.360889911651611, |
|
"learning_rate": 6.894824149998505e-07, |
|
"loss": 1.9762, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.16686403272688125, |
|
"eval_loss": 1.9689069986343384, |
|
"eval_runtime": 55.2702, |
|
"eval_samples_per_second": 9.046, |
|
"eval_steps_per_second": 9.046, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.16722288010908962, |
|
"grad_norm": 5.460212707519531, |
|
"learning_rate": 6.508001936918873e-07, |
|
"loss": 1.8351, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.16758172749129796, |
|
"grad_norm": 4.702284812927246, |
|
"learning_rate": 6.132216378268379e-07, |
|
"loss": 1.8153, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.1679405748735063, |
|
"grad_norm": 5.906696796417236, |
|
"learning_rate": 5.767482921119461e-07, |
|
"loss": 1.9498, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.16829942225571465, |
|
"grad_norm": 6.808102607727051, |
|
"learning_rate": 5.413816558236007e-07, |
|
"loss": 2.1874, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.168658269637923, |
|
"grad_norm": 5.334954738616943, |
|
"learning_rate": 5.071231827457004e-07, |
|
"loss": 1.9131, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.16901711702013134, |
|
"grad_norm": 6.644302845001221, |
|
"learning_rate": 4.739742811098946e-07, |
|
"loss": 1.8804, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.16937596440233968, |
|
"grad_norm": 6.628371715545654, |
|
"learning_rate": 4.4193631353768414e-07, |
|
"loss": 2.2605, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.16973481178454802, |
|
"grad_norm": 7.336843490600586, |
|
"learning_rate": 4.1101059698443965e-07, |
|
"loss": 1.8735, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.17009365916675637, |
|
"grad_norm": 6.385683059692383, |
|
"learning_rate": 3.8119840268523914e-07, |
|
"loss": 1.7274, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.17045250654896474, |
|
"grad_norm": 6.481418132781982, |
|
"learning_rate": 3.525009561026202e-07, |
|
"loss": 2.012, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.17081135393117308, |
|
"grad_norm": 5.986774921417236, |
|
"learning_rate": 3.2491943687621873e-07, |
|
"loss": 2.0702, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.17117020131338143, |
|
"grad_norm": 6.33309268951416, |
|
"learning_rate": 2.984549787742552e-07, |
|
"loss": 1.8632, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.17152904869558977, |
|
"grad_norm": 5.965672969818115, |
|
"learning_rate": 2.731086696469501e-07, |
|
"loss": 1.7732, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.1718878960777981, |
|
"grad_norm": 6.534940242767334, |
|
"learning_rate": 2.4888155138179576e-07, |
|
"loss": 2.2091, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.17224674346000646, |
|
"grad_norm": 5.720672130584717, |
|
"learning_rate": 2.2577461986073356e-07, |
|
"loss": 1.9488, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.17224674346000646, |
|
"eval_loss": 1.968702793121338, |
|
"eval_runtime": 54.4379, |
|
"eval_samples_per_second": 9.185, |
|
"eval_steps_per_second": 9.185, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.1726055908422148, |
|
"grad_norm": 6.99819278717041, |
|
"learning_rate": 2.0378882491921159e-07, |
|
"loss": 1.8417, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.17296443822442314, |
|
"grad_norm": 5.92847204208374, |
|
"learning_rate": 1.8292507030715362e-07, |
|
"loss": 1.8105, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.1733232856066315, |
|
"grad_norm": 6.517411231994629, |
|
"learning_rate": 1.6318421365179055e-07, |
|
"loss": 1.8031, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.17368213298883986, |
|
"grad_norm": 5.3509345054626465, |
|
"learning_rate": 1.4456706642242134e-07, |
|
"loss": 1.9194, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.1740409803710482, |
|
"grad_norm": 5.721834182739258, |
|
"learning_rate": 1.2707439389704867e-07, |
|
"loss": 1.9414, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.17439982775325655, |
|
"grad_norm": 5.240856647491455, |
|
"learning_rate": 1.1070691513092563e-07, |
|
"loss": 2.1085, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.1747586751354649, |
|
"grad_norm": 5.592405796051025, |
|
"learning_rate": 9.546530292699863e-08, |
|
"loss": 1.9062, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.17511752251767324, |
|
"grad_norm": 5.133212089538574, |
|
"learning_rate": 8.135018380824921e-08, |
|
"loss": 1.9475, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.17547636989988158, |
|
"grad_norm": 5.703231334686279, |
|
"learning_rate": 6.836213799193497e-08, |
|
"loss": 2.0048, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.17583521728208992, |
|
"grad_norm": 6.332388401031494, |
|
"learning_rate": 5.6501699365750784e-08, |
|
"loss": 1.6435, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.17619406466429827, |
|
"grad_norm": 5.111960411071777, |
|
"learning_rate": 4.5769355465876964e-08, |
|
"loss": 1.8596, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.1765529120465066, |
|
"grad_norm": 6.382414817810059, |
|
"learning_rate": 3.616554745692946e-08, |
|
"loss": 1.9124, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.17691175942871495, |
|
"grad_norm": 7.501326084136963, |
|
"learning_rate": 2.7690670113848792e-08, |
|
"loss": 1.9002, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.17727060681092333, |
|
"grad_norm": 6.54906702041626, |
|
"learning_rate": 2.034507180563916e-08, |
|
"loss": 1.8475, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.17762945419313167, |
|
"grad_norm": 5.6887078285217285, |
|
"learning_rate": 1.4129054481082926e-08, |
|
"loss": 1.7603, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.17762945419313167, |
|
"eval_loss": 1.9684966802597046, |
|
"eval_runtime": 53.5323, |
|
"eval_samples_per_second": 9.34, |
|
"eval_steps_per_second": 9.34, |
|
"step": 4950 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.9553961729366426e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|