|
{ |
|
"best_metric": 11.018465042114258, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-370", |
|
"epoch": 0.016722785925741793, |
|
"eval_steps": 5, |
|
"global_step": 370, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.519671871822106e-05, |
|
"grad_norm": 0.741265594959259, |
|
"learning_rate": 2e-05, |
|
"loss": 44.3865, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 4.519671871822106e-05, |
|
"eval_loss": 11.093368530273438, |
|
"eval_runtime": 175.5908, |
|
"eval_samples_per_second": 53.061, |
|
"eval_steps_per_second": 26.533, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 9.039343743644212e-05, |
|
"grad_norm": 0.6581929326057434, |
|
"learning_rate": 4e-05, |
|
"loss": 44.3813, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00013559015615466317, |
|
"grad_norm": 0.6729432344436646, |
|
"learning_rate": 6e-05, |
|
"loss": 44.3393, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00018078687487288423, |
|
"grad_norm": 0.6872175335884094, |
|
"learning_rate": 8e-05, |
|
"loss": 44.3795, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00022598359359110527, |
|
"grad_norm": 0.704067051410675, |
|
"learning_rate": 0.0001, |
|
"loss": 44.389, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00022598359359110527, |
|
"eval_loss": 11.093063354492188, |
|
"eval_runtime": 176.3334, |
|
"eval_samples_per_second": 52.837, |
|
"eval_steps_per_second": 26.422, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00027118031230932634, |
|
"grad_norm": 0.6682418584823608, |
|
"learning_rate": 0.00012, |
|
"loss": 44.3529, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0003163770310275474, |
|
"grad_norm": 0.6353705525398254, |
|
"learning_rate": 0.00014, |
|
"loss": 44.4355, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00036157374974576847, |
|
"grad_norm": 0.6866922974586487, |
|
"learning_rate": 0.00016, |
|
"loss": 44.4036, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00040677046846398953, |
|
"grad_norm": 0.7315343618392944, |
|
"learning_rate": 0.00018, |
|
"loss": 44.3756, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00045196718718221055, |
|
"grad_norm": 0.6867555379867554, |
|
"learning_rate": 0.0002, |
|
"loss": 44.3678, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00045196718718221055, |
|
"eval_loss": 11.091917037963867, |
|
"eval_runtime": 176.1396, |
|
"eval_samples_per_second": 52.896, |
|
"eval_steps_per_second": 26.451, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0004971639059004316, |
|
"grad_norm": 0.7067858576774597, |
|
"learning_rate": 0.0001999979446958366, |
|
"loss": 44.3933, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005423606246186527, |
|
"grad_norm": 0.7694055438041687, |
|
"learning_rate": 0.00019999177886783194, |
|
"loss": 44.3476, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0005875573433368737, |
|
"grad_norm": 0.6980550289154053, |
|
"learning_rate": 0.00019998150276943902, |
|
"loss": 44.3621, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0006327540620550948, |
|
"grad_norm": 0.7399426698684692, |
|
"learning_rate": 0.000199967116823068, |
|
"loss": 44.3727, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0006779507807733159, |
|
"grad_norm": 0.6623771786689758, |
|
"learning_rate": 0.0001999486216200688, |
|
"loss": 44.3563, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0006779507807733159, |
|
"eval_loss": 11.089905738830566, |
|
"eval_runtime": 176.0326, |
|
"eval_samples_per_second": 52.928, |
|
"eval_steps_per_second": 26.467, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0007231474994915369, |
|
"grad_norm": 0.6647756695747375, |
|
"learning_rate": 0.00019992601792070679, |
|
"loss": 44.3454, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.000768344218209758, |
|
"grad_norm": 0.7416101694107056, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 44.3249, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0008135409369279791, |
|
"grad_norm": 0.6351829171180725, |
|
"learning_rate": 0.00019986848891833845, |
|
"loss": 44.37, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0008587376556462001, |
|
"grad_norm": 0.6839431524276733, |
|
"learning_rate": 0.0001998335659801241, |
|
"loss": 44.3472, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0009039343743644211, |
|
"grad_norm": 0.6762228608131409, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 44.3507, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0009039343743644211, |
|
"eval_loss": 11.087591171264648, |
|
"eval_runtime": 176.1534, |
|
"eval_samples_per_second": 52.891, |
|
"eval_steps_per_second": 26.449, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0009491310930826422, |
|
"grad_norm": 0.7993413209915161, |
|
"learning_rate": 0.00019975141040730207, |
|
"loss": 44.288, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0009943278118008632, |
|
"grad_norm": 0.6926490664482117, |
|
"learning_rate": 0.0001997041811497882, |
|
"loss": 44.3672, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0010395245305190844, |
|
"grad_norm": 0.7373084425926208, |
|
"learning_rate": 0.00019965285344390184, |
|
"loss": 44.3927, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0010847212492373054, |
|
"grad_norm": 0.6655643582344055, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 44.3481, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0011299179679555265, |
|
"grad_norm": 0.7115928530693054, |
|
"learning_rate": 0.00019953791129491983, |
|
"loss": 44.3368, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0011299179679555265, |
|
"eval_loss": 11.085227012634277, |
|
"eval_runtime": 175.877, |
|
"eval_samples_per_second": 52.975, |
|
"eval_steps_per_second": 26.49, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0011751146866737475, |
|
"grad_norm": 0.7096830010414124, |
|
"learning_rate": 0.00019947430157664576, |
|
"loss": 44.3735, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0012203114053919684, |
|
"grad_norm": 0.6747312545776367, |
|
"learning_rate": 0.00019940660285944803, |
|
"loss": 44.3323, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0012655081241101896, |
|
"grad_norm": 0.7371957302093506, |
|
"learning_rate": 0.00019933481792615583, |
|
"loss": 44.2951, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0013107048428284106, |
|
"grad_norm": 0.7316697239875793, |
|
"learning_rate": 0.0001992589497275665, |
|
"loss": 44.3097, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0013559015615466317, |
|
"grad_norm": 0.6886783838272095, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 44.3137, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0013559015615466317, |
|
"eval_loss": 11.082609176635742, |
|
"eval_runtime": 176.2695, |
|
"eval_samples_per_second": 52.857, |
|
"eval_steps_per_second": 26.431, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0014010982802648527, |
|
"grad_norm": 0.7027749419212341, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 44.3391, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0014462949989830739, |
|
"grad_norm": 0.735598087310791, |
|
"learning_rate": 0.0001990068775649202, |
|
"loss": 44.3645, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0014914917177012948, |
|
"grad_norm": 0.7152600288391113, |
|
"learning_rate": 0.00019891470916809362, |
|
"loss": 44.3478, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.001536688436419516, |
|
"grad_norm": 0.6983291506767273, |
|
"learning_rate": 0.00019881847477499557, |
|
"loss": 44.3252, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.001581885155137737, |
|
"grad_norm": 0.6892045140266418, |
|
"learning_rate": 0.00019871817834144504, |
|
"loss": 44.2998, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.001581885155137737, |
|
"eval_loss": 11.079712867736816, |
|
"eval_runtime": 176.1378, |
|
"eval_samples_per_second": 52.896, |
|
"eval_steps_per_second": 26.451, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0016270818738559581, |
|
"grad_norm": 0.7166262865066528, |
|
"learning_rate": 0.0001986138239902355, |
|
"loss": 44.3485, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.001672278592574179, |
|
"grad_norm": 0.7545002102851868, |
|
"learning_rate": 0.0001985054160109657, |
|
"loss": 44.2613, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0017174753112924003, |
|
"grad_norm": 0.7944263219833374, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 44.2665, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0017626720300106212, |
|
"grad_norm": 0.7216903567314148, |
|
"learning_rate": 0.0001982764571596004, |
|
"loss": 44.3546, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0018078687487288422, |
|
"grad_norm": 0.7492774128913879, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 44.3223, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0018078687487288422, |
|
"eval_loss": 11.076553344726562, |
|
"eval_runtime": 176.1866, |
|
"eval_samples_per_second": 52.881, |
|
"eval_steps_per_second": 26.444, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0018530654674470634, |
|
"grad_norm": 0.8118460774421692, |
|
"learning_rate": 0.00019803133943336874, |
|
"loss": 44.3122, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0018982621861652843, |
|
"grad_norm": 0.7527559399604797, |
|
"learning_rate": 0.0001979027334832293, |
|
"loss": 44.3061, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0019434589048835055, |
|
"grad_norm": 0.7425262331962585, |
|
"learning_rate": 0.00019777010313517518, |
|
"loss": 44.2408, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0019886556236017264, |
|
"grad_norm": 0.753101646900177, |
|
"learning_rate": 0.00019763345384112043, |
|
"loss": 44.3362, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0020338523423199476, |
|
"grad_norm": 0.767737090587616, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 44.2864, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0020338523423199476, |
|
"eval_loss": 11.072389602661133, |
|
"eval_runtime": 175.9667, |
|
"eval_samples_per_second": 52.948, |
|
"eval_steps_per_second": 26.477, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002079049061038169, |
|
"grad_norm": 0.7275786995887756, |
|
"learning_rate": 0.00019734812104845047, |
|
"loss": 44.3542, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0021242457797563895, |
|
"grad_norm": 0.6908650994300842, |
|
"learning_rate": 0.00019719944927874881, |
|
"loss": 44.3377, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0021694424984746107, |
|
"grad_norm": 0.7260599136352539, |
|
"learning_rate": 0.0001970467820203915, |
|
"loss": 44.2621, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.002214639217192832, |
|
"grad_norm": 0.7138715982437134, |
|
"learning_rate": 0.00019689012554893154, |
|
"loss": 44.2338, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.002259835935911053, |
|
"grad_norm": 0.7867954969406128, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 44.3044, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.002259835935911053, |
|
"eval_loss": 11.067892074584961, |
|
"eval_runtime": 176.5244, |
|
"eval_samples_per_second": 52.78, |
|
"eval_steps_per_second": 26.393, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.002305032654629274, |
|
"grad_norm": 0.7787512540817261, |
|
"learning_rate": 0.00019656487088855592, |
|
"loss": 44.2918, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.002350229373347495, |
|
"grad_norm": 0.7184544801712036, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 44.2751, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.002395426092065716, |
|
"grad_norm": 0.7348573803901672, |
|
"learning_rate": 0.0001962237387768529, |
|
"loss": 44.246, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.002440622810783937, |
|
"grad_norm": 0.7713965773582458, |
|
"learning_rate": 0.00019604723610310194, |
|
"loss": 44.3292, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.002485819529502158, |
|
"grad_norm": 0.8040369749069214, |
|
"learning_rate": 0.00019586678530366606, |
|
"loss": 44.2155, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.002485819529502158, |
|
"eval_loss": 11.062650680541992, |
|
"eval_runtime": 176.278, |
|
"eval_samples_per_second": 52.854, |
|
"eval_steps_per_second": 26.43, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0025310162482203792, |
|
"grad_norm": 0.7459877133369446, |
|
"learning_rate": 0.00019568239379617088, |
|
"loss": 44.2188, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0025762129669386004, |
|
"grad_norm": 0.8008533716201782, |
|
"learning_rate": 0.00019549406916022905, |
|
"loss": 44.226, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.002621409685656821, |
|
"grad_norm": 0.7918010354042053, |
|
"learning_rate": 0.00019530181913712872, |
|
"loss": 44.287, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0026666064043750423, |
|
"grad_norm": 0.7287217974662781, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 44.2581, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0027118031230932635, |
|
"grad_norm": 0.7925474643707275, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 44.2277, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0027118031230932635, |
|
"eval_loss": 11.05736255645752, |
|
"eval_runtime": 176.2465, |
|
"eval_samples_per_second": 52.863, |
|
"eval_steps_per_second": 26.435, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0027569998418114847, |
|
"grad_norm": 0.8553807735443115, |
|
"learning_rate": 0.00019470159657616215, |
|
"loss": 44.2439, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0028021965605297054, |
|
"grad_norm": 0.7586395740509033, |
|
"learning_rate": 0.00019449372563954293, |
|
"loss": 44.1943, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0028473932792479266, |
|
"grad_norm": 0.7628232836723328, |
|
"learning_rate": 0.0001942819704359693, |
|
"loss": 44.2594, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0028925899979661478, |
|
"grad_norm": 0.718551754951477, |
|
"learning_rate": 0.00019406633966986828, |
|
"loss": 44.2302, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.002937786716684369, |
|
"grad_norm": 0.7625423073768616, |
|
"learning_rate": 0.00019384684220497605, |
|
"loss": 44.1989, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.002937786716684369, |
|
"eval_loss": 11.051901817321777, |
|
"eval_runtime": 176.1759, |
|
"eval_samples_per_second": 52.885, |
|
"eval_steps_per_second": 26.445, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0029829834354025897, |
|
"grad_norm": 0.7891851663589478, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 44.2199, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.003028180154120811, |
|
"grad_norm": 0.6770808100700378, |
|
"learning_rate": 0.00019339628342811632, |
|
"loss": 44.1689, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.003073376872839032, |
|
"grad_norm": 0.7498692870140076, |
|
"learning_rate": 0.0001931652406368554, |
|
"loss": 44.1741, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0031185735915572528, |
|
"grad_norm": 0.7661782503128052, |
|
"learning_rate": 0.0001929303681874552, |
|
"loss": 44.2123, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.003163770310275474, |
|
"grad_norm": 0.6438837647438049, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 44.1718, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.003163770310275474, |
|
"eval_loss": 11.046669960021973, |
|
"eval_runtime": 176.3634, |
|
"eval_samples_per_second": 52.828, |
|
"eval_steps_per_second": 26.417, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.003208967028993695, |
|
"grad_norm": 0.7522275447845459, |
|
"learning_rate": 0.00019244917309000817, |
|
"loss": 44.2246, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0032541637477119163, |
|
"grad_norm": 0.7135974168777466, |
|
"learning_rate": 0.00019220287022200707, |
|
"loss": 44.2111, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.003299360466430137, |
|
"grad_norm": 0.7275662422180176, |
|
"learning_rate": 0.0001919527772551451, |
|
"loss": 44.1464, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.003344557185148358, |
|
"grad_norm": 0.6742229461669922, |
|
"learning_rate": 0.00019169890446976454, |
|
"loss": 44.2105, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0033897539038665794, |
|
"grad_norm": 0.6085646152496338, |
|
"learning_rate": 0.00019144126230158127, |
|
"loss": 44.0926, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0033897539038665794, |
|
"eval_loss": 11.042237281799316, |
|
"eval_runtime": 176.114, |
|
"eval_samples_per_second": 52.903, |
|
"eval_steps_per_second": 26.454, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0034349506225848005, |
|
"grad_norm": 0.7245734333992004, |
|
"learning_rate": 0.0001911798613412557, |
|
"loss": 44.2154, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0034801473413030213, |
|
"grad_norm": 0.7311281561851501, |
|
"learning_rate": 0.0001909147123339575, |
|
"loss": 44.1687, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0035253440600212425, |
|
"grad_norm": 0.6399495601654053, |
|
"learning_rate": 0.0001906458261789238, |
|
"loss": 44.1596, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0035705407787394636, |
|
"grad_norm": 0.5650178790092468, |
|
"learning_rate": 0.00019037321392901136, |
|
"loss": 44.1466, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0036157374974576844, |
|
"grad_norm": 0.6039579510688782, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 44.1955, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0036157374974576844, |
|
"eval_loss": 11.038910865783691, |
|
"eval_runtime": 176.3853, |
|
"eval_samples_per_second": 52.822, |
|
"eval_steps_per_second": 26.414, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0036609342161759055, |
|
"grad_norm": 0.7481367588043213, |
|
"learning_rate": 0.0001898168561213419, |
|
"loss": 44.2182, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0037061309348941267, |
|
"grad_norm": 0.628414511680603, |
|
"learning_rate": 0.0001895331334332753, |
|
"loss": 44.1519, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.003751327653612348, |
|
"grad_norm": 0.658549964427948, |
|
"learning_rate": 0.0001892457303887706, |
|
"loss": 44.1364, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0037965243723305686, |
|
"grad_norm": 0.5245007276535034, |
|
"learning_rate": 0.0001889546588018412, |
|
"loss": 44.1079, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00384172109104879, |
|
"grad_norm": 0.5555324554443359, |
|
"learning_rate": 0.00018865993063730004, |
|
"loss": 44.1445, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00384172109104879, |
|
"eval_loss": 11.036417007446289, |
|
"eval_runtime": 176.131, |
|
"eval_samples_per_second": 52.898, |
|
"eval_steps_per_second": 26.452, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.003886917809767011, |
|
"grad_norm": 0.43622660636901855, |
|
"learning_rate": 0.00018836155801026753, |
|
"loss": 44.1515, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.003932114528485232, |
|
"grad_norm": 0.578544020652771, |
|
"learning_rate": 0.0001880595531856738, |
|
"loss": 44.0766, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.003977311247203453, |
|
"grad_norm": 0.598685085773468, |
|
"learning_rate": 0.00018775392857775432, |
|
"loss": 44.1756, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.004022507965921674, |
|
"grad_norm": 0.5733134150505066, |
|
"learning_rate": 0.00018744469674953956, |
|
"loss": 44.1756, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.004067704684639895, |
|
"grad_norm": 0.5177151560783386, |
|
"learning_rate": 0.00018713187041233896, |
|
"loss": 44.173, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004067704684639895, |
|
"eval_loss": 11.034589767456055, |
|
"eval_runtime": 176.3402, |
|
"eval_samples_per_second": 52.835, |
|
"eval_steps_per_second": 26.421, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004112901403358116, |
|
"grad_norm": 0.5208268761634827, |
|
"learning_rate": 0.00018681546242521786, |
|
"loss": 44.1346, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.004158098122076338, |
|
"grad_norm": 0.6029201149940491, |
|
"learning_rate": 0.00018649548579446936, |
|
"loss": 44.152, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.004203294840794558, |
|
"grad_norm": 0.468414843082428, |
|
"learning_rate": 0.0001861719536730795, |
|
"loss": 44.117, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.004248491559512779, |
|
"grad_norm": 0.3942670226097107, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 44.137, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.004293688278231, |
|
"grad_norm": 0.49822431802749634, |
|
"learning_rate": 0.00018551427630053463, |
|
"loss": 44.119, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004293688278231, |
|
"eval_loss": 11.03354549407959, |
|
"eval_runtime": 176.0642, |
|
"eval_samples_per_second": 52.918, |
|
"eval_steps_per_second": 26.462, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004338884996949221, |
|
"grad_norm": 0.5527846813201904, |
|
"learning_rate": 0.00018518015808392045, |
|
"loss": 44.0893, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.004384081715667443, |
|
"grad_norm": 0.5725367665290833, |
|
"learning_rate": 0.00018484253844463526, |
|
"loss": 44.1162, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.004429278434385664, |
|
"grad_norm": 0.49278348684310913, |
|
"learning_rate": 0.00018450143126090015, |
|
"loss": 44.1031, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.004474475153103885, |
|
"grad_norm": 0.4361265301704407, |
|
"learning_rate": 0.00018415685055429533, |
|
"loss": 44.1386, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.004519671871822106, |
|
"grad_norm": 0.397714763879776, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 44.1072, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004519671871822106, |
|
"eval_loss": 11.032732963562012, |
|
"eval_runtime": 176.1844, |
|
"eval_samples_per_second": 52.882, |
|
"eval_steps_per_second": 26.444, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004564868590540326, |
|
"grad_norm": 0.46195968985557556, |
|
"learning_rate": 0.00018345732537213027, |
|
"loss": 44.1243, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.004610065309258548, |
|
"grad_norm": 0.4918234348297119, |
|
"learning_rate": 0.00018310240965131041, |
|
"loss": 44.0833, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.004655262027976769, |
|
"grad_norm": 0.39288461208343506, |
|
"learning_rate": 0.00018274407791591966, |
|
"loss": 44.0844, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00470045874669499, |
|
"grad_norm": 0.7819874882698059, |
|
"learning_rate": 0.00018238234489557215, |
|
"loss": 44.0727, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.004745655465413211, |
|
"grad_norm": 0.4996788203716278, |
|
"learning_rate": 0.0001820172254596956, |
|
"loss": 44.0926, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.004745655465413211, |
|
"eval_loss": 11.03187370300293, |
|
"eval_runtime": 176.1674, |
|
"eval_samples_per_second": 52.887, |
|
"eval_steps_per_second": 26.446, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.004790852184131432, |
|
"grad_norm": 0.4443046748638153, |
|
"learning_rate": 0.00018164873461691986, |
|
"loss": 44.1211, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0048360489028496535, |
|
"grad_norm": 0.6192988753318787, |
|
"learning_rate": 0.00018127688751446027, |
|
"loss": 44.2023, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.004881245621567874, |
|
"grad_norm": 0.49968671798706055, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 44.1175, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.004926442340286095, |
|
"grad_norm": 0.5411902070045471, |
|
"learning_rate": 0.0001805231858085356, |
|
"loss": 44.1106, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.004971639059004316, |
|
"grad_norm": 0.7971486449241638, |
|
"learning_rate": 0.00018014136218679567, |
|
"loss": 44.1488, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004971639059004316, |
|
"eval_loss": 11.030839920043945, |
|
"eval_runtime": 176.4251, |
|
"eval_samples_per_second": 52.81, |
|
"eval_steps_per_second": 26.408, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005016835777722537, |
|
"grad_norm": 0.39622390270233154, |
|
"learning_rate": 0.00017975624426754848, |
|
"loss": 44.1091, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0050620324964407585, |
|
"grad_norm": 0.4785301089286804, |
|
"learning_rate": 0.00017936784788148328, |
|
"loss": 44.1038, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00510722921515898, |
|
"grad_norm": 0.5272740125656128, |
|
"learning_rate": 0.00017897618899405423, |
|
"loss": 44.1133, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.005152425933877201, |
|
"grad_norm": 0.6231501698493958, |
|
"learning_rate": 0.00017858128370482426, |
|
"loss": 44.1085, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.005197622652595422, |
|
"grad_norm": 0.5427981019020081, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 44.1395, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005197622652595422, |
|
"eval_loss": 11.029810905456543, |
|
"eval_runtime": 176.1516, |
|
"eval_samples_per_second": 52.892, |
|
"eval_steps_per_second": 26.449, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005242819371313642, |
|
"grad_norm": 0.4265317916870117, |
|
"learning_rate": 0.00017778179898577973, |
|
"loss": 44.1501, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0052880160900318635, |
|
"grad_norm": 0.9469470381736755, |
|
"learning_rate": 0.00017737725241965069, |
|
"loss": 44.2129, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.005333212808750085, |
|
"grad_norm": 0.4538600742816925, |
|
"learning_rate": 0.00017696952517774062, |
|
"loss": 44.0941, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.005378409527468306, |
|
"grad_norm": 0.7306213974952698, |
|
"learning_rate": 0.00017655863402011947, |
|
"loss": 44.1601, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.005423606246186527, |
|
"grad_norm": 0.5303515195846558, |
|
"learning_rate": 0.00017614459583691346, |
|
"loss": 44.1485, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005423606246186527, |
|
"eval_loss": 11.029101371765137, |
|
"eval_runtime": 176.3314, |
|
"eval_samples_per_second": 52.838, |
|
"eval_steps_per_second": 26.422, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.005468802964904748, |
|
"grad_norm": 0.43057698011398315, |
|
"learning_rate": 0.00017572742764761055, |
|
"loss": 44.1271, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.005513999683622969, |
|
"grad_norm": 0.5054545402526855, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 44.1574, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.00555919640234119, |
|
"grad_norm": 0.47395941615104675, |
|
"learning_rate": 0.00017488376997127283, |
|
"loss": 44.0802, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.005604393121059411, |
|
"grad_norm": 0.5438507795333862, |
|
"learning_rate": 0.0001744573151637007, |
|
"loss": 44.0974, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.005649589839777632, |
|
"grad_norm": 0.5694723129272461, |
|
"learning_rate": 0.00017402779970753155, |
|
"loss": 44.1329, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.005649589839777632, |
|
"eval_loss": 11.028435707092285, |
|
"eval_runtime": 176.0545, |
|
"eval_samples_per_second": 52.921, |
|
"eval_steps_per_second": 26.463, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.005694786558495853, |
|
"grad_norm": 0.49188655614852905, |
|
"learning_rate": 0.0001735952412584635, |
|
"loss": 44.0859, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.005739983277214074, |
|
"grad_norm": 0.5955361127853394, |
|
"learning_rate": 0.00017315965759728014, |
|
"loss": 44.0938, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0057851799959322955, |
|
"grad_norm": 0.4358704090118408, |
|
"learning_rate": 0.00017272106662911973, |
|
"loss": 44.1165, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.005830376714650517, |
|
"grad_norm": 0.4302980899810791, |
|
"learning_rate": 0.00017227948638273916, |
|
"loss": 44.1088, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.005875573433368738, |
|
"grad_norm": 0.5749801397323608, |
|
"learning_rate": 0.00017183493500977278, |
|
"loss": 44.1311, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005875573433368738, |
|
"eval_loss": 11.027961730957031, |
|
"eval_runtime": 176.2218, |
|
"eval_samples_per_second": 52.871, |
|
"eval_steps_per_second": 26.438, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005920770152086958, |
|
"grad_norm": 0.4459182620048523, |
|
"learning_rate": 0.0001713874307839863, |
|
"loss": 44.0874, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.005965966870805179, |
|
"grad_norm": 0.5632774233818054, |
|
"learning_rate": 0.0001709369921005258, |
|
"loss": 44.1085, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0060111635895234005, |
|
"grad_norm": 0.5518532991409302, |
|
"learning_rate": 0.00017048363747516117, |
|
"loss": 44.0409, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.006056360308241622, |
|
"grad_norm": 0.5138490200042725, |
|
"learning_rate": 0.00017002738554352552, |
|
"loss": 44.1078, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.006101557026959843, |
|
"grad_norm": 0.44584622979164124, |
|
"learning_rate": 0.00016956825506034867, |
|
"loss": 44.1152, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.006101557026959843, |
|
"eval_loss": 11.027368545532227, |
|
"eval_runtime": 175.9823, |
|
"eval_samples_per_second": 52.943, |
|
"eval_steps_per_second": 26.474, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.006146753745678064, |
|
"grad_norm": 0.5159522294998169, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 44.0946, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.006191950464396285, |
|
"grad_norm": 0.4725247323513031, |
|
"learning_rate": 0.00016864143404914504, |
|
"loss": 44.1131, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0062371471831145055, |
|
"grad_norm": 0.5374069213867188, |
|
"learning_rate": 0.00016817378161909996, |
|
"loss": 44.1304, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.006282343901832727, |
|
"grad_norm": 0.44262439012527466, |
|
"learning_rate": 0.00016770332683191096, |
|
"loss": 44.065, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.006327540620550948, |
|
"grad_norm": 0.5221428871154785, |
|
"learning_rate": 0.0001672300890261317, |
|
"loss": 44.1053, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006327540620550948, |
|
"eval_loss": 11.026728630065918, |
|
"eval_runtime": 176.1986, |
|
"eval_samples_per_second": 52.878, |
|
"eval_steps_per_second": 26.442, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.006372737339269169, |
|
"grad_norm": 0.47628021240234375, |
|
"learning_rate": 0.0001667540876547148, |
|
"loss": 44.1197, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00641793405798739, |
|
"grad_norm": 0.4244273006916046, |
|
"learning_rate": 0.0001662753422842123, |
|
"loss": 44.0529, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.006463130776705611, |
|
"grad_norm": 0.4019363820552826, |
|
"learning_rate": 0.00016579387259397127, |
|
"loss": 44.107, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0065083274954238325, |
|
"grad_norm": 0.41666439175605774, |
|
"learning_rate": 0.00016530969837532487, |
|
"loss": 44.1185, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.006553524214142053, |
|
"grad_norm": 0.52204829454422, |
|
"learning_rate": 0.00016482283953077887, |
|
"loss": 44.0868, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.006553524214142053, |
|
"eval_loss": 11.026100158691406, |
|
"eval_runtime": 175.9985, |
|
"eval_samples_per_second": 52.938, |
|
"eval_steps_per_second": 26.472, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.006598720932860274, |
|
"grad_norm": 0.4917082190513611, |
|
"learning_rate": 0.00016433331607319343, |
|
"loss": 44.0786, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.006643917651578495, |
|
"grad_norm": 0.6054917573928833, |
|
"learning_rate": 0.00016384114812496056, |
|
"loss": 44.0952, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.006689114370296716, |
|
"grad_norm": 0.46359196305274963, |
|
"learning_rate": 0.00016334635591717703, |
|
"loss": 44.1401, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0067343110890149376, |
|
"grad_norm": 0.5335073471069336, |
|
"learning_rate": 0.00016284895978881236, |
|
"loss": 44.0664, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.006779507807733159, |
|
"grad_norm": 0.3754950761795044, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 44.1361, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006779507807733159, |
|
"eval_loss": 11.025545120239258, |
|
"eval_runtime": 176.2544, |
|
"eval_samples_per_second": 52.861, |
|
"eval_steps_per_second": 26.433, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00682470452645138, |
|
"grad_norm": 0.48478755354881287, |
|
"learning_rate": 0.00016184643766056317, |
|
"loss": 44.14, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.006869901245169601, |
|
"grad_norm": 0.4497169852256775, |
|
"learning_rate": 0.00016134135287043669, |
|
"loss": 44.0882, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.006915097963887821, |
|
"grad_norm": 0.5556149482727051, |
|
"learning_rate": 0.00016083374657755134, |
|
"loss": 44.148, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0069602946826060426, |
|
"grad_norm": 0.4659099280834198, |
|
"learning_rate": 0.00016032363964761363, |
|
"loss": 44.0788, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.007005491401324264, |
|
"grad_norm": 0.5520086288452148, |
|
"learning_rate": 0.00015981105304912162, |
|
"loss": 44.1322, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.007005491401324264, |
|
"eval_loss": 11.025052070617676, |
|
"eval_runtime": 176.1047, |
|
"eval_samples_per_second": 52.906, |
|
"eval_steps_per_second": 26.456, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.007050688120042485, |
|
"grad_norm": 0.5233341455459595, |
|
"learning_rate": 0.00015929600785250257, |
|
"loss": 44.0942, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.007095884838760706, |
|
"grad_norm": 0.4378088712692261, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 44.0818, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.007141081557478927, |
|
"grad_norm": 0.46756836771965027, |
|
"learning_rate": 0.0001582586264510396, |
|
"loss": 44.1222, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.007186278276197148, |
|
"grad_norm": 0.5881497859954834, |
|
"learning_rate": 0.00015773633288888197, |
|
"loss": 44.0838, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.007231474994915369, |
|
"grad_norm": 0.4284621775150299, |
|
"learning_rate": 0.00015721166601221698, |
|
"loss": 44.1098, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.007231474994915369, |
|
"eval_loss": 11.024553298950195, |
|
"eval_runtime": 176.2152, |
|
"eval_samples_per_second": 52.873, |
|
"eval_steps_per_second": 26.439, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.00727667171363359, |
|
"grad_norm": 0.5078541040420532, |
|
"learning_rate": 0.000156684647388045, |
|
"loss": 44.0764, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.007321868432351811, |
|
"grad_norm": 0.46269139647483826, |
|
"learning_rate": 0.0001561552986800375, |
|
"loss": 44.0991, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.007367065151070032, |
|
"grad_norm": 0.5498519539833069, |
|
"learning_rate": 0.0001556236416476465, |
|
"loss": 44.1389, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.007412261869788253, |
|
"grad_norm": 0.8603391647338867, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 44.1567, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.007457458588506475, |
|
"grad_norm": 0.6750001907348633, |
|
"learning_rate": 0.00015455349012105486, |
|
"loss": 44.1007, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.007457458588506475, |
|
"eval_loss": 11.024243354797363, |
|
"eval_runtime": 175.9806, |
|
"eval_samples_per_second": 52.943, |
|
"eval_steps_per_second": 26.475, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.007502655307224696, |
|
"grad_norm": 0.5474929809570312, |
|
"learning_rate": 0.00015401503961659204, |
|
"loss": 44.0842, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.007547852025942917, |
|
"grad_norm": 0.5558362603187561, |
|
"learning_rate": 0.00015347436876541297, |
|
"loss": 44.1025, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.007593048744661137, |
|
"grad_norm": 0.5435320138931274, |
|
"learning_rate": 0.00015293149979237876, |
|
"loss": 44.073, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.007638245463379358, |
|
"grad_norm": 0.41495761275291443, |
|
"learning_rate": 0.00015238645501270654, |
|
"loss": 44.0608, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.00768344218209758, |
|
"grad_norm": 0.4491158127784729, |
|
"learning_rate": 0.00015183925683105254, |
|
"loss": 44.0995, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.00768344218209758, |
|
"eval_loss": 11.023889541625977, |
|
"eval_runtime": 176.2494, |
|
"eval_samples_per_second": 52.863, |
|
"eval_steps_per_second": 26.434, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.007728638900815801, |
|
"grad_norm": 0.6309311389923096, |
|
"learning_rate": 0.00015128992774059063, |
|
"loss": 44.1244, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.007773835619534022, |
|
"grad_norm": 0.4494941830635071, |
|
"learning_rate": 0.00015073849032208822, |
|
"loss": 44.1336, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.007819032338252242, |
|
"grad_norm": 0.5996090173721313, |
|
"learning_rate": 0.00015018496724297778, |
|
"loss": 44.1116, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.007864229056970463, |
|
"grad_norm": 0.73329097032547, |
|
"learning_rate": 0.00014962938125642503, |
|
"loss": 44.1541, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.007909425775688685, |
|
"grad_norm": 0.5808178186416626, |
|
"learning_rate": 0.0001490717552003938, |
|
"loss": 44.114, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.007909425775688685, |
|
"eval_loss": 11.023494720458984, |
|
"eval_runtime": 175.9386, |
|
"eval_samples_per_second": 52.956, |
|
"eval_steps_per_second": 26.481, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.007954622494406906, |
|
"grad_norm": 0.46136102080345154, |
|
"learning_rate": 0.00014851211199670721, |
|
"loss": 44.0922, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.007999819213125127, |
|
"grad_norm": 0.4197680354118347, |
|
"learning_rate": 0.0001479504746501054, |
|
"loss": 44.0494, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.008045015931843348, |
|
"grad_norm": 0.4883246421813965, |
|
"learning_rate": 0.00014738686624729986, |
|
"loss": 44.0914, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00809021265056157, |
|
"grad_norm": 0.4930349588394165, |
|
"learning_rate": 0.0001468213099560246, |
|
"loss": 44.0695, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.00813540936927979, |
|
"grad_norm": 0.5016703009605408, |
|
"learning_rate": 0.00014625382902408356, |
|
"loss": 44.0501, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00813540936927979, |
|
"eval_loss": 11.023147583007812, |
|
"eval_runtime": 176.3497, |
|
"eval_samples_per_second": 52.833, |
|
"eval_steps_per_second": 26.419, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.008180606087998012, |
|
"grad_norm": 0.5716975927352905, |
|
"learning_rate": 0.00014568444677839516, |
|
"loss": 44.1164, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.008225802806716233, |
|
"grad_norm": 0.6961561441421509, |
|
"learning_rate": 0.00014511318662403347, |
|
"loss": 44.1024, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.008270999525434454, |
|
"grad_norm": 0.5740232467651367, |
|
"learning_rate": 0.0001445400720432659, |
|
"loss": 44.1379, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.008316196244152675, |
|
"grad_norm": 0.5687277913093567, |
|
"learning_rate": 0.00014396512659458824, |
|
"loss": 44.1165, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.008361392962870896, |
|
"grad_norm": 0.6230690479278564, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 44.118, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.008361392962870896, |
|
"eval_loss": 11.022916793823242, |
|
"eval_runtime": 176.0405, |
|
"eval_samples_per_second": 52.925, |
|
"eval_steps_per_second": 26.465, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.008406589681589116, |
|
"grad_norm": 0.48787158727645874, |
|
"learning_rate": 0.0001428098377028126, |
|
"loss": 44.0875, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.008451786400307337, |
|
"grad_norm": 0.44323569536209106, |
|
"learning_rate": 0.000142229541749116, |
|
"loss": 44.143, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.008496983119025558, |
|
"grad_norm": 0.47104522585868835, |
|
"learning_rate": 0.0001416475099043599, |
|
"loss": 44.0804, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00854217983774378, |
|
"grad_norm": 0.549055814743042, |
|
"learning_rate": 0.0001410637660935938, |
|
"loss": 44.0923, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.008587376556462, |
|
"grad_norm": 0.4136901795864105, |
|
"learning_rate": 0.00014047833431223938, |
|
"loss": 44.0967, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.008587376556462, |
|
"eval_loss": 11.02279281616211, |
|
"eval_runtime": 176.1885, |
|
"eval_samples_per_second": 52.881, |
|
"eval_steps_per_second": 26.443, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.008632573275180222, |
|
"grad_norm": 0.5897504091262817, |
|
"learning_rate": 0.0001398912386251042, |
|
"loss": 44.0428, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.008677769993898443, |
|
"grad_norm": 0.4917847514152527, |
|
"learning_rate": 0.00013930250316539238, |
|
"loss": 44.0819, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.008722966712616664, |
|
"grad_norm": 0.4644782245159149, |
|
"learning_rate": 0.00013871215213371284, |
|
"loss": 44.0209, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.008768163431334885, |
|
"grad_norm": 0.6393492817878723, |
|
"learning_rate": 0.00013812020979708418, |
|
"loss": 44.101, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.008813360150053106, |
|
"grad_norm": 0.60307377576828, |
|
"learning_rate": 0.00013752670048793744, |
|
"loss": 44.1646, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.008813360150053106, |
|
"eval_loss": 11.022566795349121, |
|
"eval_runtime": 176.0184, |
|
"eval_samples_per_second": 52.932, |
|
"eval_steps_per_second": 26.469, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.008858556868771328, |
|
"grad_norm": 0.4305557608604431, |
|
"learning_rate": 0.00013693164860311565, |
|
"loss": 44.0883, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.008903753587489549, |
|
"grad_norm": 0.4658234119415283, |
|
"learning_rate": 0.00013633507860287116, |
|
"loss": 44.1006, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00894895030620777, |
|
"grad_norm": 0.5248441100120544, |
|
"learning_rate": 0.0001357370150098601, |
|
"loss": 44.0716, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.008994147024925991, |
|
"grad_norm": 0.5177784562110901, |
|
"learning_rate": 0.0001351374824081343, |
|
"loss": 44.1013, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.009039343743644212, |
|
"grad_norm": 0.5134817361831665, |
|
"learning_rate": 0.00013453650544213076, |
|
"loss": 44.0501, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.009039343743644212, |
|
"eval_loss": 11.022467613220215, |
|
"eval_runtime": 176.1703, |
|
"eval_samples_per_second": 52.886, |
|
"eval_steps_per_second": 26.446, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.009084540462362432, |
|
"grad_norm": 0.6612194776535034, |
|
"learning_rate": 0.00013393410881565876, |
|
"loss": 44.1568, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.009129737181080653, |
|
"grad_norm": 0.5365848541259766, |
|
"learning_rate": 0.00013333031729088419, |
|
"loss": 44.0318, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.009174933899798874, |
|
"grad_norm": 0.43999558687210083, |
|
"learning_rate": 0.0001327251556873117, |
|
"loss": 44.0544, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.009220130618517095, |
|
"grad_norm": 0.5535528659820557, |
|
"learning_rate": 0.00013211864888076457, |
|
"loss": 44.0657, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.009265327337235316, |
|
"grad_norm": 0.5289484262466431, |
|
"learning_rate": 0.0001315108218023621, |
|
"loss": 44.0946, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.009265327337235316, |
|
"eval_loss": 11.022246360778809, |
|
"eval_runtime": 175.5631, |
|
"eval_samples_per_second": 53.069, |
|
"eval_steps_per_second": 26.537, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.009310524055953538, |
|
"grad_norm": 0.515040934085846, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 44.1026, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.009355720774671759, |
|
"grad_norm": 0.43807700276374817, |
|
"learning_rate": 0.00013029130682479722, |
|
"loss": 44.0529, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.00940091749338998, |
|
"grad_norm": 0.40383437275886536, |
|
"learning_rate": 0.00012967966905511906, |
|
"loss": 44.0854, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.009446114212108201, |
|
"grad_norm": 0.42450079321861267, |
|
"learning_rate": 0.00012906681127049338, |
|
"loss": 44.0488, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.009491310930826422, |
|
"grad_norm": 0.5043962597846985, |
|
"learning_rate": 0.00012845275866310324, |
|
"loss": 44.047, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.009491310930826422, |
|
"eval_loss": 11.02186393737793, |
|
"eval_runtime": 176.4502, |
|
"eval_samples_per_second": 52.802, |
|
"eval_steps_per_second": 26.404, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.009536507649544643, |
|
"grad_norm": 0.5239633321762085, |
|
"learning_rate": 0.00012783753647424635, |
|
"loss": 44.1326, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.009581704368262865, |
|
"grad_norm": 0.4532044231891632, |
|
"learning_rate": 0.00012722116999329712, |
|
"loss": 44.1039, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.009626901086981086, |
|
"grad_norm": 0.5784953832626343, |
|
"learning_rate": 0.00012660368455666752, |
|
"loss": 44.0902, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.009672097805699307, |
|
"grad_norm": 0.46399155259132385, |
|
"learning_rate": 0.0001259851055467653, |
|
"loss": 44.0665, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.009717294524417528, |
|
"grad_norm": 0.5353842973709106, |
|
"learning_rate": 0.00012536545839095074, |
|
"loss": 44.0339, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.009717294524417528, |
|
"eval_loss": 11.021649360656738, |
|
"eval_runtime": 176.1431, |
|
"eval_samples_per_second": 52.895, |
|
"eval_steps_per_second": 26.45, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.009762491243135748, |
|
"grad_norm": 0.4887973666191101, |
|
"learning_rate": 0.00012474476856049144, |
|
"loss": 44.074, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.009807687961853969, |
|
"grad_norm": 0.44021403789520264, |
|
"learning_rate": 0.00012412306156951526, |
|
"loss": 44.0695, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.00985288468057219, |
|
"grad_norm": 0.5092349052429199, |
|
"learning_rate": 0.00012350036297396154, |
|
"loss": 44.0596, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.009898081399290411, |
|
"grad_norm": 0.47505757212638855, |
|
"learning_rate": 0.00012287669837053055, |
|
"loss": 44.0435, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.009943278118008632, |
|
"grad_norm": 0.4098033308982849, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 44.1334, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.009943278118008632, |
|
"eval_loss": 11.021401405334473, |
|
"eval_runtime": 176.2917, |
|
"eval_samples_per_second": 52.85, |
|
"eval_steps_per_second": 26.428, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.009988474836726853, |
|
"grad_norm": 0.5452781915664673, |
|
"learning_rate": 0.00012162657372432836, |
|
"loss": 44.0602, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.010033671555445075, |
|
"grad_norm": 0.5344114303588867, |
|
"learning_rate": 0.00012100016506928493, |
|
"loss": 44.045, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.010078868274163296, |
|
"grad_norm": 0.4083841145038605, |
|
"learning_rate": 0.00012037289317970757, |
|
"loss": 44.0642, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.010124064992881517, |
|
"grad_norm": 0.4382067918777466, |
|
"learning_rate": 0.00011974478384028672, |
|
"loss": 44.0648, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.010169261711599738, |
|
"grad_norm": 0.42340517044067383, |
|
"learning_rate": 0.00011911586287013725, |
|
"loss": 44.1315, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.010169261711599738, |
|
"eval_loss": 11.021224975585938, |
|
"eval_runtime": 176.0622, |
|
"eval_samples_per_second": 52.919, |
|
"eval_steps_per_second": 26.462, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.01021445843031796, |
|
"grad_norm": 0.5047578811645508, |
|
"learning_rate": 0.00011848615612173688, |
|
"loss": 44.123, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.01025965514903618, |
|
"grad_norm": 0.5647579431533813, |
|
"learning_rate": 0.00011785568947986367, |
|
"loss": 44.0525, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.010304851867754402, |
|
"grad_norm": 0.48243632912635803, |
|
"learning_rate": 0.0001172244888605319, |
|
"loss": 44.1143, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.010350048586472623, |
|
"grad_norm": 0.5492759943008423, |
|
"learning_rate": 0.0001165925802099268, |
|
"loss": 44.0494, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.010395245305190844, |
|
"grad_norm": 0.5804261565208435, |
|
"learning_rate": 0.00011595998950333793, |
|
"loss": 44.0785, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.010395245305190844, |
|
"eval_loss": 11.021036148071289, |
|
"eval_runtime": 176.2523, |
|
"eval_samples_per_second": 52.862, |
|
"eval_steps_per_second": 26.434, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.010440442023909063, |
|
"grad_norm": 0.4731612503528595, |
|
"learning_rate": 0.00011532674274409159, |
|
"loss": 44.1151, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.010485638742627285, |
|
"grad_norm": 0.47020676732063293, |
|
"learning_rate": 0.00011469286596248181, |
|
"loss": 44.0772, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.010530835461345506, |
|
"grad_norm": 0.4738229215145111, |
|
"learning_rate": 0.00011405838521470029, |
|
"loss": 44.1274, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.010576032180063727, |
|
"grad_norm": 0.5980152487754822, |
|
"learning_rate": 0.00011342332658176555, |
|
"loss": 44.0543, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.010621228898781948, |
|
"grad_norm": 0.45920702815055847, |
|
"learning_rate": 0.00011278771616845061, |
|
"loss": 44.0846, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.010621228898781948, |
|
"eval_loss": 11.02093505859375, |
|
"eval_runtime": 176.025, |
|
"eval_samples_per_second": 52.93, |
|
"eval_steps_per_second": 26.468, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.01066642561750017, |
|
"grad_norm": 0.48931440711021423, |
|
"learning_rate": 0.00011215158010221005, |
|
"loss": 44.0991, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.01071162233621839, |
|
"grad_norm": 0.4345873296260834, |
|
"learning_rate": 0.00011151494453210596, |
|
"loss": 44.0491, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.010756819054936612, |
|
"grad_norm": 0.43655380606651306, |
|
"learning_rate": 0.00011087783562773311, |
|
"loss": 44.0903, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.010802015773654833, |
|
"grad_norm": 0.616533637046814, |
|
"learning_rate": 0.00011024027957814314, |
|
"loss": 44.1318, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.010847212492373054, |
|
"grad_norm": 0.45536908507347107, |
|
"learning_rate": 0.00010960230259076818, |
|
"loss": 44.0812, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.010847212492373054, |
|
"eval_loss": 11.020767211914062, |
|
"eval_runtime": 176.3636, |
|
"eval_samples_per_second": 52.828, |
|
"eval_steps_per_second": 26.417, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.010892409211091275, |
|
"grad_norm": 0.47256338596343994, |
|
"learning_rate": 0.00010896393089034336, |
|
"loss": 44.0513, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.010937605929809496, |
|
"grad_norm": 0.42103204131126404, |
|
"learning_rate": 0.00010832519071782894, |
|
"loss": 44.0399, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.010982802648527717, |
|
"grad_norm": 0.49555832147598267, |
|
"learning_rate": 0.00010768610832933168, |
|
"loss": 44.1504, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.011027999367245939, |
|
"grad_norm": 0.42800289392471313, |
|
"learning_rate": 0.0001070467099950254, |
|
"loss": 44.0886, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.01107319608596416, |
|
"grad_norm": 0.6031785607337952, |
|
"learning_rate": 0.0001064070219980713, |
|
"loss": 44.0548, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.01107319608596416, |
|
"eval_loss": 11.020543098449707, |
|
"eval_runtime": 176.1913, |
|
"eval_samples_per_second": 52.88, |
|
"eval_steps_per_second": 26.443, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.01111839280468238, |
|
"grad_norm": 0.4927026629447937, |
|
"learning_rate": 0.00010576707063353746, |
|
"loss": 44.0813, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.0111635895234006, |
|
"grad_norm": 0.6148269772529602, |
|
"learning_rate": 0.00010512688220731792, |
|
"loss": 44.0928, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.011208786242118822, |
|
"grad_norm": 0.4395325779914856, |
|
"learning_rate": 0.00010448648303505151, |
|
"loss": 44.047, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.011253982960837043, |
|
"grad_norm": 0.4433494806289673, |
|
"learning_rate": 0.00010384589944103984, |
|
"loss": 44.1, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.011299179679555264, |
|
"grad_norm": 0.6447661519050598, |
|
"learning_rate": 0.00010320515775716555, |
|
"loss": 44.0861, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.011299179679555264, |
|
"eval_loss": 11.020323753356934, |
|
"eval_runtime": 176.3276, |
|
"eval_samples_per_second": 52.839, |
|
"eval_steps_per_second": 26.422, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.011344376398273485, |
|
"grad_norm": 0.5418515801429749, |
|
"learning_rate": 0.00010256428432180956, |
|
"loss": 44.0602, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.011389573116991706, |
|
"grad_norm": 0.45757991075515747, |
|
"learning_rate": 0.00010192330547876871, |
|
"loss": 44.0788, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.011434769835709927, |
|
"grad_norm": 0.5210107564926147, |
|
"learning_rate": 0.00010128224757617274, |
|
"loss": 44.0517, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.011479966554428149, |
|
"grad_norm": 0.39198753237724304, |
|
"learning_rate": 0.00010064113696540111, |
|
"loss": 44.0776, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.01152516327314637, |
|
"grad_norm": 0.4305363893508911, |
|
"learning_rate": 0.0001, |
|
"loss": 44.1121, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.01152516327314637, |
|
"eval_loss": 11.02021312713623, |
|
"eval_runtime": 176.1601, |
|
"eval_samples_per_second": 52.889, |
|
"eval_steps_per_second": 26.448, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.011570359991864591, |
|
"grad_norm": 0.4909750521183014, |
|
"learning_rate": 9.93588630345989e-05, |
|
"loss": 44.0858, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.011615556710582812, |
|
"grad_norm": 0.4016626477241516, |
|
"learning_rate": 9.871775242382727e-05, |
|
"loss": 44.0732, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.011660753429301033, |
|
"grad_norm": 0.5827097296714783, |
|
"learning_rate": 9.80766945212313e-05, |
|
"loss": 44.0957, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.011705950148019255, |
|
"grad_norm": 0.48728469014167786, |
|
"learning_rate": 9.743571567819046e-05, |
|
"loss": 44.0648, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.011751146866737476, |
|
"grad_norm": 0.455342173576355, |
|
"learning_rate": 9.679484224283449e-05, |
|
"loss": 44.0327, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.011751146866737476, |
|
"eval_loss": 11.020062446594238, |
|
"eval_runtime": 176.2853, |
|
"eval_samples_per_second": 52.852, |
|
"eval_steps_per_second": 26.429, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.011796343585455695, |
|
"grad_norm": 0.50531005859375, |
|
"learning_rate": 9.615410055896015e-05, |
|
"loss": 44.0094, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.011841540304173916, |
|
"grad_norm": 0.6205224990844727, |
|
"learning_rate": 9.551351696494854e-05, |
|
"loss": 44.1, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.011886737022892137, |
|
"grad_norm": 0.5274375081062317, |
|
"learning_rate": 9.48731177926821e-05, |
|
"loss": 44.1223, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.011931933741610359, |
|
"grad_norm": 0.5149595141410828, |
|
"learning_rate": 9.423292936646257e-05, |
|
"loss": 44.1192, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.01197713046032858, |
|
"grad_norm": 0.5359209179878235, |
|
"learning_rate": 9.359297800192872e-05, |
|
"loss": 44.1155, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.01197713046032858, |
|
"eval_loss": 11.019892692565918, |
|
"eval_runtime": 176.1866, |
|
"eval_samples_per_second": 52.881, |
|
"eval_steps_per_second": 26.444, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.012022327179046801, |
|
"grad_norm": 0.5752252340316772, |
|
"learning_rate": 9.29532900049746e-05, |
|
"loss": 44.0821, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.012067523897765022, |
|
"grad_norm": 0.5125178098678589, |
|
"learning_rate": 9.231389167066837e-05, |
|
"loss": 44.061, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.012112720616483243, |
|
"grad_norm": 0.5295204520225525, |
|
"learning_rate": 9.167480928217108e-05, |
|
"loss": 43.9889, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.012157917335201465, |
|
"grad_norm": 0.40016570687294006, |
|
"learning_rate": 9.103606910965666e-05, |
|
"loss": 44.0684, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.012203114053919686, |
|
"grad_norm": 0.42660149931907654, |
|
"learning_rate": 9.039769740923183e-05, |
|
"loss": 44.0547, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.012203114053919686, |
|
"eval_loss": 11.01980209350586, |
|
"eval_runtime": 176.1599, |
|
"eval_samples_per_second": 52.889, |
|
"eval_steps_per_second": 26.448, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.012248310772637907, |
|
"grad_norm": 0.636551022529602, |
|
"learning_rate": 8.975972042185687e-05, |
|
"loss": 44.1385, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.012293507491356128, |
|
"grad_norm": 0.5031408071517944, |
|
"learning_rate": 8.912216437226693e-05, |
|
"loss": 44.1121, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.01233870421007435, |
|
"grad_norm": 0.49243634939193726, |
|
"learning_rate": 8.848505546789408e-05, |
|
"loss": 44.0864, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.01238390092879257, |
|
"grad_norm": 0.47308340668678284, |
|
"learning_rate": 8.784841989778996e-05, |
|
"loss": 44.0391, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.012429097647510792, |
|
"grad_norm": 0.43966105580329895, |
|
"learning_rate": 8.721228383154939e-05, |
|
"loss": 44.0969, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.012429097647510792, |
|
"eval_loss": 11.019760131835938, |
|
"eval_runtime": 176.1857, |
|
"eval_samples_per_second": 52.882, |
|
"eval_steps_per_second": 26.444, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.012474294366229011, |
|
"grad_norm": 0.4853382706642151, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 44.079, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.012519491084947232, |
|
"grad_norm": 0.453819215297699, |
|
"learning_rate": 8.594161478529974e-05, |
|
"loss": 44.0371, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.012564687803665453, |
|
"grad_norm": 0.4855421483516693, |
|
"learning_rate": 8.530713403751821e-05, |
|
"loss": 44.0514, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.012609884522383675, |
|
"grad_norm": 0.49890294671058655, |
|
"learning_rate": 8.46732572559084e-05, |
|
"loss": 44.0561, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.012655081241101896, |
|
"grad_norm": 0.406686007976532, |
|
"learning_rate": 8.404001049666211e-05, |
|
"loss": 44.0746, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.012655081241101896, |
|
"eval_loss": 11.01966381072998, |
|
"eval_runtime": 176.4032, |
|
"eval_samples_per_second": 52.817, |
|
"eval_steps_per_second": 26.411, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.012700277959820117, |
|
"grad_norm": 0.584389865398407, |
|
"learning_rate": 8.340741979007325e-05, |
|
"loss": 44.0014, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.012745474678538338, |
|
"grad_norm": 0.5981946587562561, |
|
"learning_rate": 8.277551113946812e-05, |
|
"loss": 44.1037, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.01279067139725656, |
|
"grad_norm": 0.48125511407852173, |
|
"learning_rate": 8.214431052013634e-05, |
|
"loss": 44.1114, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.01283586811597478, |
|
"grad_norm": 0.4403318762779236, |
|
"learning_rate": 8.151384387826313e-05, |
|
"loss": 44.0742, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.012881064834693002, |
|
"grad_norm": 0.5336763262748718, |
|
"learning_rate": 8.08841371298628e-05, |
|
"loss": 44.0535, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.012881064834693002, |
|
"eval_loss": 11.01951789855957, |
|
"eval_runtime": 176.2803, |
|
"eval_samples_per_second": 52.853, |
|
"eval_steps_per_second": 26.429, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.012926261553411223, |
|
"grad_norm": 0.4550967216491699, |
|
"learning_rate": 8.02552161597133e-05, |
|
"loss": 44.0825, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.012971458272129444, |
|
"grad_norm": 0.5073683261871338, |
|
"learning_rate": 7.962710682029245e-05, |
|
"loss": 44.0045, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.013016654990847665, |
|
"grad_norm": 0.424605131149292, |
|
"learning_rate": 7.899983493071507e-05, |
|
"loss": 44.0451, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.013061851709565886, |
|
"grad_norm": 0.48650291562080383, |
|
"learning_rate": 7.837342627567165e-05, |
|
"loss": 44.0424, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.013107048428284106, |
|
"grad_norm": 0.5977911949157715, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 44.1303, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.013107048428284106, |
|
"eval_loss": 11.019427299499512, |
|
"eval_runtime": 176.4378, |
|
"eval_samples_per_second": 52.806, |
|
"eval_steps_per_second": 26.406, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.013152245147002327, |
|
"grad_norm": 0.5895593166351318, |
|
"learning_rate": 7.712330162946948e-05, |
|
"loss": 44.0645, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.013197441865720548, |
|
"grad_norm": 0.4745809733867645, |
|
"learning_rate": 7.649963702603849e-05, |
|
"loss": 44.0755, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.01324263858443877, |
|
"grad_norm": 0.5061216950416565, |
|
"learning_rate": 7.587693843048475e-05, |
|
"loss": 44.0751, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.01328783530315699, |
|
"grad_norm": 0.42560261487960815, |
|
"learning_rate": 7.525523143950859e-05, |
|
"loss": 44.0495, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.013333032021875212, |
|
"grad_norm": 0.44290590286254883, |
|
"learning_rate": 7.463454160904928e-05, |
|
"loss": 44.1142, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.013333032021875212, |
|
"eval_loss": 11.019330978393555, |
|
"eval_runtime": 175.7063, |
|
"eval_samples_per_second": 53.026, |
|
"eval_steps_per_second": 26.516, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.013378228740593433, |
|
"grad_norm": 0.6524297595024109, |
|
"learning_rate": 7.401489445323473e-05, |
|
"loss": 44.0737, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.013423425459311654, |
|
"grad_norm": 0.49754655361175537, |
|
"learning_rate": 7.339631544333249e-05, |
|
"loss": 44.0838, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.013468622178029875, |
|
"grad_norm": 0.4138273596763611, |
|
"learning_rate": 7.27788300067029e-05, |
|
"loss": 44.0653, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.013513818896748096, |
|
"grad_norm": 0.5399671792984009, |
|
"learning_rate": 7.21624635257537e-05, |
|
"loss": 44.0646, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.013559015615466317, |
|
"grad_norm": 0.41923409700393677, |
|
"learning_rate": 7.154724133689677e-05, |
|
"loss": 44.0685, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013559015615466317, |
|
"eval_loss": 11.019237518310547, |
|
"eval_runtime": 176.4288, |
|
"eval_samples_per_second": 52.809, |
|
"eval_steps_per_second": 26.407, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013604212334184539, |
|
"grad_norm": 0.49278682470321655, |
|
"learning_rate": 7.093318872950665e-05, |
|
"loss": 44.0319, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.01364940905290276, |
|
"grad_norm": 0.5009450316429138, |
|
"learning_rate": 7.032033094488095e-05, |
|
"loss": 44.0988, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.013694605771620981, |
|
"grad_norm": 0.4270615577697754, |
|
"learning_rate": 6.97086931752028e-05, |
|
"loss": 44.1025, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.013739802490339202, |
|
"grad_norm": 0.49744102358818054, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 44.0652, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.013784999209057422, |
|
"grad_norm": 0.48600587248802185, |
|
"learning_rate": 6.848917819763793e-05, |
|
"loss": 44.1292, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.013784999209057422, |
|
"eval_loss": 11.01909351348877, |
|
"eval_runtime": 176.1259, |
|
"eval_samples_per_second": 52.9, |
|
"eval_steps_per_second": 26.453, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.013830195927775643, |
|
"grad_norm": 0.4116569459438324, |
|
"learning_rate": 6.788135111923545e-05, |
|
"loss": 44.0897, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.013875392646493864, |
|
"grad_norm": 0.4364916682243347, |
|
"learning_rate": 6.72748443126883e-05, |
|
"loss": 44.1195, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.013920589365212085, |
|
"grad_norm": 0.5589216351509094, |
|
"learning_rate": 6.666968270911584e-05, |
|
"loss": 44.0911, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.013965786083930306, |
|
"grad_norm": 0.5414496064186096, |
|
"learning_rate": 6.606589118434126e-05, |
|
"loss": 44.1532, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.014010982802648527, |
|
"grad_norm": 0.4488687515258789, |
|
"learning_rate": 6.546349455786926e-05, |
|
"loss": 44.0637, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.014010982802648527, |
|
"eval_loss": 11.018967628479004, |
|
"eval_runtime": 176.4018, |
|
"eval_samples_per_second": 52.817, |
|
"eval_steps_per_second": 26.411, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.014056179521366749, |
|
"grad_norm": 0.5137606859207153, |
|
"learning_rate": 6.486251759186572e-05, |
|
"loss": 44.1158, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.01410137624008497, |
|
"grad_norm": 0.5155542492866516, |
|
"learning_rate": 6.426298499013994e-05, |
|
"loss": 44.1199, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.014146572958803191, |
|
"grad_norm": 0.37395790219306946, |
|
"learning_rate": 6.366492139712886e-05, |
|
"loss": 44.0457, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.014191769677521412, |
|
"grad_norm": 0.6116747260093689, |
|
"learning_rate": 6.306835139688438e-05, |
|
"loss": 44.1012, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.014236966396239633, |
|
"grad_norm": 0.5333120822906494, |
|
"learning_rate": 6.24732995120626e-05, |
|
"loss": 44.1035, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.014236966396239633, |
|
"eval_loss": 11.018932342529297, |
|
"eval_runtime": 176.1972, |
|
"eval_samples_per_second": 52.878, |
|
"eval_steps_per_second": 26.442, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.014282163114957854, |
|
"grad_norm": 0.43927499651908875, |
|
"learning_rate": 6.187979020291583e-05, |
|
"loss": 44.0191, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.014327359833676076, |
|
"grad_norm": 0.4511764347553253, |
|
"learning_rate": 6.12878478662872e-05, |
|
"loss": 44.036, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.014372556552394297, |
|
"grad_norm": 0.4678284823894501, |
|
"learning_rate": 6.069749683460765e-05, |
|
"loss": 44.1023, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.014417753271112518, |
|
"grad_norm": 0.4449803829193115, |
|
"learning_rate": 6.010876137489584e-05, |
|
"loss": 44.0835, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.014462949989830737, |
|
"grad_norm": 0.42860502004623413, |
|
"learning_rate": 5.952166568776062e-05, |
|
"loss": 44.0725, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.014462949989830737, |
|
"eval_loss": 11.018913269042969, |
|
"eval_runtime": 176.3627, |
|
"eval_samples_per_second": 52.829, |
|
"eval_steps_per_second": 26.417, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.014508146708548959, |
|
"grad_norm": 0.47462332248687744, |
|
"learning_rate": 5.893623390640621e-05, |
|
"loss": 44.0712, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.01455334342726718, |
|
"grad_norm": 0.3999902307987213, |
|
"learning_rate": 5.835249009564012e-05, |
|
"loss": 44.0985, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.014598540145985401, |
|
"grad_norm": 0.5390244126319885, |
|
"learning_rate": 5.777045825088404e-05, |
|
"loss": 44.0947, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.014643736864703622, |
|
"grad_norm": 0.5316472053527832, |
|
"learning_rate": 5.7190162297187475e-05, |
|
"loss": 44.0887, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.014688933583421843, |
|
"grad_norm": 0.43537721037864685, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 44.1142, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.014688933583421843, |
|
"eval_loss": 11.018884658813477, |
|
"eval_runtime": 176.0785, |
|
"eval_samples_per_second": 52.914, |
|
"eval_steps_per_second": 26.46, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.014734130302140065, |
|
"grad_norm": 0.42780250310897827, |
|
"learning_rate": 5.60348734054118e-05, |
|
"loss": 44.0567, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.014779327020858286, |
|
"grad_norm": 0.418026864528656, |
|
"learning_rate": 5.545992795673408e-05, |
|
"loss": 44.0578, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.014824523739576507, |
|
"grad_norm": 0.507036030292511, |
|
"learning_rate": 5.488681337596653e-05, |
|
"loss": 44.0708, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.014869720458294728, |
|
"grad_norm": 0.4779205322265625, |
|
"learning_rate": 5.431555322160483e-05, |
|
"loss": 44.0879, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.01491491717701295, |
|
"grad_norm": 0.48253196477890015, |
|
"learning_rate": 5.37461709759165e-05, |
|
"loss": 44.005, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01491491717701295, |
|
"eval_loss": 11.018866539001465, |
|
"eval_runtime": 176.4141, |
|
"eval_samples_per_second": 52.813, |
|
"eval_steps_per_second": 26.409, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01496011389573117, |
|
"grad_norm": 0.503404438495636, |
|
"learning_rate": 5.317869004397544e-05, |
|
"loss": 44.0551, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.015005310614449392, |
|
"grad_norm": 0.5667140483856201, |
|
"learning_rate": 5.261313375270014e-05, |
|
"loss": 44.1005, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.015050507333167613, |
|
"grad_norm": 0.4343127906322479, |
|
"learning_rate": 5.2049525349894625e-05, |
|
"loss": 44.0367, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.015095704051885834, |
|
"grad_norm": 0.4030550420284271, |
|
"learning_rate": 5.148788800329278e-05, |
|
"loss": 44.0094, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.015140900770604053, |
|
"grad_norm": 0.7541276812553406, |
|
"learning_rate": 5.092824479960625e-05, |
|
"loss": 44.0686, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.015140900770604053, |
|
"eval_loss": 11.018802642822266, |
|
"eval_runtime": 176.1322, |
|
"eval_samples_per_second": 52.898, |
|
"eval_steps_per_second": 26.452, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.015186097489322275, |
|
"grad_norm": 0.4742172360420227, |
|
"learning_rate": 5.0370618743575026e-05, |
|
"loss": 44.0855, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.015231294208040496, |
|
"grad_norm": 0.4134741723537445, |
|
"learning_rate": 4.981503275702227e-05, |
|
"loss": 44.0928, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.015276490926758717, |
|
"grad_norm": 0.6316869258880615, |
|
"learning_rate": 4.92615096779118e-05, |
|
"loss": 44.0649, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.015321687645476938, |
|
"grad_norm": 0.4112119674682617, |
|
"learning_rate": 4.87100722594094e-05, |
|
"loss": 44.0769, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.01536688436419516, |
|
"grad_norm": 0.4423971474170685, |
|
"learning_rate": 4.8160743168947496e-05, |
|
"loss": 44.059, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01536688436419516, |
|
"eval_loss": 11.018733024597168, |
|
"eval_runtime": 176.2897, |
|
"eval_samples_per_second": 52.85, |
|
"eval_steps_per_second": 26.428, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01541208108291338, |
|
"grad_norm": 0.46714112162590027, |
|
"learning_rate": 4.7613544987293446e-05, |
|
"loss": 44.007, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.015457277801631602, |
|
"grad_norm": 0.449789434671402, |
|
"learning_rate": 4.706850020762126e-05, |
|
"loss": 44.0599, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.015502474520349823, |
|
"grad_norm": 0.5278406739234924, |
|
"learning_rate": 4.6525631234587034e-05, |
|
"loss": 44.0606, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.015547671239068044, |
|
"grad_norm": 0.5856757760047913, |
|
"learning_rate": 4.5984960383408005e-05, |
|
"loss": 44.0862, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.015592867957786265, |
|
"grad_norm": 0.48914504051208496, |
|
"learning_rate": 4.544650987894514e-05, |
|
"loss": 44.0642, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.015592867957786265, |
|
"eval_loss": 11.018689155578613, |
|
"eval_runtime": 176.1852, |
|
"eval_samples_per_second": 52.882, |
|
"eval_steps_per_second": 26.444, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.015638064676504485, |
|
"grad_norm": 0.5346770882606506, |
|
"learning_rate": 4.491030185478976e-05, |
|
"loss": 44.122, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.015683261395222706, |
|
"grad_norm": 0.4303387701511383, |
|
"learning_rate": 4.437635835235353e-05, |
|
"loss": 44.0754, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.015728458113940927, |
|
"grad_norm": 0.3995809555053711, |
|
"learning_rate": 4.384470131996252e-05, |
|
"loss": 44.1039, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.015773654832659148, |
|
"grad_norm": 0.44882121682167053, |
|
"learning_rate": 4.331535261195504e-05, |
|
"loss": 44.1023, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.01581885155137737, |
|
"grad_norm": 0.4910334646701813, |
|
"learning_rate": 4.278833398778306e-05, |
|
"loss": 44.0906, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01581885155137737, |
|
"eval_loss": 11.018669128417969, |
|
"eval_runtime": 176.2273, |
|
"eval_samples_per_second": 52.869, |
|
"eval_steps_per_second": 26.437, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01586404827009559, |
|
"grad_norm": 0.4974361062049866, |
|
"learning_rate": 4.2263667111118074e-05, |
|
"loss": 44.0836, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.01590924498881381, |
|
"grad_norm": 0.4839700162410736, |
|
"learning_rate": 4.174137354896039e-05, |
|
"loss": 44.0984, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.015954441707532033, |
|
"grad_norm": 0.4186987578868866, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 44.0672, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.015999638426250254, |
|
"grad_norm": 0.5234962701797485, |
|
"learning_rate": 4.0703992147497425e-05, |
|
"loss": 44.0376, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.016044835144968475, |
|
"grad_norm": 0.47532570362091064, |
|
"learning_rate": 4.0188946950878404e-05, |
|
"loss": 44.0386, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.016044835144968475, |
|
"eval_loss": 11.018640518188477, |
|
"eval_runtime": 176.1029, |
|
"eval_samples_per_second": 52.907, |
|
"eval_steps_per_second": 26.456, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.016090031863686696, |
|
"grad_norm": 0.397630900144577, |
|
"learning_rate": 3.9676360352386356e-05, |
|
"loss": 44.1375, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.016135228582404917, |
|
"grad_norm": 0.530908465385437, |
|
"learning_rate": 3.9166253422448686e-05, |
|
"loss": 44.1015, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.01618042530112314, |
|
"grad_norm": 0.41138243675231934, |
|
"learning_rate": 3.8658647129563364e-05, |
|
"loss": 44.0516, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.01622562201984136, |
|
"grad_norm": 0.5258074402809143, |
|
"learning_rate": 3.8153562339436855e-05, |
|
"loss": 44.1157, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.01627081873855958, |
|
"grad_norm": 0.3948734402656555, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 44.0478, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01627081873855958, |
|
"eval_loss": 11.018574714660645, |
|
"eval_runtime": 176.3307, |
|
"eval_samples_per_second": 52.838, |
|
"eval_steps_per_second": 26.422, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.016316015457277802, |
|
"grad_norm": 0.47562116384506226, |
|
"learning_rate": 3.7151040211187635e-05, |
|
"loss": 44.0571, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.016361212175996023, |
|
"grad_norm": 0.439248651266098, |
|
"learning_rate": 3.665364408282305e-05, |
|
"loss": 44.0292, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.016406408894714244, |
|
"grad_norm": 0.5355764031410217, |
|
"learning_rate": 3.615885187503946e-05, |
|
"loss": 44.1601, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.016451605613432466, |
|
"grad_norm": 0.5143962502479553, |
|
"learning_rate": 3.566668392680662e-05, |
|
"loss": 44.0829, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.016496802332150687, |
|
"grad_norm": 0.5054187178611755, |
|
"learning_rate": 3.517716046922118e-05, |
|
"loss": 44.144, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.016496802332150687, |
|
"eval_loss": 11.018515586853027, |
|
"eval_runtime": 176.1346, |
|
"eval_samples_per_second": 52.897, |
|
"eval_steps_per_second": 26.451, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.016541999050868908, |
|
"grad_norm": 0.44439616799354553, |
|
"learning_rate": 3.469030162467513e-05, |
|
"loss": 44.0321, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.01658719576958713, |
|
"grad_norm": 0.5372561812400818, |
|
"learning_rate": 3.4206127406028745e-05, |
|
"loss": 44.0923, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.01663239248830535, |
|
"grad_norm": 0.48407748341560364, |
|
"learning_rate": 3.372465771578771e-05, |
|
"loss": 44.1126, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.01667758920702357, |
|
"grad_norm": 0.4682793915271759, |
|
"learning_rate": 3.32459123452852e-05, |
|
"loss": 44.0227, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.016722785925741793, |
|
"grad_norm": 0.4110027551651001, |
|
"learning_rate": 3.276991097386831e-05, |
|
"loss": 44.0354, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.016722785925741793, |
|
"eval_loss": 11.018465042114258, |
|
"eval_runtime": 176.3082, |
|
"eval_samples_per_second": 52.845, |
|
"eval_steps_per_second": 26.425, |
|
"step": 370 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3887490662400.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|