|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 20205, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014847809948032665, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 9.950507300173225e-05, |
|
"loss": 0.3437, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02969561989606533, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 9.90101460034645e-05, |
|
"loss": 0.305, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.044543429844097995, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 9.851521900519674e-05, |
|
"loss": 0.2885, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05939123979213066, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 9.802029200692899e-05, |
|
"loss": 0.2781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07423904974016332, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 9.752536500866123e-05, |
|
"loss": 0.2746, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08908685968819599, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 9.703043801039347e-05, |
|
"loss": 0.2658, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10393466963622866, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 9.653551101212572e-05, |
|
"loss": 0.2709, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.11878247958426132, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 9.604058401385795e-05, |
|
"loss": 0.2634, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.133630289532294, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 9.55456570155902e-05, |
|
"loss": 0.2688, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.14847809948032664, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 9.505073001732245e-05, |
|
"loss": 0.261, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1633259094283593, |
|
"grad_norm": 0.12158203125, |
|
"learning_rate": 9.455580301905469e-05, |
|
"loss": 0.2537, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.17817371937639198, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 9.406087602078693e-05, |
|
"loss": 0.2592, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.19302152932442465, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 9.356594902251918e-05, |
|
"loss": 0.2529, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.20786933927245732, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 9.307102202425143e-05, |
|
"loss": 0.2489, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.22271714922049, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 9.257609502598367e-05, |
|
"loss": 0.2539, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.23756495916852263, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 9.208116802771592e-05, |
|
"loss": 0.2449, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2524127691165553, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.158624102944816e-05, |
|
"loss": 0.2457, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.267260579064588, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 9.109131403118041e-05, |
|
"loss": 0.2424, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.28210838901262064, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.059638703291266e-05, |
|
"loss": 0.2485, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2969561989606533, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 9.01014600346449e-05, |
|
"loss": 0.2534, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.311804008908686, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 8.960653303637714e-05, |
|
"loss": 0.238, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3266518188567186, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 8.911160603810938e-05, |
|
"loss": 0.2409, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3414996288047513, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 8.861667903984163e-05, |
|
"loss": 0.2484, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.35634743875278396, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 8.812175204157386e-05, |
|
"loss": 0.2401, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3711952487008166, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.762682504330611e-05, |
|
"loss": 0.2372, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3860430586488493, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 8.713189804503836e-05, |
|
"loss": 0.2346, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.40089086859688194, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 8.663697104677061e-05, |
|
"loss": 0.2307, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.41573867854491464, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 8.614204404850284e-05, |
|
"loss": 0.2276, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4305864884929473, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 8.56471170502351e-05, |
|
"loss": 0.2386, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.44543429844098, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 8.515219005196734e-05, |
|
"loss": 0.2313, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4602821083890126, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 8.465726305369959e-05, |
|
"loss": 0.245, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.47512991833704527, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 8.416233605543183e-05, |
|
"loss": 0.2278, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.48997772828507796, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.366740905716407e-05, |
|
"loss": 0.2346, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5048255382331106, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 8.317248205889632e-05, |
|
"loss": 0.2364, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5196733481811433, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 8.267755506062856e-05, |
|
"loss": 0.2238, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.534521158129176, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 8.21826280623608e-05, |
|
"loss": 0.2396, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5493689680772086, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 8.168770106409304e-05, |
|
"loss": 0.2254, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.5642167780252413, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 8.119277406582529e-05, |
|
"loss": 0.2259, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.579064587973274, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 8.069784706755754e-05, |
|
"loss": 0.2273, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.5939123979213066, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 8.020292006928979e-05, |
|
"loss": 0.2274, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6087602078693393, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 7.970799307102202e-05, |
|
"loss": 0.2322, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.623608017817372, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.921306607275427e-05, |
|
"loss": 0.2274, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6384558277654045, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 7.871813907448652e-05, |
|
"loss": 0.2274, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.6533036377134372, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.822321207621877e-05, |
|
"loss": 0.2362, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6681514476614699, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 7.7728285077951e-05, |
|
"loss": 0.2367, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6829992576095026, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 7.723335807968325e-05, |
|
"loss": 0.2287, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.6978470675575352, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 7.67384310814155e-05, |
|
"loss": 0.233, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7126948775055679, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 7.624350408314775e-05, |
|
"loss": 0.2243, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7275426874536006, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 7.574857708487998e-05, |
|
"loss": 0.2311, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.7423904974016332, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.525365008661222e-05, |
|
"loss": 0.229, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7572383073496659, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 7.475872308834447e-05, |
|
"loss": 0.2275, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.7720861172976986, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 7.426379609007672e-05, |
|
"loss": 0.2235, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.7869339272457313, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 7.376886909180895e-05, |
|
"loss": 0.2234, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8017817371937639, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 7.32739420935412e-05, |
|
"loss": 0.2236, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8166295471417966, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 7.277901509527345e-05, |
|
"loss": 0.2265, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8314773570898293, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 7.22840880970057e-05, |
|
"loss": 0.2121, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.8463251670378619, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 7.178916109873795e-05, |
|
"loss": 0.2225, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.8611729769858946, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 7.129423410047018e-05, |
|
"loss": 0.2196, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.8760207869339273, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 7.079930710220243e-05, |
|
"loss": 0.2235, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.89086859688196, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 7.030438010393468e-05, |
|
"loss": 0.2256, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9057164068299925, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.980945310566693e-05, |
|
"loss": 0.2217, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.9205642167780252, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 6.931452610739916e-05, |
|
"loss": 0.223, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.9354120267260579, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 6.881959910913141e-05, |
|
"loss": 0.2165, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.9502598366740905, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.832467211086365e-05, |
|
"loss": 0.2222, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.9651076466221232, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 6.78297451125959e-05, |
|
"loss": 0.2243, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9799554565701559, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 6.733481811432813e-05, |
|
"loss": 0.2261, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.9948032665181886, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.683989111606038e-05, |
|
"loss": 0.2195, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.0096510764662212, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 6.634496411779263e-05, |
|
"loss": 0.2005, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.024498886414254, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 6.585003711952488e-05, |
|
"loss": 0.1892, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.0393466963622866, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 6.535511012125711e-05, |
|
"loss": 0.1842, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0541945063103193, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.486018312298936e-05, |
|
"loss": 0.1897, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.069042316258352, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 6.436525612472161e-05, |
|
"loss": 0.1831, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.0838901262063845, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.387032912645386e-05, |
|
"loss": 0.1891, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.0987379361544172, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 6.33754021281861e-05, |
|
"loss": 0.1884, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.1135857461024499, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 6.288047512991834e-05, |
|
"loss": 0.1904, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.1284335560504826, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.238554813165059e-05, |
|
"loss": 0.1899, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.1432813659985153, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.189062113338282e-05, |
|
"loss": 0.1884, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.158129175946548, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.139569413511507e-05, |
|
"loss": 0.1905, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.1729769858945804, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 6.0900767136847315e-05, |
|
"loss": 0.183, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.1878247958426131, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 6.0405840138579564e-05, |
|
"loss": 0.1841, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2026726057906458, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 5.9910913140311805e-05, |
|
"loss": 0.1873, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.2175204157386785, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 5.9415986142044054e-05, |
|
"loss": 0.1817, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.2323682256867112, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.892105914377629e-05, |
|
"loss": 0.1824, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.247216035634744, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 5.842613214550854e-05, |
|
"loss": 0.19, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.2620638455827766, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5.7931205147240786e-05, |
|
"loss": 0.1859, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.2769116555308093, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 5.7436278148973035e-05, |
|
"loss": 0.1877, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.2917594654788418, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 5.694135115070527e-05, |
|
"loss": 0.1813, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.3066072754268745, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 5.644642415243752e-05, |
|
"loss": 0.186, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.3214550853749072, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.595149715416976e-05, |
|
"loss": 0.1861, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.3363028953229399, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 5.545657015590201e-05, |
|
"loss": 0.1858, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.3511507052709726, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 5.4961643157634244e-05, |
|
"loss": 0.1827, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.365998515219005, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 5.446671615936649e-05, |
|
"loss": 0.1778, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.3808463251670378, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 5.397178916109874e-05, |
|
"loss": 0.1891, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.3956941351150705, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 5.347686216283099e-05, |
|
"loss": 0.1842, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.4105419450631032, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 5.298193516456323e-05, |
|
"loss": 0.187, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.4253897550111359, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 5.248700816629547e-05, |
|
"loss": 0.1897, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.4402375649591685, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.1992081168027716e-05, |
|
"loss": 0.1873, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.4550853749072012, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.1497154169759965e-05, |
|
"loss": 0.188, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.469933184855234, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 5.100222717149221e-05, |
|
"loss": 0.1824, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.4847809948032666, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 5.050730017322445e-05, |
|
"loss": 0.1827, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.4996288047512991, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.00123731749567e-05, |
|
"loss": 0.185, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.5144766146993318, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.951744617668894e-05, |
|
"loss": 0.1813, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.5293244246473645, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 4.902251917842118e-05, |
|
"loss": 0.1796, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.5441722345953972, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.852759218015343e-05, |
|
"loss": 0.1887, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.5590200445434297, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 4.803266518188567e-05, |
|
"loss": 0.1845, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.5738678544914624, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.753773818361792e-05, |
|
"loss": 0.1784, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.588715664439495, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 4.704281118535016e-05, |
|
"loss": 0.1842, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.6035634743875278, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.654788418708241e-05, |
|
"loss": 0.188, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.6184112843355605, |
|
"grad_norm": 0.25, |
|
"learning_rate": 4.605295718881465e-05, |
|
"loss": 0.1844, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.6332590942835932, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 4.5558030190546894e-05, |
|
"loss": 0.1802, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.6481069042316259, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.506310319227914e-05, |
|
"loss": 0.1881, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.6629547141796586, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 4.4568176194011384e-05, |
|
"loss": 0.185, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.6778025241276913, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.407324919574363e-05, |
|
"loss": 0.1818, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.692650334075724, |
|
"grad_norm": 0.25, |
|
"learning_rate": 4.3578322197475875e-05, |
|
"loss": 0.182, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.7074981440237567, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.3083395199208124e-05, |
|
"loss": 0.184, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.7223459539717891, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.2588468200940365e-05, |
|
"loss": 0.1923, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.7371937639198218, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.209354120267261e-05, |
|
"loss": 0.1812, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.7520415738678545, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 4.159861420440485e-05, |
|
"loss": 0.1756, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.766889383815887, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 4.11036872061371e-05, |
|
"loss": 0.1878, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.7817371937639197, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 4.060876020786934e-05, |
|
"loss": 0.18, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.7965850037119524, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.011383320960159e-05, |
|
"loss": 0.1802, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.811432813659985, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 3.961890621133383e-05, |
|
"loss": 0.1802, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.8262806236080178, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.912397921306608e-05, |
|
"loss": 0.1842, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.8411284335560505, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 3.8629052214798314e-05, |
|
"loss": 0.1855, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.8559762435040832, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 3.813412521653056e-05, |
|
"loss": 0.1846, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.8708240534521159, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 3.7639198218262804e-05, |
|
"loss": 0.1829, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.8856718634001486, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 3.714427121999505e-05, |
|
"loss": 0.1845, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.9005196733481813, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 3.66493442217273e-05, |
|
"loss": 0.1849, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.9153674832962138, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3.6154417223459543e-05, |
|
"loss": 0.1849, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.9302152932442465, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 3.5659490225191785e-05, |
|
"loss": 0.1782, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.9450631031922792, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 3.516456322692403e-05, |
|
"loss": 0.1871, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.9599109131403119, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.4669636228656276e-05, |
|
"loss": 0.1775, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.9747587230883443, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.417470923038852e-05, |
|
"loss": 0.1815, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.989606533036377, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.3679782232120766e-05, |
|
"loss": 0.1831, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.0044543429844097, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 3.318485523385301e-05, |
|
"loss": 0.1752, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.0193021529324424, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.268992823558526e-05, |
|
"loss": 0.172, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 2.034149962880475, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 3.21950012373175e-05, |
|
"loss": 0.1695, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 2.048997772828508, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.170007423904974e-05, |
|
"loss": 0.169, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 2.0638455827765405, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 3.120514724078198e-05, |
|
"loss": 0.1688, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 2.078693392724573, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 3.071022024251423e-05, |
|
"loss": 0.1696, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.093541202672606, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.0215293244246473e-05, |
|
"loss": 0.1683, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 2.1083890126206386, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 2.972036624597872e-05, |
|
"loss": 0.1647, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 2.1232368225686713, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.9225439247710963e-05, |
|
"loss": 0.1698, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 2.138084632516704, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.873051224944321e-05, |
|
"loss": 0.1648, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 2.1529324424647363, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 2.823558525117545e-05, |
|
"loss": 0.1676, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.167780252412769, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.77406582529077e-05, |
|
"loss": 0.1676, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 2.1826280623608016, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 2.7245731254639944e-05, |
|
"loss": 0.168, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 2.1974758723088343, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.6750804256372186e-05, |
|
"loss": 0.1662, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 2.212323682256867, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 2.6255877258104435e-05, |
|
"loss": 0.163, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 2.2271714922048997, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.5760950259836673e-05, |
|
"loss": 0.1658, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.2420193021529324, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.5266023261568922e-05, |
|
"loss": 0.1645, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.256867112100965, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.4771096263301164e-05, |
|
"loss": 0.1563, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.271714922048998, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.427616926503341e-05, |
|
"loss": 0.1677, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.2865627319970305, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.378124226676565e-05, |
|
"loss": 0.1635, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.3014105419450632, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.3286315268497896e-05, |
|
"loss": 0.1693, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.316258351893096, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 2.2791388270230145e-05, |
|
"loss": 0.1682, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 2.3311061618411286, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 2.2296461271962387e-05, |
|
"loss": 0.1637, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 2.345953971789161, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.1801534273694632e-05, |
|
"loss": 0.1702, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 2.3608017817371936, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.1306607275426877e-05, |
|
"loss": 0.1648, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 2.3756495916852263, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 2.0811680277159122e-05, |
|
"loss": 0.1586, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.390497401633259, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.0316753278891364e-05, |
|
"loss": 0.1628, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 2.4053452115812917, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 1.982182628062361e-05, |
|
"loss": 0.1621, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 2.4201930215293244, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.9326899282355855e-05, |
|
"loss": 0.1628, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 2.435040831477357, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.8831972284088097e-05, |
|
"loss": 0.1642, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 2.4498886414253898, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.8337045285820342e-05, |
|
"loss": 0.1659, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.4647364513734225, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.7842118287552587e-05, |
|
"loss": 0.1683, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 2.479584261321455, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.7347191289284832e-05, |
|
"loss": 0.1655, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 2.494432071269488, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.6852264291017074e-05, |
|
"loss": 0.1655, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 2.5092798812175205, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.635733729274932e-05, |
|
"loss": 0.164, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 2.5241276911655532, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 1.5862410294481565e-05, |
|
"loss": 0.1745, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.538975501113586, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 1.5367483296213807e-05, |
|
"loss": 0.169, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 2.5538233110616186, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.4872556297946052e-05, |
|
"loss": 0.1695, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 2.5686711210096513, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.4377629299678297e-05, |
|
"loss": 0.1657, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 2.5835189309576836, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.3882702301410544e-05, |
|
"loss": 0.1615, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 2.5983667409057163, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 1.3387775303142788e-05, |
|
"loss": 0.1675, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.613214550853749, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.2892848304875033e-05, |
|
"loss": 0.1646, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 2.6280623608017817, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 1.2397921306607275e-05, |
|
"loss": 0.1631, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 2.6429101707498144, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.190299430833952e-05, |
|
"loss": 0.1621, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 2.657757980697847, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.1408067310071765e-05, |
|
"loss": 0.1682, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 2.6726057906458798, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.091314031180401e-05, |
|
"loss": 0.1664, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.6874536005939125, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.0418213313536254e-05, |
|
"loss": 0.1686, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 2.702301410541945, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 9.9232863152685e-06, |
|
"loss": 0.1653, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 2.717149220489978, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.428359317000743e-06, |
|
"loss": 0.1704, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 2.73199703043801, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 8.933432318732986e-06, |
|
"loss": 0.1645, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 2.746844840386043, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 8.438505320465232e-06, |
|
"loss": 0.1647, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.7616926503340755, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.943578322197475e-06, |
|
"loss": 0.1721, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 2.776540460282108, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.44865132392972e-06, |
|
"loss": 0.1652, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 2.791388270230141, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.953724325661966e-06, |
|
"loss": 0.1634, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 2.8062360801781736, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 6.45879732739421e-06, |
|
"loss": 0.1668, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 2.8210838901262063, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 5.9638703291264544e-06, |
|
"loss": 0.1642, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.835931700074239, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 5.468943330858699e-06, |
|
"loss": 0.1594, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 2.8507795100222717, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 4.974016332590943e-06, |
|
"loss": 0.1648, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 2.8656273199703044, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.479089334323188e-06, |
|
"loss": 0.1637, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 2.880475129918337, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 3.984162336055432e-06, |
|
"loss": 0.1728, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 2.89532293986637, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 3.489235337787677e-06, |
|
"loss": 0.1646, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.9101707498144025, |
|
"grad_norm": 0.25, |
|
"learning_rate": 2.9943083395199213e-06, |
|
"loss": 0.1692, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 2.925018559762435, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.4993813412521652e-06, |
|
"loss": 0.1708, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 2.939866369710468, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.00445434298441e-06, |
|
"loss": 0.1689, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 2.9547141796585006, |
|
"grad_norm": 0.25, |
|
"learning_rate": 1.5095273447166542e-06, |
|
"loss": 0.1622, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 2.9695619896065333, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.0146003464488989e-06, |
|
"loss": 0.1649, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.984409799554566, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.196733481811434e-07, |
|
"loss": 0.163, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 2.9992576095025982, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.4746349913387775e-08, |
|
"loss": 0.1617, |
|
"step": 20200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 20205, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1772177365591194e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|