{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 20205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014847809948032665, "grad_norm": 0.14453125, "learning_rate": 9.950507300173225e-05, "loss": 0.3437, "step": 100 }, { "epoch": 0.02969561989606533, "grad_norm": 0.10986328125, "learning_rate": 9.90101460034645e-05, "loss": 0.305, "step": 200 }, { "epoch": 0.044543429844097995, "grad_norm": 0.1630859375, "learning_rate": 9.851521900519674e-05, "loss": 0.2885, "step": 300 }, { "epoch": 0.05939123979213066, "grad_norm": 0.162109375, "learning_rate": 9.802029200692899e-05, "loss": 0.2781, "step": 400 }, { "epoch": 0.07423904974016332, "grad_norm": 0.193359375, "learning_rate": 9.752536500866123e-05, "loss": 0.2746, "step": 500 }, { "epoch": 0.08908685968819599, "grad_norm": 0.1357421875, "learning_rate": 9.703043801039347e-05, "loss": 0.2658, "step": 600 }, { "epoch": 0.10393466963622866, "grad_norm": 0.1689453125, "learning_rate": 9.653551101212572e-05, "loss": 0.2709, "step": 700 }, { "epoch": 0.11878247958426132, "grad_norm": 0.17578125, "learning_rate": 9.604058401385795e-05, "loss": 0.2634, "step": 800 }, { "epoch": 0.133630289532294, "grad_norm": 0.15234375, "learning_rate": 9.55456570155902e-05, "loss": 0.2688, "step": 900 }, { "epoch": 0.14847809948032664, "grad_norm": 0.146484375, "learning_rate": 9.505073001732245e-05, "loss": 0.261, "step": 1000 }, { "epoch": 0.1633259094283593, "grad_norm": 0.12158203125, "learning_rate": 9.455580301905469e-05, "loss": 0.2537, "step": 1100 }, { "epoch": 0.17817371937639198, "grad_norm": 0.1845703125, "learning_rate": 9.406087602078693e-05, "loss": 0.2592, "step": 1200 }, { "epoch": 0.19302152932442465, "grad_norm": 0.234375, "learning_rate": 9.356594902251918e-05, "loss": 0.2529, "step": 1300 }, { "epoch": 0.20786933927245732, "grad_norm": 0.1064453125, "learning_rate": 9.307102202425143e-05, "loss": 0.2489, "step": 1400 }, { "epoch": 0.22271714922049, "grad_norm": 0.166015625, "learning_rate": 9.257609502598367e-05, "loss": 0.2539, "step": 1500 }, { "epoch": 0.23756495916852263, "grad_norm": 0.1669921875, "learning_rate": 9.208116802771592e-05, "loss": 0.2449, "step": 1600 }, { "epoch": 0.2524127691165553, "grad_norm": 0.2109375, "learning_rate": 9.158624102944816e-05, "loss": 0.2457, "step": 1700 }, { "epoch": 0.267260579064588, "grad_norm": 0.1904296875, "learning_rate": 9.109131403118041e-05, "loss": 0.2424, "step": 1800 }, { "epoch": 0.28210838901262064, "grad_norm": 0.23828125, "learning_rate": 9.059638703291266e-05, "loss": 0.2485, "step": 1900 }, { "epoch": 0.2969561989606533, "grad_norm": 0.203125, "learning_rate": 9.01014600346449e-05, "loss": 0.2534, "step": 2000 }, { "epoch": 0.311804008908686, "grad_norm": 0.17578125, "learning_rate": 8.960653303637714e-05, "loss": 0.238, "step": 2100 }, { "epoch": 0.3266518188567186, "grad_norm": 0.158203125, "learning_rate": 8.911160603810938e-05, "loss": 0.2409, "step": 2200 }, { "epoch": 0.3414996288047513, "grad_norm": 0.177734375, "learning_rate": 8.861667903984163e-05, "loss": 0.2484, "step": 2300 }, { "epoch": 0.35634743875278396, "grad_norm": 0.1689453125, "learning_rate": 8.812175204157386e-05, "loss": 0.2401, "step": 2400 }, { "epoch": 0.3711952487008166, "grad_norm": 0.15625, "learning_rate": 8.762682504330611e-05, "loss": 0.2372, "step": 2500 }, { "epoch": 0.3860430586488493, "grad_norm": 0.193359375, "learning_rate": 8.713189804503836e-05, "loss": 0.2346, "step": 2600 }, { "epoch": 0.40089086859688194, "grad_norm": 0.19140625, "learning_rate": 8.663697104677061e-05, "loss": 0.2307, "step": 2700 }, { "epoch": 0.41573867854491464, "grad_norm": 0.15234375, "learning_rate": 8.614204404850284e-05, "loss": 0.2276, "step": 2800 }, { "epoch": 0.4305864884929473, "grad_norm": 0.1513671875, "learning_rate": 8.56471170502351e-05, "loss": 0.2386, "step": 2900 }, { "epoch": 0.44543429844098, "grad_norm": 0.189453125, "learning_rate": 8.515219005196734e-05, "loss": 0.2313, "step": 3000 }, { "epoch": 0.4602821083890126, "grad_norm": 0.208984375, "learning_rate": 8.465726305369959e-05, "loss": 0.245, "step": 3100 }, { "epoch": 0.47512991833704527, "grad_norm": 0.1943359375, "learning_rate": 8.416233605543183e-05, "loss": 0.2278, "step": 3200 }, { "epoch": 0.48997772828507796, "grad_norm": 0.21484375, "learning_rate": 8.366740905716407e-05, "loss": 0.2346, "step": 3300 }, { "epoch": 0.5048255382331106, "grad_norm": 0.173828125, "learning_rate": 8.317248205889632e-05, "loss": 0.2364, "step": 3400 }, { "epoch": 0.5196733481811433, "grad_norm": 0.1611328125, "learning_rate": 8.267755506062856e-05, "loss": 0.2238, "step": 3500 }, { "epoch": 0.534521158129176, "grad_norm": 0.154296875, "learning_rate": 8.21826280623608e-05, "loss": 0.2396, "step": 3600 }, { "epoch": 0.5493689680772086, "grad_norm": 0.2255859375, "learning_rate": 8.168770106409304e-05, "loss": 0.2254, "step": 3700 }, { "epoch": 0.5642167780252413, "grad_norm": 0.18359375, "learning_rate": 8.119277406582529e-05, "loss": 0.2259, "step": 3800 }, { "epoch": 0.579064587973274, "grad_norm": 0.158203125, "learning_rate": 8.069784706755754e-05, "loss": 0.2273, "step": 3900 }, { "epoch": 0.5939123979213066, "grad_norm": 0.1669921875, "learning_rate": 8.020292006928979e-05, "loss": 0.2274, "step": 4000 }, { "epoch": 0.6087602078693393, "grad_norm": 0.166015625, "learning_rate": 7.970799307102202e-05, "loss": 0.2322, "step": 4100 }, { "epoch": 0.623608017817372, "grad_norm": 0.2138671875, "learning_rate": 7.921306607275427e-05, "loss": 0.2274, "step": 4200 }, { "epoch": 0.6384558277654045, "grad_norm": 0.1416015625, "learning_rate": 7.871813907448652e-05, "loss": 0.2274, "step": 4300 }, { "epoch": 0.6533036377134372, "grad_norm": 0.2060546875, "learning_rate": 7.822321207621877e-05, "loss": 0.2362, "step": 4400 }, { "epoch": 0.6681514476614699, "grad_norm": 0.1572265625, "learning_rate": 7.7728285077951e-05, "loss": 0.2367, "step": 4500 }, { "epoch": 0.6829992576095026, "grad_norm": 0.1650390625, "learning_rate": 7.723335807968325e-05, "loss": 0.2287, "step": 4600 }, { "epoch": 0.6978470675575352, "grad_norm": 0.1259765625, "learning_rate": 7.67384310814155e-05, "loss": 0.233, "step": 4700 }, { "epoch": 0.7126948775055679, "grad_norm": 0.1455078125, "learning_rate": 7.624350408314775e-05, "loss": 0.2243, "step": 4800 }, { "epoch": 0.7275426874536006, "grad_norm": 0.14453125, "learning_rate": 7.574857708487998e-05, "loss": 0.2311, "step": 4900 }, { "epoch": 0.7423904974016332, "grad_norm": 0.251953125, "learning_rate": 7.525365008661222e-05, "loss": 0.229, "step": 5000 }, { "epoch": 0.7572383073496659, "grad_norm": 0.234375, "learning_rate": 7.475872308834447e-05, "loss": 0.2275, "step": 5100 }, { "epoch": 0.7720861172976986, "grad_norm": 0.28125, "learning_rate": 7.426379609007672e-05, "loss": 0.2235, "step": 5200 }, { "epoch": 0.7869339272457313, "grad_norm": 0.2431640625, "learning_rate": 7.376886909180895e-05, "loss": 0.2234, "step": 5300 }, { "epoch": 0.8017817371937639, "grad_norm": 0.2119140625, "learning_rate": 7.32739420935412e-05, "loss": 0.2236, "step": 5400 }, { "epoch": 0.8166295471417966, "grad_norm": 0.212890625, "learning_rate": 7.277901509527345e-05, "loss": 0.2265, "step": 5500 }, { "epoch": 0.8314773570898293, "grad_norm": 0.236328125, "learning_rate": 7.22840880970057e-05, "loss": 0.2121, "step": 5600 }, { "epoch": 0.8463251670378619, "grad_norm": 0.1953125, "learning_rate": 7.178916109873795e-05, "loss": 0.2225, "step": 5700 }, { "epoch": 0.8611729769858946, "grad_norm": 0.2119140625, "learning_rate": 7.129423410047018e-05, "loss": 0.2196, "step": 5800 }, { "epoch": 0.8760207869339273, "grad_norm": 0.171875, "learning_rate": 7.079930710220243e-05, "loss": 0.2235, "step": 5900 }, { "epoch": 0.89086859688196, "grad_norm": 0.2333984375, "learning_rate": 7.030438010393468e-05, "loss": 0.2256, "step": 6000 }, { "epoch": 0.9057164068299925, "grad_norm": 0.279296875, "learning_rate": 6.980945310566693e-05, "loss": 0.2217, "step": 6100 }, { "epoch": 0.9205642167780252, "grad_norm": 0.16015625, "learning_rate": 6.931452610739916e-05, "loss": 0.223, "step": 6200 }, { "epoch": 0.9354120267260579, "grad_norm": 0.287109375, "learning_rate": 6.881959910913141e-05, "loss": 0.2165, "step": 6300 }, { "epoch": 0.9502598366740905, "grad_norm": 0.255859375, "learning_rate": 6.832467211086365e-05, "loss": 0.2222, "step": 6400 }, { "epoch": 0.9651076466221232, "grad_norm": 0.2080078125, "learning_rate": 6.78297451125959e-05, "loss": 0.2243, "step": 6500 }, { "epoch": 0.9799554565701559, "grad_norm": 0.291015625, "learning_rate": 6.733481811432813e-05, "loss": 0.2261, "step": 6600 }, { "epoch": 0.9948032665181886, "grad_norm": 0.26171875, "learning_rate": 6.683989111606038e-05, "loss": 0.2195, "step": 6700 }, { "epoch": 1.0096510764662212, "grad_norm": 0.2138671875, "learning_rate": 6.634496411779263e-05, "loss": 0.2005, "step": 6800 }, { "epoch": 1.024498886414254, "grad_norm": 0.21875, "learning_rate": 6.585003711952488e-05, "loss": 0.1892, "step": 6900 }, { "epoch": 1.0393466963622866, "grad_norm": 0.21484375, "learning_rate": 6.535511012125711e-05, "loss": 0.1842, "step": 7000 }, { "epoch": 1.0541945063103193, "grad_norm": 0.1982421875, "learning_rate": 6.486018312298936e-05, "loss": 0.1897, "step": 7100 }, { "epoch": 1.069042316258352, "grad_norm": 0.1884765625, "learning_rate": 6.436525612472161e-05, "loss": 0.1831, "step": 7200 }, { "epoch": 1.0838901262063845, "grad_norm": 0.2578125, "learning_rate": 6.387032912645386e-05, "loss": 0.1891, "step": 7300 }, { "epoch": 1.0987379361544172, "grad_norm": 0.17578125, "learning_rate": 6.33754021281861e-05, "loss": 0.1884, "step": 7400 }, { "epoch": 1.1135857461024499, "grad_norm": 0.2451171875, "learning_rate": 6.288047512991834e-05, "loss": 0.1904, "step": 7500 }, { "epoch": 1.1284335560504826, "grad_norm": 0.267578125, "learning_rate": 6.238554813165059e-05, "loss": 0.1899, "step": 7600 }, { "epoch": 1.1432813659985153, "grad_norm": 0.30859375, "learning_rate": 6.189062113338282e-05, "loss": 0.1884, "step": 7700 }, { "epoch": 1.158129175946548, "grad_norm": 0.255859375, "learning_rate": 6.139569413511507e-05, "loss": 0.1905, "step": 7800 }, { "epoch": 1.1729769858945804, "grad_norm": 0.2373046875, "learning_rate": 6.0900767136847315e-05, "loss": 0.183, "step": 7900 }, { "epoch": 1.1878247958426131, "grad_norm": 0.2333984375, "learning_rate": 6.0405840138579564e-05, "loss": 0.1841, "step": 8000 }, { "epoch": 1.2026726057906458, "grad_norm": 0.2490234375, "learning_rate": 5.9910913140311805e-05, "loss": 0.1873, "step": 8100 }, { "epoch": 1.2175204157386785, "grad_norm": 0.267578125, "learning_rate": 5.9415986142044054e-05, "loss": 0.1817, "step": 8200 }, { "epoch": 1.2323682256867112, "grad_norm": 0.28515625, "learning_rate": 5.892105914377629e-05, "loss": 0.1824, "step": 8300 }, { "epoch": 1.247216035634744, "grad_norm": 0.26953125, "learning_rate": 5.842613214550854e-05, "loss": 0.19, "step": 8400 }, { "epoch": 1.2620638455827766, "grad_norm": 0.236328125, "learning_rate": 5.7931205147240786e-05, "loss": 0.1859, "step": 8500 }, { "epoch": 1.2769116555308093, "grad_norm": 0.255859375, "learning_rate": 5.7436278148973035e-05, "loss": 0.1877, "step": 8600 }, { "epoch": 1.2917594654788418, "grad_norm": 0.1826171875, "learning_rate": 5.694135115070527e-05, "loss": 0.1813, "step": 8700 }, { "epoch": 1.3066072754268745, "grad_norm": 0.365234375, "learning_rate": 5.644642415243752e-05, "loss": 0.186, "step": 8800 }, { "epoch": 1.3214550853749072, "grad_norm": 0.2041015625, "learning_rate": 5.595149715416976e-05, "loss": 0.1861, "step": 8900 }, { "epoch": 1.3363028953229399, "grad_norm": 0.248046875, "learning_rate": 5.545657015590201e-05, "loss": 0.1858, "step": 9000 }, { "epoch": 1.3511507052709726, "grad_norm": 0.2021484375, "learning_rate": 5.4961643157634244e-05, "loss": 0.1827, "step": 9100 }, { "epoch": 1.365998515219005, "grad_norm": 0.2080078125, "learning_rate": 5.446671615936649e-05, "loss": 0.1778, "step": 9200 }, { "epoch": 1.3808463251670378, "grad_norm": 0.26171875, "learning_rate": 5.397178916109874e-05, "loss": 0.1891, "step": 9300 }, { "epoch": 1.3956941351150705, "grad_norm": 0.294921875, "learning_rate": 5.347686216283099e-05, "loss": 0.1842, "step": 9400 }, { "epoch": 1.4105419450631032, "grad_norm": 0.2392578125, "learning_rate": 5.298193516456323e-05, "loss": 0.187, "step": 9500 }, { "epoch": 1.4253897550111359, "grad_norm": 0.2216796875, "learning_rate": 5.248700816629547e-05, "loss": 0.1897, "step": 9600 }, { "epoch": 1.4402375649591685, "grad_norm": 0.322265625, "learning_rate": 5.1992081168027716e-05, "loss": 0.1873, "step": 9700 }, { "epoch": 1.4550853749072012, "grad_norm": 0.291015625, "learning_rate": 5.1497154169759965e-05, "loss": 0.188, "step": 9800 }, { "epoch": 1.469933184855234, "grad_norm": 0.2119140625, "learning_rate": 5.100222717149221e-05, "loss": 0.1824, "step": 9900 }, { "epoch": 1.4847809948032666, "grad_norm": 0.185546875, "learning_rate": 5.050730017322445e-05, "loss": 0.1827, "step": 10000 }, { "epoch": 1.4996288047512991, "grad_norm": 0.33203125, "learning_rate": 5.00123731749567e-05, "loss": 0.185, "step": 10100 }, { "epoch": 1.5144766146993318, "grad_norm": 0.1884765625, "learning_rate": 4.951744617668894e-05, "loss": 0.1813, "step": 10200 }, { "epoch": 1.5293244246473645, "grad_norm": 0.2294921875, "learning_rate": 4.902251917842118e-05, "loss": 0.1796, "step": 10300 }, { "epoch": 1.5441722345953972, "grad_norm": 0.296875, "learning_rate": 4.852759218015343e-05, "loss": 0.1887, "step": 10400 }, { "epoch": 1.5590200445434297, "grad_norm": 0.166015625, "learning_rate": 4.803266518188567e-05, "loss": 0.1845, "step": 10500 }, { "epoch": 1.5738678544914624, "grad_norm": 0.326171875, "learning_rate": 4.753773818361792e-05, "loss": 0.1784, "step": 10600 }, { "epoch": 1.588715664439495, "grad_norm": 0.1982421875, "learning_rate": 4.704281118535016e-05, "loss": 0.1842, "step": 10700 }, { "epoch": 1.6035634743875278, "grad_norm": 0.314453125, "learning_rate": 4.654788418708241e-05, "loss": 0.188, "step": 10800 }, { "epoch": 1.6184112843355605, "grad_norm": 0.25, "learning_rate": 4.605295718881465e-05, "loss": 0.1844, "step": 10900 }, { "epoch": 1.6332590942835932, "grad_norm": 0.380859375, "learning_rate": 4.5558030190546894e-05, "loss": 0.1802, "step": 11000 }, { "epoch": 1.6481069042316259, "grad_norm": 0.2314453125, "learning_rate": 4.506310319227914e-05, "loss": 0.1881, "step": 11100 }, { "epoch": 1.6629547141796586, "grad_norm": 0.37109375, "learning_rate": 4.4568176194011384e-05, "loss": 0.185, "step": 11200 }, { "epoch": 1.6778025241276913, "grad_norm": 0.283203125, "learning_rate": 4.407324919574363e-05, "loss": 0.1818, "step": 11300 }, { "epoch": 1.692650334075724, "grad_norm": 0.25, "learning_rate": 4.3578322197475875e-05, "loss": 0.182, "step": 11400 }, { "epoch": 1.7074981440237567, "grad_norm": 0.25390625, "learning_rate": 4.3083395199208124e-05, "loss": 0.184, "step": 11500 }, { "epoch": 1.7223459539717891, "grad_norm": 0.28515625, "learning_rate": 4.2588468200940365e-05, "loss": 0.1923, "step": 11600 }, { "epoch": 1.7371937639198218, "grad_norm": 0.2236328125, "learning_rate": 4.209354120267261e-05, "loss": 0.1812, "step": 11700 }, { "epoch": 1.7520415738678545, "grad_norm": 0.1865234375, "learning_rate": 4.159861420440485e-05, "loss": 0.1756, "step": 11800 }, { "epoch": 1.766889383815887, "grad_norm": 0.31640625, "learning_rate": 4.11036872061371e-05, "loss": 0.1878, "step": 11900 }, { "epoch": 1.7817371937639197, "grad_norm": 0.32421875, "learning_rate": 4.060876020786934e-05, "loss": 0.18, "step": 12000 }, { "epoch": 1.7965850037119524, "grad_norm": 0.265625, "learning_rate": 4.011383320960159e-05, "loss": 0.1802, "step": 12100 }, { "epoch": 1.811432813659985, "grad_norm": 0.234375, "learning_rate": 3.961890621133383e-05, "loss": 0.1802, "step": 12200 }, { "epoch": 1.8262806236080178, "grad_norm": 0.31640625, "learning_rate": 3.912397921306608e-05, "loss": 0.1842, "step": 12300 }, { "epoch": 1.8411284335560505, "grad_norm": 0.2109375, "learning_rate": 3.8629052214798314e-05, "loss": 0.1855, "step": 12400 }, { "epoch": 1.8559762435040832, "grad_norm": 0.271484375, "learning_rate": 3.813412521653056e-05, "loss": 0.1846, "step": 12500 }, { "epoch": 1.8708240534521159, "grad_norm": 0.275390625, "learning_rate": 3.7639198218262804e-05, "loss": 0.1829, "step": 12600 }, { "epoch": 1.8856718634001486, "grad_norm": 0.2734375, "learning_rate": 3.714427121999505e-05, "loss": 0.1845, "step": 12700 }, { "epoch": 1.9005196733481813, "grad_norm": 0.28125, "learning_rate": 3.66493442217273e-05, "loss": 0.1849, "step": 12800 }, { "epoch": 1.9153674832962138, "grad_norm": 0.2353515625, "learning_rate": 3.6154417223459543e-05, "loss": 0.1849, "step": 12900 }, { "epoch": 1.9302152932442465, "grad_norm": 0.33203125, "learning_rate": 3.5659490225191785e-05, "loss": 0.1782, "step": 13000 }, { "epoch": 1.9450631031922792, "grad_norm": 0.29296875, "learning_rate": 3.516456322692403e-05, "loss": 0.1871, "step": 13100 }, { "epoch": 1.9599109131403119, "grad_norm": 0.23828125, "learning_rate": 3.4669636228656276e-05, "loss": 0.1775, "step": 13200 }, { "epoch": 1.9747587230883443, "grad_norm": 0.30078125, "learning_rate": 3.417470923038852e-05, "loss": 0.1815, "step": 13300 }, { "epoch": 1.989606533036377, "grad_norm": 0.30078125, "learning_rate": 3.3679782232120766e-05, "loss": 0.1831, "step": 13400 }, { "epoch": 2.0044543429844097, "grad_norm": 0.2373046875, "learning_rate": 3.318485523385301e-05, "loss": 0.1752, "step": 13500 }, { "epoch": 2.0193021529324424, "grad_norm": 0.298828125, "learning_rate": 3.268992823558526e-05, "loss": 0.172, "step": 13600 }, { "epoch": 2.034149962880475, "grad_norm": 0.263671875, "learning_rate": 3.21950012373175e-05, "loss": 0.1695, "step": 13700 }, { "epoch": 2.048997772828508, "grad_norm": 0.255859375, "learning_rate": 3.170007423904974e-05, "loss": 0.169, "step": 13800 }, { "epoch": 2.0638455827765405, "grad_norm": 0.2333984375, "learning_rate": 3.120514724078198e-05, "loss": 0.1688, "step": 13900 }, { "epoch": 2.078693392724573, "grad_norm": 0.224609375, "learning_rate": 3.071022024251423e-05, "loss": 0.1696, "step": 14000 }, { "epoch": 2.093541202672606, "grad_norm": 0.34375, "learning_rate": 3.0215293244246473e-05, "loss": 0.1683, "step": 14100 }, { "epoch": 2.1083890126206386, "grad_norm": 0.365234375, "learning_rate": 2.972036624597872e-05, "loss": 0.1647, "step": 14200 }, { "epoch": 2.1232368225686713, "grad_norm": 0.310546875, "learning_rate": 2.9225439247710963e-05, "loss": 0.1698, "step": 14300 }, { "epoch": 2.138084632516704, "grad_norm": 0.20703125, "learning_rate": 2.873051224944321e-05, "loss": 0.1648, "step": 14400 }, { "epoch": 2.1529324424647363, "grad_norm": 0.224609375, "learning_rate": 2.823558525117545e-05, "loss": 0.1676, "step": 14500 }, { "epoch": 2.167780252412769, "grad_norm": 0.283203125, "learning_rate": 2.77406582529077e-05, "loss": 0.1676, "step": 14600 }, { "epoch": 2.1826280623608016, "grad_norm": 0.240234375, "learning_rate": 2.7245731254639944e-05, "loss": 0.168, "step": 14700 }, { "epoch": 2.1974758723088343, "grad_norm": 0.2734375, "learning_rate": 2.6750804256372186e-05, "loss": 0.1662, "step": 14800 }, { "epoch": 2.212323682256867, "grad_norm": 0.2119140625, "learning_rate": 2.6255877258104435e-05, "loss": 0.163, "step": 14900 }, { "epoch": 2.2271714922048997, "grad_norm": 0.32421875, "learning_rate": 2.5760950259836673e-05, "loss": 0.1658, "step": 15000 }, { "epoch": 2.2420193021529324, "grad_norm": 0.2734375, "learning_rate": 2.5266023261568922e-05, "loss": 0.1645, "step": 15100 }, { "epoch": 2.256867112100965, "grad_norm": 0.291015625, "learning_rate": 2.4771096263301164e-05, "loss": 0.1563, "step": 15200 }, { "epoch": 2.271714922048998, "grad_norm": 0.34375, "learning_rate": 2.427616926503341e-05, "loss": 0.1677, "step": 15300 }, { "epoch": 2.2865627319970305, "grad_norm": 0.265625, "learning_rate": 2.378124226676565e-05, "loss": 0.1635, "step": 15400 }, { "epoch": 2.3014105419450632, "grad_norm": 0.32421875, "learning_rate": 2.3286315268497896e-05, "loss": 0.1693, "step": 15500 }, { "epoch": 2.316258351893096, "grad_norm": 0.2470703125, "learning_rate": 2.2791388270230145e-05, "loss": 0.1682, "step": 15600 }, { "epoch": 2.3311061618411286, "grad_norm": 0.271484375, "learning_rate": 2.2296461271962387e-05, "loss": 0.1637, "step": 15700 }, { "epoch": 2.345953971789161, "grad_norm": 0.337890625, "learning_rate": 2.1801534273694632e-05, "loss": 0.1702, "step": 15800 }, { "epoch": 2.3608017817371936, "grad_norm": 0.3125, "learning_rate": 2.1306607275426877e-05, "loss": 0.1648, "step": 15900 }, { "epoch": 2.3756495916852263, "grad_norm": 0.275390625, "learning_rate": 2.0811680277159122e-05, "loss": 0.1586, "step": 16000 }, { "epoch": 2.390497401633259, "grad_norm": 0.263671875, "learning_rate": 2.0316753278891364e-05, "loss": 0.1628, "step": 16100 }, { "epoch": 2.4053452115812917, "grad_norm": 0.28125, "learning_rate": 1.982182628062361e-05, "loss": 0.1621, "step": 16200 }, { "epoch": 2.4201930215293244, "grad_norm": 0.296875, "learning_rate": 1.9326899282355855e-05, "loss": 0.1628, "step": 16300 }, { "epoch": 2.435040831477357, "grad_norm": 0.2412109375, "learning_rate": 1.8831972284088097e-05, "loss": 0.1642, "step": 16400 }, { "epoch": 2.4498886414253898, "grad_norm": 0.314453125, "learning_rate": 1.8337045285820342e-05, "loss": 0.1659, "step": 16500 }, { "epoch": 2.4647364513734225, "grad_norm": 0.2734375, "learning_rate": 1.7842118287552587e-05, "loss": 0.1683, "step": 16600 }, { "epoch": 2.479584261321455, "grad_norm": 0.322265625, "learning_rate": 1.7347191289284832e-05, "loss": 0.1655, "step": 16700 }, { "epoch": 2.494432071269488, "grad_norm": 0.2734375, "learning_rate": 1.6852264291017074e-05, "loss": 0.1655, "step": 16800 }, { "epoch": 2.5092798812175205, "grad_norm": 0.3046875, "learning_rate": 1.635733729274932e-05, "loss": 0.164, "step": 16900 }, { "epoch": 2.5241276911655532, "grad_norm": 0.28125, "learning_rate": 1.5862410294481565e-05, "loss": 0.1745, "step": 17000 }, { "epoch": 2.538975501113586, "grad_norm": 0.2490234375, "learning_rate": 1.5367483296213807e-05, "loss": 0.169, "step": 17100 }, { "epoch": 2.5538233110616186, "grad_norm": 0.314453125, "learning_rate": 1.4872556297946052e-05, "loss": 0.1695, "step": 17200 }, { "epoch": 2.5686711210096513, "grad_norm": 0.306640625, "learning_rate": 1.4377629299678297e-05, "loss": 0.1657, "step": 17300 }, { "epoch": 2.5835189309576836, "grad_norm": 0.302734375, "learning_rate": 1.3882702301410544e-05, "loss": 0.1615, "step": 17400 }, { "epoch": 2.5983667409057163, "grad_norm": 0.263671875, "learning_rate": 1.3387775303142788e-05, "loss": 0.1675, "step": 17500 }, { "epoch": 2.613214550853749, "grad_norm": 0.29296875, "learning_rate": 1.2892848304875033e-05, "loss": 0.1646, "step": 17600 }, { "epoch": 2.6280623608017817, "grad_norm": 0.197265625, "learning_rate": 1.2397921306607275e-05, "loss": 0.1631, "step": 17700 }, { "epoch": 2.6429101707498144, "grad_norm": 0.2890625, "learning_rate": 1.190299430833952e-05, "loss": 0.1621, "step": 17800 }, { "epoch": 2.657757980697847, "grad_norm": 0.29296875, "learning_rate": 1.1408067310071765e-05, "loss": 0.1682, "step": 17900 }, { "epoch": 2.6726057906458798, "grad_norm": 0.29296875, "learning_rate": 1.091314031180401e-05, "loss": 0.1664, "step": 18000 }, { "epoch": 2.6874536005939125, "grad_norm": 0.2431640625, "learning_rate": 1.0418213313536254e-05, "loss": 0.1686, "step": 18100 }, { "epoch": 2.702301410541945, "grad_norm": 0.427734375, "learning_rate": 9.9232863152685e-06, "loss": 0.1653, "step": 18200 }, { "epoch": 2.717149220489978, "grad_norm": 0.283203125, "learning_rate": 9.428359317000743e-06, "loss": 0.1704, "step": 18300 }, { "epoch": 2.73199703043801, "grad_norm": 0.2314453125, "learning_rate": 8.933432318732986e-06, "loss": 0.1645, "step": 18400 }, { "epoch": 2.746844840386043, "grad_norm": 0.2041015625, "learning_rate": 8.438505320465232e-06, "loss": 0.1647, "step": 18500 }, { "epoch": 2.7616926503340755, "grad_norm": 0.333984375, "learning_rate": 7.943578322197475e-06, "loss": 0.1721, "step": 18600 }, { "epoch": 2.776540460282108, "grad_norm": 0.330078125, "learning_rate": 7.44865132392972e-06, "loss": 0.1652, "step": 18700 }, { "epoch": 2.791388270230141, "grad_norm": 0.333984375, "learning_rate": 6.953724325661966e-06, "loss": 0.1634, "step": 18800 }, { "epoch": 2.8062360801781736, "grad_norm": 0.283203125, "learning_rate": 6.45879732739421e-06, "loss": 0.1668, "step": 18900 }, { "epoch": 2.8210838901262063, "grad_norm": 0.248046875, "learning_rate": 5.9638703291264544e-06, "loss": 0.1642, "step": 19000 }, { "epoch": 2.835931700074239, "grad_norm": 0.2373046875, "learning_rate": 5.468943330858699e-06, "loss": 0.1594, "step": 19100 }, { "epoch": 2.8507795100222717, "grad_norm": 0.224609375, "learning_rate": 4.974016332590943e-06, "loss": 0.1648, "step": 19200 }, { "epoch": 2.8656273199703044, "grad_norm": 0.2353515625, "learning_rate": 4.479089334323188e-06, "loss": 0.1637, "step": 19300 }, { "epoch": 2.880475129918337, "grad_norm": 0.2294921875, "learning_rate": 3.984162336055432e-06, "loss": 0.1728, "step": 19400 }, { "epoch": 2.89532293986637, "grad_norm": 0.25390625, "learning_rate": 3.489235337787677e-06, "loss": 0.1646, "step": 19500 }, { "epoch": 2.9101707498144025, "grad_norm": 0.25, "learning_rate": 2.9943083395199213e-06, "loss": 0.1692, "step": 19600 }, { "epoch": 2.925018559762435, "grad_norm": 0.328125, "learning_rate": 2.4993813412521652e-06, "loss": 0.1708, "step": 19700 }, { "epoch": 2.939866369710468, "grad_norm": 0.265625, "learning_rate": 2.00445434298441e-06, "loss": 0.1689, "step": 19800 }, { "epoch": 2.9547141796585006, "grad_norm": 0.25, "learning_rate": 1.5095273447166542e-06, "loss": 0.1622, "step": 19900 }, { "epoch": 2.9695619896065333, "grad_norm": 0.298828125, "learning_rate": 1.0146003464488989e-06, "loss": 0.1649, "step": 20000 }, { "epoch": 2.984409799554566, "grad_norm": 0.291015625, "learning_rate": 5.196733481811434e-07, "loss": 0.163, "step": 20100 }, { "epoch": 2.9992576095025982, "grad_norm": 0.359375, "learning_rate": 2.4746349913387775e-08, "loss": 0.1617, "step": 20200 } ], "logging_steps": 100, "max_steps": 20205, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1772177365591194e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }