{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9746835443037973, "eval_steps": 45, "global_step": 354, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005625879043600563, "grad_norm": 1.739181637763977, "learning_rate": 1.0000000000000002e-06, "loss": 2.4549, "step": 1 }, { "epoch": 0.005625879043600563, "eval_loss": 2.5061044692993164, "eval_runtime": 30.2114, "eval_samples_per_second": 1.688, "eval_steps_per_second": 1.688, "step": 1 }, { "epoch": 0.011251758087201125, "grad_norm": 1.461582899093628, "learning_rate": 2.0000000000000003e-06, "loss": 2.4808, "step": 2 }, { "epoch": 0.016877637130801686, "grad_norm": 1.5786077976226807, "learning_rate": 3e-06, "loss": 2.54, "step": 3 }, { "epoch": 0.02250351617440225, "grad_norm": 1.605600118637085, "learning_rate": 4.000000000000001e-06, "loss": 2.6763, "step": 4 }, { "epoch": 0.02812939521800281, "grad_norm": 1.4658780097961426, "learning_rate": 5e-06, "loss": 2.4768, "step": 5 }, { "epoch": 0.03375527426160337, "grad_norm": 1.4920974969863892, "learning_rate": 6e-06, "loss": 2.383, "step": 6 }, { "epoch": 0.03938115330520394, "grad_norm": 1.2311428785324097, "learning_rate": 7e-06, "loss": 2.1965, "step": 7 }, { "epoch": 0.0450070323488045, "grad_norm": 1.442131519317627, "learning_rate": 8.000000000000001e-06, "loss": 2.3795, "step": 8 }, { "epoch": 0.05063291139240506, "grad_norm": 1.2747117280960083, "learning_rate": 9e-06, "loss": 2.5673, "step": 9 }, { "epoch": 0.05625879043600562, "grad_norm": 1.810668706893921, "learning_rate": 1e-05, "loss": 2.4721, "step": 10 }, { "epoch": 0.06188466947960619, "grad_norm": 1.694833755493164, "learning_rate": 9.999801918709974e-06, "loss": 2.4181, "step": 11 }, { "epoch": 0.06751054852320675, "grad_norm": 1.3476848602294922, "learning_rate": 9.999207691360399e-06, "loss": 2.5278, "step": 12 }, { "epoch": 0.07313642756680731, "grad_norm": 1.3295379877090454, "learning_rate": 9.99821736751141e-06, "loss": 2.3155, "step": 13 }, { "epoch": 0.07876230661040788, "grad_norm": 1.337768793106079, "learning_rate": 9.996831029758638e-06, "loss": 2.3802, "step": 14 }, { "epoch": 0.08438818565400844, "grad_norm": 1.4814488887786865, "learning_rate": 9.995048793726324e-06, "loss": 2.4514, "step": 15 }, { "epoch": 0.090014064697609, "grad_norm": 1.4958126544952393, "learning_rate": 9.992870808057673e-06, "loss": 2.2855, "step": 16 }, { "epoch": 0.09563994374120956, "grad_norm": 1.3594876527786255, "learning_rate": 9.99029725440246e-06, "loss": 2.485, "step": 17 }, { "epoch": 0.10126582278481013, "grad_norm": 1.2920221090316772, "learning_rate": 9.987328347401871e-06, "loss": 2.5267, "step": 18 }, { "epoch": 0.10689170182841069, "grad_norm": 1.289971947669983, "learning_rate": 9.983964334670618e-06, "loss": 2.2582, "step": 19 }, { "epoch": 0.11251758087201125, "grad_norm": 1.4366180896759033, "learning_rate": 9.980205496776269e-06, "loss": 2.318, "step": 20 }, { "epoch": 0.11814345991561181, "grad_norm": 1.4515198469161987, "learning_rate": 9.976052147215859e-06, "loss": 2.3388, "step": 21 }, { "epoch": 0.12376933895921238, "grad_norm": 1.3737822771072388, "learning_rate": 9.971504632389744e-06, "loss": 2.1809, "step": 22 }, { "epoch": 0.12939521800281295, "grad_norm": 1.692108392715454, "learning_rate": 9.966563331572703e-06, "loss": 2.5646, "step": 23 }, { "epoch": 0.1350210970464135, "grad_norm": 1.2992991209030151, "learning_rate": 9.96122865688231e-06, "loss": 2.2552, "step": 24 }, { "epoch": 0.14064697609001406, "grad_norm": 1.30594801902771, "learning_rate": 9.955501053244563e-06, "loss": 2.2872, "step": 25 }, { "epoch": 0.14627285513361463, "grad_norm": 1.5014636516571045, "learning_rate": 9.949380998356774e-06, "loss": 2.4255, "step": 26 }, { "epoch": 0.1518987341772152, "grad_norm": 1.2353171110153198, "learning_rate": 9.942869002647731e-06, "loss": 2.285, "step": 27 }, { "epoch": 0.15752461322081576, "grad_norm": 1.2573610544204712, "learning_rate": 9.935965609235122e-06, "loss": 2.2413, "step": 28 }, { "epoch": 0.1631504922644163, "grad_norm": 1.2308180332183838, "learning_rate": 9.92867139388024e-06, "loss": 2.254, "step": 29 }, { "epoch": 0.16877637130801687, "grad_norm": 1.3512063026428223, "learning_rate": 9.920986964939964e-06, "loss": 2.6116, "step": 30 }, { "epoch": 0.17440225035161744, "grad_norm": 1.4129055738449097, "learning_rate": 9.912912963316021e-06, "loss": 2.4745, "step": 31 }, { "epoch": 0.180028129395218, "grad_norm": 1.2212988138198853, "learning_rate": 9.904450062401528e-06, "loss": 2.3624, "step": 32 }, { "epoch": 0.18565400843881857, "grad_norm": 1.6333080530166626, "learning_rate": 9.895598968024834e-06, "loss": 2.3999, "step": 33 }, { "epoch": 0.19127988748241911, "grad_norm": 1.437427043914795, "learning_rate": 9.886360418390655e-06, "loss": 2.3292, "step": 34 }, { "epoch": 0.19690576652601968, "grad_norm": 1.333884596824646, "learning_rate": 9.876735184018495e-06, "loss": 2.2703, "step": 35 }, { "epoch": 0.20253164556962025, "grad_norm": 1.452011227607727, "learning_rate": 9.866724067678392e-06, "loss": 2.444, "step": 36 }, { "epoch": 0.20815752461322082, "grad_norm": 1.3540449142456055, "learning_rate": 9.856327904323965e-06, "loss": 2.2662, "step": 37 }, { "epoch": 0.21378340365682139, "grad_norm": 1.3395826816558838, "learning_rate": 9.84554756102277e-06, "loss": 2.4792, "step": 38 }, { "epoch": 0.21940928270042195, "grad_norm": 1.335483193397522, "learning_rate": 9.83438393688399e-06, "loss": 2.5329, "step": 39 }, { "epoch": 0.2250351617440225, "grad_norm": 1.4518929719924927, "learning_rate": 9.822837962983443e-06, "loss": 2.3844, "step": 40 }, { "epoch": 0.23066104078762306, "grad_norm": 1.4345314502716064, "learning_rate": 9.810910602285933e-06, "loss": 2.5071, "step": 41 }, { "epoch": 0.23628691983122363, "grad_norm": 1.2864302396774292, "learning_rate": 9.798602849564929e-06, "loss": 2.5834, "step": 42 }, { "epoch": 0.2419127988748242, "grad_norm": 1.319409728050232, "learning_rate": 9.785915731319605e-06, "loss": 2.282, "step": 43 }, { "epoch": 0.24753867791842477, "grad_norm": 1.2264432907104492, "learning_rate": 9.772850305689224e-06, "loss": 2.3332, "step": 44 }, { "epoch": 0.25316455696202533, "grad_norm": 1.2530275583267212, "learning_rate": 9.759407662364885e-06, "loss": 2.3516, "step": 45 }, { "epoch": 0.25316455696202533, "eval_loss": 2.431987762451172, "eval_runtime": 30.3221, "eval_samples_per_second": 1.682, "eval_steps_per_second": 1.682, "step": 45 }, { "epoch": 0.2587904360056259, "grad_norm": 1.2621686458587646, "learning_rate": 9.745588922498646e-06, "loss": 2.319, "step": 46 }, { "epoch": 0.26441631504922647, "grad_norm": 1.444759726524353, "learning_rate": 9.731395238610006e-06, "loss": 2.4669, "step": 47 }, { "epoch": 0.270042194092827, "grad_norm": 1.4075920581817627, "learning_rate": 9.716827794489795e-06, "loss": 2.4549, "step": 48 }, { "epoch": 0.27566807313642755, "grad_norm": 1.2430981397628784, "learning_rate": 9.701887805101434e-06, "loss": 2.2979, "step": 49 }, { "epoch": 0.2812939521800281, "grad_norm": 1.3289717435836792, "learning_rate": 9.686576516479604e-06, "loss": 2.3718, "step": 50 }, { "epoch": 0.2869198312236287, "grad_norm": 1.5157921314239502, "learning_rate": 9.670895205626327e-06, "loss": 2.3173, "step": 51 }, { "epoch": 0.29254571026722925, "grad_norm": 1.2412256002426147, "learning_rate": 9.65484518040446e-06, "loss": 2.5968, "step": 52 }, { "epoch": 0.2981715893108298, "grad_norm": 1.218125581741333, "learning_rate": 9.638427779428613e-06, "loss": 2.3775, "step": 53 }, { "epoch": 0.3037974683544304, "grad_norm": 1.5379339456558228, "learning_rate": 9.621644371953507e-06, "loss": 2.4156, "step": 54 }, { "epoch": 0.30942334739803096, "grad_norm": 1.3565013408660889, "learning_rate": 9.604496357759778e-06, "loss": 2.3882, "step": 55 }, { "epoch": 0.3150492264416315, "grad_norm": 1.5721348524093628, "learning_rate": 9.586985167037224e-06, "loss": 2.3433, "step": 56 }, { "epoch": 0.3206751054852321, "grad_norm": 1.2288527488708496, "learning_rate": 9.569112260265527e-06, "loss": 2.2329, "step": 57 }, { "epoch": 0.3263009845288326, "grad_norm": 1.2658888101577759, "learning_rate": 9.550879128092447e-06, "loss": 2.5814, "step": 58 }, { "epoch": 0.3319268635724332, "grad_norm": 1.073593020439148, "learning_rate": 9.532287291209498e-06, "loss": 2.2243, "step": 59 }, { "epoch": 0.33755274261603374, "grad_norm": 1.267996907234192, "learning_rate": 9.513338300225116e-06, "loss": 2.5315, "step": 60 }, { "epoch": 0.3431786216596343, "grad_norm": 1.3250463008880615, "learning_rate": 9.49403373553533e-06, "loss": 2.3651, "step": 61 }, { "epoch": 0.3488045007032349, "grad_norm": 1.6996604204177856, "learning_rate": 9.474375207191965e-06, "loss": 2.5536, "step": 62 }, { "epoch": 0.35443037974683544, "grad_norm": 1.1338461637496948, "learning_rate": 9.454364354768351e-06, "loss": 2.2356, "step": 63 }, { "epoch": 0.360056258790436, "grad_norm": 1.3313428163528442, "learning_rate": 9.434002847222574e-06, "loss": 2.2594, "step": 64 }, { "epoch": 0.3656821378340366, "grad_norm": 1.2771188020706177, "learning_rate": 9.41329238275829e-06, "loss": 2.5237, "step": 65 }, { "epoch": 0.37130801687763715, "grad_norm": 1.355175495147705, "learning_rate": 9.392234688683088e-06, "loss": 2.3274, "step": 66 }, { "epoch": 0.3769338959212377, "grad_norm": 1.2518092393875122, "learning_rate": 9.37083152126442e-06, "loss": 2.4285, "step": 67 }, { "epoch": 0.38255977496483823, "grad_norm": 1.3508447408676147, "learning_rate": 9.349084665583136e-06, "loss": 2.1645, "step": 68 }, { "epoch": 0.3881856540084388, "grad_norm": 1.5284922122955322, "learning_rate": 9.326995935384594e-06, "loss": 2.6101, "step": 69 }, { "epoch": 0.39381153305203936, "grad_norm": 1.6654582023620605, "learning_rate": 9.304567172927397e-06, "loss": 2.3393, "step": 70 }, { "epoch": 0.39943741209563993, "grad_norm": 1.2337652444839478, "learning_rate": 9.281800248829728e-06, "loss": 2.4461, "step": 71 }, { "epoch": 0.4050632911392405, "grad_norm": 1.419947624206543, "learning_rate": 9.25869706191336e-06, "loss": 2.4146, "step": 72 }, { "epoch": 0.41068917018284107, "grad_norm": 1.1560102701187134, "learning_rate": 9.235259539045263e-06, "loss": 2.2357, "step": 73 }, { "epoch": 0.41631504922644164, "grad_norm": 1.3465362787246704, "learning_rate": 9.21148963497692e-06, "loss": 2.3877, "step": 74 }, { "epoch": 0.4219409282700422, "grad_norm": 1.37497079372406, "learning_rate": 9.187389332181285e-06, "loss": 2.4105, "step": 75 }, { "epoch": 0.42756680731364277, "grad_norm": 1.1877851486206055, "learning_rate": 9.162960640687436e-06, "loss": 2.3803, "step": 76 }, { "epoch": 0.43319268635724334, "grad_norm": 1.1579440832138062, "learning_rate": 9.138205597912943e-06, "loss": 2.2938, "step": 77 }, { "epoch": 0.4388185654008439, "grad_norm": 1.3359930515289307, "learning_rate": 9.113126268493937e-06, "loss": 2.5006, "step": 78 }, { "epoch": 0.4444444444444444, "grad_norm": 1.3176641464233398, "learning_rate": 9.08772474411291e-06, "loss": 2.3957, "step": 79 }, { "epoch": 0.450070323488045, "grad_norm": 1.2082509994506836, "learning_rate": 9.062003143324267e-06, "loss": 2.273, "step": 80 }, { "epoch": 0.45569620253164556, "grad_norm": 1.3498975038528442, "learning_rate": 9.035963611377641e-06, "loss": 2.4793, "step": 81 }, { "epoch": 0.4613220815752461, "grad_norm": 1.2176086902618408, "learning_rate": 9.009608320038959e-06, "loss": 2.2616, "step": 82 }, { "epoch": 0.4669479606188467, "grad_norm": 1.2488033771514893, "learning_rate": 8.982939467409314e-06, "loss": 2.3263, "step": 83 }, { "epoch": 0.47257383966244726, "grad_norm": 1.106594443321228, "learning_rate": 8.955959277741654e-06, "loss": 2.2856, "step": 84 }, { "epoch": 0.4781997187060478, "grad_norm": 1.2389321327209473, "learning_rate": 8.928670001255248e-06, "loss": 2.4291, "step": 85 }, { "epoch": 0.4838255977496484, "grad_norm": 1.1330113410949707, "learning_rate": 8.901073913948028e-06, "loss": 2.13, "step": 86 }, { "epoch": 0.48945147679324896, "grad_norm": 1.1081808805465698, "learning_rate": 8.873173317406764e-06, "loss": 2.2318, "step": 87 }, { "epoch": 0.49507735583684953, "grad_norm": 1.2427239418029785, "learning_rate": 8.844970538615099e-06, "loss": 2.6752, "step": 88 }, { "epoch": 0.5007032348804501, "grad_norm": 1.2715911865234375, "learning_rate": 8.816467929759476e-06, "loss": 2.5999, "step": 89 }, { "epoch": 0.5063291139240507, "grad_norm": 1.282126545906067, "learning_rate": 8.787667868032964e-06, "loss": 2.6222, "step": 90 }, { "epoch": 0.5063291139240507, "eval_loss": 2.413635492324829, "eval_runtime": 30.2824, "eval_samples_per_second": 1.684, "eval_steps_per_second": 1.684, "step": 90 }, { "epoch": 0.5119549929676512, "grad_norm": 1.25644850730896, "learning_rate": 8.758572755436986e-06, "loss": 2.5696, "step": 91 }, { "epoch": 0.5175808720112518, "grad_norm": 1.3811650276184082, "learning_rate": 8.729185018580984e-06, "loss": 2.2826, "step": 92 }, { "epoch": 0.5232067510548524, "grad_norm": 1.872058629989624, "learning_rate": 8.69950710848005e-06, "loss": 2.4419, "step": 93 }, { "epoch": 0.5288326300984529, "grad_norm": 1.2003915309906006, "learning_rate": 8.669541500350481e-06, "loss": 2.2919, "step": 94 }, { "epoch": 0.5344585091420534, "grad_norm": 1.352100133895874, "learning_rate": 8.63929069340336e-06, "loss": 2.4084, "step": 95 }, { "epoch": 0.540084388185654, "grad_norm": 1.4791978597640991, "learning_rate": 8.608757210636101e-06, "loss": 2.2582, "step": 96 }, { "epoch": 0.5457102672292545, "grad_norm": 1.5740419626235962, "learning_rate": 8.577943598622037e-06, "loss": 2.2898, "step": 97 }, { "epoch": 0.5513361462728551, "grad_norm": 1.1978390216827393, "learning_rate": 8.546852427298013e-06, "loss": 2.2277, "step": 98 }, { "epoch": 0.5569620253164557, "grad_norm": 1.4278184175491333, "learning_rate": 8.515486289750061e-06, "loss": 2.3385, "step": 99 }, { "epoch": 0.5625879043600562, "grad_norm": 1.2246702909469604, "learning_rate": 8.483847801997126e-06, "loss": 2.3997, "step": 100 }, { "epoch": 0.5682137834036568, "grad_norm": 1.261964201927185, "learning_rate": 8.451939602772877e-06, "loss": 2.4312, "step": 101 }, { "epoch": 0.5738396624472574, "grad_norm": 1.4236760139465332, "learning_rate": 8.419764353305638e-06, "loss": 2.3398, "step": 102 }, { "epoch": 0.5794655414908579, "grad_norm": 1.232182502746582, "learning_rate": 8.387324737096427e-06, "loss": 2.5426, "step": 103 }, { "epoch": 0.5850914205344585, "grad_norm": 1.2140839099884033, "learning_rate": 8.35462345969515e-06, "loss": 2.6031, "step": 104 }, { "epoch": 0.5907172995780591, "grad_norm": 1.2126487493515015, "learning_rate": 8.321663248474949e-06, "loss": 2.2923, "step": 105 }, { "epoch": 0.5963431786216596, "grad_norm": 1.3279041051864624, "learning_rate": 8.288446852404735e-06, "loss": 2.2648, "step": 106 }, { "epoch": 0.6019690576652602, "grad_norm": 1.9618940353393555, "learning_rate": 8.254977041819909e-06, "loss": 2.3423, "step": 107 }, { "epoch": 0.6075949367088608, "grad_norm": 1.2524311542510986, "learning_rate": 8.221256608191316e-06, "loss": 2.6281, "step": 108 }, { "epoch": 0.6132208157524613, "grad_norm": 1.4134135246276855, "learning_rate": 8.18728836389243e-06, "loss": 2.3007, "step": 109 }, { "epoch": 0.6188466947960619, "grad_norm": 1.42375910282135, "learning_rate": 8.153075141964785e-06, "loss": 2.3253, "step": 110 }, { "epoch": 0.6244725738396625, "grad_norm": 1.2591787576675415, "learning_rate": 8.118619795881702e-06, "loss": 2.658, "step": 111 }, { "epoch": 0.630098452883263, "grad_norm": 1.2173055410385132, "learning_rate": 8.083925199310299e-06, "loss": 2.339, "step": 112 }, { "epoch": 0.6357243319268636, "grad_norm": 1.8275774717330933, "learning_rate": 8.048994245871813e-06, "loss": 2.415, "step": 113 }, { "epoch": 0.6413502109704642, "grad_norm": 1.2721606492996216, "learning_rate": 8.013829848900278e-06, "loss": 2.273, "step": 114 }, { "epoch": 0.6469760900140648, "grad_norm": 1.4262254238128662, "learning_rate": 7.978434941199526e-06, "loss": 2.5605, "step": 115 }, { "epoch": 0.6526019690576652, "grad_norm": 1.2906497716903687, "learning_rate": 7.942812474798602e-06, "loss": 2.3066, "step": 116 }, { "epoch": 0.6582278481012658, "grad_norm": 1.2999948263168335, "learning_rate": 7.90696542070555e-06, "loss": 2.2782, "step": 117 }, { "epoch": 0.6638537271448663, "grad_norm": 1.5385446548461914, "learning_rate": 7.87089676865962e-06, "loss": 2.3383, "step": 118 }, { "epoch": 0.6694796061884669, "grad_norm": 1.2246472835540771, "learning_rate": 7.834609526881914e-06, "loss": 2.5499, "step": 119 }, { "epoch": 0.6751054852320675, "grad_norm": 1.6999132633209229, "learning_rate": 7.798106721824504e-06, "loss": 2.3576, "step": 120 }, { "epoch": 0.680731364275668, "grad_norm": 1.2035925388336182, "learning_rate": 7.761391397918005e-06, "loss": 2.33, "step": 121 }, { "epoch": 0.6863572433192686, "grad_norm": 1.3971892595291138, "learning_rate": 7.72446661731767e-06, "loss": 2.3338, "step": 122 }, { "epoch": 0.6919831223628692, "grad_norm": 1.4358816146850586, "learning_rate": 7.687335459647993e-06, "loss": 2.5577, "step": 123 }, { "epoch": 0.6976090014064698, "grad_norm": 1.352947473526001, "learning_rate": 7.650001021745866e-06, "loss": 2.2954, "step": 124 }, { "epoch": 0.7032348804500703, "grad_norm": 1.4151978492736816, "learning_rate": 7.612466417402282e-06, "loss": 2.6017, "step": 125 }, { "epoch": 0.7088607594936709, "grad_norm": 1.1894267797470093, "learning_rate": 7.574734777102657e-06, "loss": 2.2623, "step": 126 }, { "epoch": 0.7144866385372715, "grad_norm": 1.1567312479019165, "learning_rate": 7.536809247765718e-06, "loss": 2.2232, "step": 127 }, { "epoch": 0.720112517580872, "grad_norm": 1.3824794292449951, "learning_rate": 7.498692992481056e-06, "loss": 2.2699, "step": 128 }, { "epoch": 0.7257383966244726, "grad_norm": 1.203261375427246, "learning_rate": 7.4603891902453115e-06, "loss": 2.3539, "step": 129 }, { "epoch": 0.7313642756680732, "grad_norm": 1.6249403953552246, "learning_rate": 7.421901035697033e-06, "loss": 2.5196, "step": 130 }, { "epoch": 0.7369901547116737, "grad_norm": 1.3398568630218506, "learning_rate": 7.383231738850246e-06, "loss": 2.341, "step": 131 }, { "epoch": 0.7426160337552743, "grad_norm": 1.148158311843872, "learning_rate": 7.34438452482672e-06, "loss": 2.2581, "step": 132 }, { "epoch": 0.7482419127988749, "grad_norm": 1.1582486629486084, "learning_rate": 7.305362633586984e-06, "loss": 2.3726, "step": 133 }, { "epoch": 0.7538677918424754, "grad_norm": 1.2645725011825562, "learning_rate": 7.266169319660123e-06, "loss": 2.2198, "step": 134 }, { "epoch": 0.759493670886076, "grad_norm": 1.177494764328003, "learning_rate": 7.226807851872312e-06, "loss": 2.319, "step": 135 }, { "epoch": 0.759493670886076, "eval_loss": 2.4054980278015137, "eval_runtime": 30.3027, "eval_samples_per_second": 1.683, "eval_steps_per_second": 1.683, "step": 135 }, { "epoch": 0.7651195499296765, "grad_norm": 1.6996376514434814, "learning_rate": 7.187281513074214e-06, "loss": 2.2793, "step": 136 }, { "epoch": 0.770745428973277, "grad_norm": 1.2821097373962402, "learning_rate": 7.147593599867166e-06, "loss": 2.2482, "step": 137 }, { "epoch": 0.7763713080168776, "grad_norm": 1.4424350261688232, "learning_rate": 7.107747422328241e-06, "loss": 2.3816, "step": 138 }, { "epoch": 0.7819971870604782, "grad_norm": 1.3601067066192627, "learning_rate": 7.067746303734178e-06, "loss": 2.5607, "step": 139 }, { "epoch": 0.7876230661040787, "grad_norm": 1.5715044736862183, "learning_rate": 7.0275935802842036e-06, "loss": 2.2028, "step": 140 }, { "epoch": 0.7932489451476793, "grad_norm": 1.5017560720443726, "learning_rate": 6.9872926008217976e-06, "loss": 2.5636, "step": 141 }, { "epoch": 0.7988748241912799, "grad_norm": 1.2483998537063599, "learning_rate": 6.9468467265553805e-06, "loss": 2.218, "step": 142 }, { "epoch": 0.8045007032348804, "grad_norm": 1.2412904500961304, "learning_rate": 6.906259330777986e-06, "loss": 2.318, "step": 143 }, { "epoch": 0.810126582278481, "grad_norm": 1.4541116952896118, "learning_rate": 6.865533798585915e-06, "loss": 2.2498, "step": 144 }, { "epoch": 0.8157524613220816, "grad_norm": 1.3769111633300781, "learning_rate": 6.824673526596411e-06, "loss": 2.5446, "step": 145 }, { "epoch": 0.8213783403656821, "grad_norm": 1.3049566745758057, "learning_rate": 6.7836819226643705e-06, "loss": 2.5029, "step": 146 }, { "epoch": 0.8270042194092827, "grad_norm": 1.5045892000198364, "learning_rate": 6.7425624055981284e-06, "loss": 2.5418, "step": 147 }, { "epoch": 0.8326300984528833, "grad_norm": 1.375936508178711, "learning_rate": 6.701318404874308e-06, "loss": 2.4155, "step": 148 }, { "epoch": 0.8382559774964838, "grad_norm": 1.4569542407989502, "learning_rate": 6.659953360351803e-06, "loss": 2.483, "step": 149 }, { "epoch": 0.8438818565400844, "grad_norm": 1.4985612630844116, "learning_rate": 6.61847072198488e-06, "loss": 2.3627, "step": 150 }, { "epoch": 0.849507735583685, "grad_norm": 1.1635637283325195, "learning_rate": 6.576873949535439e-06, "loss": 2.4863, "step": 151 }, { "epoch": 0.8551336146272855, "grad_norm": 1.309057354927063, "learning_rate": 6.535166512284473e-06, "loss": 2.4227, "step": 152 }, { "epoch": 0.8607594936708861, "grad_norm": 1.1509865522384644, "learning_rate": 6.493351888742706e-06, "loss": 2.2121, "step": 153 }, { "epoch": 0.8663853727144867, "grad_norm": 1.3486562967300415, "learning_rate": 6.4514335663604834e-06, "loss": 2.3682, "step": 154 }, { "epoch": 0.8720112517580872, "grad_norm": 1.177566409111023, "learning_rate": 6.409415041236912e-06, "loss": 2.439, "step": 155 }, { "epoch": 0.8776371308016878, "grad_norm": 1.3212536573410034, "learning_rate": 6.367299817828271e-06, "loss": 2.3195, "step": 156 }, { "epoch": 0.8832630098452883, "grad_norm": 1.3443726301193237, "learning_rate": 6.325091408655728e-06, "loss": 2.3453, "step": 157 }, { "epoch": 0.8888888888888888, "grad_norm": 1.2387938499450684, "learning_rate": 6.282793334012397e-06, "loss": 2.3215, "step": 158 }, { "epoch": 0.8945147679324894, "grad_norm": 1.1146687269210815, "learning_rate": 6.240409121669726e-06, "loss": 2.5072, "step": 159 }, { "epoch": 0.90014064697609, "grad_norm": 1.4473936557769775, "learning_rate": 6.1979423065832766e-06, "loss": 2.3696, "step": 160 }, { "epoch": 0.9057665260196905, "grad_norm": 1.419673204421997, "learning_rate": 6.155396430597896e-06, "loss": 2.4739, "step": 161 }, { "epoch": 0.9113924050632911, "grad_norm": 1.3161817789077759, "learning_rate": 6.112775042152324e-06, "loss": 2.2546, "step": 162 }, { "epoch": 0.9170182841068917, "grad_norm": 1.2259889841079712, "learning_rate": 6.070081695983236e-06, "loss": 2.3529, "step": 163 }, { "epoch": 0.9226441631504922, "grad_norm": 1.190901756286621, "learning_rate": 6.0273199528287695e-06, "loss": 2.3558, "step": 164 }, { "epoch": 0.9282700421940928, "grad_norm": 1.6890244483947754, "learning_rate": 5.984493379131559e-06, "loss": 2.5214, "step": 165 }, { "epoch": 0.9338959212376934, "grad_norm": 1.6165000200271606, "learning_rate": 5.9416055467412745e-06, "loss": 2.286, "step": 166 }, { "epoch": 0.939521800281294, "grad_norm": 1.2835633754730225, "learning_rate": 5.898660032616721e-06, "loss": 2.2492, "step": 167 }, { "epoch": 0.9451476793248945, "grad_norm": 1.2182981967926025, "learning_rate": 5.855660418527513e-06, "loss": 2.2491, "step": 168 }, { "epoch": 0.9507735583684951, "grad_norm": 1.4453896284103394, "learning_rate": 5.812610290755352e-06, "loss": 2.3408, "step": 169 }, { "epoch": 0.9563994374120957, "grad_norm": 1.1564884185791016, "learning_rate": 5.769513239794905e-06, "loss": 2.5319, "step": 170 }, { "epoch": 0.9620253164556962, "grad_norm": 1.4496843814849854, "learning_rate": 5.7263728600543636e-06, "loss": 2.4306, "step": 171 }, { "epoch": 0.9676511954992968, "grad_norm": 1.0973615646362305, "learning_rate": 5.683192749555652e-06, "loss": 2.2153, "step": 172 }, { "epoch": 0.9732770745428974, "grad_norm": 1.3325732946395874, "learning_rate": 5.639976509634346e-06, "loss": 2.4422, "step": 173 }, { "epoch": 0.9789029535864979, "grad_norm": 1.1608248949050903, "learning_rate": 5.596727744639311e-06, "loss": 2.1957, "step": 174 }, { "epoch": 0.9845288326300985, "grad_norm": 1.2780177593231201, "learning_rate": 5.5534500616320885e-06, "loss": 2.3345, "step": 175 }, { "epoch": 0.9901547116736991, "grad_norm": 1.395959734916687, "learning_rate": 5.510147070086057e-06, "loss": 2.4131, "step": 176 }, { "epoch": 0.9957805907172996, "grad_norm": 1.3863272666931152, "learning_rate": 5.466822381585402e-06, "loss": 2.3061, "step": 177 }, { "epoch": 1.0014064697609002, "grad_norm": 1.2258063554763794, "learning_rate": 5.4234796095238804e-06, "loss": 2.3713, "step": 178 }, { "epoch": 1.0070323488045008, "grad_norm": 1.1133767366409302, "learning_rate": 5.380122368803476e-06, "loss": 2.1674, "step": 179 }, { "epoch": 1.0126582278481013, "grad_norm": 2.211198091506958, "learning_rate": 5.3367542755328935e-06, "loss": 2.4607, "step": 180 }, { "epoch": 1.0126582278481013, "eval_loss": 2.400386095046997, "eval_runtime": 30.3125, "eval_samples_per_second": 1.682, "eval_steps_per_second": 1.682, "step": 180 }, { "epoch": 1.0014064697609002, "grad_norm": 1.428281307220459, "learning_rate": 5.293378946725968e-06, "loss": 2.6363, "step": 181 }, { "epoch": 1.0070323488045008, "grad_norm": 1.3397562503814697, "learning_rate": 5.2500000000000006e-06, "loss": 2.2154, "step": 182 }, { "epoch": 1.0126582278481013, "grad_norm": 1.2449477910995483, "learning_rate": 5.206621053274032e-06, "loss": 2.2516, "step": 183 }, { "epoch": 1.018284106891702, "grad_norm": 1.219135046005249, "learning_rate": 5.1632457244671076e-06, "loss": 2.2162, "step": 184 }, { "epoch": 1.0239099859353025, "grad_norm": 1.5311412811279297, "learning_rate": 5.119877631196525e-06, "loss": 2.1986, "step": 185 }, { "epoch": 1.029535864978903, "grad_norm": 1.0927857160568237, "learning_rate": 5.076520390476121e-06, "loss": 2.299, "step": 186 }, { "epoch": 1.0351617440225036, "grad_norm": 1.161383867263794, "learning_rate": 5.0331776184146e-06, "loss": 2.4993, "step": 187 }, { "epoch": 1.0407876230661042, "grad_norm": 1.392291784286499, "learning_rate": 4.989852929913943e-06, "loss": 2.3965, "step": 188 }, { "epoch": 1.0464135021097047, "grad_norm": 1.0993740558624268, "learning_rate": 4.946549938367912e-06, "loss": 2.2925, "step": 189 }, { "epoch": 1.0520393811533053, "grad_norm": 1.3068045377731323, "learning_rate": 4.9032722553606895e-06, "loss": 2.279, "step": 190 }, { "epoch": 1.0576652601969059, "grad_norm": 1.117245078086853, "learning_rate": 4.860023490365654e-06, "loss": 2.0698, "step": 191 }, { "epoch": 1.0632911392405062, "grad_norm": 1.3679758310317993, "learning_rate": 4.8168072504443484e-06, "loss": 2.1846, "step": 192 }, { "epoch": 1.0689170182841068, "grad_norm": 1.3210700750350952, "learning_rate": 4.773627139945638e-06, "loss": 2.315, "step": 193 }, { "epoch": 1.0745428973277074, "grad_norm": 1.082641363143921, "learning_rate": 4.730486760205098e-06, "loss": 2.3924, "step": 194 }, { "epoch": 1.080168776371308, "grad_norm": 1.1435967683792114, "learning_rate": 4.687389709244651e-06, "loss": 2.421, "step": 195 }, { "epoch": 1.0857946554149085, "grad_norm": 1.8713021278381348, "learning_rate": 4.644339581472489e-06, "loss": 2.1892, "step": 196 }, { "epoch": 1.091420534458509, "grad_norm": 1.304328203201294, "learning_rate": 4.601339967383282e-06, "loss": 2.1397, "step": 197 }, { "epoch": 1.0970464135021096, "grad_norm": 1.1662545204162598, "learning_rate": 4.558394453258728e-06, "loss": 2.2044, "step": 198 }, { "epoch": 1.1026722925457102, "grad_norm": 1.6519335508346558, "learning_rate": 4.515506620868443e-06, "loss": 2.2881, "step": 199 }, { "epoch": 1.1082981715893108, "grad_norm": 1.1646329164505005, "learning_rate": 4.4726800471712325e-06, "loss": 2.4505, "step": 200 }, { "epoch": 1.1139240506329113, "grad_norm": 1.3433741331100464, "learning_rate": 4.429918304016766e-06, "loss": 2.1556, "step": 201 }, { "epoch": 1.119549929676512, "grad_norm": 1.093310832977295, "learning_rate": 4.3872249578476774e-06, "loss": 2.2014, "step": 202 }, { "epoch": 1.1251758087201125, "grad_norm": 1.1493537425994873, "learning_rate": 4.344603569402106e-06, "loss": 2.3267, "step": 203 }, { "epoch": 1.130801687763713, "grad_norm": 1.2506024837493896, "learning_rate": 4.302057693416725e-06, "loss": 2.2444, "step": 204 }, { "epoch": 1.1364275668073136, "grad_norm": 1.3935209512710571, "learning_rate": 4.259590878330276e-06, "loss": 2.2121, "step": 205 }, { "epoch": 1.1420534458509142, "grad_norm": 1.213395595550537, "learning_rate": 4.217206665987605e-06, "loss": 2.2528, "step": 206 }, { "epoch": 1.1476793248945147, "grad_norm": 1.9364022016525269, "learning_rate": 4.174908591344273e-06, "loss": 2.4659, "step": 207 }, { "epoch": 1.1533052039381153, "grad_norm": 1.194150447845459, "learning_rate": 4.132700182171731e-06, "loss": 2.2238, "step": 208 }, { "epoch": 1.1589310829817159, "grad_norm": 1.4336832761764526, "learning_rate": 4.090584958763088e-06, "loss": 2.2914, "step": 209 }, { "epoch": 1.1645569620253164, "grad_norm": 1.6567353010177612, "learning_rate": 4.048566433639516e-06, "loss": 2.4391, "step": 210 }, { "epoch": 1.170182841068917, "grad_norm": 1.426830768585205, "learning_rate": 4.006648111257294e-06, "loss": 2.4198, "step": 211 }, { "epoch": 1.1758087201125176, "grad_norm": 1.2131551504135132, "learning_rate": 3.964833487715527e-06, "loss": 2.3363, "step": 212 }, { "epoch": 1.1814345991561181, "grad_norm": 1.1698105335235596, "learning_rate": 3.923126050464561e-06, "loss": 2.4659, "step": 213 }, { "epoch": 1.1870604781997187, "grad_norm": 1.346468210220337, "learning_rate": 3.881529278015122e-06, "loss": 2.5802, "step": 214 }, { "epoch": 1.1926863572433193, "grad_norm": 1.1469833850860596, "learning_rate": 3.840046639648199e-06, "loss": 2.3794, "step": 215 }, { "epoch": 1.1983122362869199, "grad_norm": 1.3175195455551147, "learning_rate": 3.7986815951256937e-06, "loss": 2.2429, "step": 216 }, { "epoch": 1.2039381153305204, "grad_norm": 1.2770299911499023, "learning_rate": 3.7574375944018744e-06, "loss": 2.3475, "step": 217 }, { "epoch": 1.209563994374121, "grad_norm": 1.2195074558258057, "learning_rate": 3.716318077335632e-06, "loss": 2.2418, "step": 218 }, { "epoch": 1.2151898734177216, "grad_norm": 1.4205323457717896, "learning_rate": 3.675326473403591e-06, "loss": 2.3453, "step": 219 }, { "epoch": 1.2208157524613221, "grad_norm": 1.3720946311950684, "learning_rate": 3.6344662014140862e-06, "loss": 2.318, "step": 220 }, { "epoch": 1.2264416315049227, "grad_norm": 1.162539005279541, "learning_rate": 3.593740669222015e-06, "loss": 2.2763, "step": 221 }, { "epoch": 1.2320675105485233, "grad_norm": 1.1718677282333374, "learning_rate": 3.5531532734446194e-06, "loss": 2.1948, "step": 222 }, { "epoch": 1.2376933895921238, "grad_norm": 1.231491208076477, "learning_rate": 3.512707399178204e-06, "loss": 2.1702, "step": 223 }, { "epoch": 1.2433192686357244, "grad_norm": 1.3110443353652954, "learning_rate": 3.4724064197157976e-06, "loss": 2.4983, "step": 224 }, { "epoch": 1.248945147679325, "grad_norm": 1.2871124744415283, "learning_rate": 3.432253696265824e-06, "loss": 2.2115, "step": 225 }, { "epoch": 1.248945147679325, "eval_loss": 2.3990979194641113, "eval_runtime": 30.3251, "eval_samples_per_second": 1.682, "eval_steps_per_second": 1.682, "step": 225 }, { "epoch": 1.2545710267229255, "grad_norm": 1.3364418745040894, "learning_rate": 3.3922525776717597e-06, "loss": 2.3069, "step": 226 }, { "epoch": 1.260196905766526, "grad_norm": 1.1549724340438843, "learning_rate": 3.3524064001328345e-06, "loss": 2.3003, "step": 227 }, { "epoch": 1.2658227848101267, "grad_norm": 1.5768709182739258, "learning_rate": 3.312718486925787e-06, "loss": 2.1072, "step": 228 }, { "epoch": 1.271448663853727, "grad_norm": 1.3013666868209839, "learning_rate": 3.2731921481276887e-06, "loss": 2.1262, "step": 229 }, { "epoch": 1.2770745428973278, "grad_norm": 1.2704813480377197, "learning_rate": 3.233830680339879e-06, "loss": 2.2043, "step": 230 }, { "epoch": 1.2827004219409281, "grad_norm": 1.818085789680481, "learning_rate": 3.1946373664130155e-06, "loss": 2.2851, "step": 231 }, { "epoch": 1.288326300984529, "grad_norm": 1.4256744384765625, "learning_rate": 3.1556154751732816e-06, "loss": 2.2682, "step": 232 }, { "epoch": 1.2939521800281293, "grad_norm": 1.168641209602356, "learning_rate": 3.1167682611497536e-06, "loss": 2.2535, "step": 233 }, { "epoch": 1.29957805907173, "grad_norm": 1.7689348459243774, "learning_rate": 3.078098964302967e-06, "loss": 2.5086, "step": 234 }, { "epoch": 1.3052039381153304, "grad_norm": 1.1472971439361572, "learning_rate": 3.039610809754689e-06, "loss": 2.2806, "step": 235 }, { "epoch": 1.3108298171589312, "grad_norm": 1.1633094549179077, "learning_rate": 3.001307007518944e-06, "loss": 2.1489, "step": 236 }, { "epoch": 1.3164556962025316, "grad_norm": 1.3734430074691772, "learning_rate": 2.963190752234284e-06, "loss": 2.435, "step": 237 }, { "epoch": 1.3220815752461323, "grad_norm": 1.4113901853561401, "learning_rate": 2.925265222897345e-06, "loss": 2.3259, "step": 238 }, { "epoch": 1.3277074542897327, "grad_norm": 1.2623318433761597, "learning_rate": 2.8875335825977185e-06, "loss": 2.3495, "step": 239 }, { "epoch": 1.3333333333333333, "grad_norm": 1.1913394927978516, "learning_rate": 2.849998978254136e-06, "loss": 2.245, "step": 240 }, { "epoch": 1.3389592123769338, "grad_norm": 1.3264411687850952, "learning_rate": 2.812664540352008e-06, "loss": 2.3225, "step": 241 }, { "epoch": 1.3445850914205344, "grad_norm": 1.2762576341629028, "learning_rate": 2.775533382682332e-06, "loss": 2.3699, "step": 242 }, { "epoch": 1.350210970464135, "grad_norm": 1.4252859354019165, "learning_rate": 2.738608602081996e-06, "loss": 2.2251, "step": 243 }, { "epoch": 1.3558368495077355, "grad_norm": 1.181598424911499, "learning_rate": 2.701893278175499e-06, "loss": 2.3656, "step": 244 }, { "epoch": 1.361462728551336, "grad_norm": 1.2117236852645874, "learning_rate": 2.665390473118088e-06, "loss": 2.5056, "step": 245 }, { "epoch": 1.3670886075949367, "grad_norm": 1.2578994035720825, "learning_rate": 2.629103231340382e-06, "loss": 2.3728, "step": 246 }, { "epoch": 1.3727144866385372, "grad_norm": 1.5048015117645264, "learning_rate": 2.5930345792944513e-06, "loss": 2.3655, "step": 247 }, { "epoch": 1.3783403656821378, "grad_norm": 1.5193151235580444, "learning_rate": 2.5571875252013984e-06, "loss": 2.4273, "step": 248 }, { "epoch": 1.3839662447257384, "grad_norm": 1.2505041360855103, "learning_rate": 2.521565058800475e-06, "loss": 2.2828, "step": 249 }, { "epoch": 1.389592123769339, "grad_norm": 1.3817038536071777, "learning_rate": 2.486170151099725e-06, "loss": 2.2924, "step": 250 }, { "epoch": 1.3952180028129395, "grad_norm": 1.3338009119033813, "learning_rate": 2.4510057541281872e-06, "loss": 2.2852, "step": 251 }, { "epoch": 1.40084388185654, "grad_norm": 1.202316403388977, "learning_rate": 2.4160748006897018e-06, "loss": 2.4643, "step": 252 }, { "epoch": 1.4064697609001406, "grad_norm": 1.4298673868179321, "learning_rate": 2.3813802041182987e-06, "loss": 2.4521, "step": 253 }, { "epoch": 1.4120956399437412, "grad_norm": 1.219159722328186, "learning_rate": 2.346924858035216e-06, "loss": 2.155, "step": 254 }, { "epoch": 1.4177215189873418, "grad_norm": 1.4144155979156494, "learning_rate": 2.3127116361075712e-06, "loss": 2.1897, "step": 255 }, { "epoch": 1.4233473980309423, "grad_norm": 1.171831488609314, "learning_rate": 2.278743391808684e-06, "loss": 2.3303, "step": 256 }, { "epoch": 1.428973277074543, "grad_norm": 1.1133025884628296, "learning_rate": 2.2450229581800925e-06, "loss": 2.3888, "step": 257 }, { "epoch": 1.4345991561181435, "grad_norm": 1.4286714792251587, "learning_rate": 2.2115531475952678e-06, "loss": 2.1884, "step": 258 }, { "epoch": 1.440225035161744, "grad_norm": 1.2007641792297363, "learning_rate": 2.178336751525052e-06, "loss": 2.2624, "step": 259 }, { "epoch": 1.4458509142053446, "grad_norm": 1.3826512098312378, "learning_rate": 2.1453765403048525e-06, "loss": 2.1844, "step": 260 }, { "epoch": 1.4514767932489452, "grad_norm": 1.157139778137207, "learning_rate": 2.1126752629035753e-06, "loss": 2.1608, "step": 261 }, { "epoch": 1.4571026722925458, "grad_norm": 1.1881312131881714, "learning_rate": 2.080235646694363e-06, "loss": 2.1249, "step": 262 }, { "epoch": 1.4627285513361463, "grad_norm": 1.1453356742858887, "learning_rate": 2.0480603972271227e-06, "loss": 2.191, "step": 263 }, { "epoch": 1.4683544303797469, "grad_norm": 1.3118908405303955, "learning_rate": 2.016152198002876e-06, "loss": 2.3229, "step": 264 }, { "epoch": 1.4739803094233475, "grad_norm": 1.2369331121444702, "learning_rate": 1.98451371024994e-06, "loss": 2.1827, "step": 265 }, { "epoch": 1.479606188466948, "grad_norm": 1.3042728900909424, "learning_rate": 1.953147572701989e-06, "loss": 2.1371, "step": 266 }, { "epoch": 1.4852320675105486, "grad_norm": 1.2125662565231323, "learning_rate": 1.922056401377966e-06, "loss": 2.2515, "step": 267 }, { "epoch": 1.4908579465541492, "grad_norm": 1.2672710418701172, "learning_rate": 1.8912427893638996e-06, "loss": 2.1017, "step": 268 }, { "epoch": 1.4964838255977497, "grad_norm": 1.3755918741226196, "learning_rate": 1.8607093065966408e-06, "loss": 2.1161, "step": 269 }, { "epoch": 1.50210970464135, "grad_norm": 1.258540153503418, "learning_rate": 1.8304584996495205e-06, "loss": 2.0267, "step": 270 }, { "epoch": 1.50210970464135, "eval_loss": 2.398313283920288, "eval_runtime": 30.2825, "eval_samples_per_second": 1.684, "eval_steps_per_second": 1.684, "step": 270 }, { "epoch": 1.5077355836849509, "grad_norm": 1.3976160287857056, "learning_rate": 1.8004928915199515e-06, "loss": 2.3219, "step": 271 }, { "epoch": 1.5133614627285512, "grad_norm": 1.1738497018814087, "learning_rate": 1.7708149814190156e-06, "loss": 2.1721, "step": 272 }, { "epoch": 1.518987341772152, "grad_norm": 1.1697238683700562, "learning_rate": 1.7414272445630166e-06, "loss": 2.0663, "step": 273 }, { "epoch": 1.5246132208157523, "grad_norm": 1.1484073400497437, "learning_rate": 1.712332131967036e-06, "loss": 2.1742, "step": 274 }, { "epoch": 1.5302390998593531, "grad_norm": 1.393418312072754, "learning_rate": 1.6835320702405238e-06, "loss": 2.6228, "step": 275 }, { "epoch": 1.5358649789029535, "grad_norm": 1.3217144012451172, "learning_rate": 1.6550294613849016e-06, "loss": 2.2263, "step": 276 }, { "epoch": 1.5414908579465543, "grad_norm": 1.2675628662109375, "learning_rate": 1.6268266825932378e-06, "loss": 2.2772, "step": 277 }, { "epoch": 1.5471167369901546, "grad_norm": 1.3925647735595703, "learning_rate": 1.5989260860519723e-06, "loss": 2.2878, "step": 278 }, { "epoch": 1.5527426160337554, "grad_norm": 1.2969094514846802, "learning_rate": 1.5713299987447534e-06, "loss": 2.2664, "step": 279 }, { "epoch": 1.5583684950773558, "grad_norm": 1.312171220779419, "learning_rate": 1.5440407222583475e-06, "loss": 2.2109, "step": 280 }, { "epoch": 1.5639943741209565, "grad_norm": 1.2671453952789307, "learning_rate": 1.5170605325906863e-06, "loss": 2.2593, "step": 281 }, { "epoch": 1.5696202531645569, "grad_norm": 1.4005205631256104, "learning_rate": 1.4903916799610435e-06, "loss": 2.2503, "step": 282 }, { "epoch": 1.5752461322081577, "grad_norm": 1.098641037940979, "learning_rate": 1.46403638862236e-06, "loss": 2.1495, "step": 283 }, { "epoch": 1.580872011251758, "grad_norm": 1.4645534753799438, "learning_rate": 1.437996856675735e-06, "loss": 2.3019, "step": 284 }, { "epoch": 1.5864978902953588, "grad_norm": 1.2095060348510742, "learning_rate": 1.4122752558870933e-06, "loss": 2.2375, "step": 285 }, { "epoch": 1.5921237693389592, "grad_norm": 1.5466718673706055, "learning_rate": 1.3868737315060646e-06, "loss": 2.584, "step": 286 }, { "epoch": 1.5977496483825597, "grad_norm": 1.4787497520446777, "learning_rate": 1.3617944020870577e-06, "loss": 2.482, "step": 287 }, { "epoch": 1.6033755274261603, "grad_norm": 1.3512495756149292, "learning_rate": 1.3370393593125647e-06, "loss": 2.3235, "step": 288 }, { "epoch": 1.6090014064697609, "grad_norm": 1.1563678979873657, "learning_rate": 1.3126106678187156e-06, "loss": 2.2995, "step": 289 }, { "epoch": 1.6146272855133614, "grad_norm": 1.30403470993042, "learning_rate": 1.2885103650230806e-06, "loss": 2.3431, "step": 290 }, { "epoch": 1.620253164556962, "grad_norm": 1.24360990524292, "learning_rate": 1.2647404609547384e-06, "loss": 2.0579, "step": 291 }, { "epoch": 1.6258790436005626, "grad_norm": 1.4975757598876953, "learning_rate": 1.241302938086642e-06, "loss": 2.0818, "step": 292 }, { "epoch": 1.6315049226441631, "grad_norm": 1.355246901512146, "learning_rate": 1.2181997511702728e-06, "loss": 2.271, "step": 293 }, { "epoch": 1.6371308016877637, "grad_norm": 1.3052653074264526, "learning_rate": 1.1954328270726045e-06, "loss": 2.4885, "step": 294 }, { "epoch": 1.6427566807313643, "grad_norm": 1.5899144411087036, "learning_rate": 1.1730040646154045e-06, "loss": 2.3587, "step": 295 }, { "epoch": 1.6483825597749648, "grad_norm": 1.2158286571502686, "learning_rate": 1.150915334416865e-06, "loss": 2.197, "step": 296 }, { "epoch": 1.6540084388185654, "grad_norm": 1.7922595739364624, "learning_rate": 1.129168478735581e-06, "loss": 2.3705, "step": 297 }, { "epoch": 1.659634317862166, "grad_norm": 1.1858259439468384, "learning_rate": 1.1077653113169134e-06, "loss": 2.5342, "step": 298 }, { "epoch": 1.6652601969057665, "grad_norm": 1.1772092580795288, "learning_rate": 1.0867076172417105e-06, "loss": 2.3663, "step": 299 }, { "epoch": 1.6708860759493671, "grad_norm": 1.3147233724594116, "learning_rate": 1.0659971527774277e-06, "loss": 2.387, "step": 300 }, { "epoch": 1.6765119549929677, "grad_norm": 1.320970058441162, "learning_rate": 1.0456356452316515e-06, "loss": 2.3621, "step": 301 }, { "epoch": 1.6821378340365682, "grad_norm": 1.1753120422363281, "learning_rate": 1.0256247928080357e-06, "loss": 2.1657, "step": 302 }, { "epoch": 1.6877637130801688, "grad_norm": 1.3142890930175781, "learning_rate": 1.0059662644646723e-06, "loss": 2.3147, "step": 303 }, { "epoch": 1.6933895921237694, "grad_norm": 1.2477645874023438, "learning_rate": 9.86661699774887e-07, "loss": 2.331, "step": 304 }, { "epoch": 1.69901547116737, "grad_norm": 1.2975739240646362, "learning_rate": 9.677127087905032e-07, "loss": 2.2859, "step": 305 }, { "epoch": 1.7046413502109705, "grad_norm": 1.5711100101470947, "learning_rate": 9.491208719075537e-07, "loss": 2.1521, "step": 306 }, { "epoch": 1.7102672292545709, "grad_norm": 1.3837226629257202, "learning_rate": 9.308877397344751e-07, "loss": 2.3636, "step": 307 }, { "epoch": 1.7158931082981717, "grad_norm": 1.3110496997833252, "learning_rate": 9.130148329627774e-07, "loss": 2.1745, "step": 308 }, { "epoch": 1.721518987341772, "grad_norm": 1.1928850412368774, "learning_rate": 8.955036422402223e-07, "loss": 2.1995, "step": 309 }, { "epoch": 1.7271448663853728, "grad_norm": 1.4467804431915283, "learning_rate": 8.783556280464933e-07, "loss": 2.1655, "step": 310 }, { "epoch": 1.7327707454289731, "grad_norm": 1.1782878637313843, "learning_rate": 8.615722205713881e-07, "loss": 2.3282, "step": 311 }, { "epoch": 1.738396624472574, "grad_norm": 1.3934561014175415, "learning_rate": 8.451548195955409e-07, "loss": 2.3772, "step": 312 }, { "epoch": 1.7440225035161743, "grad_norm": 1.359525442123413, "learning_rate": 8.291047943736744e-07, "loss": 2.3182, "step": 313 }, { "epoch": 1.749648382559775, "grad_norm": 1.169758677482605, "learning_rate": 8.134234835203974e-07, "loss": 2.3455, "step": 314 }, { "epoch": 1.7552742616033754, "grad_norm": 1.1883350610733032, "learning_rate": 7.981121948985665e-07, "loss": 2.2055, "step": 315 }, { "epoch": 1.7552742616033754, "eval_loss": 2.396662473678589, "eval_runtime": 30.3177, "eval_samples_per_second": 1.682, "eval_steps_per_second": 1.682, "step": 315 }, { "epoch": 1.7609001406469762, "grad_norm": 1.4302018880844116, "learning_rate": 7.831722055102056e-07, "loss": 2.1707, "step": 316 }, { "epoch": 1.7665260196905765, "grad_norm": 1.388421893119812, "learning_rate": 7.686047613899948e-07, "loss": 2.1807, "step": 317 }, { "epoch": 1.7721518987341773, "grad_norm": 1.3130079507827759, "learning_rate": 7.544110775013554e-07, "loss": 2.2276, "step": 318 }, { "epoch": 1.7777777777777777, "grad_norm": 1.3371933698654175, "learning_rate": 7.405923376351153e-07, "loss": 2.3195, "step": 319 }, { "epoch": 1.7834036568213785, "grad_norm": 1.354970097541809, "learning_rate": 7.27149694310777e-07, "loss": 2.3254, "step": 320 }, { "epoch": 1.7890295358649788, "grad_norm": 1.139561414718628, "learning_rate": 7.140842686803959e-07, "loss": 2.1193, "step": 321 }, { "epoch": 1.7946554149085796, "grad_norm": 1.2641339302062988, "learning_rate": 7.013971504350722e-07, "loss": 2.2094, "step": 322 }, { "epoch": 1.80028129395218, "grad_norm": 1.221411943435669, "learning_rate": 6.890893977140682e-07, "loss": 2.1865, "step": 323 }, { "epoch": 1.8059071729957807, "grad_norm": 1.2744096517562866, "learning_rate": 6.771620370165577e-07, "loss": 2.3021, "step": 324 }, { "epoch": 1.811533052039381, "grad_norm": 1.2490217685699463, "learning_rate": 6.656160631160105e-07, "loss": 2.2437, "step": 325 }, { "epoch": 1.8171589310829819, "grad_norm": 1.1591171026229858, "learning_rate": 6.544524389772303e-07, "loss": 2.2542, "step": 326 }, { "epoch": 1.8227848101265822, "grad_norm": 1.3077995777130127, "learning_rate": 6.436720956760359e-07, "loss": 2.3848, "step": 327 }, { "epoch": 1.8284106891701828, "grad_norm": 1.0817915201187134, "learning_rate": 6.332759323216081e-07, "loss": 2.1434, "step": 328 }, { "epoch": 1.8340365682137834, "grad_norm": 1.2274130582809448, "learning_rate": 6.232648159815062e-07, "loss": 2.2062, "step": 329 }, { "epoch": 1.839662447257384, "grad_norm": 1.1238057613372803, "learning_rate": 6.136395816093466e-07, "loss": 2.168, "step": 330 }, { "epoch": 1.8452883263009845, "grad_norm": 1.325332760810852, "learning_rate": 6.044010319751662e-07, "loss": 2.2529, "step": 331 }, { "epoch": 1.850914205344585, "grad_norm": 1.4097862243652344, "learning_rate": 5.95549937598473e-07, "loss": 2.3565, "step": 332 }, { "epoch": 1.8565400843881856, "grad_norm": 1.4523735046386719, "learning_rate": 5.870870366839798e-07, "loss": 2.1707, "step": 333 }, { "epoch": 1.8621659634317862, "grad_norm": 1.3728147745132446, "learning_rate": 5.790130350600362e-07, "loss": 2.2998, "step": 334 }, { "epoch": 1.8677918424753868, "grad_norm": 1.1367465257644653, "learning_rate": 5.713286061197607e-07, "loss": 2.2088, "step": 335 }, { "epoch": 1.8734177215189873, "grad_norm": 1.291197419166565, "learning_rate": 5.640343907648791e-07, "loss": 2.4345, "step": 336 }, { "epoch": 1.879043600562588, "grad_norm": 1.4310858249664307, "learning_rate": 5.571309973522697e-07, "loss": 2.6208, "step": 337 }, { "epoch": 1.8846694796061885, "grad_norm": 1.2096583843231201, "learning_rate": 5.506190016432264e-07, "loss": 2.4734, "step": 338 }, { "epoch": 1.890295358649789, "grad_norm": 1.17626953125, "learning_rate": 5.444989467554386e-07, "loss": 2.5059, "step": 339 }, { "epoch": 1.8959212376933896, "grad_norm": 1.463349461555481, "learning_rate": 5.387713431176918e-07, "loss": 2.2987, "step": 340 }, { "epoch": 1.9015471167369902, "grad_norm": 1.414905309677124, "learning_rate": 5.334366684272987e-07, "loss": 2.3687, "step": 341 }, { "epoch": 1.9071729957805907, "grad_norm": 1.2944713830947876, "learning_rate": 5.28495367610257e-07, "loss": 2.2465, "step": 342 }, { "epoch": 1.9127988748241913, "grad_norm": 1.156260371208191, "learning_rate": 5.239478527841415e-07, "loss": 2.2922, "step": 343 }, { "epoch": 1.9184247538677919, "grad_norm": 1.5482733249664307, "learning_rate": 5.197945032237327e-07, "loss": 2.2515, "step": 344 }, { "epoch": 1.9240506329113924, "grad_norm": 1.29518461227417, "learning_rate": 5.160356653293837e-07, "loss": 2.5098, "step": 345 }, { "epoch": 1.929676511954993, "grad_norm": 1.286757230758667, "learning_rate": 5.126716525981297e-07, "loss": 2.3004, "step": 346 }, { "epoch": 1.9353023909985936, "grad_norm": 1.1425246000289917, "learning_rate": 5.097027455975421e-07, "loss": 2.2362, "step": 347 }, { "epoch": 1.9409282700421941, "grad_norm": 1.1543301343917847, "learning_rate": 5.071291919423276e-07, "loss": 2.3147, "step": 348 }, { "epoch": 1.9465541490857947, "grad_norm": 1.2874441146850586, "learning_rate": 5.049512062736767e-07, "loss": 2.2537, "step": 349 }, { "epoch": 1.952180028129395, "grad_norm": 1.476819396018982, "learning_rate": 5.03168970241363e-07, "loss": 2.1098, "step": 350 }, { "epoch": 1.9578059071729959, "grad_norm": 1.2125787734985352, "learning_rate": 5.017826324885912e-07, "loss": 2.3497, "step": 351 }, { "epoch": 1.9634317862165962, "grad_norm": 1.2305964231491089, "learning_rate": 5.007923086396018e-07, "loss": 2.3115, "step": 352 }, { "epoch": 1.969057665260197, "grad_norm": 1.3071091175079346, "learning_rate": 5.001980812900265e-07, "loss": 2.1634, "step": 353 }, { "epoch": 1.9746835443037973, "grad_norm": 1.4950101375579834, "learning_rate": 5.000000000000001e-07, "loss": 2.3289, "step": 354 } ], "logging_steps": 1, "max_steps": 354, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 177, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8528841455357133e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }