|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9746835443037973, |
|
"eval_steps": 45, |
|
"global_step": 354, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005625879043600563, |
|
"grad_norm": 1.739181637763977, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.4549, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005625879043600563, |
|
"eval_loss": 2.5061044692993164, |
|
"eval_runtime": 30.2114, |
|
"eval_samples_per_second": 1.688, |
|
"eval_steps_per_second": 1.688, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011251758087201125, |
|
"grad_norm": 1.461582899093628, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4808, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.016877637130801686, |
|
"grad_norm": 1.5786077976226807, |
|
"learning_rate": 3e-06, |
|
"loss": 2.54, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02250351617440225, |
|
"grad_norm": 1.605600118637085, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.6763, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02812939521800281, |
|
"grad_norm": 1.4658780097961426, |
|
"learning_rate": 5e-06, |
|
"loss": 2.4768, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03375527426160337, |
|
"grad_norm": 1.4920974969863892, |
|
"learning_rate": 6e-06, |
|
"loss": 2.383, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03938115330520394, |
|
"grad_norm": 1.2311428785324097, |
|
"learning_rate": 7e-06, |
|
"loss": 2.1965, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0450070323488045, |
|
"grad_norm": 1.442131519317627, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.3795, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05063291139240506, |
|
"grad_norm": 1.2747117280960083, |
|
"learning_rate": 9e-06, |
|
"loss": 2.5673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05625879043600562, |
|
"grad_norm": 1.810668706893921, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06188466947960619, |
|
"grad_norm": 1.694833755493164, |
|
"learning_rate": 9.999801918709974e-06, |
|
"loss": 2.4181, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06751054852320675, |
|
"grad_norm": 1.3476848602294922, |
|
"learning_rate": 9.999207691360399e-06, |
|
"loss": 2.5278, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07313642756680731, |
|
"grad_norm": 1.3295379877090454, |
|
"learning_rate": 9.99821736751141e-06, |
|
"loss": 2.3155, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07876230661040788, |
|
"grad_norm": 1.337768793106079, |
|
"learning_rate": 9.996831029758638e-06, |
|
"loss": 2.3802, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08438818565400844, |
|
"grad_norm": 1.4814488887786865, |
|
"learning_rate": 9.995048793726324e-06, |
|
"loss": 2.4514, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.090014064697609, |
|
"grad_norm": 1.4958126544952393, |
|
"learning_rate": 9.992870808057673e-06, |
|
"loss": 2.2855, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.09563994374120956, |
|
"grad_norm": 1.3594876527786255, |
|
"learning_rate": 9.99029725440246e-06, |
|
"loss": 2.485, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10126582278481013, |
|
"grad_norm": 1.2920221090316772, |
|
"learning_rate": 9.987328347401871e-06, |
|
"loss": 2.5267, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.10689170182841069, |
|
"grad_norm": 1.289971947669983, |
|
"learning_rate": 9.983964334670618e-06, |
|
"loss": 2.2582, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11251758087201125, |
|
"grad_norm": 1.4366180896759033, |
|
"learning_rate": 9.980205496776269e-06, |
|
"loss": 2.318, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11814345991561181, |
|
"grad_norm": 1.4515198469161987, |
|
"learning_rate": 9.976052147215859e-06, |
|
"loss": 2.3388, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.12376933895921238, |
|
"grad_norm": 1.3737822771072388, |
|
"learning_rate": 9.971504632389744e-06, |
|
"loss": 2.1809, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.12939521800281295, |
|
"grad_norm": 1.692108392715454, |
|
"learning_rate": 9.966563331572703e-06, |
|
"loss": 2.5646, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1350210970464135, |
|
"grad_norm": 1.2992991209030151, |
|
"learning_rate": 9.96122865688231e-06, |
|
"loss": 2.2552, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14064697609001406, |
|
"grad_norm": 1.30594801902771, |
|
"learning_rate": 9.955501053244563e-06, |
|
"loss": 2.2872, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14627285513361463, |
|
"grad_norm": 1.5014636516571045, |
|
"learning_rate": 9.949380998356774e-06, |
|
"loss": 2.4255, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1518987341772152, |
|
"grad_norm": 1.2353171110153198, |
|
"learning_rate": 9.942869002647731e-06, |
|
"loss": 2.285, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.15752461322081576, |
|
"grad_norm": 1.2573610544204712, |
|
"learning_rate": 9.935965609235122e-06, |
|
"loss": 2.2413, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1631504922644163, |
|
"grad_norm": 1.2308180332183838, |
|
"learning_rate": 9.92867139388024e-06, |
|
"loss": 2.254, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.16877637130801687, |
|
"grad_norm": 1.3512063026428223, |
|
"learning_rate": 9.920986964939964e-06, |
|
"loss": 2.6116, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17440225035161744, |
|
"grad_norm": 1.4129055738449097, |
|
"learning_rate": 9.912912963316021e-06, |
|
"loss": 2.4745, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.180028129395218, |
|
"grad_norm": 1.2212988138198853, |
|
"learning_rate": 9.904450062401528e-06, |
|
"loss": 2.3624, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.18565400843881857, |
|
"grad_norm": 1.6333080530166626, |
|
"learning_rate": 9.895598968024834e-06, |
|
"loss": 2.3999, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.19127988748241911, |
|
"grad_norm": 1.437427043914795, |
|
"learning_rate": 9.886360418390655e-06, |
|
"loss": 2.3292, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.19690576652601968, |
|
"grad_norm": 1.333884596824646, |
|
"learning_rate": 9.876735184018495e-06, |
|
"loss": 2.2703, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.20253164556962025, |
|
"grad_norm": 1.452011227607727, |
|
"learning_rate": 9.866724067678392e-06, |
|
"loss": 2.444, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.20815752461322082, |
|
"grad_norm": 1.3540449142456055, |
|
"learning_rate": 9.856327904323965e-06, |
|
"loss": 2.2662, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.21378340365682139, |
|
"grad_norm": 1.3395826816558838, |
|
"learning_rate": 9.84554756102277e-06, |
|
"loss": 2.4792, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.21940928270042195, |
|
"grad_norm": 1.335483193397522, |
|
"learning_rate": 9.83438393688399e-06, |
|
"loss": 2.5329, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2250351617440225, |
|
"grad_norm": 1.4518929719924927, |
|
"learning_rate": 9.822837962983443e-06, |
|
"loss": 2.3844, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.23066104078762306, |
|
"grad_norm": 1.4345314502716064, |
|
"learning_rate": 9.810910602285933e-06, |
|
"loss": 2.5071, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.23628691983122363, |
|
"grad_norm": 1.2864302396774292, |
|
"learning_rate": 9.798602849564929e-06, |
|
"loss": 2.5834, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2419127988748242, |
|
"grad_norm": 1.319409728050232, |
|
"learning_rate": 9.785915731319605e-06, |
|
"loss": 2.282, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.24753867791842477, |
|
"grad_norm": 1.2264432907104492, |
|
"learning_rate": 9.772850305689224e-06, |
|
"loss": 2.3332, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.25316455696202533, |
|
"grad_norm": 1.2530275583267212, |
|
"learning_rate": 9.759407662364885e-06, |
|
"loss": 2.3516, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.25316455696202533, |
|
"eval_loss": 2.431987762451172, |
|
"eval_runtime": 30.3221, |
|
"eval_samples_per_second": 1.682, |
|
"eval_steps_per_second": 1.682, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2587904360056259, |
|
"grad_norm": 1.2621686458587646, |
|
"learning_rate": 9.745588922498646e-06, |
|
"loss": 2.319, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.26441631504922647, |
|
"grad_norm": 1.444759726524353, |
|
"learning_rate": 9.731395238610006e-06, |
|
"loss": 2.4669, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.270042194092827, |
|
"grad_norm": 1.4075920581817627, |
|
"learning_rate": 9.716827794489795e-06, |
|
"loss": 2.4549, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.27566807313642755, |
|
"grad_norm": 1.2430981397628784, |
|
"learning_rate": 9.701887805101434e-06, |
|
"loss": 2.2979, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2812939521800281, |
|
"grad_norm": 1.3289717435836792, |
|
"learning_rate": 9.686576516479604e-06, |
|
"loss": 2.3718, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2869198312236287, |
|
"grad_norm": 1.5157921314239502, |
|
"learning_rate": 9.670895205626327e-06, |
|
"loss": 2.3173, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.29254571026722925, |
|
"grad_norm": 1.2412256002426147, |
|
"learning_rate": 9.65484518040446e-06, |
|
"loss": 2.5968, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2981715893108298, |
|
"grad_norm": 1.218125581741333, |
|
"learning_rate": 9.638427779428613e-06, |
|
"loss": 2.3775, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3037974683544304, |
|
"grad_norm": 1.5379339456558228, |
|
"learning_rate": 9.621644371953507e-06, |
|
"loss": 2.4156, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.30942334739803096, |
|
"grad_norm": 1.3565013408660889, |
|
"learning_rate": 9.604496357759778e-06, |
|
"loss": 2.3882, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3150492264416315, |
|
"grad_norm": 1.5721348524093628, |
|
"learning_rate": 9.586985167037224e-06, |
|
"loss": 2.3433, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3206751054852321, |
|
"grad_norm": 1.2288527488708496, |
|
"learning_rate": 9.569112260265527e-06, |
|
"loss": 2.2329, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3263009845288326, |
|
"grad_norm": 1.2658888101577759, |
|
"learning_rate": 9.550879128092447e-06, |
|
"loss": 2.5814, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3319268635724332, |
|
"grad_norm": 1.073593020439148, |
|
"learning_rate": 9.532287291209498e-06, |
|
"loss": 2.2243, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.33755274261603374, |
|
"grad_norm": 1.267996907234192, |
|
"learning_rate": 9.513338300225116e-06, |
|
"loss": 2.5315, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3431786216596343, |
|
"grad_norm": 1.3250463008880615, |
|
"learning_rate": 9.49403373553533e-06, |
|
"loss": 2.3651, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3488045007032349, |
|
"grad_norm": 1.6996604204177856, |
|
"learning_rate": 9.474375207191965e-06, |
|
"loss": 2.5536, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.35443037974683544, |
|
"grad_norm": 1.1338461637496948, |
|
"learning_rate": 9.454364354768351e-06, |
|
"loss": 2.2356, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.360056258790436, |
|
"grad_norm": 1.3313428163528442, |
|
"learning_rate": 9.434002847222574e-06, |
|
"loss": 2.2594, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3656821378340366, |
|
"grad_norm": 1.2771188020706177, |
|
"learning_rate": 9.41329238275829e-06, |
|
"loss": 2.5237, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.37130801687763715, |
|
"grad_norm": 1.355175495147705, |
|
"learning_rate": 9.392234688683088e-06, |
|
"loss": 2.3274, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3769338959212377, |
|
"grad_norm": 1.2518092393875122, |
|
"learning_rate": 9.37083152126442e-06, |
|
"loss": 2.4285, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.38255977496483823, |
|
"grad_norm": 1.3508447408676147, |
|
"learning_rate": 9.349084665583136e-06, |
|
"loss": 2.1645, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3881856540084388, |
|
"grad_norm": 1.5284922122955322, |
|
"learning_rate": 9.326995935384594e-06, |
|
"loss": 2.6101, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.39381153305203936, |
|
"grad_norm": 1.6654582023620605, |
|
"learning_rate": 9.304567172927397e-06, |
|
"loss": 2.3393, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.39943741209563993, |
|
"grad_norm": 1.2337652444839478, |
|
"learning_rate": 9.281800248829728e-06, |
|
"loss": 2.4461, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4050632911392405, |
|
"grad_norm": 1.419947624206543, |
|
"learning_rate": 9.25869706191336e-06, |
|
"loss": 2.4146, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.41068917018284107, |
|
"grad_norm": 1.1560102701187134, |
|
"learning_rate": 9.235259539045263e-06, |
|
"loss": 2.2357, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.41631504922644164, |
|
"grad_norm": 1.3465362787246704, |
|
"learning_rate": 9.21148963497692e-06, |
|
"loss": 2.3877, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4219409282700422, |
|
"grad_norm": 1.37497079372406, |
|
"learning_rate": 9.187389332181285e-06, |
|
"loss": 2.4105, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.42756680731364277, |
|
"grad_norm": 1.1877851486206055, |
|
"learning_rate": 9.162960640687436e-06, |
|
"loss": 2.3803, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.43319268635724334, |
|
"grad_norm": 1.1579440832138062, |
|
"learning_rate": 9.138205597912943e-06, |
|
"loss": 2.2938, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4388185654008439, |
|
"grad_norm": 1.3359930515289307, |
|
"learning_rate": 9.113126268493937e-06, |
|
"loss": 2.5006, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 1.3176641464233398, |
|
"learning_rate": 9.08772474411291e-06, |
|
"loss": 2.3957, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.450070323488045, |
|
"grad_norm": 1.2082509994506836, |
|
"learning_rate": 9.062003143324267e-06, |
|
"loss": 2.273, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45569620253164556, |
|
"grad_norm": 1.3498975038528442, |
|
"learning_rate": 9.035963611377641e-06, |
|
"loss": 2.4793, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4613220815752461, |
|
"grad_norm": 1.2176086902618408, |
|
"learning_rate": 9.009608320038959e-06, |
|
"loss": 2.2616, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4669479606188467, |
|
"grad_norm": 1.2488033771514893, |
|
"learning_rate": 8.982939467409314e-06, |
|
"loss": 2.3263, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.47257383966244726, |
|
"grad_norm": 1.106594443321228, |
|
"learning_rate": 8.955959277741654e-06, |
|
"loss": 2.2856, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4781997187060478, |
|
"grad_norm": 1.2389321327209473, |
|
"learning_rate": 8.928670001255248e-06, |
|
"loss": 2.4291, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4838255977496484, |
|
"grad_norm": 1.1330113410949707, |
|
"learning_rate": 8.901073913948028e-06, |
|
"loss": 2.13, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.48945147679324896, |
|
"grad_norm": 1.1081808805465698, |
|
"learning_rate": 8.873173317406764e-06, |
|
"loss": 2.2318, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.49507735583684953, |
|
"grad_norm": 1.2427239418029785, |
|
"learning_rate": 8.844970538615099e-06, |
|
"loss": 2.6752, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5007032348804501, |
|
"grad_norm": 1.2715911865234375, |
|
"learning_rate": 8.816467929759476e-06, |
|
"loss": 2.5999, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 1.282126545906067, |
|
"learning_rate": 8.787667868032964e-06, |
|
"loss": 2.6222, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"eval_loss": 2.413635492324829, |
|
"eval_runtime": 30.2824, |
|
"eval_samples_per_second": 1.684, |
|
"eval_steps_per_second": 1.684, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5119549929676512, |
|
"grad_norm": 1.25644850730896, |
|
"learning_rate": 8.758572755436986e-06, |
|
"loss": 2.5696, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5175808720112518, |
|
"grad_norm": 1.3811650276184082, |
|
"learning_rate": 8.729185018580984e-06, |
|
"loss": 2.2826, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5232067510548524, |
|
"grad_norm": 1.872058629989624, |
|
"learning_rate": 8.69950710848005e-06, |
|
"loss": 2.4419, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5288326300984529, |
|
"grad_norm": 1.2003915309906006, |
|
"learning_rate": 8.669541500350481e-06, |
|
"loss": 2.2919, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5344585091420534, |
|
"grad_norm": 1.352100133895874, |
|
"learning_rate": 8.63929069340336e-06, |
|
"loss": 2.4084, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.540084388185654, |
|
"grad_norm": 1.4791978597640991, |
|
"learning_rate": 8.608757210636101e-06, |
|
"loss": 2.2582, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5457102672292545, |
|
"grad_norm": 1.5740419626235962, |
|
"learning_rate": 8.577943598622037e-06, |
|
"loss": 2.2898, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5513361462728551, |
|
"grad_norm": 1.1978390216827393, |
|
"learning_rate": 8.546852427298013e-06, |
|
"loss": 2.2277, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5569620253164557, |
|
"grad_norm": 1.4278184175491333, |
|
"learning_rate": 8.515486289750061e-06, |
|
"loss": 2.3385, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5625879043600562, |
|
"grad_norm": 1.2246702909469604, |
|
"learning_rate": 8.483847801997126e-06, |
|
"loss": 2.3997, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5682137834036568, |
|
"grad_norm": 1.261964201927185, |
|
"learning_rate": 8.451939602772877e-06, |
|
"loss": 2.4312, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5738396624472574, |
|
"grad_norm": 1.4236760139465332, |
|
"learning_rate": 8.419764353305638e-06, |
|
"loss": 2.3398, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5794655414908579, |
|
"grad_norm": 1.232182502746582, |
|
"learning_rate": 8.387324737096427e-06, |
|
"loss": 2.5426, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5850914205344585, |
|
"grad_norm": 1.2140839099884033, |
|
"learning_rate": 8.35462345969515e-06, |
|
"loss": 2.6031, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5907172995780591, |
|
"grad_norm": 1.2126487493515015, |
|
"learning_rate": 8.321663248474949e-06, |
|
"loss": 2.2923, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5963431786216596, |
|
"grad_norm": 1.3279041051864624, |
|
"learning_rate": 8.288446852404735e-06, |
|
"loss": 2.2648, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6019690576652602, |
|
"grad_norm": 1.9618940353393555, |
|
"learning_rate": 8.254977041819909e-06, |
|
"loss": 2.3423, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6075949367088608, |
|
"grad_norm": 1.2524311542510986, |
|
"learning_rate": 8.221256608191316e-06, |
|
"loss": 2.6281, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6132208157524613, |
|
"grad_norm": 1.4134135246276855, |
|
"learning_rate": 8.18728836389243e-06, |
|
"loss": 2.3007, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6188466947960619, |
|
"grad_norm": 1.42375910282135, |
|
"learning_rate": 8.153075141964785e-06, |
|
"loss": 2.3253, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6244725738396625, |
|
"grad_norm": 1.2591787576675415, |
|
"learning_rate": 8.118619795881702e-06, |
|
"loss": 2.658, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.630098452883263, |
|
"grad_norm": 1.2173055410385132, |
|
"learning_rate": 8.083925199310299e-06, |
|
"loss": 2.339, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6357243319268636, |
|
"grad_norm": 1.8275774717330933, |
|
"learning_rate": 8.048994245871813e-06, |
|
"loss": 2.415, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6413502109704642, |
|
"grad_norm": 1.2721606492996216, |
|
"learning_rate": 8.013829848900278e-06, |
|
"loss": 2.273, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6469760900140648, |
|
"grad_norm": 1.4262254238128662, |
|
"learning_rate": 7.978434941199526e-06, |
|
"loss": 2.5605, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6526019690576652, |
|
"grad_norm": 1.2906497716903687, |
|
"learning_rate": 7.942812474798602e-06, |
|
"loss": 2.3066, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6582278481012658, |
|
"grad_norm": 1.2999948263168335, |
|
"learning_rate": 7.90696542070555e-06, |
|
"loss": 2.2782, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6638537271448663, |
|
"grad_norm": 1.5385446548461914, |
|
"learning_rate": 7.87089676865962e-06, |
|
"loss": 2.3383, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6694796061884669, |
|
"grad_norm": 1.2246472835540771, |
|
"learning_rate": 7.834609526881914e-06, |
|
"loss": 2.5499, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6751054852320675, |
|
"grad_norm": 1.6999132633209229, |
|
"learning_rate": 7.798106721824504e-06, |
|
"loss": 2.3576, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.680731364275668, |
|
"grad_norm": 1.2035925388336182, |
|
"learning_rate": 7.761391397918005e-06, |
|
"loss": 2.33, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6863572433192686, |
|
"grad_norm": 1.3971892595291138, |
|
"learning_rate": 7.72446661731767e-06, |
|
"loss": 2.3338, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6919831223628692, |
|
"grad_norm": 1.4358816146850586, |
|
"learning_rate": 7.687335459647993e-06, |
|
"loss": 2.5577, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6976090014064698, |
|
"grad_norm": 1.352947473526001, |
|
"learning_rate": 7.650001021745866e-06, |
|
"loss": 2.2954, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7032348804500703, |
|
"grad_norm": 1.4151978492736816, |
|
"learning_rate": 7.612466417402282e-06, |
|
"loss": 2.6017, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7088607594936709, |
|
"grad_norm": 1.1894267797470093, |
|
"learning_rate": 7.574734777102657e-06, |
|
"loss": 2.2623, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7144866385372715, |
|
"grad_norm": 1.1567312479019165, |
|
"learning_rate": 7.536809247765718e-06, |
|
"loss": 2.2232, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.720112517580872, |
|
"grad_norm": 1.3824794292449951, |
|
"learning_rate": 7.498692992481056e-06, |
|
"loss": 2.2699, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7257383966244726, |
|
"grad_norm": 1.203261375427246, |
|
"learning_rate": 7.4603891902453115e-06, |
|
"loss": 2.3539, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7313642756680732, |
|
"grad_norm": 1.6249403953552246, |
|
"learning_rate": 7.421901035697033e-06, |
|
"loss": 2.5196, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7369901547116737, |
|
"grad_norm": 1.3398568630218506, |
|
"learning_rate": 7.383231738850246e-06, |
|
"loss": 2.341, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7426160337552743, |
|
"grad_norm": 1.148158311843872, |
|
"learning_rate": 7.34438452482672e-06, |
|
"loss": 2.2581, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7482419127988749, |
|
"grad_norm": 1.1582486629486084, |
|
"learning_rate": 7.305362633586984e-06, |
|
"loss": 2.3726, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7538677918424754, |
|
"grad_norm": 1.2645725011825562, |
|
"learning_rate": 7.266169319660123e-06, |
|
"loss": 2.2198, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"grad_norm": 1.177494764328003, |
|
"learning_rate": 7.226807851872312e-06, |
|
"loss": 2.319, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"eval_loss": 2.4054980278015137, |
|
"eval_runtime": 30.3027, |
|
"eval_samples_per_second": 1.683, |
|
"eval_steps_per_second": 1.683, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7651195499296765, |
|
"grad_norm": 1.6996376514434814, |
|
"learning_rate": 7.187281513074214e-06, |
|
"loss": 2.2793, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.770745428973277, |
|
"grad_norm": 1.2821097373962402, |
|
"learning_rate": 7.147593599867166e-06, |
|
"loss": 2.2482, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7763713080168776, |
|
"grad_norm": 1.4424350261688232, |
|
"learning_rate": 7.107747422328241e-06, |
|
"loss": 2.3816, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7819971870604782, |
|
"grad_norm": 1.3601067066192627, |
|
"learning_rate": 7.067746303734178e-06, |
|
"loss": 2.5607, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7876230661040787, |
|
"grad_norm": 1.5715044736862183, |
|
"learning_rate": 7.0275935802842036e-06, |
|
"loss": 2.2028, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7932489451476793, |
|
"grad_norm": 1.5017560720443726, |
|
"learning_rate": 6.9872926008217976e-06, |
|
"loss": 2.5636, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7988748241912799, |
|
"grad_norm": 1.2483998537063599, |
|
"learning_rate": 6.9468467265553805e-06, |
|
"loss": 2.218, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8045007032348804, |
|
"grad_norm": 1.2412904500961304, |
|
"learning_rate": 6.906259330777986e-06, |
|
"loss": 2.318, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.810126582278481, |
|
"grad_norm": 1.4541116952896118, |
|
"learning_rate": 6.865533798585915e-06, |
|
"loss": 2.2498, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8157524613220816, |
|
"grad_norm": 1.3769111633300781, |
|
"learning_rate": 6.824673526596411e-06, |
|
"loss": 2.5446, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8213783403656821, |
|
"grad_norm": 1.3049566745758057, |
|
"learning_rate": 6.7836819226643705e-06, |
|
"loss": 2.5029, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8270042194092827, |
|
"grad_norm": 1.5045892000198364, |
|
"learning_rate": 6.7425624055981284e-06, |
|
"loss": 2.5418, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8326300984528833, |
|
"grad_norm": 1.375936508178711, |
|
"learning_rate": 6.701318404874308e-06, |
|
"loss": 2.4155, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8382559774964838, |
|
"grad_norm": 1.4569542407989502, |
|
"learning_rate": 6.659953360351803e-06, |
|
"loss": 2.483, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8438818565400844, |
|
"grad_norm": 1.4985612630844116, |
|
"learning_rate": 6.61847072198488e-06, |
|
"loss": 2.3627, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.849507735583685, |
|
"grad_norm": 1.1635637283325195, |
|
"learning_rate": 6.576873949535439e-06, |
|
"loss": 2.4863, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8551336146272855, |
|
"grad_norm": 1.309057354927063, |
|
"learning_rate": 6.535166512284473e-06, |
|
"loss": 2.4227, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8607594936708861, |
|
"grad_norm": 1.1509865522384644, |
|
"learning_rate": 6.493351888742706e-06, |
|
"loss": 2.2121, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8663853727144867, |
|
"grad_norm": 1.3486562967300415, |
|
"learning_rate": 6.4514335663604834e-06, |
|
"loss": 2.3682, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8720112517580872, |
|
"grad_norm": 1.177566409111023, |
|
"learning_rate": 6.409415041236912e-06, |
|
"loss": 2.439, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8776371308016878, |
|
"grad_norm": 1.3212536573410034, |
|
"learning_rate": 6.367299817828271e-06, |
|
"loss": 2.3195, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8832630098452883, |
|
"grad_norm": 1.3443726301193237, |
|
"learning_rate": 6.325091408655728e-06, |
|
"loss": 2.3453, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.2387938499450684, |
|
"learning_rate": 6.282793334012397e-06, |
|
"loss": 2.3215, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8945147679324894, |
|
"grad_norm": 1.1146687269210815, |
|
"learning_rate": 6.240409121669726e-06, |
|
"loss": 2.5072, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.90014064697609, |
|
"grad_norm": 1.4473936557769775, |
|
"learning_rate": 6.1979423065832766e-06, |
|
"loss": 2.3696, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9057665260196905, |
|
"grad_norm": 1.419673204421997, |
|
"learning_rate": 6.155396430597896e-06, |
|
"loss": 2.4739, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9113924050632911, |
|
"grad_norm": 1.3161817789077759, |
|
"learning_rate": 6.112775042152324e-06, |
|
"loss": 2.2546, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9170182841068917, |
|
"grad_norm": 1.2259889841079712, |
|
"learning_rate": 6.070081695983236e-06, |
|
"loss": 2.3529, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9226441631504922, |
|
"grad_norm": 1.190901756286621, |
|
"learning_rate": 6.0273199528287695e-06, |
|
"loss": 2.3558, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9282700421940928, |
|
"grad_norm": 1.6890244483947754, |
|
"learning_rate": 5.984493379131559e-06, |
|
"loss": 2.5214, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9338959212376934, |
|
"grad_norm": 1.6165000200271606, |
|
"learning_rate": 5.9416055467412745e-06, |
|
"loss": 2.286, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.939521800281294, |
|
"grad_norm": 1.2835633754730225, |
|
"learning_rate": 5.898660032616721e-06, |
|
"loss": 2.2492, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9451476793248945, |
|
"grad_norm": 1.2182981967926025, |
|
"learning_rate": 5.855660418527513e-06, |
|
"loss": 2.2491, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9507735583684951, |
|
"grad_norm": 1.4453896284103394, |
|
"learning_rate": 5.812610290755352e-06, |
|
"loss": 2.3408, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.9563994374120957, |
|
"grad_norm": 1.1564884185791016, |
|
"learning_rate": 5.769513239794905e-06, |
|
"loss": 2.5319, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9620253164556962, |
|
"grad_norm": 1.4496843814849854, |
|
"learning_rate": 5.7263728600543636e-06, |
|
"loss": 2.4306, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9676511954992968, |
|
"grad_norm": 1.0973615646362305, |
|
"learning_rate": 5.683192749555652e-06, |
|
"loss": 2.2153, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9732770745428974, |
|
"grad_norm": 1.3325732946395874, |
|
"learning_rate": 5.639976509634346e-06, |
|
"loss": 2.4422, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.9789029535864979, |
|
"grad_norm": 1.1608248949050903, |
|
"learning_rate": 5.596727744639311e-06, |
|
"loss": 2.1957, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9845288326300985, |
|
"grad_norm": 1.2780177593231201, |
|
"learning_rate": 5.5534500616320885e-06, |
|
"loss": 2.3345, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9901547116736991, |
|
"grad_norm": 1.395959734916687, |
|
"learning_rate": 5.510147070086057e-06, |
|
"loss": 2.4131, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9957805907172996, |
|
"grad_norm": 1.3863272666931152, |
|
"learning_rate": 5.466822381585402e-06, |
|
"loss": 2.3061, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0014064697609002, |
|
"grad_norm": 1.2258063554763794, |
|
"learning_rate": 5.4234796095238804e-06, |
|
"loss": 2.3713, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.0070323488045008, |
|
"grad_norm": 1.1133767366409302, |
|
"learning_rate": 5.380122368803476e-06, |
|
"loss": 2.1674, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"grad_norm": 2.211198091506958, |
|
"learning_rate": 5.3367542755328935e-06, |
|
"loss": 2.4607, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"eval_loss": 2.400386095046997, |
|
"eval_runtime": 30.3125, |
|
"eval_samples_per_second": 1.682, |
|
"eval_steps_per_second": 1.682, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0014064697609002, |
|
"grad_norm": 1.428281307220459, |
|
"learning_rate": 5.293378946725968e-06, |
|
"loss": 2.6363, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.0070323488045008, |
|
"grad_norm": 1.3397562503814697, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 2.2154, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"grad_norm": 1.2449477910995483, |
|
"learning_rate": 5.206621053274032e-06, |
|
"loss": 2.2516, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.018284106891702, |
|
"grad_norm": 1.219135046005249, |
|
"learning_rate": 5.1632457244671076e-06, |
|
"loss": 2.2162, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0239099859353025, |
|
"grad_norm": 1.5311412811279297, |
|
"learning_rate": 5.119877631196525e-06, |
|
"loss": 2.1986, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.029535864978903, |
|
"grad_norm": 1.0927857160568237, |
|
"learning_rate": 5.076520390476121e-06, |
|
"loss": 2.299, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.0351617440225036, |
|
"grad_norm": 1.161383867263794, |
|
"learning_rate": 5.0331776184146e-06, |
|
"loss": 2.4993, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0407876230661042, |
|
"grad_norm": 1.392291784286499, |
|
"learning_rate": 4.989852929913943e-06, |
|
"loss": 2.3965, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0464135021097047, |
|
"grad_norm": 1.0993740558624268, |
|
"learning_rate": 4.946549938367912e-06, |
|
"loss": 2.2925, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.0520393811533053, |
|
"grad_norm": 1.3068045377731323, |
|
"learning_rate": 4.9032722553606895e-06, |
|
"loss": 2.279, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0576652601969059, |
|
"grad_norm": 1.117245078086853, |
|
"learning_rate": 4.860023490365654e-06, |
|
"loss": 2.0698, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.0632911392405062, |
|
"grad_norm": 1.3679758310317993, |
|
"learning_rate": 4.8168072504443484e-06, |
|
"loss": 2.1846, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.0689170182841068, |
|
"grad_norm": 1.3210700750350952, |
|
"learning_rate": 4.773627139945638e-06, |
|
"loss": 2.315, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.0745428973277074, |
|
"grad_norm": 1.082641363143921, |
|
"learning_rate": 4.730486760205098e-06, |
|
"loss": 2.3924, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.080168776371308, |
|
"grad_norm": 1.1435967683792114, |
|
"learning_rate": 4.687389709244651e-06, |
|
"loss": 2.421, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0857946554149085, |
|
"grad_norm": 1.8713021278381348, |
|
"learning_rate": 4.644339581472489e-06, |
|
"loss": 2.1892, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.091420534458509, |
|
"grad_norm": 1.304328203201294, |
|
"learning_rate": 4.601339967383282e-06, |
|
"loss": 2.1397, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.0970464135021096, |
|
"grad_norm": 1.1662545204162598, |
|
"learning_rate": 4.558394453258728e-06, |
|
"loss": 2.2044, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.1026722925457102, |
|
"grad_norm": 1.6519335508346558, |
|
"learning_rate": 4.515506620868443e-06, |
|
"loss": 2.2881, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.1082981715893108, |
|
"grad_norm": 1.1646329164505005, |
|
"learning_rate": 4.4726800471712325e-06, |
|
"loss": 2.4505, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1139240506329113, |
|
"grad_norm": 1.3433741331100464, |
|
"learning_rate": 4.429918304016766e-06, |
|
"loss": 2.1556, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.119549929676512, |
|
"grad_norm": 1.093310832977295, |
|
"learning_rate": 4.3872249578476774e-06, |
|
"loss": 2.2014, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.1251758087201125, |
|
"grad_norm": 1.1493537425994873, |
|
"learning_rate": 4.344603569402106e-06, |
|
"loss": 2.3267, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.130801687763713, |
|
"grad_norm": 1.2506024837493896, |
|
"learning_rate": 4.302057693416725e-06, |
|
"loss": 2.2444, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.1364275668073136, |
|
"grad_norm": 1.3935209512710571, |
|
"learning_rate": 4.259590878330276e-06, |
|
"loss": 2.2121, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1420534458509142, |
|
"grad_norm": 1.213395595550537, |
|
"learning_rate": 4.217206665987605e-06, |
|
"loss": 2.2528, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.1476793248945147, |
|
"grad_norm": 1.9364022016525269, |
|
"learning_rate": 4.174908591344273e-06, |
|
"loss": 2.4659, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.1533052039381153, |
|
"grad_norm": 1.194150447845459, |
|
"learning_rate": 4.132700182171731e-06, |
|
"loss": 2.2238, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.1589310829817159, |
|
"grad_norm": 1.4336832761764526, |
|
"learning_rate": 4.090584958763088e-06, |
|
"loss": 2.2914, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.1645569620253164, |
|
"grad_norm": 1.6567353010177612, |
|
"learning_rate": 4.048566433639516e-06, |
|
"loss": 2.4391, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.170182841068917, |
|
"grad_norm": 1.426830768585205, |
|
"learning_rate": 4.006648111257294e-06, |
|
"loss": 2.4198, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.1758087201125176, |
|
"grad_norm": 1.2131551504135132, |
|
"learning_rate": 3.964833487715527e-06, |
|
"loss": 2.3363, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1814345991561181, |
|
"grad_norm": 1.1698105335235596, |
|
"learning_rate": 3.923126050464561e-06, |
|
"loss": 2.4659, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.1870604781997187, |
|
"grad_norm": 1.346468210220337, |
|
"learning_rate": 3.881529278015122e-06, |
|
"loss": 2.5802, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.1926863572433193, |
|
"grad_norm": 1.1469833850860596, |
|
"learning_rate": 3.840046639648199e-06, |
|
"loss": 2.3794, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1983122362869199, |
|
"grad_norm": 1.3175195455551147, |
|
"learning_rate": 3.7986815951256937e-06, |
|
"loss": 2.2429, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.2039381153305204, |
|
"grad_norm": 1.2770299911499023, |
|
"learning_rate": 3.7574375944018744e-06, |
|
"loss": 2.3475, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.209563994374121, |
|
"grad_norm": 1.2195074558258057, |
|
"learning_rate": 3.716318077335632e-06, |
|
"loss": 2.2418, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.2151898734177216, |
|
"grad_norm": 1.4205323457717896, |
|
"learning_rate": 3.675326473403591e-06, |
|
"loss": 2.3453, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.2208157524613221, |
|
"grad_norm": 1.3720946311950684, |
|
"learning_rate": 3.6344662014140862e-06, |
|
"loss": 2.318, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2264416315049227, |
|
"grad_norm": 1.162539005279541, |
|
"learning_rate": 3.593740669222015e-06, |
|
"loss": 2.2763, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.2320675105485233, |
|
"grad_norm": 1.1718677282333374, |
|
"learning_rate": 3.5531532734446194e-06, |
|
"loss": 2.1948, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.2376933895921238, |
|
"grad_norm": 1.231491208076477, |
|
"learning_rate": 3.512707399178204e-06, |
|
"loss": 2.1702, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.2433192686357244, |
|
"grad_norm": 1.3110443353652954, |
|
"learning_rate": 3.4724064197157976e-06, |
|
"loss": 2.4983, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.248945147679325, |
|
"grad_norm": 1.2871124744415283, |
|
"learning_rate": 3.432253696265824e-06, |
|
"loss": 2.2115, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.248945147679325, |
|
"eval_loss": 2.3990979194641113, |
|
"eval_runtime": 30.3251, |
|
"eval_samples_per_second": 1.682, |
|
"eval_steps_per_second": 1.682, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2545710267229255, |
|
"grad_norm": 1.3364418745040894, |
|
"learning_rate": 3.3922525776717597e-06, |
|
"loss": 2.3069, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.260196905766526, |
|
"grad_norm": 1.1549724340438843, |
|
"learning_rate": 3.3524064001328345e-06, |
|
"loss": 2.3003, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.2658227848101267, |
|
"grad_norm": 1.5768709182739258, |
|
"learning_rate": 3.312718486925787e-06, |
|
"loss": 2.1072, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.271448663853727, |
|
"grad_norm": 1.3013666868209839, |
|
"learning_rate": 3.2731921481276887e-06, |
|
"loss": 2.1262, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.2770745428973278, |
|
"grad_norm": 1.2704813480377197, |
|
"learning_rate": 3.233830680339879e-06, |
|
"loss": 2.2043, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2827004219409281, |
|
"grad_norm": 1.818085789680481, |
|
"learning_rate": 3.1946373664130155e-06, |
|
"loss": 2.2851, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.288326300984529, |
|
"grad_norm": 1.4256744384765625, |
|
"learning_rate": 3.1556154751732816e-06, |
|
"loss": 2.2682, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.2939521800281293, |
|
"grad_norm": 1.168641209602356, |
|
"learning_rate": 3.1167682611497536e-06, |
|
"loss": 2.2535, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.29957805907173, |
|
"grad_norm": 1.7689348459243774, |
|
"learning_rate": 3.078098964302967e-06, |
|
"loss": 2.5086, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.3052039381153304, |
|
"grad_norm": 1.1472971439361572, |
|
"learning_rate": 3.039610809754689e-06, |
|
"loss": 2.2806, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3108298171589312, |
|
"grad_norm": 1.1633094549179077, |
|
"learning_rate": 3.001307007518944e-06, |
|
"loss": 2.1489, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.3164556962025316, |
|
"grad_norm": 1.3734430074691772, |
|
"learning_rate": 2.963190752234284e-06, |
|
"loss": 2.435, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.3220815752461323, |
|
"grad_norm": 1.4113901853561401, |
|
"learning_rate": 2.925265222897345e-06, |
|
"loss": 2.3259, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.3277074542897327, |
|
"grad_norm": 1.2623318433761597, |
|
"learning_rate": 2.8875335825977185e-06, |
|
"loss": 2.3495, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.1913394927978516, |
|
"learning_rate": 2.849998978254136e-06, |
|
"loss": 2.245, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3389592123769338, |
|
"grad_norm": 1.3264411687850952, |
|
"learning_rate": 2.812664540352008e-06, |
|
"loss": 2.3225, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.3445850914205344, |
|
"grad_norm": 1.2762576341629028, |
|
"learning_rate": 2.775533382682332e-06, |
|
"loss": 2.3699, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.350210970464135, |
|
"grad_norm": 1.4252859354019165, |
|
"learning_rate": 2.738608602081996e-06, |
|
"loss": 2.2251, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.3558368495077355, |
|
"grad_norm": 1.181598424911499, |
|
"learning_rate": 2.701893278175499e-06, |
|
"loss": 2.3656, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.361462728551336, |
|
"grad_norm": 1.2117236852645874, |
|
"learning_rate": 2.665390473118088e-06, |
|
"loss": 2.5056, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.3670886075949367, |
|
"grad_norm": 1.2578994035720825, |
|
"learning_rate": 2.629103231340382e-06, |
|
"loss": 2.3728, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.3727144866385372, |
|
"grad_norm": 1.5048015117645264, |
|
"learning_rate": 2.5930345792944513e-06, |
|
"loss": 2.3655, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.3783403656821378, |
|
"grad_norm": 1.5193151235580444, |
|
"learning_rate": 2.5571875252013984e-06, |
|
"loss": 2.4273, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.3839662447257384, |
|
"grad_norm": 1.2505041360855103, |
|
"learning_rate": 2.521565058800475e-06, |
|
"loss": 2.2828, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.389592123769339, |
|
"grad_norm": 1.3817038536071777, |
|
"learning_rate": 2.486170151099725e-06, |
|
"loss": 2.2924, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3952180028129395, |
|
"grad_norm": 1.3338009119033813, |
|
"learning_rate": 2.4510057541281872e-06, |
|
"loss": 2.2852, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.40084388185654, |
|
"grad_norm": 1.202316403388977, |
|
"learning_rate": 2.4160748006897018e-06, |
|
"loss": 2.4643, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.4064697609001406, |
|
"grad_norm": 1.4298673868179321, |
|
"learning_rate": 2.3813802041182987e-06, |
|
"loss": 2.4521, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.4120956399437412, |
|
"grad_norm": 1.219159722328186, |
|
"learning_rate": 2.346924858035216e-06, |
|
"loss": 2.155, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.4177215189873418, |
|
"grad_norm": 1.4144155979156494, |
|
"learning_rate": 2.3127116361075712e-06, |
|
"loss": 2.1897, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4233473980309423, |
|
"grad_norm": 1.171831488609314, |
|
"learning_rate": 2.278743391808684e-06, |
|
"loss": 2.3303, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.428973277074543, |
|
"grad_norm": 1.1133025884628296, |
|
"learning_rate": 2.2450229581800925e-06, |
|
"loss": 2.3888, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.4345991561181435, |
|
"grad_norm": 1.4286714792251587, |
|
"learning_rate": 2.2115531475952678e-06, |
|
"loss": 2.1884, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.440225035161744, |
|
"grad_norm": 1.2007641792297363, |
|
"learning_rate": 2.178336751525052e-06, |
|
"loss": 2.2624, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.4458509142053446, |
|
"grad_norm": 1.3826512098312378, |
|
"learning_rate": 2.1453765403048525e-06, |
|
"loss": 2.1844, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4514767932489452, |
|
"grad_norm": 1.157139778137207, |
|
"learning_rate": 2.1126752629035753e-06, |
|
"loss": 2.1608, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.4571026722925458, |
|
"grad_norm": 1.1881312131881714, |
|
"learning_rate": 2.080235646694363e-06, |
|
"loss": 2.1249, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.4627285513361463, |
|
"grad_norm": 1.1453356742858887, |
|
"learning_rate": 2.0480603972271227e-06, |
|
"loss": 2.191, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.4683544303797469, |
|
"grad_norm": 1.3118908405303955, |
|
"learning_rate": 2.016152198002876e-06, |
|
"loss": 2.3229, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.4739803094233475, |
|
"grad_norm": 1.2369331121444702, |
|
"learning_rate": 1.98451371024994e-06, |
|
"loss": 2.1827, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.479606188466948, |
|
"grad_norm": 1.3042728900909424, |
|
"learning_rate": 1.953147572701989e-06, |
|
"loss": 2.1371, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.4852320675105486, |
|
"grad_norm": 1.2125662565231323, |
|
"learning_rate": 1.922056401377966e-06, |
|
"loss": 2.2515, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.4908579465541492, |
|
"grad_norm": 1.2672710418701172, |
|
"learning_rate": 1.8912427893638996e-06, |
|
"loss": 2.1017, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.4964838255977497, |
|
"grad_norm": 1.3755918741226196, |
|
"learning_rate": 1.8607093065966408e-06, |
|
"loss": 2.1161, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.50210970464135, |
|
"grad_norm": 1.258540153503418, |
|
"learning_rate": 1.8304584996495205e-06, |
|
"loss": 2.0267, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.50210970464135, |
|
"eval_loss": 2.398313283920288, |
|
"eval_runtime": 30.2825, |
|
"eval_samples_per_second": 1.684, |
|
"eval_steps_per_second": 1.684, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5077355836849509, |
|
"grad_norm": 1.3976160287857056, |
|
"learning_rate": 1.8004928915199515e-06, |
|
"loss": 2.3219, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.5133614627285512, |
|
"grad_norm": 1.1738497018814087, |
|
"learning_rate": 1.7708149814190156e-06, |
|
"loss": 2.1721, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.518987341772152, |
|
"grad_norm": 1.1697238683700562, |
|
"learning_rate": 1.7414272445630166e-06, |
|
"loss": 2.0663, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.5246132208157523, |
|
"grad_norm": 1.1484073400497437, |
|
"learning_rate": 1.712332131967036e-06, |
|
"loss": 2.1742, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.5302390998593531, |
|
"grad_norm": 1.393418312072754, |
|
"learning_rate": 1.6835320702405238e-06, |
|
"loss": 2.6228, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5358649789029535, |
|
"grad_norm": 1.3217144012451172, |
|
"learning_rate": 1.6550294613849016e-06, |
|
"loss": 2.2263, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.5414908579465543, |
|
"grad_norm": 1.2675628662109375, |
|
"learning_rate": 1.6268266825932378e-06, |
|
"loss": 2.2772, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.5471167369901546, |
|
"grad_norm": 1.3925647735595703, |
|
"learning_rate": 1.5989260860519723e-06, |
|
"loss": 2.2878, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.5527426160337554, |
|
"grad_norm": 1.2969094514846802, |
|
"learning_rate": 1.5713299987447534e-06, |
|
"loss": 2.2664, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.5583684950773558, |
|
"grad_norm": 1.312171220779419, |
|
"learning_rate": 1.5440407222583475e-06, |
|
"loss": 2.2109, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5639943741209565, |
|
"grad_norm": 1.2671453952789307, |
|
"learning_rate": 1.5170605325906863e-06, |
|
"loss": 2.2593, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.5696202531645569, |
|
"grad_norm": 1.4005205631256104, |
|
"learning_rate": 1.4903916799610435e-06, |
|
"loss": 2.2503, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.5752461322081577, |
|
"grad_norm": 1.098641037940979, |
|
"learning_rate": 1.46403638862236e-06, |
|
"loss": 2.1495, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.580872011251758, |
|
"grad_norm": 1.4645534753799438, |
|
"learning_rate": 1.437996856675735e-06, |
|
"loss": 2.3019, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.5864978902953588, |
|
"grad_norm": 1.2095060348510742, |
|
"learning_rate": 1.4122752558870933e-06, |
|
"loss": 2.2375, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5921237693389592, |
|
"grad_norm": 1.5466718673706055, |
|
"learning_rate": 1.3868737315060646e-06, |
|
"loss": 2.584, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.5977496483825597, |
|
"grad_norm": 1.4787497520446777, |
|
"learning_rate": 1.3617944020870577e-06, |
|
"loss": 2.482, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.6033755274261603, |
|
"grad_norm": 1.3512495756149292, |
|
"learning_rate": 1.3370393593125647e-06, |
|
"loss": 2.3235, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.6090014064697609, |
|
"grad_norm": 1.1563678979873657, |
|
"learning_rate": 1.3126106678187156e-06, |
|
"loss": 2.2995, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.6146272855133614, |
|
"grad_norm": 1.30403470993042, |
|
"learning_rate": 1.2885103650230806e-06, |
|
"loss": 2.3431, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.620253164556962, |
|
"grad_norm": 1.24360990524292, |
|
"learning_rate": 1.2647404609547384e-06, |
|
"loss": 2.0579, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.6258790436005626, |
|
"grad_norm": 1.4975757598876953, |
|
"learning_rate": 1.241302938086642e-06, |
|
"loss": 2.0818, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.6315049226441631, |
|
"grad_norm": 1.355246901512146, |
|
"learning_rate": 1.2181997511702728e-06, |
|
"loss": 2.271, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.6371308016877637, |
|
"grad_norm": 1.3052653074264526, |
|
"learning_rate": 1.1954328270726045e-06, |
|
"loss": 2.4885, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.6427566807313643, |
|
"grad_norm": 1.5899144411087036, |
|
"learning_rate": 1.1730040646154045e-06, |
|
"loss": 2.3587, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.6483825597749648, |
|
"grad_norm": 1.2158286571502686, |
|
"learning_rate": 1.150915334416865e-06, |
|
"loss": 2.197, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.6540084388185654, |
|
"grad_norm": 1.7922595739364624, |
|
"learning_rate": 1.129168478735581e-06, |
|
"loss": 2.3705, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.659634317862166, |
|
"grad_norm": 1.1858259439468384, |
|
"learning_rate": 1.1077653113169134e-06, |
|
"loss": 2.5342, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.6652601969057665, |
|
"grad_norm": 1.1772092580795288, |
|
"learning_rate": 1.0867076172417105e-06, |
|
"loss": 2.3663, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.6708860759493671, |
|
"grad_norm": 1.3147233724594116, |
|
"learning_rate": 1.0659971527774277e-06, |
|
"loss": 2.387, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6765119549929677, |
|
"grad_norm": 1.320970058441162, |
|
"learning_rate": 1.0456356452316515e-06, |
|
"loss": 2.3621, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.6821378340365682, |
|
"grad_norm": 1.1753120422363281, |
|
"learning_rate": 1.0256247928080357e-06, |
|
"loss": 2.1657, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.6877637130801688, |
|
"grad_norm": 1.3142890930175781, |
|
"learning_rate": 1.0059662644646723e-06, |
|
"loss": 2.3147, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.6933895921237694, |
|
"grad_norm": 1.2477645874023438, |
|
"learning_rate": 9.86661699774887e-07, |
|
"loss": 2.331, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.69901547116737, |
|
"grad_norm": 1.2975739240646362, |
|
"learning_rate": 9.677127087905032e-07, |
|
"loss": 2.2859, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.7046413502109705, |
|
"grad_norm": 1.5711100101470947, |
|
"learning_rate": 9.491208719075537e-07, |
|
"loss": 2.1521, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.7102672292545709, |
|
"grad_norm": 1.3837226629257202, |
|
"learning_rate": 9.308877397344751e-07, |
|
"loss": 2.3636, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.7158931082981717, |
|
"grad_norm": 1.3110496997833252, |
|
"learning_rate": 9.130148329627774e-07, |
|
"loss": 2.1745, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.721518987341772, |
|
"grad_norm": 1.1928850412368774, |
|
"learning_rate": 8.955036422402223e-07, |
|
"loss": 2.1995, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.7271448663853728, |
|
"grad_norm": 1.4467804431915283, |
|
"learning_rate": 8.783556280464933e-07, |
|
"loss": 2.1655, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7327707454289731, |
|
"grad_norm": 1.1782878637313843, |
|
"learning_rate": 8.615722205713881e-07, |
|
"loss": 2.3282, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.738396624472574, |
|
"grad_norm": 1.3934561014175415, |
|
"learning_rate": 8.451548195955409e-07, |
|
"loss": 2.3772, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.7440225035161743, |
|
"grad_norm": 1.359525442123413, |
|
"learning_rate": 8.291047943736744e-07, |
|
"loss": 2.3182, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.749648382559775, |
|
"grad_norm": 1.169758677482605, |
|
"learning_rate": 8.134234835203974e-07, |
|
"loss": 2.3455, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.7552742616033754, |
|
"grad_norm": 1.1883350610733032, |
|
"learning_rate": 7.981121948985665e-07, |
|
"loss": 2.2055, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.7552742616033754, |
|
"eval_loss": 2.396662473678589, |
|
"eval_runtime": 30.3177, |
|
"eval_samples_per_second": 1.682, |
|
"eval_steps_per_second": 1.682, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.7609001406469762, |
|
"grad_norm": 1.4302018880844116, |
|
"learning_rate": 7.831722055102056e-07, |
|
"loss": 2.1707, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.7665260196905765, |
|
"grad_norm": 1.388421893119812, |
|
"learning_rate": 7.686047613899948e-07, |
|
"loss": 2.1807, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.7721518987341773, |
|
"grad_norm": 1.3130079507827759, |
|
"learning_rate": 7.544110775013554e-07, |
|
"loss": 2.2276, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 1.3371933698654175, |
|
"learning_rate": 7.405923376351153e-07, |
|
"loss": 2.3195, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.7834036568213785, |
|
"grad_norm": 1.354970097541809, |
|
"learning_rate": 7.27149694310777e-07, |
|
"loss": 2.3254, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7890295358649788, |
|
"grad_norm": 1.139561414718628, |
|
"learning_rate": 7.140842686803959e-07, |
|
"loss": 2.1193, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.7946554149085796, |
|
"grad_norm": 1.2641339302062988, |
|
"learning_rate": 7.013971504350722e-07, |
|
"loss": 2.2094, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.80028129395218, |
|
"grad_norm": 1.221411943435669, |
|
"learning_rate": 6.890893977140682e-07, |
|
"loss": 2.1865, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.8059071729957807, |
|
"grad_norm": 1.2744096517562866, |
|
"learning_rate": 6.771620370165577e-07, |
|
"loss": 2.3021, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.811533052039381, |
|
"grad_norm": 1.2490217685699463, |
|
"learning_rate": 6.656160631160105e-07, |
|
"loss": 2.2437, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.8171589310829819, |
|
"grad_norm": 1.1591171026229858, |
|
"learning_rate": 6.544524389772303e-07, |
|
"loss": 2.2542, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.8227848101265822, |
|
"grad_norm": 1.3077995777130127, |
|
"learning_rate": 6.436720956760359e-07, |
|
"loss": 2.3848, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.8284106891701828, |
|
"grad_norm": 1.0817915201187134, |
|
"learning_rate": 6.332759323216081e-07, |
|
"loss": 2.1434, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.8340365682137834, |
|
"grad_norm": 1.2274130582809448, |
|
"learning_rate": 6.232648159815062e-07, |
|
"loss": 2.2062, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.839662447257384, |
|
"grad_norm": 1.1238057613372803, |
|
"learning_rate": 6.136395816093466e-07, |
|
"loss": 2.168, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8452883263009845, |
|
"grad_norm": 1.325332760810852, |
|
"learning_rate": 6.044010319751662e-07, |
|
"loss": 2.2529, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.850914205344585, |
|
"grad_norm": 1.4097862243652344, |
|
"learning_rate": 5.95549937598473e-07, |
|
"loss": 2.3565, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.8565400843881856, |
|
"grad_norm": 1.4523735046386719, |
|
"learning_rate": 5.870870366839798e-07, |
|
"loss": 2.1707, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.8621659634317862, |
|
"grad_norm": 1.3728147745132446, |
|
"learning_rate": 5.790130350600362e-07, |
|
"loss": 2.2998, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.8677918424753868, |
|
"grad_norm": 1.1367465257644653, |
|
"learning_rate": 5.713286061197607e-07, |
|
"loss": 2.2088, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.8734177215189873, |
|
"grad_norm": 1.291197419166565, |
|
"learning_rate": 5.640343907648791e-07, |
|
"loss": 2.4345, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.879043600562588, |
|
"grad_norm": 1.4310858249664307, |
|
"learning_rate": 5.571309973522697e-07, |
|
"loss": 2.6208, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.8846694796061885, |
|
"grad_norm": 1.2096583843231201, |
|
"learning_rate": 5.506190016432264e-07, |
|
"loss": 2.4734, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.890295358649789, |
|
"grad_norm": 1.17626953125, |
|
"learning_rate": 5.444989467554386e-07, |
|
"loss": 2.5059, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.8959212376933896, |
|
"grad_norm": 1.463349461555481, |
|
"learning_rate": 5.387713431176918e-07, |
|
"loss": 2.2987, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9015471167369902, |
|
"grad_norm": 1.414905309677124, |
|
"learning_rate": 5.334366684272987e-07, |
|
"loss": 2.3687, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.9071729957805907, |
|
"grad_norm": 1.2944713830947876, |
|
"learning_rate": 5.28495367610257e-07, |
|
"loss": 2.2465, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.9127988748241913, |
|
"grad_norm": 1.156260371208191, |
|
"learning_rate": 5.239478527841415e-07, |
|
"loss": 2.2922, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.9184247538677919, |
|
"grad_norm": 1.5482733249664307, |
|
"learning_rate": 5.197945032237327e-07, |
|
"loss": 2.2515, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.9240506329113924, |
|
"grad_norm": 1.29518461227417, |
|
"learning_rate": 5.160356653293837e-07, |
|
"loss": 2.5098, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.929676511954993, |
|
"grad_norm": 1.286757230758667, |
|
"learning_rate": 5.126716525981297e-07, |
|
"loss": 2.3004, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.9353023909985936, |
|
"grad_norm": 1.1425246000289917, |
|
"learning_rate": 5.097027455975421e-07, |
|
"loss": 2.2362, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.9409282700421941, |
|
"grad_norm": 1.1543301343917847, |
|
"learning_rate": 5.071291919423276e-07, |
|
"loss": 2.3147, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.9465541490857947, |
|
"grad_norm": 1.2874441146850586, |
|
"learning_rate": 5.049512062736767e-07, |
|
"loss": 2.2537, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.952180028129395, |
|
"grad_norm": 1.476819396018982, |
|
"learning_rate": 5.03168970241363e-07, |
|
"loss": 2.1098, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9578059071729959, |
|
"grad_norm": 1.2125787734985352, |
|
"learning_rate": 5.017826324885912e-07, |
|
"loss": 2.3497, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.9634317862165962, |
|
"grad_norm": 1.2305964231491089, |
|
"learning_rate": 5.007923086396018e-07, |
|
"loss": 2.3115, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.969057665260197, |
|
"grad_norm": 1.3071091175079346, |
|
"learning_rate": 5.001980812900265e-07, |
|
"loss": 2.1634, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.9746835443037973, |
|
"grad_norm": 1.4950101375579834, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 2.3289, |
|
"step": 354 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 354, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 177, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8528841455357133e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|