PEFT
Safetensors
bodinforg / trainer_state.json
inflatebot's picture
Upload 9 files
6590377 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9746835443037973,
"eval_steps": 45,
"global_step": 354,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005625879043600563,
"grad_norm": 1.739181637763977,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.4549,
"step": 1
},
{
"epoch": 0.005625879043600563,
"eval_loss": 2.5061044692993164,
"eval_runtime": 30.2114,
"eval_samples_per_second": 1.688,
"eval_steps_per_second": 1.688,
"step": 1
},
{
"epoch": 0.011251758087201125,
"grad_norm": 1.461582899093628,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.4808,
"step": 2
},
{
"epoch": 0.016877637130801686,
"grad_norm": 1.5786077976226807,
"learning_rate": 3e-06,
"loss": 2.54,
"step": 3
},
{
"epoch": 0.02250351617440225,
"grad_norm": 1.605600118637085,
"learning_rate": 4.000000000000001e-06,
"loss": 2.6763,
"step": 4
},
{
"epoch": 0.02812939521800281,
"grad_norm": 1.4658780097961426,
"learning_rate": 5e-06,
"loss": 2.4768,
"step": 5
},
{
"epoch": 0.03375527426160337,
"grad_norm": 1.4920974969863892,
"learning_rate": 6e-06,
"loss": 2.383,
"step": 6
},
{
"epoch": 0.03938115330520394,
"grad_norm": 1.2311428785324097,
"learning_rate": 7e-06,
"loss": 2.1965,
"step": 7
},
{
"epoch": 0.0450070323488045,
"grad_norm": 1.442131519317627,
"learning_rate": 8.000000000000001e-06,
"loss": 2.3795,
"step": 8
},
{
"epoch": 0.05063291139240506,
"grad_norm": 1.2747117280960083,
"learning_rate": 9e-06,
"loss": 2.5673,
"step": 9
},
{
"epoch": 0.05625879043600562,
"grad_norm": 1.810668706893921,
"learning_rate": 1e-05,
"loss": 2.4721,
"step": 10
},
{
"epoch": 0.06188466947960619,
"grad_norm": 1.694833755493164,
"learning_rate": 9.999801918709974e-06,
"loss": 2.4181,
"step": 11
},
{
"epoch": 0.06751054852320675,
"grad_norm": 1.3476848602294922,
"learning_rate": 9.999207691360399e-06,
"loss": 2.5278,
"step": 12
},
{
"epoch": 0.07313642756680731,
"grad_norm": 1.3295379877090454,
"learning_rate": 9.99821736751141e-06,
"loss": 2.3155,
"step": 13
},
{
"epoch": 0.07876230661040788,
"grad_norm": 1.337768793106079,
"learning_rate": 9.996831029758638e-06,
"loss": 2.3802,
"step": 14
},
{
"epoch": 0.08438818565400844,
"grad_norm": 1.4814488887786865,
"learning_rate": 9.995048793726324e-06,
"loss": 2.4514,
"step": 15
},
{
"epoch": 0.090014064697609,
"grad_norm": 1.4958126544952393,
"learning_rate": 9.992870808057673e-06,
"loss": 2.2855,
"step": 16
},
{
"epoch": 0.09563994374120956,
"grad_norm": 1.3594876527786255,
"learning_rate": 9.99029725440246e-06,
"loss": 2.485,
"step": 17
},
{
"epoch": 0.10126582278481013,
"grad_norm": 1.2920221090316772,
"learning_rate": 9.987328347401871e-06,
"loss": 2.5267,
"step": 18
},
{
"epoch": 0.10689170182841069,
"grad_norm": 1.289971947669983,
"learning_rate": 9.983964334670618e-06,
"loss": 2.2582,
"step": 19
},
{
"epoch": 0.11251758087201125,
"grad_norm": 1.4366180896759033,
"learning_rate": 9.980205496776269e-06,
"loss": 2.318,
"step": 20
},
{
"epoch": 0.11814345991561181,
"grad_norm": 1.4515198469161987,
"learning_rate": 9.976052147215859e-06,
"loss": 2.3388,
"step": 21
},
{
"epoch": 0.12376933895921238,
"grad_norm": 1.3737822771072388,
"learning_rate": 9.971504632389744e-06,
"loss": 2.1809,
"step": 22
},
{
"epoch": 0.12939521800281295,
"grad_norm": 1.692108392715454,
"learning_rate": 9.966563331572703e-06,
"loss": 2.5646,
"step": 23
},
{
"epoch": 0.1350210970464135,
"grad_norm": 1.2992991209030151,
"learning_rate": 9.96122865688231e-06,
"loss": 2.2552,
"step": 24
},
{
"epoch": 0.14064697609001406,
"grad_norm": 1.30594801902771,
"learning_rate": 9.955501053244563e-06,
"loss": 2.2872,
"step": 25
},
{
"epoch": 0.14627285513361463,
"grad_norm": 1.5014636516571045,
"learning_rate": 9.949380998356774e-06,
"loss": 2.4255,
"step": 26
},
{
"epoch": 0.1518987341772152,
"grad_norm": 1.2353171110153198,
"learning_rate": 9.942869002647731e-06,
"loss": 2.285,
"step": 27
},
{
"epoch": 0.15752461322081576,
"grad_norm": 1.2573610544204712,
"learning_rate": 9.935965609235122e-06,
"loss": 2.2413,
"step": 28
},
{
"epoch": 0.1631504922644163,
"grad_norm": 1.2308180332183838,
"learning_rate": 9.92867139388024e-06,
"loss": 2.254,
"step": 29
},
{
"epoch": 0.16877637130801687,
"grad_norm": 1.3512063026428223,
"learning_rate": 9.920986964939964e-06,
"loss": 2.6116,
"step": 30
},
{
"epoch": 0.17440225035161744,
"grad_norm": 1.4129055738449097,
"learning_rate": 9.912912963316021e-06,
"loss": 2.4745,
"step": 31
},
{
"epoch": 0.180028129395218,
"grad_norm": 1.2212988138198853,
"learning_rate": 9.904450062401528e-06,
"loss": 2.3624,
"step": 32
},
{
"epoch": 0.18565400843881857,
"grad_norm": 1.6333080530166626,
"learning_rate": 9.895598968024834e-06,
"loss": 2.3999,
"step": 33
},
{
"epoch": 0.19127988748241911,
"grad_norm": 1.437427043914795,
"learning_rate": 9.886360418390655e-06,
"loss": 2.3292,
"step": 34
},
{
"epoch": 0.19690576652601968,
"grad_norm": 1.333884596824646,
"learning_rate": 9.876735184018495e-06,
"loss": 2.2703,
"step": 35
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.452011227607727,
"learning_rate": 9.866724067678392e-06,
"loss": 2.444,
"step": 36
},
{
"epoch": 0.20815752461322082,
"grad_norm": 1.3540449142456055,
"learning_rate": 9.856327904323965e-06,
"loss": 2.2662,
"step": 37
},
{
"epoch": 0.21378340365682139,
"grad_norm": 1.3395826816558838,
"learning_rate": 9.84554756102277e-06,
"loss": 2.4792,
"step": 38
},
{
"epoch": 0.21940928270042195,
"grad_norm": 1.335483193397522,
"learning_rate": 9.83438393688399e-06,
"loss": 2.5329,
"step": 39
},
{
"epoch": 0.2250351617440225,
"grad_norm": 1.4518929719924927,
"learning_rate": 9.822837962983443e-06,
"loss": 2.3844,
"step": 40
},
{
"epoch": 0.23066104078762306,
"grad_norm": 1.4345314502716064,
"learning_rate": 9.810910602285933e-06,
"loss": 2.5071,
"step": 41
},
{
"epoch": 0.23628691983122363,
"grad_norm": 1.2864302396774292,
"learning_rate": 9.798602849564929e-06,
"loss": 2.5834,
"step": 42
},
{
"epoch": 0.2419127988748242,
"grad_norm": 1.319409728050232,
"learning_rate": 9.785915731319605e-06,
"loss": 2.282,
"step": 43
},
{
"epoch": 0.24753867791842477,
"grad_norm": 1.2264432907104492,
"learning_rate": 9.772850305689224e-06,
"loss": 2.3332,
"step": 44
},
{
"epoch": 0.25316455696202533,
"grad_norm": 1.2530275583267212,
"learning_rate": 9.759407662364885e-06,
"loss": 2.3516,
"step": 45
},
{
"epoch": 0.25316455696202533,
"eval_loss": 2.431987762451172,
"eval_runtime": 30.3221,
"eval_samples_per_second": 1.682,
"eval_steps_per_second": 1.682,
"step": 45
},
{
"epoch": 0.2587904360056259,
"grad_norm": 1.2621686458587646,
"learning_rate": 9.745588922498646e-06,
"loss": 2.319,
"step": 46
},
{
"epoch": 0.26441631504922647,
"grad_norm": 1.444759726524353,
"learning_rate": 9.731395238610006e-06,
"loss": 2.4669,
"step": 47
},
{
"epoch": 0.270042194092827,
"grad_norm": 1.4075920581817627,
"learning_rate": 9.716827794489795e-06,
"loss": 2.4549,
"step": 48
},
{
"epoch": 0.27566807313642755,
"grad_norm": 1.2430981397628784,
"learning_rate": 9.701887805101434e-06,
"loss": 2.2979,
"step": 49
},
{
"epoch": 0.2812939521800281,
"grad_norm": 1.3289717435836792,
"learning_rate": 9.686576516479604e-06,
"loss": 2.3718,
"step": 50
},
{
"epoch": 0.2869198312236287,
"grad_norm": 1.5157921314239502,
"learning_rate": 9.670895205626327e-06,
"loss": 2.3173,
"step": 51
},
{
"epoch": 0.29254571026722925,
"grad_norm": 1.2412256002426147,
"learning_rate": 9.65484518040446e-06,
"loss": 2.5968,
"step": 52
},
{
"epoch": 0.2981715893108298,
"grad_norm": 1.218125581741333,
"learning_rate": 9.638427779428613e-06,
"loss": 2.3775,
"step": 53
},
{
"epoch": 0.3037974683544304,
"grad_norm": 1.5379339456558228,
"learning_rate": 9.621644371953507e-06,
"loss": 2.4156,
"step": 54
},
{
"epoch": 0.30942334739803096,
"grad_norm": 1.3565013408660889,
"learning_rate": 9.604496357759778e-06,
"loss": 2.3882,
"step": 55
},
{
"epoch": 0.3150492264416315,
"grad_norm": 1.5721348524093628,
"learning_rate": 9.586985167037224e-06,
"loss": 2.3433,
"step": 56
},
{
"epoch": 0.3206751054852321,
"grad_norm": 1.2288527488708496,
"learning_rate": 9.569112260265527e-06,
"loss": 2.2329,
"step": 57
},
{
"epoch": 0.3263009845288326,
"grad_norm": 1.2658888101577759,
"learning_rate": 9.550879128092447e-06,
"loss": 2.5814,
"step": 58
},
{
"epoch": 0.3319268635724332,
"grad_norm": 1.073593020439148,
"learning_rate": 9.532287291209498e-06,
"loss": 2.2243,
"step": 59
},
{
"epoch": 0.33755274261603374,
"grad_norm": 1.267996907234192,
"learning_rate": 9.513338300225116e-06,
"loss": 2.5315,
"step": 60
},
{
"epoch": 0.3431786216596343,
"grad_norm": 1.3250463008880615,
"learning_rate": 9.49403373553533e-06,
"loss": 2.3651,
"step": 61
},
{
"epoch": 0.3488045007032349,
"grad_norm": 1.6996604204177856,
"learning_rate": 9.474375207191965e-06,
"loss": 2.5536,
"step": 62
},
{
"epoch": 0.35443037974683544,
"grad_norm": 1.1338461637496948,
"learning_rate": 9.454364354768351e-06,
"loss": 2.2356,
"step": 63
},
{
"epoch": 0.360056258790436,
"grad_norm": 1.3313428163528442,
"learning_rate": 9.434002847222574e-06,
"loss": 2.2594,
"step": 64
},
{
"epoch": 0.3656821378340366,
"grad_norm": 1.2771188020706177,
"learning_rate": 9.41329238275829e-06,
"loss": 2.5237,
"step": 65
},
{
"epoch": 0.37130801687763715,
"grad_norm": 1.355175495147705,
"learning_rate": 9.392234688683088e-06,
"loss": 2.3274,
"step": 66
},
{
"epoch": 0.3769338959212377,
"grad_norm": 1.2518092393875122,
"learning_rate": 9.37083152126442e-06,
"loss": 2.4285,
"step": 67
},
{
"epoch": 0.38255977496483823,
"grad_norm": 1.3508447408676147,
"learning_rate": 9.349084665583136e-06,
"loss": 2.1645,
"step": 68
},
{
"epoch": 0.3881856540084388,
"grad_norm": 1.5284922122955322,
"learning_rate": 9.326995935384594e-06,
"loss": 2.6101,
"step": 69
},
{
"epoch": 0.39381153305203936,
"grad_norm": 1.6654582023620605,
"learning_rate": 9.304567172927397e-06,
"loss": 2.3393,
"step": 70
},
{
"epoch": 0.39943741209563993,
"grad_norm": 1.2337652444839478,
"learning_rate": 9.281800248829728e-06,
"loss": 2.4461,
"step": 71
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.419947624206543,
"learning_rate": 9.25869706191336e-06,
"loss": 2.4146,
"step": 72
},
{
"epoch": 0.41068917018284107,
"grad_norm": 1.1560102701187134,
"learning_rate": 9.235259539045263e-06,
"loss": 2.2357,
"step": 73
},
{
"epoch": 0.41631504922644164,
"grad_norm": 1.3465362787246704,
"learning_rate": 9.21148963497692e-06,
"loss": 2.3877,
"step": 74
},
{
"epoch": 0.4219409282700422,
"grad_norm": 1.37497079372406,
"learning_rate": 9.187389332181285e-06,
"loss": 2.4105,
"step": 75
},
{
"epoch": 0.42756680731364277,
"grad_norm": 1.1877851486206055,
"learning_rate": 9.162960640687436e-06,
"loss": 2.3803,
"step": 76
},
{
"epoch": 0.43319268635724334,
"grad_norm": 1.1579440832138062,
"learning_rate": 9.138205597912943e-06,
"loss": 2.2938,
"step": 77
},
{
"epoch": 0.4388185654008439,
"grad_norm": 1.3359930515289307,
"learning_rate": 9.113126268493937e-06,
"loss": 2.5006,
"step": 78
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.3176641464233398,
"learning_rate": 9.08772474411291e-06,
"loss": 2.3957,
"step": 79
},
{
"epoch": 0.450070323488045,
"grad_norm": 1.2082509994506836,
"learning_rate": 9.062003143324267e-06,
"loss": 2.273,
"step": 80
},
{
"epoch": 0.45569620253164556,
"grad_norm": 1.3498975038528442,
"learning_rate": 9.035963611377641e-06,
"loss": 2.4793,
"step": 81
},
{
"epoch": 0.4613220815752461,
"grad_norm": 1.2176086902618408,
"learning_rate": 9.009608320038959e-06,
"loss": 2.2616,
"step": 82
},
{
"epoch": 0.4669479606188467,
"grad_norm": 1.2488033771514893,
"learning_rate": 8.982939467409314e-06,
"loss": 2.3263,
"step": 83
},
{
"epoch": 0.47257383966244726,
"grad_norm": 1.106594443321228,
"learning_rate": 8.955959277741654e-06,
"loss": 2.2856,
"step": 84
},
{
"epoch": 0.4781997187060478,
"grad_norm": 1.2389321327209473,
"learning_rate": 8.928670001255248e-06,
"loss": 2.4291,
"step": 85
},
{
"epoch": 0.4838255977496484,
"grad_norm": 1.1330113410949707,
"learning_rate": 8.901073913948028e-06,
"loss": 2.13,
"step": 86
},
{
"epoch": 0.48945147679324896,
"grad_norm": 1.1081808805465698,
"learning_rate": 8.873173317406764e-06,
"loss": 2.2318,
"step": 87
},
{
"epoch": 0.49507735583684953,
"grad_norm": 1.2427239418029785,
"learning_rate": 8.844970538615099e-06,
"loss": 2.6752,
"step": 88
},
{
"epoch": 0.5007032348804501,
"grad_norm": 1.2715911865234375,
"learning_rate": 8.816467929759476e-06,
"loss": 2.5999,
"step": 89
},
{
"epoch": 0.5063291139240507,
"grad_norm": 1.282126545906067,
"learning_rate": 8.787667868032964e-06,
"loss": 2.6222,
"step": 90
},
{
"epoch": 0.5063291139240507,
"eval_loss": 2.413635492324829,
"eval_runtime": 30.2824,
"eval_samples_per_second": 1.684,
"eval_steps_per_second": 1.684,
"step": 90
},
{
"epoch": 0.5119549929676512,
"grad_norm": 1.25644850730896,
"learning_rate": 8.758572755436986e-06,
"loss": 2.5696,
"step": 91
},
{
"epoch": 0.5175808720112518,
"grad_norm": 1.3811650276184082,
"learning_rate": 8.729185018580984e-06,
"loss": 2.2826,
"step": 92
},
{
"epoch": 0.5232067510548524,
"grad_norm": 1.872058629989624,
"learning_rate": 8.69950710848005e-06,
"loss": 2.4419,
"step": 93
},
{
"epoch": 0.5288326300984529,
"grad_norm": 1.2003915309906006,
"learning_rate": 8.669541500350481e-06,
"loss": 2.2919,
"step": 94
},
{
"epoch": 0.5344585091420534,
"grad_norm": 1.352100133895874,
"learning_rate": 8.63929069340336e-06,
"loss": 2.4084,
"step": 95
},
{
"epoch": 0.540084388185654,
"grad_norm": 1.4791978597640991,
"learning_rate": 8.608757210636101e-06,
"loss": 2.2582,
"step": 96
},
{
"epoch": 0.5457102672292545,
"grad_norm": 1.5740419626235962,
"learning_rate": 8.577943598622037e-06,
"loss": 2.2898,
"step": 97
},
{
"epoch": 0.5513361462728551,
"grad_norm": 1.1978390216827393,
"learning_rate": 8.546852427298013e-06,
"loss": 2.2277,
"step": 98
},
{
"epoch": 0.5569620253164557,
"grad_norm": 1.4278184175491333,
"learning_rate": 8.515486289750061e-06,
"loss": 2.3385,
"step": 99
},
{
"epoch": 0.5625879043600562,
"grad_norm": 1.2246702909469604,
"learning_rate": 8.483847801997126e-06,
"loss": 2.3997,
"step": 100
},
{
"epoch": 0.5682137834036568,
"grad_norm": 1.261964201927185,
"learning_rate": 8.451939602772877e-06,
"loss": 2.4312,
"step": 101
},
{
"epoch": 0.5738396624472574,
"grad_norm": 1.4236760139465332,
"learning_rate": 8.419764353305638e-06,
"loss": 2.3398,
"step": 102
},
{
"epoch": 0.5794655414908579,
"grad_norm": 1.232182502746582,
"learning_rate": 8.387324737096427e-06,
"loss": 2.5426,
"step": 103
},
{
"epoch": 0.5850914205344585,
"grad_norm": 1.2140839099884033,
"learning_rate": 8.35462345969515e-06,
"loss": 2.6031,
"step": 104
},
{
"epoch": 0.5907172995780591,
"grad_norm": 1.2126487493515015,
"learning_rate": 8.321663248474949e-06,
"loss": 2.2923,
"step": 105
},
{
"epoch": 0.5963431786216596,
"grad_norm": 1.3279041051864624,
"learning_rate": 8.288446852404735e-06,
"loss": 2.2648,
"step": 106
},
{
"epoch": 0.6019690576652602,
"grad_norm": 1.9618940353393555,
"learning_rate": 8.254977041819909e-06,
"loss": 2.3423,
"step": 107
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.2524311542510986,
"learning_rate": 8.221256608191316e-06,
"loss": 2.6281,
"step": 108
},
{
"epoch": 0.6132208157524613,
"grad_norm": 1.4134135246276855,
"learning_rate": 8.18728836389243e-06,
"loss": 2.3007,
"step": 109
},
{
"epoch": 0.6188466947960619,
"grad_norm": 1.42375910282135,
"learning_rate": 8.153075141964785e-06,
"loss": 2.3253,
"step": 110
},
{
"epoch": 0.6244725738396625,
"grad_norm": 1.2591787576675415,
"learning_rate": 8.118619795881702e-06,
"loss": 2.658,
"step": 111
},
{
"epoch": 0.630098452883263,
"grad_norm": 1.2173055410385132,
"learning_rate": 8.083925199310299e-06,
"loss": 2.339,
"step": 112
},
{
"epoch": 0.6357243319268636,
"grad_norm": 1.8275774717330933,
"learning_rate": 8.048994245871813e-06,
"loss": 2.415,
"step": 113
},
{
"epoch": 0.6413502109704642,
"grad_norm": 1.2721606492996216,
"learning_rate": 8.013829848900278e-06,
"loss": 2.273,
"step": 114
},
{
"epoch": 0.6469760900140648,
"grad_norm": 1.4262254238128662,
"learning_rate": 7.978434941199526e-06,
"loss": 2.5605,
"step": 115
},
{
"epoch": 0.6526019690576652,
"grad_norm": 1.2906497716903687,
"learning_rate": 7.942812474798602e-06,
"loss": 2.3066,
"step": 116
},
{
"epoch": 0.6582278481012658,
"grad_norm": 1.2999948263168335,
"learning_rate": 7.90696542070555e-06,
"loss": 2.2782,
"step": 117
},
{
"epoch": 0.6638537271448663,
"grad_norm": 1.5385446548461914,
"learning_rate": 7.87089676865962e-06,
"loss": 2.3383,
"step": 118
},
{
"epoch": 0.6694796061884669,
"grad_norm": 1.2246472835540771,
"learning_rate": 7.834609526881914e-06,
"loss": 2.5499,
"step": 119
},
{
"epoch": 0.6751054852320675,
"grad_norm": 1.6999132633209229,
"learning_rate": 7.798106721824504e-06,
"loss": 2.3576,
"step": 120
},
{
"epoch": 0.680731364275668,
"grad_norm": 1.2035925388336182,
"learning_rate": 7.761391397918005e-06,
"loss": 2.33,
"step": 121
},
{
"epoch": 0.6863572433192686,
"grad_norm": 1.3971892595291138,
"learning_rate": 7.72446661731767e-06,
"loss": 2.3338,
"step": 122
},
{
"epoch": 0.6919831223628692,
"grad_norm": 1.4358816146850586,
"learning_rate": 7.687335459647993e-06,
"loss": 2.5577,
"step": 123
},
{
"epoch": 0.6976090014064698,
"grad_norm": 1.352947473526001,
"learning_rate": 7.650001021745866e-06,
"loss": 2.2954,
"step": 124
},
{
"epoch": 0.7032348804500703,
"grad_norm": 1.4151978492736816,
"learning_rate": 7.612466417402282e-06,
"loss": 2.6017,
"step": 125
},
{
"epoch": 0.7088607594936709,
"grad_norm": 1.1894267797470093,
"learning_rate": 7.574734777102657e-06,
"loss": 2.2623,
"step": 126
},
{
"epoch": 0.7144866385372715,
"grad_norm": 1.1567312479019165,
"learning_rate": 7.536809247765718e-06,
"loss": 2.2232,
"step": 127
},
{
"epoch": 0.720112517580872,
"grad_norm": 1.3824794292449951,
"learning_rate": 7.498692992481056e-06,
"loss": 2.2699,
"step": 128
},
{
"epoch": 0.7257383966244726,
"grad_norm": 1.203261375427246,
"learning_rate": 7.4603891902453115e-06,
"loss": 2.3539,
"step": 129
},
{
"epoch": 0.7313642756680732,
"grad_norm": 1.6249403953552246,
"learning_rate": 7.421901035697033e-06,
"loss": 2.5196,
"step": 130
},
{
"epoch": 0.7369901547116737,
"grad_norm": 1.3398568630218506,
"learning_rate": 7.383231738850246e-06,
"loss": 2.341,
"step": 131
},
{
"epoch": 0.7426160337552743,
"grad_norm": 1.148158311843872,
"learning_rate": 7.34438452482672e-06,
"loss": 2.2581,
"step": 132
},
{
"epoch": 0.7482419127988749,
"grad_norm": 1.1582486629486084,
"learning_rate": 7.305362633586984e-06,
"loss": 2.3726,
"step": 133
},
{
"epoch": 0.7538677918424754,
"grad_norm": 1.2645725011825562,
"learning_rate": 7.266169319660123e-06,
"loss": 2.2198,
"step": 134
},
{
"epoch": 0.759493670886076,
"grad_norm": 1.177494764328003,
"learning_rate": 7.226807851872312e-06,
"loss": 2.319,
"step": 135
},
{
"epoch": 0.759493670886076,
"eval_loss": 2.4054980278015137,
"eval_runtime": 30.3027,
"eval_samples_per_second": 1.683,
"eval_steps_per_second": 1.683,
"step": 135
},
{
"epoch": 0.7651195499296765,
"grad_norm": 1.6996376514434814,
"learning_rate": 7.187281513074214e-06,
"loss": 2.2793,
"step": 136
},
{
"epoch": 0.770745428973277,
"grad_norm": 1.2821097373962402,
"learning_rate": 7.147593599867166e-06,
"loss": 2.2482,
"step": 137
},
{
"epoch": 0.7763713080168776,
"grad_norm": 1.4424350261688232,
"learning_rate": 7.107747422328241e-06,
"loss": 2.3816,
"step": 138
},
{
"epoch": 0.7819971870604782,
"grad_norm": 1.3601067066192627,
"learning_rate": 7.067746303734178e-06,
"loss": 2.5607,
"step": 139
},
{
"epoch": 0.7876230661040787,
"grad_norm": 1.5715044736862183,
"learning_rate": 7.0275935802842036e-06,
"loss": 2.2028,
"step": 140
},
{
"epoch": 0.7932489451476793,
"grad_norm": 1.5017560720443726,
"learning_rate": 6.9872926008217976e-06,
"loss": 2.5636,
"step": 141
},
{
"epoch": 0.7988748241912799,
"grad_norm": 1.2483998537063599,
"learning_rate": 6.9468467265553805e-06,
"loss": 2.218,
"step": 142
},
{
"epoch": 0.8045007032348804,
"grad_norm": 1.2412904500961304,
"learning_rate": 6.906259330777986e-06,
"loss": 2.318,
"step": 143
},
{
"epoch": 0.810126582278481,
"grad_norm": 1.4541116952896118,
"learning_rate": 6.865533798585915e-06,
"loss": 2.2498,
"step": 144
},
{
"epoch": 0.8157524613220816,
"grad_norm": 1.3769111633300781,
"learning_rate": 6.824673526596411e-06,
"loss": 2.5446,
"step": 145
},
{
"epoch": 0.8213783403656821,
"grad_norm": 1.3049566745758057,
"learning_rate": 6.7836819226643705e-06,
"loss": 2.5029,
"step": 146
},
{
"epoch": 0.8270042194092827,
"grad_norm": 1.5045892000198364,
"learning_rate": 6.7425624055981284e-06,
"loss": 2.5418,
"step": 147
},
{
"epoch": 0.8326300984528833,
"grad_norm": 1.375936508178711,
"learning_rate": 6.701318404874308e-06,
"loss": 2.4155,
"step": 148
},
{
"epoch": 0.8382559774964838,
"grad_norm": 1.4569542407989502,
"learning_rate": 6.659953360351803e-06,
"loss": 2.483,
"step": 149
},
{
"epoch": 0.8438818565400844,
"grad_norm": 1.4985612630844116,
"learning_rate": 6.61847072198488e-06,
"loss": 2.3627,
"step": 150
},
{
"epoch": 0.849507735583685,
"grad_norm": 1.1635637283325195,
"learning_rate": 6.576873949535439e-06,
"loss": 2.4863,
"step": 151
},
{
"epoch": 0.8551336146272855,
"grad_norm": 1.309057354927063,
"learning_rate": 6.535166512284473e-06,
"loss": 2.4227,
"step": 152
},
{
"epoch": 0.8607594936708861,
"grad_norm": 1.1509865522384644,
"learning_rate": 6.493351888742706e-06,
"loss": 2.2121,
"step": 153
},
{
"epoch": 0.8663853727144867,
"grad_norm": 1.3486562967300415,
"learning_rate": 6.4514335663604834e-06,
"loss": 2.3682,
"step": 154
},
{
"epoch": 0.8720112517580872,
"grad_norm": 1.177566409111023,
"learning_rate": 6.409415041236912e-06,
"loss": 2.439,
"step": 155
},
{
"epoch": 0.8776371308016878,
"grad_norm": 1.3212536573410034,
"learning_rate": 6.367299817828271e-06,
"loss": 2.3195,
"step": 156
},
{
"epoch": 0.8832630098452883,
"grad_norm": 1.3443726301193237,
"learning_rate": 6.325091408655728e-06,
"loss": 2.3453,
"step": 157
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.2387938499450684,
"learning_rate": 6.282793334012397e-06,
"loss": 2.3215,
"step": 158
},
{
"epoch": 0.8945147679324894,
"grad_norm": 1.1146687269210815,
"learning_rate": 6.240409121669726e-06,
"loss": 2.5072,
"step": 159
},
{
"epoch": 0.90014064697609,
"grad_norm": 1.4473936557769775,
"learning_rate": 6.1979423065832766e-06,
"loss": 2.3696,
"step": 160
},
{
"epoch": 0.9057665260196905,
"grad_norm": 1.419673204421997,
"learning_rate": 6.155396430597896e-06,
"loss": 2.4739,
"step": 161
},
{
"epoch": 0.9113924050632911,
"grad_norm": 1.3161817789077759,
"learning_rate": 6.112775042152324e-06,
"loss": 2.2546,
"step": 162
},
{
"epoch": 0.9170182841068917,
"grad_norm": 1.2259889841079712,
"learning_rate": 6.070081695983236e-06,
"loss": 2.3529,
"step": 163
},
{
"epoch": 0.9226441631504922,
"grad_norm": 1.190901756286621,
"learning_rate": 6.0273199528287695e-06,
"loss": 2.3558,
"step": 164
},
{
"epoch": 0.9282700421940928,
"grad_norm": 1.6890244483947754,
"learning_rate": 5.984493379131559e-06,
"loss": 2.5214,
"step": 165
},
{
"epoch": 0.9338959212376934,
"grad_norm": 1.6165000200271606,
"learning_rate": 5.9416055467412745e-06,
"loss": 2.286,
"step": 166
},
{
"epoch": 0.939521800281294,
"grad_norm": 1.2835633754730225,
"learning_rate": 5.898660032616721e-06,
"loss": 2.2492,
"step": 167
},
{
"epoch": 0.9451476793248945,
"grad_norm": 1.2182981967926025,
"learning_rate": 5.855660418527513e-06,
"loss": 2.2491,
"step": 168
},
{
"epoch": 0.9507735583684951,
"grad_norm": 1.4453896284103394,
"learning_rate": 5.812610290755352e-06,
"loss": 2.3408,
"step": 169
},
{
"epoch": 0.9563994374120957,
"grad_norm": 1.1564884185791016,
"learning_rate": 5.769513239794905e-06,
"loss": 2.5319,
"step": 170
},
{
"epoch": 0.9620253164556962,
"grad_norm": 1.4496843814849854,
"learning_rate": 5.7263728600543636e-06,
"loss": 2.4306,
"step": 171
},
{
"epoch": 0.9676511954992968,
"grad_norm": 1.0973615646362305,
"learning_rate": 5.683192749555652e-06,
"loss": 2.2153,
"step": 172
},
{
"epoch": 0.9732770745428974,
"grad_norm": 1.3325732946395874,
"learning_rate": 5.639976509634346e-06,
"loss": 2.4422,
"step": 173
},
{
"epoch": 0.9789029535864979,
"grad_norm": 1.1608248949050903,
"learning_rate": 5.596727744639311e-06,
"loss": 2.1957,
"step": 174
},
{
"epoch": 0.9845288326300985,
"grad_norm": 1.2780177593231201,
"learning_rate": 5.5534500616320885e-06,
"loss": 2.3345,
"step": 175
},
{
"epoch": 0.9901547116736991,
"grad_norm": 1.395959734916687,
"learning_rate": 5.510147070086057e-06,
"loss": 2.4131,
"step": 176
},
{
"epoch": 0.9957805907172996,
"grad_norm": 1.3863272666931152,
"learning_rate": 5.466822381585402e-06,
"loss": 2.3061,
"step": 177
},
{
"epoch": 1.0014064697609002,
"grad_norm": 1.2258063554763794,
"learning_rate": 5.4234796095238804e-06,
"loss": 2.3713,
"step": 178
},
{
"epoch": 1.0070323488045008,
"grad_norm": 1.1133767366409302,
"learning_rate": 5.380122368803476e-06,
"loss": 2.1674,
"step": 179
},
{
"epoch": 1.0126582278481013,
"grad_norm": 2.211198091506958,
"learning_rate": 5.3367542755328935e-06,
"loss": 2.4607,
"step": 180
},
{
"epoch": 1.0126582278481013,
"eval_loss": 2.400386095046997,
"eval_runtime": 30.3125,
"eval_samples_per_second": 1.682,
"eval_steps_per_second": 1.682,
"step": 180
},
{
"epoch": 1.0014064697609002,
"grad_norm": 1.428281307220459,
"learning_rate": 5.293378946725968e-06,
"loss": 2.6363,
"step": 181
},
{
"epoch": 1.0070323488045008,
"grad_norm": 1.3397562503814697,
"learning_rate": 5.2500000000000006e-06,
"loss": 2.2154,
"step": 182
},
{
"epoch": 1.0126582278481013,
"grad_norm": 1.2449477910995483,
"learning_rate": 5.206621053274032e-06,
"loss": 2.2516,
"step": 183
},
{
"epoch": 1.018284106891702,
"grad_norm": 1.219135046005249,
"learning_rate": 5.1632457244671076e-06,
"loss": 2.2162,
"step": 184
},
{
"epoch": 1.0239099859353025,
"grad_norm": 1.5311412811279297,
"learning_rate": 5.119877631196525e-06,
"loss": 2.1986,
"step": 185
},
{
"epoch": 1.029535864978903,
"grad_norm": 1.0927857160568237,
"learning_rate": 5.076520390476121e-06,
"loss": 2.299,
"step": 186
},
{
"epoch": 1.0351617440225036,
"grad_norm": 1.161383867263794,
"learning_rate": 5.0331776184146e-06,
"loss": 2.4993,
"step": 187
},
{
"epoch": 1.0407876230661042,
"grad_norm": 1.392291784286499,
"learning_rate": 4.989852929913943e-06,
"loss": 2.3965,
"step": 188
},
{
"epoch": 1.0464135021097047,
"grad_norm": 1.0993740558624268,
"learning_rate": 4.946549938367912e-06,
"loss": 2.2925,
"step": 189
},
{
"epoch": 1.0520393811533053,
"grad_norm": 1.3068045377731323,
"learning_rate": 4.9032722553606895e-06,
"loss": 2.279,
"step": 190
},
{
"epoch": 1.0576652601969059,
"grad_norm": 1.117245078086853,
"learning_rate": 4.860023490365654e-06,
"loss": 2.0698,
"step": 191
},
{
"epoch": 1.0632911392405062,
"grad_norm": 1.3679758310317993,
"learning_rate": 4.8168072504443484e-06,
"loss": 2.1846,
"step": 192
},
{
"epoch": 1.0689170182841068,
"grad_norm": 1.3210700750350952,
"learning_rate": 4.773627139945638e-06,
"loss": 2.315,
"step": 193
},
{
"epoch": 1.0745428973277074,
"grad_norm": 1.082641363143921,
"learning_rate": 4.730486760205098e-06,
"loss": 2.3924,
"step": 194
},
{
"epoch": 1.080168776371308,
"grad_norm": 1.1435967683792114,
"learning_rate": 4.687389709244651e-06,
"loss": 2.421,
"step": 195
},
{
"epoch": 1.0857946554149085,
"grad_norm": 1.8713021278381348,
"learning_rate": 4.644339581472489e-06,
"loss": 2.1892,
"step": 196
},
{
"epoch": 1.091420534458509,
"grad_norm": 1.304328203201294,
"learning_rate": 4.601339967383282e-06,
"loss": 2.1397,
"step": 197
},
{
"epoch": 1.0970464135021096,
"grad_norm": 1.1662545204162598,
"learning_rate": 4.558394453258728e-06,
"loss": 2.2044,
"step": 198
},
{
"epoch": 1.1026722925457102,
"grad_norm": 1.6519335508346558,
"learning_rate": 4.515506620868443e-06,
"loss": 2.2881,
"step": 199
},
{
"epoch": 1.1082981715893108,
"grad_norm": 1.1646329164505005,
"learning_rate": 4.4726800471712325e-06,
"loss": 2.4505,
"step": 200
},
{
"epoch": 1.1139240506329113,
"grad_norm": 1.3433741331100464,
"learning_rate": 4.429918304016766e-06,
"loss": 2.1556,
"step": 201
},
{
"epoch": 1.119549929676512,
"grad_norm": 1.093310832977295,
"learning_rate": 4.3872249578476774e-06,
"loss": 2.2014,
"step": 202
},
{
"epoch": 1.1251758087201125,
"grad_norm": 1.1493537425994873,
"learning_rate": 4.344603569402106e-06,
"loss": 2.3267,
"step": 203
},
{
"epoch": 1.130801687763713,
"grad_norm": 1.2506024837493896,
"learning_rate": 4.302057693416725e-06,
"loss": 2.2444,
"step": 204
},
{
"epoch": 1.1364275668073136,
"grad_norm": 1.3935209512710571,
"learning_rate": 4.259590878330276e-06,
"loss": 2.2121,
"step": 205
},
{
"epoch": 1.1420534458509142,
"grad_norm": 1.213395595550537,
"learning_rate": 4.217206665987605e-06,
"loss": 2.2528,
"step": 206
},
{
"epoch": 1.1476793248945147,
"grad_norm": 1.9364022016525269,
"learning_rate": 4.174908591344273e-06,
"loss": 2.4659,
"step": 207
},
{
"epoch": 1.1533052039381153,
"grad_norm": 1.194150447845459,
"learning_rate": 4.132700182171731e-06,
"loss": 2.2238,
"step": 208
},
{
"epoch": 1.1589310829817159,
"grad_norm": 1.4336832761764526,
"learning_rate": 4.090584958763088e-06,
"loss": 2.2914,
"step": 209
},
{
"epoch": 1.1645569620253164,
"grad_norm": 1.6567353010177612,
"learning_rate": 4.048566433639516e-06,
"loss": 2.4391,
"step": 210
},
{
"epoch": 1.170182841068917,
"grad_norm": 1.426830768585205,
"learning_rate": 4.006648111257294e-06,
"loss": 2.4198,
"step": 211
},
{
"epoch": 1.1758087201125176,
"grad_norm": 1.2131551504135132,
"learning_rate": 3.964833487715527e-06,
"loss": 2.3363,
"step": 212
},
{
"epoch": 1.1814345991561181,
"grad_norm": 1.1698105335235596,
"learning_rate": 3.923126050464561e-06,
"loss": 2.4659,
"step": 213
},
{
"epoch": 1.1870604781997187,
"grad_norm": 1.346468210220337,
"learning_rate": 3.881529278015122e-06,
"loss": 2.5802,
"step": 214
},
{
"epoch": 1.1926863572433193,
"grad_norm": 1.1469833850860596,
"learning_rate": 3.840046639648199e-06,
"loss": 2.3794,
"step": 215
},
{
"epoch": 1.1983122362869199,
"grad_norm": 1.3175195455551147,
"learning_rate": 3.7986815951256937e-06,
"loss": 2.2429,
"step": 216
},
{
"epoch": 1.2039381153305204,
"grad_norm": 1.2770299911499023,
"learning_rate": 3.7574375944018744e-06,
"loss": 2.3475,
"step": 217
},
{
"epoch": 1.209563994374121,
"grad_norm": 1.2195074558258057,
"learning_rate": 3.716318077335632e-06,
"loss": 2.2418,
"step": 218
},
{
"epoch": 1.2151898734177216,
"grad_norm": 1.4205323457717896,
"learning_rate": 3.675326473403591e-06,
"loss": 2.3453,
"step": 219
},
{
"epoch": 1.2208157524613221,
"grad_norm": 1.3720946311950684,
"learning_rate": 3.6344662014140862e-06,
"loss": 2.318,
"step": 220
},
{
"epoch": 1.2264416315049227,
"grad_norm": 1.162539005279541,
"learning_rate": 3.593740669222015e-06,
"loss": 2.2763,
"step": 221
},
{
"epoch": 1.2320675105485233,
"grad_norm": 1.1718677282333374,
"learning_rate": 3.5531532734446194e-06,
"loss": 2.1948,
"step": 222
},
{
"epoch": 1.2376933895921238,
"grad_norm": 1.231491208076477,
"learning_rate": 3.512707399178204e-06,
"loss": 2.1702,
"step": 223
},
{
"epoch": 1.2433192686357244,
"grad_norm": 1.3110443353652954,
"learning_rate": 3.4724064197157976e-06,
"loss": 2.4983,
"step": 224
},
{
"epoch": 1.248945147679325,
"grad_norm": 1.2871124744415283,
"learning_rate": 3.432253696265824e-06,
"loss": 2.2115,
"step": 225
},
{
"epoch": 1.248945147679325,
"eval_loss": 2.3990979194641113,
"eval_runtime": 30.3251,
"eval_samples_per_second": 1.682,
"eval_steps_per_second": 1.682,
"step": 225
},
{
"epoch": 1.2545710267229255,
"grad_norm": 1.3364418745040894,
"learning_rate": 3.3922525776717597e-06,
"loss": 2.3069,
"step": 226
},
{
"epoch": 1.260196905766526,
"grad_norm": 1.1549724340438843,
"learning_rate": 3.3524064001328345e-06,
"loss": 2.3003,
"step": 227
},
{
"epoch": 1.2658227848101267,
"grad_norm": 1.5768709182739258,
"learning_rate": 3.312718486925787e-06,
"loss": 2.1072,
"step": 228
},
{
"epoch": 1.271448663853727,
"grad_norm": 1.3013666868209839,
"learning_rate": 3.2731921481276887e-06,
"loss": 2.1262,
"step": 229
},
{
"epoch": 1.2770745428973278,
"grad_norm": 1.2704813480377197,
"learning_rate": 3.233830680339879e-06,
"loss": 2.2043,
"step": 230
},
{
"epoch": 1.2827004219409281,
"grad_norm": 1.818085789680481,
"learning_rate": 3.1946373664130155e-06,
"loss": 2.2851,
"step": 231
},
{
"epoch": 1.288326300984529,
"grad_norm": 1.4256744384765625,
"learning_rate": 3.1556154751732816e-06,
"loss": 2.2682,
"step": 232
},
{
"epoch": 1.2939521800281293,
"grad_norm": 1.168641209602356,
"learning_rate": 3.1167682611497536e-06,
"loss": 2.2535,
"step": 233
},
{
"epoch": 1.29957805907173,
"grad_norm": 1.7689348459243774,
"learning_rate": 3.078098964302967e-06,
"loss": 2.5086,
"step": 234
},
{
"epoch": 1.3052039381153304,
"grad_norm": 1.1472971439361572,
"learning_rate": 3.039610809754689e-06,
"loss": 2.2806,
"step": 235
},
{
"epoch": 1.3108298171589312,
"grad_norm": 1.1633094549179077,
"learning_rate": 3.001307007518944e-06,
"loss": 2.1489,
"step": 236
},
{
"epoch": 1.3164556962025316,
"grad_norm": 1.3734430074691772,
"learning_rate": 2.963190752234284e-06,
"loss": 2.435,
"step": 237
},
{
"epoch": 1.3220815752461323,
"grad_norm": 1.4113901853561401,
"learning_rate": 2.925265222897345e-06,
"loss": 2.3259,
"step": 238
},
{
"epoch": 1.3277074542897327,
"grad_norm": 1.2623318433761597,
"learning_rate": 2.8875335825977185e-06,
"loss": 2.3495,
"step": 239
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.1913394927978516,
"learning_rate": 2.849998978254136e-06,
"loss": 2.245,
"step": 240
},
{
"epoch": 1.3389592123769338,
"grad_norm": 1.3264411687850952,
"learning_rate": 2.812664540352008e-06,
"loss": 2.3225,
"step": 241
},
{
"epoch": 1.3445850914205344,
"grad_norm": 1.2762576341629028,
"learning_rate": 2.775533382682332e-06,
"loss": 2.3699,
"step": 242
},
{
"epoch": 1.350210970464135,
"grad_norm": 1.4252859354019165,
"learning_rate": 2.738608602081996e-06,
"loss": 2.2251,
"step": 243
},
{
"epoch": 1.3558368495077355,
"grad_norm": 1.181598424911499,
"learning_rate": 2.701893278175499e-06,
"loss": 2.3656,
"step": 244
},
{
"epoch": 1.361462728551336,
"grad_norm": 1.2117236852645874,
"learning_rate": 2.665390473118088e-06,
"loss": 2.5056,
"step": 245
},
{
"epoch": 1.3670886075949367,
"grad_norm": 1.2578994035720825,
"learning_rate": 2.629103231340382e-06,
"loss": 2.3728,
"step": 246
},
{
"epoch": 1.3727144866385372,
"grad_norm": 1.5048015117645264,
"learning_rate": 2.5930345792944513e-06,
"loss": 2.3655,
"step": 247
},
{
"epoch": 1.3783403656821378,
"grad_norm": 1.5193151235580444,
"learning_rate": 2.5571875252013984e-06,
"loss": 2.4273,
"step": 248
},
{
"epoch": 1.3839662447257384,
"grad_norm": 1.2505041360855103,
"learning_rate": 2.521565058800475e-06,
"loss": 2.2828,
"step": 249
},
{
"epoch": 1.389592123769339,
"grad_norm": 1.3817038536071777,
"learning_rate": 2.486170151099725e-06,
"loss": 2.2924,
"step": 250
},
{
"epoch": 1.3952180028129395,
"grad_norm": 1.3338009119033813,
"learning_rate": 2.4510057541281872e-06,
"loss": 2.2852,
"step": 251
},
{
"epoch": 1.40084388185654,
"grad_norm": 1.202316403388977,
"learning_rate": 2.4160748006897018e-06,
"loss": 2.4643,
"step": 252
},
{
"epoch": 1.4064697609001406,
"grad_norm": 1.4298673868179321,
"learning_rate": 2.3813802041182987e-06,
"loss": 2.4521,
"step": 253
},
{
"epoch": 1.4120956399437412,
"grad_norm": 1.219159722328186,
"learning_rate": 2.346924858035216e-06,
"loss": 2.155,
"step": 254
},
{
"epoch": 1.4177215189873418,
"grad_norm": 1.4144155979156494,
"learning_rate": 2.3127116361075712e-06,
"loss": 2.1897,
"step": 255
},
{
"epoch": 1.4233473980309423,
"grad_norm": 1.171831488609314,
"learning_rate": 2.278743391808684e-06,
"loss": 2.3303,
"step": 256
},
{
"epoch": 1.428973277074543,
"grad_norm": 1.1133025884628296,
"learning_rate": 2.2450229581800925e-06,
"loss": 2.3888,
"step": 257
},
{
"epoch": 1.4345991561181435,
"grad_norm": 1.4286714792251587,
"learning_rate": 2.2115531475952678e-06,
"loss": 2.1884,
"step": 258
},
{
"epoch": 1.440225035161744,
"grad_norm": 1.2007641792297363,
"learning_rate": 2.178336751525052e-06,
"loss": 2.2624,
"step": 259
},
{
"epoch": 1.4458509142053446,
"grad_norm": 1.3826512098312378,
"learning_rate": 2.1453765403048525e-06,
"loss": 2.1844,
"step": 260
},
{
"epoch": 1.4514767932489452,
"grad_norm": 1.157139778137207,
"learning_rate": 2.1126752629035753e-06,
"loss": 2.1608,
"step": 261
},
{
"epoch": 1.4571026722925458,
"grad_norm": 1.1881312131881714,
"learning_rate": 2.080235646694363e-06,
"loss": 2.1249,
"step": 262
},
{
"epoch": 1.4627285513361463,
"grad_norm": 1.1453356742858887,
"learning_rate": 2.0480603972271227e-06,
"loss": 2.191,
"step": 263
},
{
"epoch": 1.4683544303797469,
"grad_norm": 1.3118908405303955,
"learning_rate": 2.016152198002876e-06,
"loss": 2.3229,
"step": 264
},
{
"epoch": 1.4739803094233475,
"grad_norm": 1.2369331121444702,
"learning_rate": 1.98451371024994e-06,
"loss": 2.1827,
"step": 265
},
{
"epoch": 1.479606188466948,
"grad_norm": 1.3042728900909424,
"learning_rate": 1.953147572701989e-06,
"loss": 2.1371,
"step": 266
},
{
"epoch": 1.4852320675105486,
"grad_norm": 1.2125662565231323,
"learning_rate": 1.922056401377966e-06,
"loss": 2.2515,
"step": 267
},
{
"epoch": 1.4908579465541492,
"grad_norm": 1.2672710418701172,
"learning_rate": 1.8912427893638996e-06,
"loss": 2.1017,
"step": 268
},
{
"epoch": 1.4964838255977497,
"grad_norm": 1.3755918741226196,
"learning_rate": 1.8607093065966408e-06,
"loss": 2.1161,
"step": 269
},
{
"epoch": 1.50210970464135,
"grad_norm": 1.258540153503418,
"learning_rate": 1.8304584996495205e-06,
"loss": 2.0267,
"step": 270
},
{
"epoch": 1.50210970464135,
"eval_loss": 2.398313283920288,
"eval_runtime": 30.2825,
"eval_samples_per_second": 1.684,
"eval_steps_per_second": 1.684,
"step": 270
},
{
"epoch": 1.5077355836849509,
"grad_norm": 1.3976160287857056,
"learning_rate": 1.8004928915199515e-06,
"loss": 2.3219,
"step": 271
},
{
"epoch": 1.5133614627285512,
"grad_norm": 1.1738497018814087,
"learning_rate": 1.7708149814190156e-06,
"loss": 2.1721,
"step": 272
},
{
"epoch": 1.518987341772152,
"grad_norm": 1.1697238683700562,
"learning_rate": 1.7414272445630166e-06,
"loss": 2.0663,
"step": 273
},
{
"epoch": 1.5246132208157523,
"grad_norm": 1.1484073400497437,
"learning_rate": 1.712332131967036e-06,
"loss": 2.1742,
"step": 274
},
{
"epoch": 1.5302390998593531,
"grad_norm": 1.393418312072754,
"learning_rate": 1.6835320702405238e-06,
"loss": 2.6228,
"step": 275
},
{
"epoch": 1.5358649789029535,
"grad_norm": 1.3217144012451172,
"learning_rate": 1.6550294613849016e-06,
"loss": 2.2263,
"step": 276
},
{
"epoch": 1.5414908579465543,
"grad_norm": 1.2675628662109375,
"learning_rate": 1.6268266825932378e-06,
"loss": 2.2772,
"step": 277
},
{
"epoch": 1.5471167369901546,
"grad_norm": 1.3925647735595703,
"learning_rate": 1.5989260860519723e-06,
"loss": 2.2878,
"step": 278
},
{
"epoch": 1.5527426160337554,
"grad_norm": 1.2969094514846802,
"learning_rate": 1.5713299987447534e-06,
"loss": 2.2664,
"step": 279
},
{
"epoch": 1.5583684950773558,
"grad_norm": 1.312171220779419,
"learning_rate": 1.5440407222583475e-06,
"loss": 2.2109,
"step": 280
},
{
"epoch": 1.5639943741209565,
"grad_norm": 1.2671453952789307,
"learning_rate": 1.5170605325906863e-06,
"loss": 2.2593,
"step": 281
},
{
"epoch": 1.5696202531645569,
"grad_norm": 1.4005205631256104,
"learning_rate": 1.4903916799610435e-06,
"loss": 2.2503,
"step": 282
},
{
"epoch": 1.5752461322081577,
"grad_norm": 1.098641037940979,
"learning_rate": 1.46403638862236e-06,
"loss": 2.1495,
"step": 283
},
{
"epoch": 1.580872011251758,
"grad_norm": 1.4645534753799438,
"learning_rate": 1.437996856675735e-06,
"loss": 2.3019,
"step": 284
},
{
"epoch": 1.5864978902953588,
"grad_norm": 1.2095060348510742,
"learning_rate": 1.4122752558870933e-06,
"loss": 2.2375,
"step": 285
},
{
"epoch": 1.5921237693389592,
"grad_norm": 1.5466718673706055,
"learning_rate": 1.3868737315060646e-06,
"loss": 2.584,
"step": 286
},
{
"epoch": 1.5977496483825597,
"grad_norm": 1.4787497520446777,
"learning_rate": 1.3617944020870577e-06,
"loss": 2.482,
"step": 287
},
{
"epoch": 1.6033755274261603,
"grad_norm": 1.3512495756149292,
"learning_rate": 1.3370393593125647e-06,
"loss": 2.3235,
"step": 288
},
{
"epoch": 1.6090014064697609,
"grad_norm": 1.1563678979873657,
"learning_rate": 1.3126106678187156e-06,
"loss": 2.2995,
"step": 289
},
{
"epoch": 1.6146272855133614,
"grad_norm": 1.30403470993042,
"learning_rate": 1.2885103650230806e-06,
"loss": 2.3431,
"step": 290
},
{
"epoch": 1.620253164556962,
"grad_norm": 1.24360990524292,
"learning_rate": 1.2647404609547384e-06,
"loss": 2.0579,
"step": 291
},
{
"epoch": 1.6258790436005626,
"grad_norm": 1.4975757598876953,
"learning_rate": 1.241302938086642e-06,
"loss": 2.0818,
"step": 292
},
{
"epoch": 1.6315049226441631,
"grad_norm": 1.355246901512146,
"learning_rate": 1.2181997511702728e-06,
"loss": 2.271,
"step": 293
},
{
"epoch": 1.6371308016877637,
"grad_norm": 1.3052653074264526,
"learning_rate": 1.1954328270726045e-06,
"loss": 2.4885,
"step": 294
},
{
"epoch": 1.6427566807313643,
"grad_norm": 1.5899144411087036,
"learning_rate": 1.1730040646154045e-06,
"loss": 2.3587,
"step": 295
},
{
"epoch": 1.6483825597749648,
"grad_norm": 1.2158286571502686,
"learning_rate": 1.150915334416865e-06,
"loss": 2.197,
"step": 296
},
{
"epoch": 1.6540084388185654,
"grad_norm": 1.7922595739364624,
"learning_rate": 1.129168478735581e-06,
"loss": 2.3705,
"step": 297
},
{
"epoch": 1.659634317862166,
"grad_norm": 1.1858259439468384,
"learning_rate": 1.1077653113169134e-06,
"loss": 2.5342,
"step": 298
},
{
"epoch": 1.6652601969057665,
"grad_norm": 1.1772092580795288,
"learning_rate": 1.0867076172417105e-06,
"loss": 2.3663,
"step": 299
},
{
"epoch": 1.6708860759493671,
"grad_norm": 1.3147233724594116,
"learning_rate": 1.0659971527774277e-06,
"loss": 2.387,
"step": 300
},
{
"epoch": 1.6765119549929677,
"grad_norm": 1.320970058441162,
"learning_rate": 1.0456356452316515e-06,
"loss": 2.3621,
"step": 301
},
{
"epoch": 1.6821378340365682,
"grad_norm": 1.1753120422363281,
"learning_rate": 1.0256247928080357e-06,
"loss": 2.1657,
"step": 302
},
{
"epoch": 1.6877637130801688,
"grad_norm": 1.3142890930175781,
"learning_rate": 1.0059662644646723e-06,
"loss": 2.3147,
"step": 303
},
{
"epoch": 1.6933895921237694,
"grad_norm": 1.2477645874023438,
"learning_rate": 9.86661699774887e-07,
"loss": 2.331,
"step": 304
},
{
"epoch": 1.69901547116737,
"grad_norm": 1.2975739240646362,
"learning_rate": 9.677127087905032e-07,
"loss": 2.2859,
"step": 305
},
{
"epoch": 1.7046413502109705,
"grad_norm": 1.5711100101470947,
"learning_rate": 9.491208719075537e-07,
"loss": 2.1521,
"step": 306
},
{
"epoch": 1.7102672292545709,
"grad_norm": 1.3837226629257202,
"learning_rate": 9.308877397344751e-07,
"loss": 2.3636,
"step": 307
},
{
"epoch": 1.7158931082981717,
"grad_norm": 1.3110496997833252,
"learning_rate": 9.130148329627774e-07,
"loss": 2.1745,
"step": 308
},
{
"epoch": 1.721518987341772,
"grad_norm": 1.1928850412368774,
"learning_rate": 8.955036422402223e-07,
"loss": 2.1995,
"step": 309
},
{
"epoch": 1.7271448663853728,
"grad_norm": 1.4467804431915283,
"learning_rate": 8.783556280464933e-07,
"loss": 2.1655,
"step": 310
},
{
"epoch": 1.7327707454289731,
"grad_norm": 1.1782878637313843,
"learning_rate": 8.615722205713881e-07,
"loss": 2.3282,
"step": 311
},
{
"epoch": 1.738396624472574,
"grad_norm": 1.3934561014175415,
"learning_rate": 8.451548195955409e-07,
"loss": 2.3772,
"step": 312
},
{
"epoch": 1.7440225035161743,
"grad_norm": 1.359525442123413,
"learning_rate": 8.291047943736744e-07,
"loss": 2.3182,
"step": 313
},
{
"epoch": 1.749648382559775,
"grad_norm": 1.169758677482605,
"learning_rate": 8.134234835203974e-07,
"loss": 2.3455,
"step": 314
},
{
"epoch": 1.7552742616033754,
"grad_norm": 1.1883350610733032,
"learning_rate": 7.981121948985665e-07,
"loss": 2.2055,
"step": 315
},
{
"epoch": 1.7552742616033754,
"eval_loss": 2.396662473678589,
"eval_runtime": 30.3177,
"eval_samples_per_second": 1.682,
"eval_steps_per_second": 1.682,
"step": 315
},
{
"epoch": 1.7609001406469762,
"grad_norm": 1.4302018880844116,
"learning_rate": 7.831722055102056e-07,
"loss": 2.1707,
"step": 316
},
{
"epoch": 1.7665260196905765,
"grad_norm": 1.388421893119812,
"learning_rate": 7.686047613899948e-07,
"loss": 2.1807,
"step": 317
},
{
"epoch": 1.7721518987341773,
"grad_norm": 1.3130079507827759,
"learning_rate": 7.544110775013554e-07,
"loss": 2.2276,
"step": 318
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.3371933698654175,
"learning_rate": 7.405923376351153e-07,
"loss": 2.3195,
"step": 319
},
{
"epoch": 1.7834036568213785,
"grad_norm": 1.354970097541809,
"learning_rate": 7.27149694310777e-07,
"loss": 2.3254,
"step": 320
},
{
"epoch": 1.7890295358649788,
"grad_norm": 1.139561414718628,
"learning_rate": 7.140842686803959e-07,
"loss": 2.1193,
"step": 321
},
{
"epoch": 1.7946554149085796,
"grad_norm": 1.2641339302062988,
"learning_rate": 7.013971504350722e-07,
"loss": 2.2094,
"step": 322
},
{
"epoch": 1.80028129395218,
"grad_norm": 1.221411943435669,
"learning_rate": 6.890893977140682e-07,
"loss": 2.1865,
"step": 323
},
{
"epoch": 1.8059071729957807,
"grad_norm": 1.2744096517562866,
"learning_rate": 6.771620370165577e-07,
"loss": 2.3021,
"step": 324
},
{
"epoch": 1.811533052039381,
"grad_norm": 1.2490217685699463,
"learning_rate": 6.656160631160105e-07,
"loss": 2.2437,
"step": 325
},
{
"epoch": 1.8171589310829819,
"grad_norm": 1.1591171026229858,
"learning_rate": 6.544524389772303e-07,
"loss": 2.2542,
"step": 326
},
{
"epoch": 1.8227848101265822,
"grad_norm": 1.3077995777130127,
"learning_rate": 6.436720956760359e-07,
"loss": 2.3848,
"step": 327
},
{
"epoch": 1.8284106891701828,
"grad_norm": 1.0817915201187134,
"learning_rate": 6.332759323216081e-07,
"loss": 2.1434,
"step": 328
},
{
"epoch": 1.8340365682137834,
"grad_norm": 1.2274130582809448,
"learning_rate": 6.232648159815062e-07,
"loss": 2.2062,
"step": 329
},
{
"epoch": 1.839662447257384,
"grad_norm": 1.1238057613372803,
"learning_rate": 6.136395816093466e-07,
"loss": 2.168,
"step": 330
},
{
"epoch": 1.8452883263009845,
"grad_norm": 1.325332760810852,
"learning_rate": 6.044010319751662e-07,
"loss": 2.2529,
"step": 331
},
{
"epoch": 1.850914205344585,
"grad_norm": 1.4097862243652344,
"learning_rate": 5.95549937598473e-07,
"loss": 2.3565,
"step": 332
},
{
"epoch": 1.8565400843881856,
"grad_norm": 1.4523735046386719,
"learning_rate": 5.870870366839798e-07,
"loss": 2.1707,
"step": 333
},
{
"epoch": 1.8621659634317862,
"grad_norm": 1.3728147745132446,
"learning_rate": 5.790130350600362e-07,
"loss": 2.2998,
"step": 334
},
{
"epoch": 1.8677918424753868,
"grad_norm": 1.1367465257644653,
"learning_rate": 5.713286061197607e-07,
"loss": 2.2088,
"step": 335
},
{
"epoch": 1.8734177215189873,
"grad_norm": 1.291197419166565,
"learning_rate": 5.640343907648791e-07,
"loss": 2.4345,
"step": 336
},
{
"epoch": 1.879043600562588,
"grad_norm": 1.4310858249664307,
"learning_rate": 5.571309973522697e-07,
"loss": 2.6208,
"step": 337
},
{
"epoch": 1.8846694796061885,
"grad_norm": 1.2096583843231201,
"learning_rate": 5.506190016432264e-07,
"loss": 2.4734,
"step": 338
},
{
"epoch": 1.890295358649789,
"grad_norm": 1.17626953125,
"learning_rate": 5.444989467554386e-07,
"loss": 2.5059,
"step": 339
},
{
"epoch": 1.8959212376933896,
"grad_norm": 1.463349461555481,
"learning_rate": 5.387713431176918e-07,
"loss": 2.2987,
"step": 340
},
{
"epoch": 1.9015471167369902,
"grad_norm": 1.414905309677124,
"learning_rate": 5.334366684272987e-07,
"loss": 2.3687,
"step": 341
},
{
"epoch": 1.9071729957805907,
"grad_norm": 1.2944713830947876,
"learning_rate": 5.28495367610257e-07,
"loss": 2.2465,
"step": 342
},
{
"epoch": 1.9127988748241913,
"grad_norm": 1.156260371208191,
"learning_rate": 5.239478527841415e-07,
"loss": 2.2922,
"step": 343
},
{
"epoch": 1.9184247538677919,
"grad_norm": 1.5482733249664307,
"learning_rate": 5.197945032237327e-07,
"loss": 2.2515,
"step": 344
},
{
"epoch": 1.9240506329113924,
"grad_norm": 1.29518461227417,
"learning_rate": 5.160356653293837e-07,
"loss": 2.5098,
"step": 345
},
{
"epoch": 1.929676511954993,
"grad_norm": 1.286757230758667,
"learning_rate": 5.126716525981297e-07,
"loss": 2.3004,
"step": 346
},
{
"epoch": 1.9353023909985936,
"grad_norm": 1.1425246000289917,
"learning_rate": 5.097027455975421e-07,
"loss": 2.2362,
"step": 347
},
{
"epoch": 1.9409282700421941,
"grad_norm": 1.1543301343917847,
"learning_rate": 5.071291919423276e-07,
"loss": 2.3147,
"step": 348
},
{
"epoch": 1.9465541490857947,
"grad_norm": 1.2874441146850586,
"learning_rate": 5.049512062736767e-07,
"loss": 2.2537,
"step": 349
},
{
"epoch": 1.952180028129395,
"grad_norm": 1.476819396018982,
"learning_rate": 5.03168970241363e-07,
"loss": 2.1098,
"step": 350
},
{
"epoch": 1.9578059071729959,
"grad_norm": 1.2125787734985352,
"learning_rate": 5.017826324885912e-07,
"loss": 2.3497,
"step": 351
},
{
"epoch": 1.9634317862165962,
"grad_norm": 1.2305964231491089,
"learning_rate": 5.007923086396018e-07,
"loss": 2.3115,
"step": 352
},
{
"epoch": 1.969057665260197,
"grad_norm": 1.3071091175079346,
"learning_rate": 5.001980812900265e-07,
"loss": 2.1634,
"step": 353
},
{
"epoch": 1.9746835443037973,
"grad_norm": 1.4950101375579834,
"learning_rate": 5.000000000000001e-07,
"loss": 2.3289,
"step": 354
}
],
"logging_steps": 1,
"max_steps": 354,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 177,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8528841455357133e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}