FatCat87's picture
Upload folder using huggingface_hub
91b235b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9975062344139651,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033250207813798837,
"grad_norm": 1.1512730121612549,
"learning_rate": 2e-05,
"loss": 2.3647,
"step": 1
},
{
"epoch": 0.006650041562759767,
"grad_norm": 1.1141036748886108,
"learning_rate": 4e-05,
"loss": 2.2794,
"step": 2
},
{
"epoch": 0.00997506234413965,
"grad_norm": 0.9610893726348877,
"learning_rate": 6e-05,
"loss": 2.2596,
"step": 3
},
{
"epoch": 0.013300083125519535,
"grad_norm": 1.1339858770370483,
"learning_rate": 8e-05,
"loss": 2.3787,
"step": 4
},
{
"epoch": 0.01662510390689942,
"grad_norm": 0.8878076672554016,
"learning_rate": 0.0001,
"loss": 2.3961,
"step": 5
},
{
"epoch": 0.0199501246882793,
"grad_norm": 0.829910397529602,
"learning_rate": 0.00012,
"loss": 2.1948,
"step": 6
},
{
"epoch": 0.023275145469659187,
"grad_norm": 0.9420105814933777,
"learning_rate": 0.00014,
"loss": 2.4329,
"step": 7
},
{
"epoch": 0.02660016625103907,
"grad_norm": 0.8519226908683777,
"learning_rate": 0.00016,
"loss": 2.3078,
"step": 8
},
{
"epoch": 0.029925187032418952,
"grad_norm": 0.7587653994560242,
"learning_rate": 0.00018,
"loss": 1.9353,
"step": 9
},
{
"epoch": 0.03325020781379884,
"grad_norm": 0.9927352666854858,
"learning_rate": 0.0002,
"loss": 1.9429,
"step": 10
},
{
"epoch": 0.03657522859517872,
"grad_norm": 0.9643892049789429,
"learning_rate": 0.00019999413227831132,
"loss": 2.0925,
"step": 11
},
{
"epoch": 0.0399002493765586,
"grad_norm": 0.941749632358551,
"learning_rate": 0.00019997652980184843,
"loss": 1.8099,
"step": 12
},
{
"epoch": 0.043225270157938485,
"grad_norm": 0.5177962779998779,
"learning_rate": 0.00019994719463633997,
"loss": 1.6693,
"step": 13
},
{
"epoch": 0.046550290939318374,
"grad_norm": 0.56168133020401,
"learning_rate": 0.0001999061302243977,
"loss": 1.9593,
"step": 14
},
{
"epoch": 0.04987531172069826,
"grad_norm": 0.5392152070999146,
"learning_rate": 0.00019985334138511237,
"loss": 1.6836,
"step": 15
},
{
"epoch": 0.05320033250207814,
"grad_norm": 0.5796711444854736,
"learning_rate": 0.00019978883431348845,
"loss": 1.7744,
"step": 16
},
{
"epoch": 0.05652535328345802,
"grad_norm": 0.5629785060882568,
"learning_rate": 0.0001997126165797167,
"loss": 2.0442,
"step": 17
},
{
"epoch": 0.059850374064837904,
"grad_norm": 0.48991289734840393,
"learning_rate": 0.00019962469712828614,
"loss": 1.679,
"step": 18
},
{
"epoch": 0.06317539484621779,
"grad_norm": 0.47867172956466675,
"learning_rate": 0.0001995250862769342,
"loss": 1.6641,
"step": 19
},
{
"epoch": 0.06650041562759768,
"grad_norm": 0.49752330780029297,
"learning_rate": 0.00019941379571543596,
"loss": 1.5331,
"step": 20
},
{
"epoch": 0.06982543640897755,
"grad_norm": 0.49927300214767456,
"learning_rate": 0.00019929083850423225,
"loss": 1.5704,
"step": 21
},
{
"epoch": 0.07315045719035744,
"grad_norm": 0.5634847283363342,
"learning_rate": 0.00019915622907289694,
"loss": 1.9051,
"step": 22
},
{
"epoch": 0.07647547797173733,
"grad_norm": 0.5214512944221497,
"learning_rate": 0.00019900998321844367,
"loss": 1.756,
"step": 23
},
{
"epoch": 0.0798004987531172,
"grad_norm": 0.46316221356391907,
"learning_rate": 0.00019885211810347184,
"loss": 1.6153,
"step": 24
},
{
"epoch": 0.0831255195344971,
"grad_norm": 0.45869576930999756,
"learning_rate": 0.00019868265225415265,
"loss": 1.8899,
"step": 25
},
{
"epoch": 0.08645054031587697,
"grad_norm": 0.4824669063091278,
"learning_rate": 0.00019850160555805486,
"loss": 1.8861,
"step": 26
},
{
"epoch": 0.08977556109725686,
"grad_norm": 0.509224534034729,
"learning_rate": 0.000198308999261811,
"loss": 1.8507,
"step": 27
},
{
"epoch": 0.09310058187863675,
"grad_norm": 0.4441746771335602,
"learning_rate": 0.00019810485596862392,
"loss": 1.7326,
"step": 28
},
{
"epoch": 0.09642560266001662,
"grad_norm": 0.4595758318901062,
"learning_rate": 0.00019788919963561422,
"loss": 1.8283,
"step": 29
},
{
"epoch": 0.09975062344139651,
"grad_norm": 0.5222824215888977,
"learning_rate": 0.00019766205557100868,
"loss": 1.5678,
"step": 30
},
{
"epoch": 0.10307564422277639,
"grad_norm": 0.43890196084976196,
"learning_rate": 0.00019742345043117045,
"loss": 1.5899,
"step": 31
},
{
"epoch": 0.10640066500415628,
"grad_norm": 0.4542831778526306,
"learning_rate": 0.00019717341221747056,
"loss": 1.6733,
"step": 32
},
{
"epoch": 0.10972568578553615,
"grad_norm": 0.43134549260139465,
"learning_rate": 0.00019691197027300205,
"loss": 1.7386,
"step": 33
},
{
"epoch": 0.11305070656691604,
"grad_norm": 0.44071701169013977,
"learning_rate": 0.00019663915527913625,
"loss": 1.7685,
"step": 34
},
{
"epoch": 0.11637572734829593,
"grad_norm": 0.4880881607532501,
"learning_rate": 0.0001963549992519223,
"loss": 1.8461,
"step": 35
},
{
"epoch": 0.11970074812967581,
"grad_norm": 0.40884578227996826,
"learning_rate": 0.00019605953553832988,
"loss": 1.5538,
"step": 36
},
{
"epoch": 0.1230257689110557,
"grad_norm": 0.39413318037986755,
"learning_rate": 0.00019575279881233577,
"loss": 1.4222,
"step": 37
},
{
"epoch": 0.12635078969243557,
"grad_norm": 0.44478997588157654,
"learning_rate": 0.00019543482507085482,
"loss": 1.7247,
"step": 38
},
{
"epoch": 0.12967581047381546,
"grad_norm": 0.4295913875102997,
"learning_rate": 0.00019510565162951537,
"loss": 1.5788,
"step": 39
},
{
"epoch": 0.13300083125519535,
"grad_norm": 0.47360050678253174,
"learning_rate": 0.00019476531711828027,
"loss": 1.7429,
"step": 40
},
{
"epoch": 0.13632585203657524,
"grad_norm": 0.483909547328949,
"learning_rate": 0.00019441386147691335,
"loss": 1.6674,
"step": 41
},
{
"epoch": 0.1396508728179551,
"grad_norm": 0.47071558237075806,
"learning_rate": 0.0001940513259502924,
"loss": 1.8229,
"step": 42
},
{
"epoch": 0.142975893599335,
"grad_norm": 0.43929168581962585,
"learning_rate": 0.0001936777530835689,
"loss": 1.6562,
"step": 43
},
{
"epoch": 0.14630091438071488,
"grad_norm": 0.4329998791217804,
"learning_rate": 0.0001932931867171751,
"loss": 1.5274,
"step": 44
},
{
"epoch": 0.14962593516209477,
"grad_norm": 0.44375908374786377,
"learning_rate": 0.00019289767198167916,
"loss": 1.7084,
"step": 45
},
{
"epoch": 0.15295095594347466,
"grad_norm": 0.48119276762008667,
"learning_rate": 0.0001924912552924889,
"loss": 1.7645,
"step": 46
},
{
"epoch": 0.15627597672485452,
"grad_norm": 0.4040566384792328,
"learning_rate": 0.00019207398434440478,
"loss": 1.5925,
"step": 47
},
{
"epoch": 0.1596009975062344,
"grad_norm": 0.4708506464958191,
"learning_rate": 0.00019164590810602262,
"loss": 1.8461,
"step": 48
},
{
"epoch": 0.1629260182876143,
"grad_norm": 0.431772381067276,
"learning_rate": 0.000191207076813987,
"loss": 1.5356,
"step": 49
},
{
"epoch": 0.1662510390689942,
"grad_norm": 0.4952054023742676,
"learning_rate": 0.00019075754196709572,
"loss": 1.8034,
"step": 50
},
{
"epoch": 0.16957605985037408,
"grad_norm": 0.43522897362709045,
"learning_rate": 0.00019029735632025618,
"loss": 1.6717,
"step": 51
},
{
"epoch": 0.17290108063175394,
"grad_norm": 0.46861544251441956,
"learning_rate": 0.00018982657387829445,
"loss": 1.766,
"step": 52
},
{
"epoch": 0.17622610141313383,
"grad_norm": 0.44363775849342346,
"learning_rate": 0.00018934524988961738,
"loss": 1.5169,
"step": 53
},
{
"epoch": 0.17955112219451372,
"grad_norm": 0.41366782784461975,
"learning_rate": 0.00018885344083972914,
"loss": 1.6495,
"step": 54
},
{
"epoch": 0.1828761429758936,
"grad_norm": 0.4273390769958496,
"learning_rate": 0.0001883512044446023,
"loss": 1.5952,
"step": 55
},
{
"epoch": 0.1862011637572735,
"grad_norm": 0.4389772117137909,
"learning_rate": 0.00018783859964390464,
"loss": 1.7003,
"step": 56
},
{
"epoch": 0.18952618453865336,
"grad_norm": 0.480125367641449,
"learning_rate": 0.0001873156865940823,
"loss": 1.6503,
"step": 57
},
{
"epoch": 0.19285120532003325,
"grad_norm": 0.48973348736763,
"learning_rate": 0.00018678252666130013,
"loss": 1.737,
"step": 58
},
{
"epoch": 0.19617622610141314,
"grad_norm": 0.4558335840702057,
"learning_rate": 0.0001862391824142402,
"loss": 1.571,
"step": 59
},
{
"epoch": 0.19950124688279303,
"grad_norm": 0.45777326822280884,
"learning_rate": 0.00018568571761675893,
"loss": 1.6462,
"step": 60
},
{
"epoch": 0.2028262676641729,
"grad_norm": 0.4185212254524231,
"learning_rate": 0.00018512219722040425,
"loss": 1.5729,
"step": 61
},
{
"epoch": 0.20615128844555278,
"grad_norm": 0.4137243330478668,
"learning_rate": 0.0001845486873567932,
"loss": 1.675,
"step": 62
},
{
"epoch": 0.20947630922693267,
"grad_norm": 0.42468297481536865,
"learning_rate": 0.00018396525532985108,
"loss": 1.4519,
"step": 63
},
{
"epoch": 0.21280133000831256,
"grad_norm": 0.46751776337623596,
"learning_rate": 0.00018337196960791302,
"loss": 1.7264,
"step": 64
},
{
"epoch": 0.21612635078969245,
"grad_norm": 0.47722429037094116,
"learning_rate": 0.00018276889981568906,
"loss": 1.5392,
"step": 65
},
{
"epoch": 0.2194513715710723,
"grad_norm": 0.4753107726573944,
"learning_rate": 0.00018215611672609317,
"loss": 1.5328,
"step": 66
},
{
"epoch": 0.2227763923524522,
"grad_norm": 0.4401816129684448,
"learning_rate": 0.00018153369225193782,
"loss": 1.4793,
"step": 67
},
{
"epoch": 0.22610141313383209,
"grad_norm": 0.4473712146282196,
"learning_rate": 0.00018090169943749476,
"loss": 1.5596,
"step": 68
},
{
"epoch": 0.22942643391521197,
"grad_norm": 0.45505204796791077,
"learning_rate": 0.00018026021244992287,
"loss": 1.7437,
"step": 69
},
{
"epoch": 0.23275145469659186,
"grad_norm": 0.44190192222595215,
"learning_rate": 0.00017960930657056438,
"loss": 1.7401,
"step": 70
},
{
"epoch": 0.23607647547797173,
"grad_norm": 0.501592218875885,
"learning_rate": 0.0001789490581861102,
"loss": 1.7464,
"step": 71
},
{
"epoch": 0.23940149625935161,
"grad_norm": 0.43836328387260437,
"learning_rate": 0.00017827954477963557,
"loss": 1.7451,
"step": 72
},
{
"epoch": 0.2427265170407315,
"grad_norm": 0.611949622631073,
"learning_rate": 0.0001776008449215073,
"loss": 1.6921,
"step": 73
},
{
"epoch": 0.2460515378221114,
"grad_norm": 0.46015432476997375,
"learning_rate": 0.0001769130382601629,
"loss": 1.7985,
"step": 74
},
{
"epoch": 0.24937655860349128,
"grad_norm": 0.44316309690475464,
"learning_rate": 0.00017621620551276366,
"loss": 1.7806,
"step": 75
},
{
"epoch": 0.25270157938487114,
"grad_norm": 0.4749353229999542,
"learning_rate": 0.00017551042845572208,
"loss": 1.7349,
"step": 76
},
{
"epoch": 0.25602660016625106,
"grad_norm": 0.4712280333042145,
"learning_rate": 0.00017479578991510506,
"loss": 1.4129,
"step": 77
},
{
"epoch": 0.2593516209476309,
"grad_norm": 0.44466859102249146,
"learning_rate": 0.00017407237375691392,
"loss": 1.6819,
"step": 78
},
{
"epoch": 0.2626766417290108,
"grad_norm": 0.42531418800354004,
"learning_rate": 0.00017334026487724225,
"loss": 1.6154,
"step": 79
},
{
"epoch": 0.2660016625103907,
"grad_norm": 0.4512370228767395,
"learning_rate": 0.0001725995491923131,
"loss": 1.6736,
"step": 80
},
{
"epoch": 0.26932668329177056,
"grad_norm": 0.4131537079811096,
"learning_rate": 0.00017185031362839626,
"loss": 1.5468,
"step": 81
},
{
"epoch": 0.2726517040731505,
"grad_norm": 0.47616103291511536,
"learning_rate": 0.00017109264611160708,
"loss": 1.523,
"step": 82
},
{
"epoch": 0.27597672485453034,
"grad_norm": 0.4459686279296875,
"learning_rate": 0.000170326635557588,
"loss": 1.8612,
"step": 83
},
{
"epoch": 0.2793017456359102,
"grad_norm": 0.4500899612903595,
"learning_rate": 0.00016955237186107387,
"loss": 1.643,
"step": 84
},
{
"epoch": 0.2826267664172901,
"grad_norm": 0.44385287165641785,
"learning_rate": 0.00016876994588534234,
"loss": 1.3833,
"step": 85
},
{
"epoch": 0.28595178719867,
"grad_norm": 0.4063577950000763,
"learning_rate": 0.0001679794494515508,
"loss": 1.3494,
"step": 86
},
{
"epoch": 0.2892768079800499,
"grad_norm": 0.43013447523117065,
"learning_rate": 0.00016718097532796063,
"loss": 1.5205,
"step": 87
},
{
"epoch": 0.29260182876142976,
"grad_norm": 0.46770158410072327,
"learning_rate": 0.00016637461721905045,
"loss": 1.6897,
"step": 88
},
{
"epoch": 0.2959268495428096,
"grad_norm": 0.4841039478778839,
"learning_rate": 0.00016556046975451963,
"loss": 1.5793,
"step": 89
},
{
"epoch": 0.29925187032418954,
"grad_norm": 0.48426705598831177,
"learning_rate": 0.00016473862847818277,
"loss": 1.6988,
"step": 90
},
{
"epoch": 0.3025768911055694,
"grad_norm": 0.5768110752105713,
"learning_rate": 0.0001639091898367576,
"loss": 1.7846,
"step": 91
},
{
"epoch": 0.3059019118869493,
"grad_norm": 0.446196049451828,
"learning_rate": 0.00016307225116854622,
"loss": 1.7882,
"step": 92
},
{
"epoch": 0.3092269326683292,
"grad_norm": 0.4034564793109894,
"learning_rate": 0.00016222791069201207,
"loss": 1.6616,
"step": 93
},
{
"epoch": 0.31255195344970904,
"grad_norm": 0.424376517534256,
"learning_rate": 0.00016137626749425377,
"loss": 1.5353,
"step": 94
},
{
"epoch": 0.31587697423108896,
"grad_norm": 0.45510077476501465,
"learning_rate": 0.00016051742151937655,
"loss": 1.7947,
"step": 95
},
{
"epoch": 0.3192019950124688,
"grad_norm": 0.4815070331096649,
"learning_rate": 0.00015965147355676343,
"loss": 1.581,
"step": 96
},
{
"epoch": 0.32252701579384874,
"grad_norm": 0.4505084156990051,
"learning_rate": 0.00015877852522924732,
"loss": 1.6186,
"step": 97
},
{
"epoch": 0.3258520365752286,
"grad_norm": 0.4437141418457031,
"learning_rate": 0.0001578986789811849,
"loss": 1.6509,
"step": 98
},
{
"epoch": 0.32917705735660846,
"grad_norm": 0.4133874475955963,
"learning_rate": 0.00015701203806643433,
"loss": 1.7992,
"step": 99
},
{
"epoch": 0.3325020781379884,
"grad_norm": 0.4500593841075897,
"learning_rate": 0.00015611870653623825,
"loss": 1.6654,
"step": 100
},
{
"epoch": 0.33582709891936824,
"grad_norm": 0.4359726309776306,
"learning_rate": 0.00015521878922701246,
"loss": 1.6461,
"step": 101
},
{
"epoch": 0.33915211970074816,
"grad_norm": 0.40108025074005127,
"learning_rate": 0.00015431239174804328,
"loss": 1.5237,
"step": 102
},
{
"epoch": 0.342477140482128,
"grad_norm": 0.43869125843048096,
"learning_rate": 0.00015339962046909364,
"loss": 1.6909,
"step": 103
},
{
"epoch": 0.3458021612635079,
"grad_norm": 0.42006051540374756,
"learning_rate": 0.00015248058250792008,
"loss": 1.5046,
"step": 104
},
{
"epoch": 0.3491271820448878,
"grad_norm": 0.38756394386291504,
"learning_rate": 0.00015155538571770218,
"loss": 1.3747,
"step": 105
},
{
"epoch": 0.35245220282626766,
"grad_norm": 0.47784286737442017,
"learning_rate": 0.0001506241386743854,
"loss": 1.673,
"step": 106
},
{
"epoch": 0.3557772236076476,
"grad_norm": 0.4587322175502777,
"learning_rate": 0.00014968695066393923,
"loss": 1.7987,
"step": 107
},
{
"epoch": 0.35910224438902744,
"grad_norm": 0.42091092467308044,
"learning_rate": 0.00014874393166953192,
"loss": 1.5309,
"step": 108
},
{
"epoch": 0.3624272651704073,
"grad_norm": 0.47224530577659607,
"learning_rate": 0.00014779519235862365,
"loss": 1.7268,
"step": 109
},
{
"epoch": 0.3657522859517872,
"grad_norm": 0.44596192240715027,
"learning_rate": 0.00014684084406997903,
"loss": 1.7108,
"step": 110
},
{
"epoch": 0.3690773067331671,
"grad_norm": 0.4590005874633789,
"learning_rate": 0.0001458809988006011,
"loss": 1.638,
"step": 111
},
{
"epoch": 0.372402327514547,
"grad_norm": 0.43627721071243286,
"learning_rate": 0.00014491576919258792,
"loss": 1.6721,
"step": 112
},
{
"epoch": 0.37572734829592686,
"grad_norm": 0.41456034779548645,
"learning_rate": 0.00014394526851991364,
"loss": 1.6863,
"step": 113
},
{
"epoch": 0.3790523690773067,
"grad_norm": 0.4247894883155823,
"learning_rate": 0.0001429696106751352,
"loss": 1.5659,
"step": 114
},
{
"epoch": 0.38237738985868663,
"grad_norm": 0.4657272696495056,
"learning_rate": 0.00014198891015602646,
"loss": 1.4086,
"step": 115
},
{
"epoch": 0.3857024106400665,
"grad_norm": 0.4860394597053528,
"learning_rate": 0.0001410032820521416,
"loss": 1.4603,
"step": 116
},
{
"epoch": 0.38902743142144636,
"grad_norm": 0.41849544644355774,
"learning_rate": 0.00014001284203130868,
"loss": 1.3991,
"step": 117
},
{
"epoch": 0.3923524522028263,
"grad_norm": 0.4544629752635956,
"learning_rate": 0.00013901770632605547,
"loss": 1.8028,
"step": 118
},
{
"epoch": 0.39567747298420614,
"grad_norm": 0.5051787495613098,
"learning_rate": 0.0001380179917199692,
"loss": 1.8854,
"step": 119
},
{
"epoch": 0.39900249376558605,
"grad_norm": 0.41150030493736267,
"learning_rate": 0.00013701381553399145,
"loss": 1.6686,
"step": 120
},
{
"epoch": 0.4023275145469659,
"grad_norm": 0.4593510925769806,
"learning_rate": 0.0001360052956126499,
"loss": 1.5844,
"step": 121
},
{
"epoch": 0.4056525353283458,
"grad_norm": 0.42087090015411377,
"learning_rate": 0.00013499255031022885,
"loss": 1.4865,
"step": 122
},
{
"epoch": 0.4089775561097257,
"grad_norm": 0.4708739221096039,
"learning_rate": 0.00013397569847687984,
"loss": 1.7089,
"step": 123
},
{
"epoch": 0.41230257689110555,
"grad_norm": 0.4878352880477905,
"learning_rate": 0.00013295485944467405,
"loss": 1.8006,
"step": 124
},
{
"epoch": 0.41562759767248547,
"grad_norm": 0.43254002928733826,
"learning_rate": 0.000131930153013598,
"loss": 1.6949,
"step": 125
},
{
"epoch": 0.41895261845386533,
"grad_norm": 0.47519850730895996,
"learning_rate": 0.00013090169943749476,
"loss": 1.7601,
"step": 126
},
{
"epoch": 0.4222776392352452,
"grad_norm": 0.4135800898075104,
"learning_rate": 0.00012986961940995138,
"loss": 1.5955,
"step": 127
},
{
"epoch": 0.4256026600166251,
"grad_norm": 0.46267929673194885,
"learning_rate": 0.0001288340340501351,
"loss": 1.8398,
"step": 128
},
{
"epoch": 0.428927680798005,
"grad_norm": 0.43891721963882446,
"learning_rate": 0.00012779506488857945,
"loss": 1.4741,
"step": 129
},
{
"epoch": 0.4322527015793849,
"grad_norm": 0.4456429183483124,
"learning_rate": 0.00012675283385292212,
"loss": 1.7454,
"step": 130
},
{
"epoch": 0.43557772236076475,
"grad_norm": 0.4604743719100952,
"learning_rate": 0.00012570746325359607,
"loss": 1.8192,
"step": 131
},
{
"epoch": 0.4389027431421446,
"grad_norm": 0.46728062629699707,
"learning_rate": 0.00012465907576947622,
"loss": 1.7551,
"step": 132
},
{
"epoch": 0.44222776392352453,
"grad_norm": 0.436298668384552,
"learning_rate": 0.000123607794433482,
"loss": 1.6592,
"step": 133
},
{
"epoch": 0.4455527847049044,
"grad_norm": 0.39828214049339294,
"learning_rate": 0.00012255374261813944,
"loss": 1.4603,
"step": 134
},
{
"epoch": 0.4488778054862843,
"grad_norm": 0.4469813406467438,
"learning_rate": 0.00012149704402110243,
"loss": 1.6449,
"step": 135
},
{
"epoch": 0.45220282626766417,
"grad_norm": 0.4820503294467926,
"learning_rate": 0.0001204378226506365,
"loss": 1.8473,
"step": 136
},
{
"epoch": 0.45552784704904403,
"grad_norm": 0.49072131514549255,
"learning_rate": 0.00011937620281106585,
"loss": 1.6843,
"step": 137
},
{
"epoch": 0.45885286783042395,
"grad_norm": 0.48773854970932007,
"learning_rate": 0.00011831230908818563,
"loss": 1.625,
"step": 138
},
{
"epoch": 0.4621778886118038,
"grad_norm": 0.4438723623752594,
"learning_rate": 0.00011724626633464127,
"loss": 1.7558,
"step": 139
},
{
"epoch": 0.46550290939318373,
"grad_norm": 0.4389275014400482,
"learning_rate": 0.0001161781996552765,
"loss": 1.4621,
"step": 140
},
{
"epoch": 0.4688279301745636,
"grad_norm": 0.4611305296421051,
"learning_rate": 0.00011510823439245169,
"loss": 1.59,
"step": 141
},
{
"epoch": 0.47215295095594345,
"grad_norm": 0.43601059913635254,
"learning_rate": 0.00011403649611133444,
"loss": 1.7462,
"step": 142
},
{
"epoch": 0.47547797173732337,
"grad_norm": 0.41201236844062805,
"learning_rate": 0.00011296311058516389,
"loss": 1.5341,
"step": 143
},
{
"epoch": 0.47880299251870323,
"grad_norm": 0.46523982286453247,
"learning_rate": 0.00011188820378049065,
"loss": 1.6327,
"step": 144
},
{
"epoch": 0.48212801330008315,
"grad_norm": 0.42490893602371216,
"learning_rate": 0.00011081190184239419,
"loss": 1.6178,
"step": 145
},
{
"epoch": 0.485453034081463,
"grad_norm": 0.42238375544548035,
"learning_rate": 0.00010973433107967902,
"loss": 1.534,
"step": 146
},
{
"epoch": 0.48877805486284287,
"grad_norm": 0.48569226264953613,
"learning_rate": 0.00010865561795005177,
"loss": 1.5332,
"step": 147
},
{
"epoch": 0.4921030756442228,
"grad_norm": 0.4933275878429413,
"learning_rate": 0.00010757588904528106,
"loss": 1.5928,
"step": 148
},
{
"epoch": 0.49542809642560265,
"grad_norm": 0.4781058728694916,
"learning_rate": 0.00010649527107634108,
"loss": 1.6578,
"step": 149
},
{
"epoch": 0.49875311720698257,
"grad_norm": 0.4651820659637451,
"learning_rate": 0.00010541389085854176,
"loss": 1.6884,
"step": 150
},
{
"epoch": 0.5020781379883624,
"grad_norm": 0.4429711103439331,
"learning_rate": 0.00010433187529664623,
"loss": 1.6723,
"step": 151
},
{
"epoch": 0.5054031587697423,
"grad_norm": 0.4521614611148834,
"learning_rate": 0.00010324935136997806,
"loss": 1.6269,
"step": 152
},
{
"epoch": 0.5087281795511222,
"grad_norm": 0.4930736720561981,
"learning_rate": 0.00010216644611751975,
"loss": 1.7933,
"step": 153
},
{
"epoch": 0.5120532003325021,
"grad_norm": 0.4855606257915497,
"learning_rate": 0.000101083286623004,
"loss": 1.6702,
"step": 154
},
{
"epoch": 0.515378221113882,
"grad_norm": 0.4960128366947174,
"learning_rate": 0.0001,
"loss": 1.7428,
"step": 155
},
{
"epoch": 0.5187032418952618,
"grad_norm": 0.42107459902763367,
"learning_rate": 9.891671337699602e-05,
"loss": 1.6235,
"step": 156
},
{
"epoch": 0.5220282626766417,
"grad_norm": 0.4479861855506897,
"learning_rate": 9.783355388248027e-05,
"loss": 1.5158,
"step": 157
},
{
"epoch": 0.5253532834580216,
"grad_norm": 0.4954458177089691,
"learning_rate": 9.675064863002196e-05,
"loss": 1.6743,
"step": 158
},
{
"epoch": 0.5286783042394015,
"grad_norm": 0.5591014623641968,
"learning_rate": 9.56681247033538e-05,
"loss": 1.9691,
"step": 159
},
{
"epoch": 0.5320033250207814,
"grad_norm": 0.46626871824264526,
"learning_rate": 9.458610914145826e-05,
"loss": 1.5621,
"step": 160
},
{
"epoch": 0.5353283458021613,
"grad_norm": 0.4377134144306183,
"learning_rate": 9.350472892365892e-05,
"loss": 1.5524,
"step": 161
},
{
"epoch": 0.5386533665835411,
"grad_norm": 0.3984418511390686,
"learning_rate": 9.242411095471897e-05,
"loss": 1.6454,
"step": 162
},
{
"epoch": 0.541978387364921,
"grad_norm": 0.42802637815475464,
"learning_rate": 9.134438204994824e-05,
"loss": 1.4036,
"step": 163
},
{
"epoch": 0.545303408146301,
"grad_norm": 0.4567003846168518,
"learning_rate": 9.026566892032105e-05,
"loss": 1.6606,
"step": 164
},
{
"epoch": 0.5486284289276808,
"grad_norm": 0.45452797412872314,
"learning_rate": 8.918809815760585e-05,
"loss": 1.8219,
"step": 165
},
{
"epoch": 0.5519534497090607,
"grad_norm": 0.4367886781692505,
"learning_rate": 8.811179621950936e-05,
"loss": 1.5962,
"step": 166
},
{
"epoch": 0.5552784704904405,
"grad_norm": 0.4670146703720093,
"learning_rate": 8.703688941483616e-05,
"loss": 1.6382,
"step": 167
},
{
"epoch": 0.5586034912718204,
"grad_norm": 0.5069778561592102,
"learning_rate": 8.596350388866558e-05,
"loss": 1.7067,
"step": 168
},
{
"epoch": 0.5619285120532004,
"grad_norm": 0.4080033302307129,
"learning_rate": 8.489176560754834e-05,
"loss": 1.4192,
"step": 169
},
{
"epoch": 0.5652535328345802,
"grad_norm": 0.491526335477829,
"learning_rate": 8.382180034472353e-05,
"loss": 1.8687,
"step": 170
},
{
"epoch": 0.5685785536159601,
"grad_norm": 0.5429246425628662,
"learning_rate": 8.275373366535877e-05,
"loss": 1.776,
"step": 171
},
{
"epoch": 0.57190357439734,
"grad_norm": 0.4131667912006378,
"learning_rate": 8.168769091181438e-05,
"loss": 1.3345,
"step": 172
},
{
"epoch": 0.5752285951787198,
"grad_norm": 0.5055519342422485,
"learning_rate": 8.062379718893417e-05,
"loss": 1.7716,
"step": 173
},
{
"epoch": 0.5785536159600998,
"grad_norm": 0.4675292670726776,
"learning_rate": 7.956217734936353e-05,
"loss": 1.5941,
"step": 174
},
{
"epoch": 0.5818786367414797,
"grad_norm": 0.5096448659896851,
"learning_rate": 7.85029559788976e-05,
"loss": 1.9376,
"step": 175
},
{
"epoch": 0.5852036575228595,
"grad_norm": 0.4687637686729431,
"learning_rate": 7.744625738186059e-05,
"loss": 1.7242,
"step": 176
},
{
"epoch": 0.5885286783042394,
"grad_norm": 0.437148779630661,
"learning_rate": 7.639220556651799e-05,
"loss": 1.4993,
"step": 177
},
{
"epoch": 0.5918536990856192,
"grad_norm": 0.44125625491142273,
"learning_rate": 7.534092423052381e-05,
"loss": 1.5076,
"step": 178
},
{
"epoch": 0.5951787198669992,
"grad_norm": 0.4794883131980896,
"learning_rate": 7.42925367464039e-05,
"loss": 1.6401,
"step": 179
},
{
"epoch": 0.5985037406483791,
"grad_norm": 0.42347967624664307,
"learning_rate": 7.324716614707793e-05,
"loss": 1.444,
"step": 180
},
{
"epoch": 0.6018287614297589,
"grad_norm": 0.4843563437461853,
"learning_rate": 7.220493511142059e-05,
"loss": 1.7117,
"step": 181
},
{
"epoch": 0.6051537822111388,
"grad_norm": 0.48885542154312134,
"learning_rate": 7.116596594986494e-05,
"loss": 1.6799,
"step": 182
},
{
"epoch": 0.6084788029925187,
"grad_norm": 0.48835498094558716,
"learning_rate": 7.013038059004866e-05,
"loss": 1.7308,
"step": 183
},
{
"epoch": 0.6118038237738986,
"grad_norm": 0.38506001234054565,
"learning_rate": 6.909830056250527e-05,
"loss": 1.5766,
"step": 184
},
{
"epoch": 0.6151288445552785,
"grad_norm": 0.5520392656326294,
"learning_rate": 6.806984698640202e-05,
"loss": 1.5418,
"step": 185
},
{
"epoch": 0.6184538653366584,
"grad_norm": 0.4401935935020447,
"learning_rate": 6.704514055532597e-05,
"loss": 1.7715,
"step": 186
},
{
"epoch": 0.6217788861180382,
"grad_norm": 0.4164566695690155,
"learning_rate": 6.602430152312017e-05,
"loss": 1.4711,
"step": 187
},
{
"epoch": 0.6251039068994181,
"grad_norm": 0.4750818610191345,
"learning_rate": 6.500744968977116e-05,
"loss": 1.374,
"step": 188
},
{
"epoch": 0.628428927680798,
"grad_norm": 0.5478043556213379,
"learning_rate": 6.399470438735014e-05,
"loss": 1.7294,
"step": 189
},
{
"epoch": 0.6317539484621779,
"grad_norm": 0.4560893476009369,
"learning_rate": 6.298618446600856e-05,
"loss": 1.8216,
"step": 190
},
{
"epoch": 0.6350789692435578,
"grad_norm": 0.49942511320114136,
"learning_rate": 6.19820082800308e-05,
"loss": 1.6108,
"step": 191
},
{
"epoch": 0.6384039900249376,
"grad_norm": 0.3901759088039398,
"learning_rate": 6.0982293673944544e-05,
"loss": 1.4635,
"step": 192
},
{
"epoch": 0.6417290108063175,
"grad_norm": 0.45033466815948486,
"learning_rate": 5.9987157968691344e-05,
"loss": 1.5153,
"step": 193
},
{
"epoch": 0.6450540315876975,
"grad_norm": 0.44514134526252747,
"learning_rate": 5.899671794785839e-05,
"loss": 1.6015,
"step": 194
},
{
"epoch": 0.6483790523690773,
"grad_norm": 0.42773956060409546,
"learning_rate": 5.801108984397354e-05,
"loss": 1.6624,
"step": 195
},
{
"epoch": 0.6517040731504572,
"grad_norm": 0.42323529720306396,
"learning_rate": 5.703038932486484e-05,
"loss": 1.642,
"step": 196
},
{
"epoch": 0.6550290939318371,
"grad_norm": 0.4852340519428253,
"learning_rate": 5.605473148008638e-05,
"loss": 1.5533,
"step": 197
},
{
"epoch": 0.6583541147132169,
"grad_norm": 0.46353092789649963,
"learning_rate": 5.5084230807412126e-05,
"loss": 1.5137,
"step": 198
},
{
"epoch": 0.6616791354945969,
"grad_norm": 0.5486162304878235,
"learning_rate": 5.411900119939895e-05,
"loss": 1.5682,
"step": 199
},
{
"epoch": 0.6650041562759768,
"grad_norm": 0.4136289656162262,
"learning_rate": 5.3159155930021e-05,
"loss": 1.5902,
"step": 200
},
{
"epoch": 0.6683291770573566,
"grad_norm": 0.457292765378952,
"learning_rate": 5.2204807641376354e-05,
"loss": 1.6669,
"step": 201
},
{
"epoch": 0.6716541978387365,
"grad_norm": 0.4368407726287842,
"learning_rate": 5.12560683304681e-05,
"loss": 1.7747,
"step": 202
},
{
"epoch": 0.6749792186201163,
"grad_norm": 0.4596605598926544,
"learning_rate": 5.03130493360608e-05,
"loss": 1.5868,
"step": 203
},
{
"epoch": 0.6783042394014963,
"grad_norm": 0.437491238117218,
"learning_rate": 4.9375861325614606e-05,
"loss": 1.7614,
"step": 204
},
{
"epoch": 0.6816292601828762,
"grad_norm": 0.47249388694763184,
"learning_rate": 4.844461428229782e-05,
"loss": 1.582,
"step": 205
},
{
"epoch": 0.684954280964256,
"grad_norm": 0.44100067019462585,
"learning_rate": 4.751941749207995e-05,
"loss": 1.6814,
"step": 206
},
{
"epoch": 0.6882793017456359,
"grad_norm": 0.5000886917114258,
"learning_rate": 4.660037953090639e-05,
"loss": 1.6634,
"step": 207
},
{
"epoch": 0.6916043225270158,
"grad_norm": 0.4667086899280548,
"learning_rate": 4.5687608251956714e-05,
"loss": 1.7767,
"step": 208
},
{
"epoch": 0.6949293433083957,
"grad_norm": 0.4677750766277313,
"learning_rate": 4.4781210772987514e-05,
"loss": 1.785,
"step": 209
},
{
"epoch": 0.6982543640897756,
"grad_norm": 0.40729814767837524,
"learning_rate": 4.388129346376178e-05,
"loss": 1.5742,
"step": 210
},
{
"epoch": 0.7015793848711555,
"grad_norm": 0.4622965157032013,
"learning_rate": 4.298796193356566e-05,
"loss": 1.755,
"step": 211
},
{
"epoch": 0.7049044056525353,
"grad_norm": 0.42128920555114746,
"learning_rate": 4.210132101881516e-05,
"loss": 1.359,
"step": 212
},
{
"epoch": 0.7082294264339152,
"grad_norm": 0.4670293927192688,
"learning_rate": 4.12214747707527e-05,
"loss": 1.8743,
"step": 213
},
{
"epoch": 0.7115544472152951,
"grad_norm": 0.474398136138916,
"learning_rate": 4.034852644323661e-05,
"loss": 1.6977,
"step": 214
},
{
"epoch": 0.714879467996675,
"grad_norm": 0.5026089549064636,
"learning_rate": 3.948257848062351e-05,
"loss": 1.566,
"step": 215
},
{
"epoch": 0.7182044887780549,
"grad_norm": 0.40603697299957275,
"learning_rate": 3.862373250574626e-05,
"loss": 1.3894,
"step": 216
},
{
"epoch": 0.7215295095594347,
"grad_norm": 0.4771779179573059,
"learning_rate": 3.7772089307987936e-05,
"loss": 1.6296,
"step": 217
},
{
"epoch": 0.7248545303408146,
"grad_norm": 0.44347891211509705,
"learning_rate": 3.6927748831453836e-05,
"loss": 1.6663,
"step": 218
},
{
"epoch": 0.7281795511221946,
"grad_norm": 0.435149610042572,
"learning_rate": 3.609081016324243e-05,
"loss": 1.6662,
"step": 219
},
{
"epoch": 0.7315045719035744,
"grad_norm": 0.453782856464386,
"learning_rate": 3.5261371521817244e-05,
"loss": 1.7286,
"step": 220
},
{
"epoch": 0.7348295926849543,
"grad_norm": 0.42496827244758606,
"learning_rate": 3.44395302454804e-05,
"loss": 1.7376,
"step": 221
},
{
"epoch": 0.7381546134663342,
"grad_norm": 0.45447835326194763,
"learning_rate": 3.3625382780949574e-05,
"loss": 1.5055,
"step": 222
},
{
"epoch": 0.741479634247714,
"grad_norm": 0.5035948157310486,
"learning_rate": 3.28190246720394e-05,
"loss": 1.866,
"step": 223
},
{
"epoch": 0.744804655029094,
"grad_norm": 0.47680604457855225,
"learning_rate": 3.202055054844921e-05,
"loss": 1.9692,
"step": 224
},
{
"epoch": 0.7481296758104738,
"grad_norm": 0.45373663306236267,
"learning_rate": 3.123005411465766e-05,
"loss": 1.6879,
"step": 225
},
{
"epoch": 0.7514546965918537,
"grad_norm": 0.49925628304481506,
"learning_rate": 3.0447628138926156e-05,
"loss": 1.5313,
"step": 226
},
{
"epoch": 0.7547797173732336,
"grad_norm": 0.4820810556411743,
"learning_rate": 2.9673364442412e-05,
"loss": 1.6259,
"step": 227
},
{
"epoch": 0.7581047381546134,
"grad_norm": 0.5111257433891296,
"learning_rate": 2.890735388839295e-05,
"loss": 1.6068,
"step": 228
},
{
"epoch": 0.7614297589359933,
"grad_norm": 0.3893967568874359,
"learning_rate": 2.8149686371603767e-05,
"loss": 1.5461,
"step": 229
},
{
"epoch": 0.7647547797173733,
"grad_norm": 0.42585450410842896,
"learning_rate": 2.7400450807686938e-05,
"loss": 1.4092,
"step": 230
},
{
"epoch": 0.7680798004987531,
"grad_norm": 0.5068459510803223,
"learning_rate": 2.665973512275778e-05,
"loss": 1.8426,
"step": 231
},
{
"epoch": 0.771404821280133,
"grad_norm": 0.44372087717056274,
"learning_rate": 2.59276262430861e-05,
"loss": 1.5669,
"step": 232
},
{
"epoch": 0.7747298420615129,
"grad_norm": 0.4483433663845062,
"learning_rate": 2.520421008489494e-05,
"loss": 1.508,
"step": 233
},
{
"epoch": 0.7780548628428927,
"grad_norm": 0.4225240647792816,
"learning_rate": 2.4489571544277945e-05,
"loss": 1.4963,
"step": 234
},
{
"epoch": 0.7813798836242727,
"grad_norm": 0.4540765583515167,
"learning_rate": 2.3783794487236365e-05,
"loss": 1.7699,
"step": 235
},
{
"epoch": 0.7847049044056525,
"grad_norm": 0.5303469896316528,
"learning_rate": 2.308696173983711e-05,
"loss": 1.7887,
"step": 236
},
{
"epoch": 0.7880299251870324,
"grad_norm": 0.4368319809436798,
"learning_rate": 2.2399155078492694e-05,
"loss": 1.6762,
"step": 237
},
{
"epoch": 0.7913549459684123,
"grad_norm": 0.41934987902641296,
"learning_rate": 2.1720455220364444e-05,
"loss": 1.6372,
"step": 238
},
{
"epoch": 0.7946799667497921,
"grad_norm": 0.4291558861732483,
"learning_rate": 2.1050941813889836e-05,
"loss": 1.6668,
"step": 239
},
{
"epoch": 0.7980049875311721,
"grad_norm": 0.414044052362442,
"learning_rate": 2.0390693429435627e-05,
"loss": 1.6885,
"step": 240
},
{
"epoch": 0.801330008312552,
"grad_norm": 0.4342755377292633,
"learning_rate": 1.9739787550077116e-05,
"loss": 1.5082,
"step": 241
},
{
"epoch": 0.8046550290939318,
"grad_norm": 0.45343807339668274,
"learning_rate": 1.9098300562505266e-05,
"loss": 1.5635,
"step": 242
},
{
"epoch": 0.8079800498753117,
"grad_norm": 0.4498422145843506,
"learning_rate": 1.8466307748062205e-05,
"loss": 1.6047,
"step": 243
},
{
"epoch": 0.8113050706566916,
"grad_norm": 0.4087926149368286,
"learning_rate": 1.784388327390687e-05,
"loss": 1.3402,
"step": 244
},
{
"epoch": 0.8146300914380715,
"grad_norm": 0.42908143997192383,
"learning_rate": 1.7231100184310956e-05,
"loss": 1.5664,
"step": 245
},
{
"epoch": 0.8179551122194514,
"grad_norm": 0.4820065200328827,
"learning_rate": 1.6628030392087e-05,
"loss": 1.7218,
"step": 246
},
{
"epoch": 0.8212801330008312,
"grad_norm": 0.4803646206855774,
"learning_rate": 1.6034744670148972e-05,
"loss": 1.837,
"step": 247
},
{
"epoch": 0.8246051537822111,
"grad_norm": 0.4350035786628723,
"learning_rate": 1.5451312643206827e-05,
"loss": 1.5924,
"step": 248
},
{
"epoch": 0.827930174563591,
"grad_norm": 0.49933725595474243,
"learning_rate": 1.4877802779595762e-05,
"loss": 1.6023,
"step": 249
},
{
"epoch": 0.8312551953449709,
"grad_norm": 0.44506213068962097,
"learning_rate": 1.4314282383241096e-05,
"loss": 1.4533,
"step": 250
},
{
"epoch": 0.8345802161263508,
"grad_norm": 0.46771377325057983,
"learning_rate": 1.376081758575981e-05,
"loss": 1.7391,
"step": 251
},
{
"epoch": 0.8379052369077307,
"grad_norm": 0.44328737258911133,
"learning_rate": 1.3217473338699859e-05,
"loss": 1.6868,
"step": 252
},
{
"epoch": 0.8412302576891105,
"grad_norm": 0.4481683373451233,
"learning_rate": 1.2684313405917703e-05,
"loss": 1.4394,
"step": 253
},
{
"epoch": 0.8445552784704904,
"grad_norm": 0.452848881483078,
"learning_rate": 1.2161400356095375e-05,
"loss": 1.6657,
"step": 254
},
{
"epoch": 0.8478802992518704,
"grad_norm": 0.42388778924942017,
"learning_rate": 1.1648795555397719e-05,
"loss": 1.459,
"step": 255
},
{
"epoch": 0.8512053200332502,
"grad_norm": 0.43063634634017944,
"learning_rate": 1.1146559160270875e-05,
"loss": 1.6652,
"step": 256
},
{
"epoch": 0.8545303408146301,
"grad_norm": 0.40587228536605835,
"learning_rate": 1.0654750110382628e-05,
"loss": 1.5131,
"step": 257
},
{
"epoch": 0.85785536159601,
"grad_norm": 0.4573078751564026,
"learning_rate": 1.0173426121705576e-05,
"loss": 1.6047,
"step": 258
},
{
"epoch": 0.8611803823773898,
"grad_norm": 0.4255686104297638,
"learning_rate": 9.702643679743817e-06,
"loss": 1.493,
"step": 259
},
{
"epoch": 0.8645054031587698,
"grad_norm": 0.48064589500427246,
"learning_rate": 9.242458032904311e-06,
"loss": 1.6691,
"step": 260
},
{
"epoch": 0.8678304239401496,
"grad_norm": 0.4468303620815277,
"learning_rate": 8.792923186013024e-06,
"loss": 1.5707,
"step": 261
},
{
"epoch": 0.8711554447215295,
"grad_norm": 0.4417254328727722,
"learning_rate": 8.354091893977401e-06,
"loss": 1.5591,
"step": 262
},
{
"epoch": 0.8744804655029094,
"grad_norm": 0.42065221071243286,
"learning_rate": 7.926015655595254e-06,
"loss": 1.5657,
"step": 263
},
{
"epoch": 0.8778054862842892,
"grad_norm": 0.3902848958969116,
"learning_rate": 7.508744707511117e-06,
"loss": 1.5445,
"step": 264
},
{
"epoch": 0.8811305070656692,
"grad_norm": 0.41993579268455505,
"learning_rate": 7.102328018320858e-06,
"loss": 1.4065,
"step": 265
},
{
"epoch": 0.8844555278470491,
"grad_norm": 0.4170606732368469,
"learning_rate": 6.70681328282492e-06,
"loss": 1.5117,
"step": 266
},
{
"epoch": 0.8877805486284289,
"grad_norm": 0.513680636882782,
"learning_rate": 6.322246916431107e-06,
"loss": 1.9662,
"step": 267
},
{
"epoch": 0.8911055694098088,
"grad_norm": 0.43288302421569824,
"learning_rate": 5.948674049707603e-06,
"loss": 1.6208,
"step": 268
},
{
"epoch": 0.8944305901911886,
"grad_norm": 0.38253968954086304,
"learning_rate": 5.58613852308667e-06,
"loss": 1.5302,
"step": 269
},
{
"epoch": 0.8977556109725686,
"grad_norm": 0.4266990125179291,
"learning_rate": 5.2346828817197655e-06,
"loss": 1.6815,
"step": 270
},
{
"epoch": 0.9010806317539485,
"grad_norm": 0.561107873916626,
"learning_rate": 4.8943483704846475e-06,
"loss": 1.6608,
"step": 271
},
{
"epoch": 0.9044056525353283,
"grad_norm": 0.41741943359375,
"learning_rate": 4.565174929145188e-06,
"loss": 1.2898,
"step": 272
},
{
"epoch": 0.9077306733167082,
"grad_norm": 0.39722350239753723,
"learning_rate": 4.247201187664218e-06,
"loss": 1.585,
"step": 273
},
{
"epoch": 0.9110556940980881,
"grad_norm": 0.41877254843711853,
"learning_rate": 3.940464461670135e-06,
"loss": 1.605,
"step": 274
},
{
"epoch": 0.914380714879468,
"grad_norm": 0.5125119090080261,
"learning_rate": 3.6450007480777093e-06,
"loss": 1.5922,
"step": 275
},
{
"epoch": 0.9177057356608479,
"grad_norm": 0.43189626932144165,
"learning_rate": 3.360844720863765e-06,
"loss": 1.559,
"step": 276
},
{
"epoch": 0.9210307564422278,
"grad_norm": 0.44040048122406006,
"learning_rate": 3.0880297269979653e-06,
"loss": 1.67,
"step": 277
},
{
"epoch": 0.9243557772236076,
"grad_norm": 0.5034830570220947,
"learning_rate": 2.826587782529444e-06,
"loss": 1.9225,
"step": 278
},
{
"epoch": 0.9276807980049875,
"grad_norm": 0.4373987019062042,
"learning_rate": 2.576549568829578e-06,
"loss": 1.7428,
"step": 279
},
{
"epoch": 0.9310058187863675,
"grad_norm": 0.45258763432502747,
"learning_rate": 2.3379444289913342e-06,
"loss": 1.5951,
"step": 280
},
{
"epoch": 0.9343308395677473,
"grad_norm": 0.4411347210407257,
"learning_rate": 2.110800364385812e-06,
"loss": 1.5906,
"step": 281
},
{
"epoch": 0.9376558603491272,
"grad_norm": 0.4530499577522278,
"learning_rate": 1.8951440313760837e-06,
"loss": 1.4591,
"step": 282
},
{
"epoch": 0.940980881130507,
"grad_norm": 0.484295129776001,
"learning_rate": 1.6910007381890081e-06,
"loss": 1.6808,
"step": 283
},
{
"epoch": 0.9443059019118869,
"grad_norm": 0.43871116638183594,
"learning_rate": 1.4983944419451613e-06,
"loss": 1.5378,
"step": 284
},
{
"epoch": 0.9476309226932669,
"grad_norm": 0.4450673460960388,
"learning_rate": 1.317347745847386e-06,
"loss": 1.6353,
"step": 285
},
{
"epoch": 0.9509559434746467,
"grad_norm": 0.4276074171066284,
"learning_rate": 1.1478818965281911e-06,
"loss": 1.5403,
"step": 286
},
{
"epoch": 0.9542809642560266,
"grad_norm": 0.46902570128440857,
"learning_rate": 9.900167815563465e-07,
"loss": 1.5077,
"step": 287
},
{
"epoch": 0.9576059850374065,
"grad_norm": 0.42395344376564026,
"learning_rate": 8.437709271030603e-07,
"loss": 1.4276,
"step": 288
},
{
"epoch": 0.9609310058187863,
"grad_norm": 0.45644354820251465,
"learning_rate": 7.091614957677517e-07,
"loss": 1.6846,
"step": 289
},
{
"epoch": 0.9642560266001663,
"grad_norm": 0.5007442831993103,
"learning_rate": 5.862042845640403e-07,
"loss": 1.8228,
"step": 290
},
{
"epoch": 0.9675810473815462,
"grad_norm": 0.4374752640724182,
"learning_rate": 4.7491372306580627e-07,
"loss": 1.6439,
"step": 291
},
{
"epoch": 0.970906068162926,
"grad_norm": 0.42748919129371643,
"learning_rate": 3.7530287171387843e-07,
"loss": 1.5968,
"step": 292
},
{
"epoch": 0.9742310889443059,
"grad_norm": 0.3897336721420288,
"learning_rate": 2.873834202833159e-07,
"loss": 1.5657,
"step": 293
},
{
"epoch": 0.9775561097256857,
"grad_norm": 0.5000090599060059,
"learning_rate": 2.1116568651156076e-07,
"loss": 1.6331,
"step": 294
},
{
"epoch": 0.9808811305070657,
"grad_norm": 0.4116688668727875,
"learning_rate": 1.4665861488761813e-07,
"loss": 1.349,
"step": 295
},
{
"epoch": 0.9842061512884456,
"grad_norm": 0.45309168100357056,
"learning_rate": 9.386977560232879e-08,
"loss": 1.5287,
"step": 296
},
{
"epoch": 0.9875311720698254,
"grad_norm": 0.4364264905452728,
"learning_rate": 5.2805363660046734e-08,
"loss": 1.5738,
"step": 297
},
{
"epoch": 0.9908561928512053,
"grad_norm": 0.4273874759674072,
"learning_rate": 2.347019815158724e-08,
"loss": 1.585,
"step": 298
},
{
"epoch": 0.9941812136325852,
"grad_norm": 0.42349839210510254,
"learning_rate": 5.867721688690431e-09,
"loss": 1.6562,
"step": 299
},
{
"epoch": 0.9975062344139651,
"grad_norm": 0.46075183153152466,
"learning_rate": 0.0,
"loss": 1.592,
"step": 300
},
{
"epoch": 0.9975062344139651,
"eval_loss": 1.6082537174224854,
"eval_runtime": 15.2561,
"eval_samples_per_second": 33.233,
"eval_steps_per_second": 4.195,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 300,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1990397395992576.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}