eeeebbb2's picture
Training in progress, step 150, checkpoint
cd4d8b2 verified
{
"best_metric": 0.004802440293133259,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 0.7330482590103848,
"eval_steps": 25,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004886988393402566,
"grad_norm": 3.496391773223877,
"learning_rate": 1.6666666666666667e-05,
"loss": 4.5908,
"step": 1
},
{
"epoch": 0.004886988393402566,
"eval_loss": 4.457808017730713,
"eval_runtime": 3.1608,
"eval_samples_per_second": 15.819,
"eval_steps_per_second": 4.113,
"step": 1
},
{
"epoch": 0.009773976786805132,
"grad_norm": 3.437105417251587,
"learning_rate": 3.3333333333333335e-05,
"loss": 4.5269,
"step": 2
},
{
"epoch": 0.014660965180207697,
"grad_norm": 3.467268705368042,
"learning_rate": 5e-05,
"loss": 4.551,
"step": 3
},
{
"epoch": 0.019547953573610263,
"grad_norm": 3.4430549144744873,
"learning_rate": 6.666666666666667e-05,
"loss": 4.4104,
"step": 4
},
{
"epoch": 0.02443494196701283,
"grad_norm": 3.4603631496429443,
"learning_rate": 8.333333333333334e-05,
"loss": 4.0045,
"step": 5
},
{
"epoch": 0.029321930360415395,
"grad_norm": 3.4329891204833984,
"learning_rate": 0.0001,
"loss": 3.3489,
"step": 6
},
{
"epoch": 0.03420891875381796,
"grad_norm": 3.1279687881469727,
"learning_rate": 9.998929121859592e-05,
"loss": 2.5521,
"step": 7
},
{
"epoch": 0.03909590714722053,
"grad_norm": 2.870373249053955,
"learning_rate": 9.99571699711836e-05,
"loss": 1.8669,
"step": 8
},
{
"epoch": 0.04398289554062309,
"grad_norm": 2.74849271774292,
"learning_rate": 9.990365154573717e-05,
"loss": 1.3343,
"step": 9
},
{
"epoch": 0.04886988393402566,
"grad_norm": 2.134420156478882,
"learning_rate": 9.982876141412856e-05,
"loss": 0.9228,
"step": 10
},
{
"epoch": 0.05375687232742822,
"grad_norm": 1.9005703926086426,
"learning_rate": 9.973253522000438e-05,
"loss": 0.6209,
"step": 11
},
{
"epoch": 0.05864386072083079,
"grad_norm": 1.5681957006454468,
"learning_rate": 9.961501876182148e-05,
"loss": 0.4086,
"step": 12
},
{
"epoch": 0.06353084911423336,
"grad_norm": 1.2829203605651855,
"learning_rate": 9.947626797104925e-05,
"loss": 0.3602,
"step": 13
},
{
"epoch": 0.06841783750763591,
"grad_norm": 2.018519878387451,
"learning_rate": 9.931634888554937e-05,
"loss": 0.3689,
"step": 14
},
{
"epoch": 0.07330482590103848,
"grad_norm": 1.200795292854309,
"learning_rate": 9.913533761814537e-05,
"loss": 0.2407,
"step": 15
},
{
"epoch": 0.07819181429444105,
"grad_norm": 0.7333146333694458,
"learning_rate": 9.893332032039701e-05,
"loss": 0.1828,
"step": 16
},
{
"epoch": 0.08307880268784362,
"grad_norm": 0.8203049302101135,
"learning_rate": 9.871039314159677e-05,
"loss": 0.1775,
"step": 17
},
{
"epoch": 0.08796579108124618,
"grad_norm": 0.40979278087615967,
"learning_rate": 9.846666218300807e-05,
"loss": 0.1498,
"step": 18
},
{
"epoch": 0.09285277947464875,
"grad_norm": 0.6932844519615173,
"learning_rate": 9.82022434473668e-05,
"loss": 0.1343,
"step": 19
},
{
"epoch": 0.09773976786805132,
"grad_norm": 0.3889353275299072,
"learning_rate": 9.791726278367022e-05,
"loss": 0.1287,
"step": 20
},
{
"epoch": 0.10262675626145389,
"grad_norm": 0.5370850563049316,
"learning_rate": 9.761185582727977e-05,
"loss": 0.121,
"step": 21
},
{
"epoch": 0.10751374465485644,
"grad_norm": 0.43492749333381653,
"learning_rate": 9.728616793536588e-05,
"loss": 0.1071,
"step": 22
},
{
"epoch": 0.11240073304825901,
"grad_norm": 0.2682703733444214,
"learning_rate": 9.694035411772594e-05,
"loss": 0.0857,
"step": 23
},
{
"epoch": 0.11728772144166158,
"grad_norm": 0.4315255582332611,
"learning_rate": 9.657457896300791e-05,
"loss": 0.0563,
"step": 24
},
{
"epoch": 0.12217470983506414,
"grad_norm": 0.5034452080726624,
"learning_rate": 9.618901656037514e-05,
"loss": 0.0362,
"step": 25
},
{
"epoch": 0.12217470983506414,
"eval_loss": 0.09082719683647156,
"eval_runtime": 3.232,
"eval_samples_per_second": 15.47,
"eval_steps_per_second": 4.022,
"step": 25
},
{
"epoch": 0.12706169822846672,
"grad_norm": 0.7888331413269043,
"learning_rate": 9.578385041664925e-05,
"loss": 0.1436,
"step": 26
},
{
"epoch": 0.13194868662186926,
"grad_norm": 0.6206626296043396,
"learning_rate": 9.535927336897098e-05,
"loss": 0.1505,
"step": 27
},
{
"epoch": 0.13683567501527183,
"grad_norm": 0.5963620543479919,
"learning_rate": 9.491548749301997e-05,
"loss": 0.1375,
"step": 28
},
{
"epoch": 0.1417226634086744,
"grad_norm": 0.642953634262085,
"learning_rate": 9.445270400683786e-05,
"loss": 0.1211,
"step": 29
},
{
"epoch": 0.14660965180207697,
"grad_norm": 0.3498934507369995,
"learning_rate": 9.397114317029975e-05,
"loss": 0.1147,
"step": 30
},
{
"epoch": 0.15149664019547954,
"grad_norm": 0.7011544704437256,
"learning_rate": 9.34710341802826e-05,
"loss": 0.0905,
"step": 31
},
{
"epoch": 0.1563836285888821,
"grad_norm": 0.299635112285614,
"learning_rate": 9.295261506157986e-05,
"loss": 0.0823,
"step": 32
},
{
"epoch": 0.16127061698228468,
"grad_norm": 0.36785322427749634,
"learning_rate": 9.241613255361455e-05,
"loss": 0.0777,
"step": 33
},
{
"epoch": 0.16615760537568725,
"grad_norm": 0.34238114953041077,
"learning_rate": 9.186184199300464e-05,
"loss": 0.0748,
"step": 34
},
{
"epoch": 0.1710445937690898,
"grad_norm": 0.1910071223974228,
"learning_rate": 9.129000719203672e-05,
"loss": 0.0674,
"step": 35
},
{
"epoch": 0.17593158216249236,
"grad_norm": 0.15098601579666138,
"learning_rate": 9.070090031310558e-05,
"loss": 0.0457,
"step": 36
},
{
"epoch": 0.18081857055589493,
"grad_norm": 0.30129632353782654,
"learning_rate": 9.009480173917968e-05,
"loss": 0.0323,
"step": 37
},
{
"epoch": 0.1857055589492975,
"grad_norm": 0.19990572333335876,
"learning_rate": 8.947199994035401e-05,
"loss": 0.0659,
"step": 38
},
{
"epoch": 0.19059254734270006,
"grad_norm": 0.31118127703666687,
"learning_rate": 8.883279133655399e-05,
"loss": 0.1034,
"step": 39
},
{
"epoch": 0.19547953573610263,
"grad_norm": 0.2536146640777588,
"learning_rate": 8.817748015645558e-05,
"loss": 0.0837,
"step": 40
},
{
"epoch": 0.2003665241295052,
"grad_norm": 0.2500099539756775,
"learning_rate": 8.7506378292689e-05,
"loss": 0.0794,
"step": 41
},
{
"epoch": 0.20525351252290777,
"grad_norm": 0.24145787954330444,
"learning_rate": 8.681980515339464e-05,
"loss": 0.0797,
"step": 42
},
{
"epoch": 0.2101405009163103,
"grad_norm": 0.2965352237224579,
"learning_rate": 8.611808751020213e-05,
"loss": 0.0714,
"step": 43
},
{
"epoch": 0.21502748930971288,
"grad_norm": 0.3214447498321533,
"learning_rate": 8.540155934270471e-05,
"loss": 0.065,
"step": 44
},
{
"epoch": 0.21991447770311545,
"grad_norm": 0.15810289978981018,
"learning_rate": 8.467056167950311e-05,
"loss": 0.056,
"step": 45
},
{
"epoch": 0.22480146609651802,
"grad_norm": 0.48764532804489136,
"learning_rate": 8.392544243589427e-05,
"loss": 0.0531,
"step": 46
},
{
"epoch": 0.2296884544899206,
"grad_norm": 0.27347925305366516,
"learning_rate": 8.316655624828267e-05,
"loss": 0.038,
"step": 47
},
{
"epoch": 0.23457544288332316,
"grad_norm": 0.11946756392717361,
"learning_rate": 8.239426430539243e-05,
"loss": 0.0319,
"step": 48
},
{
"epoch": 0.23946243127672573,
"grad_norm": 0.31073084473609924,
"learning_rate": 8.160893417636122e-05,
"loss": 0.0362,
"step": 49
},
{
"epoch": 0.24434941967012827,
"grad_norm": 0.2531058192253113,
"learning_rate": 8.081093963579707e-05,
"loss": 0.0174,
"step": 50
},
{
"epoch": 0.24434941967012827,
"eval_loss": 0.033168304711580276,
"eval_runtime": 3.2218,
"eval_samples_per_second": 15.519,
"eval_steps_per_second": 4.035,
"step": 50
},
{
"epoch": 0.24923640806353084,
"grad_norm": 0.4061261713504791,
"learning_rate": 8.000066048588211e-05,
"loss": 0.0797,
"step": 51
},
{
"epoch": 0.25412339645693344,
"grad_norm": 0.21066705882549286,
"learning_rate": 7.917848237560709e-05,
"loss": 0.0657,
"step": 52
},
{
"epoch": 0.259010384850336,
"grad_norm": 0.30424046516418457,
"learning_rate": 7.834479661722347e-05,
"loss": 0.0624,
"step": 53
},
{
"epoch": 0.2638973732437385,
"grad_norm": 0.32400649785995483,
"learning_rate": 7.75e-05,
"loss": 0.0554,
"step": 54
},
{
"epoch": 0.2687843616371411,
"grad_norm": 0.3764171004295349,
"learning_rate": 7.664449460137245e-05,
"loss": 0.0549,
"step": 55
},
{
"epoch": 0.27367135003054366,
"grad_norm": 0.2119298130273819,
"learning_rate": 7.577868759557654e-05,
"loss": 0.0487,
"step": 56
},
{
"epoch": 0.27855833842394623,
"grad_norm": 0.24470122158527374,
"learning_rate": 7.490299105985507e-05,
"loss": 0.0454,
"step": 57
},
{
"epoch": 0.2834453268173488,
"grad_norm": 0.2825168967247009,
"learning_rate": 7.401782177833148e-05,
"loss": 0.0414,
"step": 58
},
{
"epoch": 0.28833231521075137,
"grad_norm": 0.13654862344264984,
"learning_rate": 7.312360104364318e-05,
"loss": 0.0376,
"step": 59
},
{
"epoch": 0.29321930360415394,
"grad_norm": 0.18448443710803986,
"learning_rate": 7.222075445642904e-05,
"loss": 0.0339,
"step": 60
},
{
"epoch": 0.2981062919975565,
"grad_norm": 0.0950138047337532,
"learning_rate": 7.130971172276657e-05,
"loss": 0.02,
"step": 61
},
{
"epoch": 0.3029932803909591,
"grad_norm": 0.18261590600013733,
"learning_rate": 7.03909064496551e-05,
"loss": 0.0153,
"step": 62
},
{
"epoch": 0.30788026878436164,
"grad_norm": 0.1410977840423584,
"learning_rate": 6.946477593864228e-05,
"loss": 0.0378,
"step": 63
},
{
"epoch": 0.3127672571777642,
"grad_norm": 0.21909953653812408,
"learning_rate": 6.853176097769229e-05,
"loss": 0.0525,
"step": 64
},
{
"epoch": 0.3176542455711668,
"grad_norm": 0.2571473717689514,
"learning_rate": 6.759230563139466e-05,
"loss": 0.064,
"step": 65
},
{
"epoch": 0.32254123396456935,
"grad_norm": 0.17875425517559052,
"learning_rate": 6.664685702961344e-05,
"loss": 0.0489,
"step": 66
},
{
"epoch": 0.3274282223579719,
"grad_norm": 0.1534833163022995,
"learning_rate": 6.56958651546778e-05,
"loss": 0.0453,
"step": 67
},
{
"epoch": 0.3323152107513745,
"grad_norm": 0.1610105186700821,
"learning_rate": 6.473978262721463e-05,
"loss": 0.0397,
"step": 68
},
{
"epoch": 0.337202199144777,
"grad_norm": 0.16883207857608795,
"learning_rate": 6.377906449072578e-05,
"loss": 0.035,
"step": 69
},
{
"epoch": 0.3420891875381796,
"grad_norm": 0.16259914636611938,
"learning_rate": 6.281416799501188e-05,
"loss": 0.0289,
"step": 70
},
{
"epoch": 0.34697617593158214,
"grad_norm": 0.18017540872097015,
"learning_rate": 6.184555237854625e-05,
"loss": 0.0307,
"step": 71
},
{
"epoch": 0.3518631643249847,
"grad_norm": 0.15212522447109222,
"learning_rate": 6.087367864990233e-05,
"loss": 0.0278,
"step": 72
},
{
"epoch": 0.3567501527183873,
"grad_norm": 0.1106053963303566,
"learning_rate": 5.989900936833841e-05,
"loss": 0.019,
"step": 73
},
{
"epoch": 0.36163714111178985,
"grad_norm": 0.11007523536682129,
"learning_rate": 5.8922008423644624e-05,
"loss": 0.0181,
"step": 74
},
{
"epoch": 0.3665241295051924,
"grad_norm": 0.04825620353221893,
"learning_rate": 5.794314081535644e-05,
"loss": 0.0037,
"step": 75
},
{
"epoch": 0.3665241295051924,
"eval_loss": 0.017774144187569618,
"eval_runtime": 3.2279,
"eval_samples_per_second": 15.49,
"eval_steps_per_second": 4.027,
"step": 75
},
{
"epoch": 0.371411117898595,
"grad_norm": 0.3393837809562683,
"learning_rate": 5.696287243144013e-05,
"loss": 0.0564,
"step": 76
},
{
"epoch": 0.37629810629199756,
"grad_norm": 0.19871211051940918,
"learning_rate": 5.598166982655526e-05,
"loss": 0.0472,
"step": 77
},
{
"epoch": 0.3811850946854001,
"grad_norm": 0.20391109585762024,
"learning_rate": 5.500000000000001e-05,
"loss": 0.0435,
"step": 78
},
{
"epoch": 0.3860720830788027,
"grad_norm": 0.18788817524909973,
"learning_rate": 5.4018330173444754e-05,
"loss": 0.0379,
"step": 79
},
{
"epoch": 0.39095907147220527,
"grad_norm": 0.14316879212856293,
"learning_rate": 5.303712756855988e-05,
"loss": 0.0283,
"step": 80
},
{
"epoch": 0.39584605986560784,
"grad_norm": 0.17786382138729095,
"learning_rate": 5.205685918464356e-05,
"loss": 0.027,
"step": 81
},
{
"epoch": 0.4007330482590104,
"grad_norm": 0.21436955034732819,
"learning_rate": 5.107799157635538e-05,
"loss": 0.0309,
"step": 82
},
{
"epoch": 0.405620036652413,
"grad_norm": 0.16341635584831238,
"learning_rate": 5.0100990631661606e-05,
"loss": 0.0287,
"step": 83
},
{
"epoch": 0.41050702504581554,
"grad_norm": 0.19714505970478058,
"learning_rate": 4.912632135009769e-05,
"loss": 0.0267,
"step": 84
},
{
"epoch": 0.41539401343921806,
"grad_norm": 0.1616361290216446,
"learning_rate": 4.8154447621453744e-05,
"loss": 0.0217,
"step": 85
},
{
"epoch": 0.4202810018326206,
"grad_norm": 0.11600978672504425,
"learning_rate": 4.718583200498814e-05,
"loss": 0.0178,
"step": 86
},
{
"epoch": 0.4251679902260232,
"grad_norm": 0.10082818567752838,
"learning_rate": 4.6220935509274235e-05,
"loss": 0.0108,
"step": 87
},
{
"epoch": 0.43005497861942577,
"grad_norm": 0.21947574615478516,
"learning_rate": 4.526021737278538e-05,
"loss": 0.0339,
"step": 88
},
{
"epoch": 0.43494196701282833,
"grad_norm": 0.231426402926445,
"learning_rate": 4.430413484532222e-05,
"loss": 0.0479,
"step": 89
},
{
"epoch": 0.4398289554062309,
"grad_norm": 0.23115426301956177,
"learning_rate": 4.3353142970386564e-05,
"loss": 0.0427,
"step": 90
},
{
"epoch": 0.4447159437996335,
"grad_norm": 0.19273918867111206,
"learning_rate": 4.240769436860537e-05,
"loss": 0.0372,
"step": 91
},
{
"epoch": 0.44960293219303604,
"grad_norm": 0.17096419632434845,
"learning_rate": 4.146823902230772e-05,
"loss": 0.0293,
"step": 92
},
{
"epoch": 0.4544899205864386,
"grad_norm": 0.15599671006202698,
"learning_rate": 4.053522406135775e-05,
"loss": 0.0252,
"step": 93
},
{
"epoch": 0.4593769089798412,
"grad_norm": 0.14636379480361938,
"learning_rate": 3.960909355034491e-05,
"loss": 0.0289,
"step": 94
},
{
"epoch": 0.46426389737324375,
"grad_norm": 0.1349724531173706,
"learning_rate": 3.8690288277233435e-05,
"loss": 0.021,
"step": 95
},
{
"epoch": 0.4691508857666463,
"grad_norm": 0.18591056764125824,
"learning_rate": 3.777924554357096e-05,
"loss": 0.0206,
"step": 96
},
{
"epoch": 0.4740378741600489,
"grad_norm": 0.1168551817536354,
"learning_rate": 3.687639895635684e-05,
"loss": 0.017,
"step": 97
},
{
"epoch": 0.47892486255345146,
"grad_norm": 0.15066345036029816,
"learning_rate": 3.598217822166854e-05,
"loss": 0.0151,
"step": 98
},
{
"epoch": 0.483811850946854,
"grad_norm": 0.10822492092847824,
"learning_rate": 3.509700894014496e-05,
"loss": 0.0098,
"step": 99
},
{
"epoch": 0.48869883934025654,
"grad_norm": 0.09653550386428833,
"learning_rate": 3.422131240442349e-05,
"loss": 0.0064,
"step": 100
},
{
"epoch": 0.48869883934025654,
"eval_loss": 0.012017174623906612,
"eval_runtime": 3.2213,
"eval_samples_per_second": 15.522,
"eval_steps_per_second": 4.036,
"step": 100
},
{
"epoch": 0.4935858277336591,
"grad_norm": 0.24904842674732208,
"learning_rate": 3.3355505398627566e-05,
"loss": 0.0339,
"step": 101
},
{
"epoch": 0.4984728161270617,
"grad_norm": 0.2502872347831726,
"learning_rate": 3.250000000000001e-05,
"loss": 0.0348,
"step": 102
},
{
"epoch": 0.5033598045204642,
"grad_norm": 0.27058976888656616,
"learning_rate": 3.165520338277653e-05,
"loss": 0.0306,
"step": 103
},
{
"epoch": 0.5082467929138669,
"grad_norm": 0.16723230481147766,
"learning_rate": 3.082151762439293e-05,
"loss": 0.0215,
"step": 104
},
{
"epoch": 0.5131337813072694,
"grad_norm": 0.2476491630077362,
"learning_rate": 2.9999339514117912e-05,
"loss": 0.0243,
"step": 105
},
{
"epoch": 0.518020769700672,
"grad_norm": 0.16993577778339386,
"learning_rate": 2.9189060364202943e-05,
"loss": 0.022,
"step": 106
},
{
"epoch": 0.5229077580940745,
"grad_norm": 0.20630362629890442,
"learning_rate": 2.8391065823638806e-05,
"loss": 0.0214,
"step": 107
},
{
"epoch": 0.527794746487477,
"grad_norm": 0.18468116223812103,
"learning_rate": 2.760573569460757e-05,
"loss": 0.0224,
"step": 108
},
{
"epoch": 0.5326817348808797,
"grad_norm": 0.12725146114826202,
"learning_rate": 2.6833443751717347e-05,
"loss": 0.0101,
"step": 109
},
{
"epoch": 0.5375687232742822,
"grad_norm": 0.14481593668460846,
"learning_rate": 2.6074557564105727e-05,
"loss": 0.0165,
"step": 110
},
{
"epoch": 0.5424557116676848,
"grad_norm": 0.14558053016662598,
"learning_rate": 2.53294383204969e-05,
"loss": 0.0191,
"step": 111
},
{
"epoch": 0.5473427000610873,
"grad_norm": 0.09826002269983292,
"learning_rate": 2.459844065729529e-05,
"loss": 0.005,
"step": 112
},
{
"epoch": 0.5522296884544899,
"grad_norm": 0.2601858973503113,
"learning_rate": 2.3881912489797885e-05,
"loss": 0.0232,
"step": 113
},
{
"epoch": 0.5571166768478925,
"grad_norm": 0.2997516393661499,
"learning_rate": 2.3180194846605367e-05,
"loss": 0.0294,
"step": 114
},
{
"epoch": 0.5620036652412951,
"grad_norm": 0.2502405345439911,
"learning_rate": 2.2493621707311002e-05,
"loss": 0.0341,
"step": 115
},
{
"epoch": 0.5668906536346976,
"grad_norm": 0.21258173882961273,
"learning_rate": 2.1822519843544424e-05,
"loss": 0.0251,
"step": 116
},
{
"epoch": 0.5717776420281002,
"grad_norm": 0.2167203426361084,
"learning_rate": 2.1167208663446025e-05,
"loss": 0.0233,
"step": 117
},
{
"epoch": 0.5766646304215027,
"grad_norm": 0.20736785233020782,
"learning_rate": 2.0528000059645997e-05,
"loss": 0.0238,
"step": 118
},
{
"epoch": 0.5815516188149054,
"grad_norm": 0.18189823627471924,
"learning_rate": 1.9905198260820328e-05,
"loss": 0.0239,
"step": 119
},
{
"epoch": 0.5864386072083079,
"grad_norm": 0.17831675708293915,
"learning_rate": 1.9299099686894423e-05,
"loss": 0.0139,
"step": 120
},
{
"epoch": 0.5913255956017105,
"grad_norm": 0.17311929166316986,
"learning_rate": 1.8709992807963285e-05,
"loss": 0.0155,
"step": 121
},
{
"epoch": 0.596212583995113,
"grad_norm": 0.21193860471248627,
"learning_rate": 1.8138158006995364e-05,
"loss": 0.0138,
"step": 122
},
{
"epoch": 0.6010995723885155,
"grad_norm": 0.12439697980880737,
"learning_rate": 1.758386744638546e-05,
"loss": 0.0081,
"step": 123
},
{
"epoch": 0.6059865607819181,
"grad_norm": 0.1485549807548523,
"learning_rate": 1.7047384938420154e-05,
"loss": 0.0061,
"step": 124
},
{
"epoch": 0.6108735491753207,
"grad_norm": 0.0644276961684227,
"learning_rate": 1.6528965819717413e-05,
"loss": 0.002,
"step": 125
},
{
"epoch": 0.6108735491753207,
"eval_loss": 0.006207775324583054,
"eval_runtime": 3.2392,
"eval_samples_per_second": 15.436,
"eval_steps_per_second": 4.013,
"step": 125
},
{
"epoch": 0.6157605375687233,
"grad_norm": 0.2093576341867447,
"learning_rate": 1.602885682970026e-05,
"loss": 0.0323,
"step": 126
},
{
"epoch": 0.6206475259621258,
"grad_norm": 0.24124690890312195,
"learning_rate": 1.5547295993162156e-05,
"loss": 0.0263,
"step": 127
},
{
"epoch": 0.6255345143555284,
"grad_norm": 0.22997763752937317,
"learning_rate": 1.5084512506980026e-05,
"loss": 0.0246,
"step": 128
},
{
"epoch": 0.6304215027489309,
"grad_norm": 0.17232383787631989,
"learning_rate": 1.464072663102903e-05,
"loss": 0.024,
"step": 129
},
{
"epoch": 0.6353084911423336,
"grad_norm": 0.20387302339076996,
"learning_rate": 1.4216149583350754e-05,
"loss": 0.021,
"step": 130
},
{
"epoch": 0.6401954795357361,
"grad_norm": 0.19521893560886383,
"learning_rate": 1.3810983439624881e-05,
"loss": 0.0224,
"step": 131
},
{
"epoch": 0.6450824679291387,
"grad_norm": 0.16281966865062714,
"learning_rate": 1.3425421036992098e-05,
"loss": 0.0145,
"step": 132
},
{
"epoch": 0.6499694563225412,
"grad_norm": 0.13637928664684296,
"learning_rate": 1.305964588227407e-05,
"loss": 0.0133,
"step": 133
},
{
"epoch": 0.6548564447159438,
"grad_norm": 0.194955512881279,
"learning_rate": 1.2713832064634126e-05,
"loss": 0.0181,
"step": 134
},
{
"epoch": 0.6597434331093464,
"grad_norm": 0.12221532315015793,
"learning_rate": 1.2388144172720251e-05,
"loss": 0.009,
"step": 135
},
{
"epoch": 0.664630421502749,
"grad_norm": 0.14309658110141754,
"learning_rate": 1.2082737216329794e-05,
"loss": 0.01,
"step": 136
},
{
"epoch": 0.6695174098961515,
"grad_norm": 0.1093713566660881,
"learning_rate": 1.1797756552633215e-05,
"loss": 0.0051,
"step": 137
},
{
"epoch": 0.674404398289554,
"grad_norm": 0.2528100609779358,
"learning_rate": 1.1533337816991932e-05,
"loss": 0.0218,
"step": 138
},
{
"epoch": 0.6792913866829566,
"grad_norm": 0.27401408553123474,
"learning_rate": 1.1289606858403237e-05,
"loss": 0.0312,
"step": 139
},
{
"epoch": 0.6841783750763591,
"grad_norm": 0.20631706714630127,
"learning_rate": 1.1066679679603e-05,
"loss": 0.0255,
"step": 140
},
{
"epoch": 0.6890653634697618,
"grad_norm": 0.2128758430480957,
"learning_rate": 1.0864662381854632e-05,
"loss": 0.0194,
"step": 141
},
{
"epoch": 0.6939523518631643,
"grad_norm": 0.24871855974197388,
"learning_rate": 1.0683651114450641e-05,
"loss": 0.0204,
"step": 142
},
{
"epoch": 0.6988393402565669,
"grad_norm": 0.2161489576101303,
"learning_rate": 1.0523732028950771e-05,
"loss": 0.0172,
"step": 143
},
{
"epoch": 0.7037263286499694,
"grad_norm": 0.1810789704322815,
"learning_rate": 1.0384981238178534e-05,
"loss": 0.0137,
"step": 144
},
{
"epoch": 0.708613317043372,
"grad_norm": 0.1576724499464035,
"learning_rate": 1.0267464779995617e-05,
"loss": 0.014,
"step": 145
},
{
"epoch": 0.7135003054367746,
"grad_norm": 0.17585541307926178,
"learning_rate": 1.017123858587145e-05,
"loss": 0.0117,
"step": 146
},
{
"epoch": 0.7183872938301772,
"grad_norm": 0.13925491273403168,
"learning_rate": 1.0096348454262845e-05,
"loss": 0.0111,
"step": 147
},
{
"epoch": 0.7232742822235797,
"grad_norm": 0.11349561810493469,
"learning_rate": 1.00428300288164e-05,
"loss": 0.0082,
"step": 148
},
{
"epoch": 0.7281612706169823,
"grad_norm": 0.11688259989023209,
"learning_rate": 1.001070878140409e-05,
"loss": 0.0054,
"step": 149
},
{
"epoch": 0.7330482590103848,
"grad_norm": 0.05643425136804581,
"learning_rate": 1e-05,
"loss": 0.0018,
"step": 150
},
{
"epoch": 0.7330482590103848,
"eval_loss": 0.004802440293133259,
"eval_runtime": 3.2337,
"eval_samples_per_second": 15.462,
"eval_steps_per_second": 4.02,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 150,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6872173431947264e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}