risolmayo's picture
Training in progress, step 200, checkpoint
31f5350 verified
{
"best_metric": 0.696293830871582,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.03715745471435207,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00018578727357176033,
"grad_norm": 4.046935558319092,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3446,
"step": 1
},
{
"epoch": 0.00018578727357176033,
"eval_loss": 1.5395567417144775,
"eval_runtime": 210.244,
"eval_samples_per_second": 43.121,
"eval_steps_per_second": 10.783,
"step": 1
},
{
"epoch": 0.00037157454714352065,
"grad_norm": 5.148815631866455,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.5422,
"step": 2
},
{
"epoch": 0.000557361820715281,
"grad_norm": 8.490533828735352,
"learning_rate": 2.4e-05,
"loss": 1.7292,
"step": 3
},
{
"epoch": 0.0007431490942870413,
"grad_norm": 29.692520141601562,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.6123,
"step": 4
},
{
"epoch": 0.0009289363678588017,
"grad_norm": 23.357440948486328,
"learning_rate": 4e-05,
"loss": 1.7596,
"step": 5
},
{
"epoch": 0.001114723641430562,
"grad_norm": 20.767066955566406,
"learning_rate": 4.8e-05,
"loss": 1.3862,
"step": 6
},
{
"epoch": 0.0013005109150023223,
"grad_norm": 2.4082024097442627,
"learning_rate": 5.6e-05,
"loss": 1.1814,
"step": 7
},
{
"epoch": 0.0014862981885740826,
"grad_norm": 2.5414106845855713,
"learning_rate": 6.400000000000001e-05,
"loss": 1.1491,
"step": 8
},
{
"epoch": 0.001672085462145843,
"grad_norm": 1.9966752529144287,
"learning_rate": 7.2e-05,
"loss": 1.0002,
"step": 9
},
{
"epoch": 0.0018578727357176034,
"grad_norm": 2.459282398223877,
"learning_rate": 8e-05,
"loss": 1.0005,
"step": 10
},
{
"epoch": 0.0020436600092893636,
"grad_norm": 1.952752709388733,
"learning_rate": 7.999453219969877e-05,
"loss": 0.8618,
"step": 11
},
{
"epoch": 0.002229447282861124,
"grad_norm": 1.8736774921417236,
"learning_rate": 7.997813029363704e-05,
"loss": 0.8635,
"step": 12
},
{
"epoch": 0.002415234556432884,
"grad_norm": 2.54675555229187,
"learning_rate": 7.99507987659322e-05,
"loss": 0.8823,
"step": 13
},
{
"epoch": 0.0026010218300046447,
"grad_norm": 2.0304737091064453,
"learning_rate": 7.991254508875098e-05,
"loss": 0.9139,
"step": 14
},
{
"epoch": 0.002786809103576405,
"grad_norm": 1.8423620462417603,
"learning_rate": 7.98633797202668e-05,
"loss": 0.8128,
"step": 15
},
{
"epoch": 0.0029725963771481652,
"grad_norm": 4.305455684661865,
"learning_rate": 7.980331610180046e-05,
"loss": 0.9064,
"step": 16
},
{
"epoch": 0.0031583836507199257,
"grad_norm": 3.134892225265503,
"learning_rate": 7.973237065414553e-05,
"loss": 0.901,
"step": 17
},
{
"epoch": 0.003344170924291686,
"grad_norm": 1.7530468702316284,
"learning_rate": 7.965056277307902e-05,
"loss": 0.8444,
"step": 18
},
{
"epoch": 0.0035299581978634463,
"grad_norm": 1.8036566972732544,
"learning_rate": 7.955791482405875e-05,
"loss": 0.9215,
"step": 19
},
{
"epoch": 0.0037157454714352067,
"grad_norm": 1.2477421760559082,
"learning_rate": 7.94544521361089e-05,
"loss": 0.7512,
"step": 20
},
{
"epoch": 0.0039015327450069672,
"grad_norm": 1.4336917400360107,
"learning_rate": 7.93402029948953e-05,
"loss": 0.7466,
"step": 21
},
{
"epoch": 0.004087320018578727,
"grad_norm": 1.7280688285827637,
"learning_rate": 7.921519863499239e-05,
"loss": 0.7877,
"step": 22
},
{
"epoch": 0.004273107292150488,
"grad_norm": 1.2563620805740356,
"learning_rate": 7.907947323134398e-05,
"loss": 0.6979,
"step": 23
},
{
"epoch": 0.004458894565722248,
"grad_norm": 1.380014181137085,
"learning_rate": 7.893306388992023e-05,
"loss": 0.7668,
"step": 24
},
{
"epoch": 0.004644681839294009,
"grad_norm": 1.2533624172210693,
"learning_rate": 7.877601063757323e-05,
"loss": 0.7324,
"step": 25
},
{
"epoch": 0.004830469112865768,
"grad_norm": 1.1784394979476929,
"learning_rate": 7.860835641109395e-05,
"loss": 0.595,
"step": 26
},
{
"epoch": 0.005016256386437529,
"grad_norm": 1.1498676538467407,
"learning_rate": 7.843014704547393e-05,
"loss": 0.6538,
"step": 27
},
{
"epoch": 0.005202043660009289,
"grad_norm": 1.195434331893921,
"learning_rate": 7.824143126137431e-05,
"loss": 0.6842,
"step": 28
},
{
"epoch": 0.00538783093358105,
"grad_norm": 1.0709013938903809,
"learning_rate": 7.804226065180615e-05,
"loss": 0.611,
"step": 29
},
{
"epoch": 0.00557361820715281,
"grad_norm": 1.418430209159851,
"learning_rate": 7.783268966802539e-05,
"loss": 0.9021,
"step": 30
},
{
"epoch": 0.00575940548072457,
"grad_norm": 1.3018062114715576,
"learning_rate": 7.761277560464645e-05,
"loss": 0.7383,
"step": 31
},
{
"epoch": 0.0059451927542963304,
"grad_norm": 1.2901121377944946,
"learning_rate": 7.738257858397844e-05,
"loss": 0.704,
"step": 32
},
{
"epoch": 0.006130980027868091,
"grad_norm": 1.2701770067214966,
"learning_rate": 7.71421615395883e-05,
"loss": 0.8416,
"step": 33
},
{
"epoch": 0.006316767301439851,
"grad_norm": 1.303396224975586,
"learning_rate": 7.68915901990954e-05,
"loss": 0.7444,
"step": 34
},
{
"epoch": 0.006502554575011612,
"grad_norm": 1.3861703872680664,
"learning_rate": 7.663093306620231e-05,
"loss": 0.7191,
"step": 35
},
{
"epoch": 0.006688341848583372,
"grad_norm": 1.503480315208435,
"learning_rate": 7.636026140196651e-05,
"loss": 0.8693,
"step": 36
},
{
"epoch": 0.006874129122155132,
"grad_norm": 1.307245135307312,
"learning_rate": 7.607964920531837e-05,
"loss": 0.7712,
"step": 37
},
{
"epoch": 0.0070599163957268925,
"grad_norm": 1.2363077402114868,
"learning_rate": 7.578917319283055e-05,
"loss": 0.7318,
"step": 38
},
{
"epoch": 0.007245703669298653,
"grad_norm": 1.183120608329773,
"learning_rate": 7.548891277774448e-05,
"loss": 0.8098,
"step": 39
},
{
"epoch": 0.0074314909428704135,
"grad_norm": 1.3283722400665283,
"learning_rate": 7.517895004825956e-05,
"loss": 0.7582,
"step": 40
},
{
"epoch": 0.007617278216442174,
"grad_norm": 1.2799726724624634,
"learning_rate": 7.48593697450911e-05,
"loss": 0.8915,
"step": 41
},
{
"epoch": 0.0078030654900139345,
"grad_norm": 1.4513181447982788,
"learning_rate": 7.453025923830296e-05,
"loss": 0.8696,
"step": 42
},
{
"epoch": 0.007988852763585694,
"grad_norm": 1.2473115921020508,
"learning_rate": 7.419170850342156e-05,
"loss": 0.892,
"step": 43
},
{
"epoch": 0.008174640037157455,
"grad_norm": 1.3187384605407715,
"learning_rate": 7.384381009683742e-05,
"loss": 0.9073,
"step": 44
},
{
"epoch": 0.008360427310729215,
"grad_norm": 1.25960111618042,
"learning_rate": 7.348665913050115e-05,
"loss": 0.9463,
"step": 45
},
{
"epoch": 0.008546214584300976,
"grad_norm": 1.1852588653564453,
"learning_rate": 7.312035324592081e-05,
"loss": 0.8179,
"step": 46
},
{
"epoch": 0.008732001857872736,
"grad_norm": 1.2183889150619507,
"learning_rate": 7.274499258746771e-05,
"loss": 0.7668,
"step": 47
},
{
"epoch": 0.008917789131444497,
"grad_norm": 1.3253717422485352,
"learning_rate": 7.236067977499791e-05,
"loss": 0.7596,
"step": 48
},
{
"epoch": 0.009103576405016257,
"grad_norm": 1.3234570026397705,
"learning_rate": 7.196751987579699e-05,
"loss": 0.7846,
"step": 49
},
{
"epoch": 0.009289363678588018,
"grad_norm": 1.4833699464797974,
"learning_rate": 7.156562037585576e-05,
"loss": 0.8507,
"step": 50
},
{
"epoch": 0.009289363678588018,
"eval_loss": 0.7784072160720825,
"eval_runtime": 210.9709,
"eval_samples_per_second": 42.973,
"eval_steps_per_second": 10.746,
"step": 50
},
{
"epoch": 0.009475150952159776,
"grad_norm": 1.1448761224746704,
"learning_rate": 7.11550911504845e-05,
"loss": 0.7059,
"step": 51
},
{
"epoch": 0.009660938225731537,
"grad_norm": 1.5964897871017456,
"learning_rate": 7.073604443427437e-05,
"loss": 1.0514,
"step": 52
},
{
"epoch": 0.009846725499303297,
"grad_norm": 1.4827876091003418,
"learning_rate": 7.03085947904134e-05,
"loss": 1.0776,
"step": 53
},
{
"epoch": 0.010032512772875058,
"grad_norm": 1.5401029586791992,
"learning_rate": 6.987285907936617e-05,
"loss": 1.0623,
"step": 54
},
{
"epoch": 0.010218300046446818,
"grad_norm": 1.462876319885254,
"learning_rate": 6.942895642692527e-05,
"loss": 1.0345,
"step": 55
},
{
"epoch": 0.010404087320018579,
"grad_norm": 1.5211715698242188,
"learning_rate": 6.897700819164357e-05,
"loss": 1.0968,
"step": 56
},
{
"epoch": 0.01058987459359034,
"grad_norm": 1.2720927000045776,
"learning_rate": 6.851713793165589e-05,
"loss": 0.8471,
"step": 57
},
{
"epoch": 0.0107756618671621,
"grad_norm": 1.2138237953186035,
"learning_rate": 6.804947137089955e-05,
"loss": 0.7272,
"step": 58
},
{
"epoch": 0.01096144914073386,
"grad_norm": 1.2674663066864014,
"learning_rate": 6.757413636474263e-05,
"loss": 0.6568,
"step": 59
},
{
"epoch": 0.01114723641430562,
"grad_norm": 0.9728902578353882,
"learning_rate": 6.709126286502965e-05,
"loss": 0.707,
"step": 60
},
{
"epoch": 0.011333023687877381,
"grad_norm": 1.2180075645446777,
"learning_rate": 6.660098288455393e-05,
"loss": 0.8588,
"step": 61
},
{
"epoch": 0.01151881096144914,
"grad_norm": 1.1713515520095825,
"learning_rate": 6.610343046096674e-05,
"loss": 0.7691,
"step": 62
},
{
"epoch": 0.0117045982350209,
"grad_norm": 1.0399553775787354,
"learning_rate": 6.559874162013267e-05,
"loss": 0.7024,
"step": 63
},
{
"epoch": 0.011890385508592661,
"grad_norm": 0.8005794286727905,
"learning_rate": 6.508705433894149e-05,
"loss": 0.5976,
"step": 64
},
{
"epoch": 0.012076172782164421,
"grad_norm": 1.0212355852127075,
"learning_rate": 6.456850850758673e-05,
"loss": 0.7279,
"step": 65
},
{
"epoch": 0.012261960055736182,
"grad_norm": 0.9839227795600891,
"learning_rate": 6.404324589132101e-05,
"loss": 0.714,
"step": 66
},
{
"epoch": 0.012447747329307942,
"grad_norm": 0.9003210067749023,
"learning_rate": 6.351141009169893e-05,
"loss": 0.7392,
"step": 67
},
{
"epoch": 0.012633534602879703,
"grad_norm": 0.9685829877853394,
"learning_rate": 6.297314650731775e-05,
"loss": 0.697,
"step": 68
},
{
"epoch": 0.012819321876451463,
"grad_norm": 0.8940137624740601,
"learning_rate": 6.242860229406692e-05,
"loss": 0.5469,
"step": 69
},
{
"epoch": 0.013005109150023224,
"grad_norm": 0.9139478206634521,
"learning_rate": 6.18779263248971e-05,
"loss": 0.6808,
"step": 70
},
{
"epoch": 0.013190896423594984,
"grad_norm": 1.0126328468322754,
"learning_rate": 6.132126914911976e-05,
"loss": 0.6446,
"step": 71
},
{
"epoch": 0.013376683697166745,
"grad_norm": 0.7734840512275696,
"learning_rate": 6.075878295124861e-05,
"loss": 0.6272,
"step": 72
},
{
"epoch": 0.013562470970738504,
"grad_norm": 0.7684584856033325,
"learning_rate": 6.019062150939376e-05,
"loss": 0.5738,
"step": 73
},
{
"epoch": 0.013748258244310264,
"grad_norm": 0.9124207496643066,
"learning_rate": 5.9616940153220336e-05,
"loss": 0.623,
"step": 74
},
{
"epoch": 0.013934045517882025,
"grad_norm": 0.8309489488601685,
"learning_rate": 5.903789572148295e-05,
"loss": 0.5815,
"step": 75
},
{
"epoch": 0.014119832791453785,
"grad_norm": 0.7745351791381836,
"learning_rate": 5.845364651914752e-05,
"loss": 0.6165,
"step": 76
},
{
"epoch": 0.014305620065025546,
"grad_norm": 0.9630031585693359,
"learning_rate": 5.786435227411227e-05,
"loss": 0.6886,
"step": 77
},
{
"epoch": 0.014491407338597306,
"grad_norm": 0.8420267105102539,
"learning_rate": 5.727017409353971e-05,
"loss": 0.6423,
"step": 78
},
{
"epoch": 0.014677194612169066,
"grad_norm": 0.9119953513145447,
"learning_rate": 5.667127441981162e-05,
"loss": 0.7206,
"step": 79
},
{
"epoch": 0.014862981885740827,
"grad_norm": 1.015648365020752,
"learning_rate": 5.606781698611879e-05,
"loss": 0.6322,
"step": 80
},
{
"epoch": 0.015048769159312587,
"grad_norm": 0.9261860251426697,
"learning_rate": 5.5459966771698096e-05,
"loss": 0.6806,
"step": 81
},
{
"epoch": 0.015234556432884348,
"grad_norm": 0.8736683130264282,
"learning_rate": 5.4847889956728834e-05,
"loss": 0.674,
"step": 82
},
{
"epoch": 0.015420343706456108,
"grad_norm": 0.9856055378913879,
"learning_rate": 5.423175387690067e-05,
"loss": 0.8235,
"step": 83
},
{
"epoch": 0.015606130980027869,
"grad_norm": 1.0767531394958496,
"learning_rate": 5.361172697766573e-05,
"loss": 0.778,
"step": 84
},
{
"epoch": 0.01579191825359963,
"grad_norm": 0.8056624531745911,
"learning_rate": 5.298797876818735e-05,
"loss": 0.617,
"step": 85
},
{
"epoch": 0.015977705527171388,
"grad_norm": 1.071303367614746,
"learning_rate": 5.23606797749979e-05,
"loss": 0.7675,
"step": 86
},
{
"epoch": 0.01616349280074315,
"grad_norm": 0.9511001110076904,
"learning_rate": 5.17300014953786e-05,
"loss": 0.7353,
"step": 87
},
{
"epoch": 0.01634928007431491,
"grad_norm": 0.9310784339904785,
"learning_rate": 5.109611635047379e-05,
"loss": 0.6954,
"step": 88
},
{
"epoch": 0.01653506734788667,
"grad_norm": 1.0314819812774658,
"learning_rate": 5.04591976381528e-05,
"loss": 0.7053,
"step": 89
},
{
"epoch": 0.01672085462145843,
"grad_norm": 0.9734024405479431,
"learning_rate": 4.981941948563197e-05,
"loss": 0.7357,
"step": 90
},
{
"epoch": 0.01690664189503019,
"grad_norm": 0.9812660217285156,
"learning_rate": 4.9176956801870065e-05,
"loss": 0.705,
"step": 91
},
{
"epoch": 0.01709242916860195,
"grad_norm": 1.061806082725525,
"learning_rate": 4.853198522974988e-05,
"loss": 0.7836,
"step": 92
},
{
"epoch": 0.01727821644217371,
"grad_norm": 1.190076470375061,
"learning_rate": 4.788468109805921e-05,
"loss": 0.8644,
"step": 93
},
{
"epoch": 0.017464003715745472,
"grad_norm": 1.0145090818405151,
"learning_rate": 4.7235221373284407e-05,
"loss": 0.7877,
"step": 94
},
{
"epoch": 0.01764979098931723,
"grad_norm": 1.1526635885238647,
"learning_rate": 4.658378361122936e-05,
"loss": 0.8445,
"step": 95
},
{
"epoch": 0.017835578262888993,
"grad_norm": 1.0449837446212769,
"learning_rate": 4.593054590847368e-05,
"loss": 0.8328,
"step": 96
},
{
"epoch": 0.018021365536460752,
"grad_norm": 1.0370270013809204,
"learning_rate": 4.5275686853682765e-05,
"loss": 0.7437,
"step": 97
},
{
"epoch": 0.018207152810032514,
"grad_norm": 1.0356221199035645,
"learning_rate": 4.4619385478783456e-05,
"loss": 0.7405,
"step": 98
},
{
"epoch": 0.018392940083604273,
"grad_norm": 1.0345137119293213,
"learning_rate": 4.396182121001852e-05,
"loss": 0.6983,
"step": 99
},
{
"epoch": 0.018578727357176035,
"grad_norm": 1.3793245553970337,
"learning_rate": 4.33031738188933e-05,
"loss": 0.9343,
"step": 100
},
{
"epoch": 0.018578727357176035,
"eval_loss": 0.7299540638923645,
"eval_runtime": 210.9599,
"eval_samples_per_second": 42.975,
"eval_steps_per_second": 10.746,
"step": 100
},
{
"epoch": 0.018764514630747794,
"grad_norm": 1.1361013650894165,
"learning_rate": 4.264362337302798e-05,
"loss": 0.6842,
"step": 101
},
{
"epoch": 0.018950301904319553,
"grad_norm": 1.120656132698059,
"learning_rate": 4.1983350186928894e-05,
"loss": 0.943,
"step": 102
},
{
"epoch": 0.019136089177891315,
"grad_norm": 1.1091196537017822,
"learning_rate": 4.132253477269233e-05,
"loss": 0.8101,
"step": 103
},
{
"epoch": 0.019321876451463074,
"grad_norm": 1.1574037075042725,
"learning_rate": 4.0661357790654345e-05,
"loss": 0.9537,
"step": 104
},
{
"epoch": 0.019507663725034836,
"grad_norm": 1.3250733613967896,
"learning_rate": 4e-05,
"loss": 1.0247,
"step": 105
},
{
"epoch": 0.019693450998606594,
"grad_norm": 1.2618422508239746,
"learning_rate": 3.933864220934566e-05,
"loss": 0.9238,
"step": 106
},
{
"epoch": 0.019879238272178357,
"grad_norm": 1.1964272260665894,
"learning_rate": 3.8677465227307676e-05,
"loss": 0.8356,
"step": 107
},
{
"epoch": 0.020065025545750115,
"grad_norm": 0.930941104888916,
"learning_rate": 3.8016649813071106e-05,
"loss": 0.7564,
"step": 108
},
{
"epoch": 0.020250812819321878,
"grad_norm": 0.948442280292511,
"learning_rate": 3.735637662697203e-05,
"loss": 0.6836,
"step": 109
},
{
"epoch": 0.020436600092893636,
"grad_norm": 0.9409064650535583,
"learning_rate": 3.669682618110671e-05,
"loss": 0.696,
"step": 110
},
{
"epoch": 0.0206223873664654,
"grad_norm": 0.9388203024864197,
"learning_rate": 3.6038178789981494e-05,
"loss": 0.6616,
"step": 111
},
{
"epoch": 0.020808174640037157,
"grad_norm": 0.8305265307426453,
"learning_rate": 3.538061452121656e-05,
"loss": 0.6802,
"step": 112
},
{
"epoch": 0.020993961913608916,
"grad_norm": 0.8969584107398987,
"learning_rate": 3.472431314631724e-05,
"loss": 0.667,
"step": 113
},
{
"epoch": 0.02117974918718068,
"grad_norm": 0.8265141844749451,
"learning_rate": 3.406945409152632e-05,
"loss": 0.5857,
"step": 114
},
{
"epoch": 0.021365536460752437,
"grad_norm": 0.9097794890403748,
"learning_rate": 3.341621638877064e-05,
"loss": 0.7731,
"step": 115
},
{
"epoch": 0.0215513237343242,
"grad_norm": 0.8234865665435791,
"learning_rate": 3.276477862671562e-05,
"loss": 0.6853,
"step": 116
},
{
"epoch": 0.021737111007895958,
"grad_norm": 0.8503565788269043,
"learning_rate": 3.21153189019408e-05,
"loss": 0.5874,
"step": 117
},
{
"epoch": 0.02192289828146772,
"grad_norm": 0.7727727890014648,
"learning_rate": 3.146801477025013e-05,
"loss": 0.6099,
"step": 118
},
{
"epoch": 0.02210868555503948,
"grad_norm": 0.808000385761261,
"learning_rate": 3.082304319812994e-05,
"loss": 0.6345,
"step": 119
},
{
"epoch": 0.02229447282861124,
"grad_norm": 0.7325134873390198,
"learning_rate": 3.0180580514368037e-05,
"loss": 0.5591,
"step": 120
},
{
"epoch": 0.022480260102183,
"grad_norm": 0.7028451561927795,
"learning_rate": 2.9540802361847212e-05,
"loss": 0.512,
"step": 121
},
{
"epoch": 0.022666047375754762,
"grad_norm": 0.7961578369140625,
"learning_rate": 2.890388364952623e-05,
"loss": 0.5754,
"step": 122
},
{
"epoch": 0.02285183464932652,
"grad_norm": 0.7276977896690369,
"learning_rate": 2.8269998504621416e-05,
"loss": 0.5613,
"step": 123
},
{
"epoch": 0.02303762192289828,
"grad_norm": 0.858888566493988,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.655,
"step": 124
},
{
"epoch": 0.023223409196470042,
"grad_norm": 0.9296563863754272,
"learning_rate": 2.7012021231812666e-05,
"loss": 0.6636,
"step": 125
},
{
"epoch": 0.0234091964700418,
"grad_norm": 0.7812833189964294,
"learning_rate": 2.638827302233428e-05,
"loss": 0.6386,
"step": 126
},
{
"epoch": 0.023594983743613563,
"grad_norm": 0.9057222604751587,
"learning_rate": 2.576824612309934e-05,
"loss": 0.638,
"step": 127
},
{
"epoch": 0.023780771017185322,
"grad_norm": 0.8555361032485962,
"learning_rate": 2.5152110043271166e-05,
"loss": 0.6836,
"step": 128
},
{
"epoch": 0.023966558290757084,
"grad_norm": 0.8539828062057495,
"learning_rate": 2.454003322830192e-05,
"loss": 0.7038,
"step": 129
},
{
"epoch": 0.024152345564328843,
"grad_norm": 0.8139870166778564,
"learning_rate": 2.393218301388123e-05,
"loss": 0.4949,
"step": 130
},
{
"epoch": 0.024338132837900605,
"grad_norm": 0.8350996375083923,
"learning_rate": 2.3328725580188395e-05,
"loss": 0.6847,
"step": 131
},
{
"epoch": 0.024523920111472364,
"grad_norm": 0.8771671056747437,
"learning_rate": 2.272982590646029e-05,
"loss": 0.6576,
"step": 132
},
{
"epoch": 0.024709707385044126,
"grad_norm": 0.9145622253417969,
"learning_rate": 2.2135647725887744e-05,
"loss": 0.6714,
"step": 133
},
{
"epoch": 0.024895494658615885,
"grad_norm": 0.8157410621643066,
"learning_rate": 2.1546353480852495e-05,
"loss": 0.6085,
"step": 134
},
{
"epoch": 0.025081281932187643,
"grad_norm": 0.860339879989624,
"learning_rate": 2.096210427851706e-05,
"loss": 0.5648,
"step": 135
},
{
"epoch": 0.025267069205759406,
"grad_norm": 0.8176294565200806,
"learning_rate": 2.038305984677969e-05,
"loss": 0.6053,
"step": 136
},
{
"epoch": 0.025452856479331164,
"grad_norm": 0.9252009391784668,
"learning_rate": 1.9809378490606264e-05,
"loss": 0.6228,
"step": 137
},
{
"epoch": 0.025638643752902927,
"grad_norm": 0.8035367727279663,
"learning_rate": 1.9241217048751406e-05,
"loss": 0.6502,
"step": 138
},
{
"epoch": 0.025824431026474685,
"grad_norm": 0.9396884441375732,
"learning_rate": 1.867873085088026e-05,
"loss": 0.6928,
"step": 139
},
{
"epoch": 0.026010218300046448,
"grad_norm": 0.973107635974884,
"learning_rate": 1.8122073675102935e-05,
"loss": 0.7169,
"step": 140
},
{
"epoch": 0.026196005573618206,
"grad_norm": 0.9451408982276917,
"learning_rate": 1.75713977059331e-05,
"loss": 0.6835,
"step": 141
},
{
"epoch": 0.02638179284718997,
"grad_norm": 0.9551781415939331,
"learning_rate": 1.702685349268226e-05,
"loss": 0.7596,
"step": 142
},
{
"epoch": 0.026567580120761727,
"grad_norm": 0.9721214175224304,
"learning_rate": 1.648858990830108e-05,
"loss": 0.804,
"step": 143
},
{
"epoch": 0.02675336739433349,
"grad_norm": 1.019667148590088,
"learning_rate": 1.5956754108678996e-05,
"loss": 0.8623,
"step": 144
},
{
"epoch": 0.02693915466790525,
"grad_norm": 1.0824493169784546,
"learning_rate": 1.5431491492413288e-05,
"loss": 0.8455,
"step": 145
},
{
"epoch": 0.027124941941477007,
"grad_norm": 0.9800060987472534,
"learning_rate": 1.491294566105852e-05,
"loss": 0.8264,
"step": 146
},
{
"epoch": 0.02731072921504877,
"grad_norm": 0.9747028350830078,
"learning_rate": 1.4401258379867335e-05,
"loss": 0.7024,
"step": 147
},
{
"epoch": 0.027496516488620528,
"grad_norm": 0.8642198443412781,
"learning_rate": 1.3896569539033253e-05,
"loss": 0.7009,
"step": 148
},
{
"epoch": 0.02768230376219229,
"grad_norm": 0.8279868960380554,
"learning_rate": 1.3399017115446067e-05,
"loss": 0.6983,
"step": 149
},
{
"epoch": 0.02786809103576405,
"grad_norm": 1.2232890129089355,
"learning_rate": 1.2908737134970367e-05,
"loss": 0.8388,
"step": 150
},
{
"epoch": 0.02786809103576405,
"eval_loss": 0.7057402729988098,
"eval_runtime": 210.9537,
"eval_samples_per_second": 42.976,
"eval_steps_per_second": 10.746,
"step": 150
},
{
"epoch": 0.02805387830933581,
"grad_norm": 0.8277557492256165,
"learning_rate": 1.242586363525737e-05,
"loss": 0.6744,
"step": 151
},
{
"epoch": 0.02823966558290757,
"grad_norm": 1.129407286643982,
"learning_rate": 1.1950528629100457e-05,
"loss": 0.8988,
"step": 152
},
{
"epoch": 0.028425452856479332,
"grad_norm": 1.1117703914642334,
"learning_rate": 1.1482862068344121e-05,
"loss": 0.8987,
"step": 153
},
{
"epoch": 0.02861124013005109,
"grad_norm": 1.274944543838501,
"learning_rate": 1.1022991808356442e-05,
"loss": 0.9714,
"step": 154
},
{
"epoch": 0.028797027403622853,
"grad_norm": 1.413684368133545,
"learning_rate": 1.0571043573074737e-05,
"loss": 1.0464,
"step": 155
},
{
"epoch": 0.028982814677194612,
"grad_norm": 1.2533186674118042,
"learning_rate": 1.0127140920633857e-05,
"loss": 0.8553,
"step": 156
},
{
"epoch": 0.029168601950766374,
"grad_norm": 0.9504323601722717,
"learning_rate": 9.69140520958662e-06,
"loss": 0.6454,
"step": 157
},
{
"epoch": 0.029354389224338133,
"grad_norm": 0.9204007387161255,
"learning_rate": 9.263955565725648e-06,
"loss": 0.733,
"step": 158
},
{
"epoch": 0.02954017649790989,
"grad_norm": 0.8115749359130859,
"learning_rate": 8.844908849515509e-06,
"loss": 0.6411,
"step": 159
},
{
"epoch": 0.029725963771481654,
"grad_norm": 0.7680659294128418,
"learning_rate": 8.434379624144261e-06,
"loss": 0.6213,
"step": 160
},
{
"epoch": 0.029911751045053413,
"grad_norm": 0.7348408699035645,
"learning_rate": 8.032480124203013e-06,
"loss": 0.608,
"step": 161
},
{
"epoch": 0.030097538318625175,
"grad_norm": 0.690196692943573,
"learning_rate": 7.639320225002106e-06,
"loss": 0.5074,
"step": 162
},
{
"epoch": 0.030283325592196934,
"grad_norm": 0.7912430167198181,
"learning_rate": 7.255007412532307e-06,
"loss": 0.6236,
"step": 163
},
{
"epoch": 0.030469112865768696,
"grad_norm": 0.8454386591911316,
"learning_rate": 6.8796467540791986e-06,
"loss": 0.773,
"step": 164
},
{
"epoch": 0.030654900139340455,
"grad_norm": 0.7565322518348694,
"learning_rate": 6.513340869498859e-06,
"loss": 0.5278,
"step": 165
},
{
"epoch": 0.030840687412912217,
"grad_norm": 0.7427991032600403,
"learning_rate": 6.1561899031625794e-06,
"loss": 0.5895,
"step": 166
},
{
"epoch": 0.031026474686483976,
"grad_norm": 0.72712242603302,
"learning_rate": 5.808291496578435e-06,
"loss": 0.554,
"step": 167
},
{
"epoch": 0.031212261960055738,
"grad_norm": 0.8168418407440186,
"learning_rate": 5.469740761697044e-06,
"loss": 0.5795,
"step": 168
},
{
"epoch": 0.0313980492336275,
"grad_norm": 0.7900062203407288,
"learning_rate": 5.140630254908905e-06,
"loss": 0.6155,
"step": 169
},
{
"epoch": 0.03158383650719926,
"grad_norm": 0.7631322741508484,
"learning_rate": 4.821049951740442e-06,
"loss": 0.6395,
"step": 170
},
{
"epoch": 0.031769623780771014,
"grad_norm": 0.8723105788230896,
"learning_rate": 4.511087222255528e-06,
"loss": 0.7083,
"step": 171
},
{
"epoch": 0.031955411054342776,
"grad_norm": 0.8694934248924255,
"learning_rate": 4.2108268071694616e-06,
"loss": 0.6848,
"step": 172
},
{
"epoch": 0.03214119832791454,
"grad_norm": 0.8055874109268188,
"learning_rate": 3.9203507946816445e-06,
"loss": 0.6301,
"step": 173
},
{
"epoch": 0.0323269856014863,
"grad_norm": 0.782102644443512,
"learning_rate": 3.6397385980335e-06,
"loss": 0.5799,
"step": 174
},
{
"epoch": 0.032512772875058056,
"grad_norm": 0.9123784303665161,
"learning_rate": 3.3690669337977e-06,
"loss": 0.6572,
"step": 175
},
{
"epoch": 0.03269856014862982,
"grad_norm": 0.8065102100372314,
"learning_rate": 3.1084098009046106e-06,
"loss": 0.6309,
"step": 176
},
{
"epoch": 0.03288434742220158,
"grad_norm": 0.7722126841545105,
"learning_rate": 2.8578384604117217e-06,
"loss": 0.5781,
"step": 177
},
{
"epoch": 0.03307013469577334,
"grad_norm": 0.8441624641418457,
"learning_rate": 2.6174214160215704e-06,
"loss": 0.625,
"step": 178
},
{
"epoch": 0.0332559219693451,
"grad_norm": 0.7183513045310974,
"learning_rate": 2.3872243953535535e-06,
"loss": 0.5938,
"step": 179
},
{
"epoch": 0.03344170924291686,
"grad_norm": 0.8442609310150146,
"learning_rate": 2.1673103319746146e-06,
"loss": 0.6409,
"step": 180
},
{
"epoch": 0.03362749651648862,
"grad_norm": 0.7444936633110046,
"learning_rate": 1.957739348193859e-06,
"loss": 0.6137,
"step": 181
},
{
"epoch": 0.03381328379006038,
"grad_norm": 0.851841390132904,
"learning_rate": 1.7585687386256944e-06,
"loss": 0.5644,
"step": 182
},
{
"epoch": 0.03399907106363214,
"grad_norm": 0.7763927578926086,
"learning_rate": 1.5698529545260744e-06,
"loss": 0.556,
"step": 183
},
{
"epoch": 0.0341848583372039,
"grad_norm": 0.7218007445335388,
"learning_rate": 1.3916435889060575e-06,
"loss": 0.5211,
"step": 184
},
{
"epoch": 0.034370645610775664,
"grad_norm": 0.8408937454223633,
"learning_rate": 1.2239893624267852e-06,
"loss": 0.6682,
"step": 185
},
{
"epoch": 0.03455643288434742,
"grad_norm": 0.8628000020980835,
"learning_rate": 1.0669361100797704e-06,
"loss": 0.6878,
"step": 186
},
{
"epoch": 0.03474222015791918,
"grad_norm": 0.8714439272880554,
"learning_rate": 9.205267686560293e-07,
"loss": 0.6547,
"step": 187
},
{
"epoch": 0.034928007431490944,
"grad_norm": 0.7619119882583618,
"learning_rate": 7.848013650076258e-07,
"loss": 0.5769,
"step": 188
},
{
"epoch": 0.035113794705062706,
"grad_norm": 0.9454699158668518,
"learning_rate": 6.597970051047053e-07,
"loss": 0.6948,
"step": 189
},
{
"epoch": 0.03529958197863446,
"grad_norm": 0.7809498906135559,
"learning_rate": 5.455478638911071e-07,
"loss": 0.657,
"step": 190
},
{
"epoch": 0.035485369252206224,
"grad_norm": 0.9400784373283386,
"learning_rate": 4.420851759412603e-07,
"loss": 0.723,
"step": 191
},
{
"epoch": 0.035671156525777986,
"grad_norm": 1.0285460948944092,
"learning_rate": 3.4943722692099224e-07,
"loss": 0.8145,
"step": 192
},
{
"epoch": 0.03585694379934974,
"grad_norm": 1.0024358034133911,
"learning_rate": 2.676293458544743e-07,
"loss": 0.8062,
"step": 193
},
{
"epoch": 0.036042731072921504,
"grad_norm": 0.9210175275802612,
"learning_rate": 1.9668389819954338e-07,
"loss": 0.7595,
"step": 194
},
{
"epoch": 0.036228518346493266,
"grad_norm": 1.0221508741378784,
"learning_rate": 1.3662027973320614e-07,
"loss": 0.7836,
"step": 195
},
{
"epoch": 0.03641430562006503,
"grad_norm": 0.9825165271759033,
"learning_rate": 8.745491124901861e-08,
"loss": 0.8122,
"step": 196
},
{
"epoch": 0.03660009289363678,
"grad_norm": 1.2274169921875,
"learning_rate": 4.920123406781052e-08,
"loss": 0.9055,
"step": 197
},
{
"epoch": 0.036785880167208546,
"grad_norm": 1.0650230646133423,
"learning_rate": 2.1869706362958044e-08,
"loss": 0.7219,
"step": 198
},
{
"epoch": 0.03697166744078031,
"grad_norm": 1.0528210401535034,
"learning_rate": 5.467800301239834e-09,
"loss": 0.6992,
"step": 199
},
{
"epoch": 0.03715745471435207,
"grad_norm": 1.143416166305542,
"learning_rate": 0.0,
"loss": 0.808,
"step": 200
},
{
"epoch": 0.03715745471435207,
"eval_loss": 0.696293830871582,
"eval_runtime": 210.9668,
"eval_samples_per_second": 42.974,
"eval_steps_per_second": 10.746,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.895574321423974e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}