bbytxt's picture
Training in progress, step 200, checkpoint
ad5c4c0 verified
{
"best_metric": 0.46117889881134033,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.1085334418667752,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000542667209333876,
"grad_norm": 1.8354359865188599,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.8492,
"step": 1
},
{
"epoch": 0.000542667209333876,
"eval_loss": 3.3348822593688965,
"eval_runtime": 109.0843,
"eval_samples_per_second": 28.455,
"eval_steps_per_second": 14.228,
"step": 1
},
{
"epoch": 0.001085334418667752,
"grad_norm": 3.162593364715576,
"learning_rate": 6.666666666666667e-06,
"loss": 2.3208,
"step": 2
},
{
"epoch": 0.001628001628001628,
"grad_norm": 3.748394727706909,
"learning_rate": 1e-05,
"loss": 2.4477,
"step": 3
},
{
"epoch": 0.002170668837335504,
"grad_norm": 4.363120079040527,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.4994,
"step": 4
},
{
"epoch": 0.00271333604666938,
"grad_norm": 4.346236705780029,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.6665,
"step": 5
},
{
"epoch": 0.003256003256003256,
"grad_norm": 4.28872537612915,
"learning_rate": 2e-05,
"loss": 2.5443,
"step": 6
},
{
"epoch": 0.003798670465337132,
"grad_norm": 4.303534984588623,
"learning_rate": 2.3333333333333336e-05,
"loss": 2.3335,
"step": 7
},
{
"epoch": 0.004341337674671008,
"grad_norm": 3.9452812671661377,
"learning_rate": 2.6666666666666667e-05,
"loss": 2.257,
"step": 8
},
{
"epoch": 0.004884004884004884,
"grad_norm": 4.098211288452148,
"learning_rate": 3e-05,
"loss": 2.0106,
"step": 9
},
{
"epoch": 0.00542667209333876,
"grad_norm": 4.80372953414917,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.6557,
"step": 10
},
{
"epoch": 0.005969339302672636,
"grad_norm": 5.134768486022949,
"learning_rate": 3.6666666666666666e-05,
"loss": 1.4699,
"step": 11
},
{
"epoch": 0.006512006512006512,
"grad_norm": 5.335248947143555,
"learning_rate": 4e-05,
"loss": 1.5766,
"step": 12
},
{
"epoch": 0.007054673721340388,
"grad_norm": 5.714169979095459,
"learning_rate": 4.3333333333333334e-05,
"loss": 1.0583,
"step": 13
},
{
"epoch": 0.007597340930674264,
"grad_norm": 4.347225189208984,
"learning_rate": 4.666666666666667e-05,
"loss": 0.6941,
"step": 14
},
{
"epoch": 0.00814000814000814,
"grad_norm": 2.579446792602539,
"learning_rate": 5e-05,
"loss": 0.3363,
"step": 15
},
{
"epoch": 0.008682675349342016,
"grad_norm": 2.0687601566314697,
"learning_rate": 5.333333333333333e-05,
"loss": 0.1997,
"step": 16
},
{
"epoch": 0.009225342558675891,
"grad_norm": 2.1691503524780273,
"learning_rate": 5.666666666666667e-05,
"loss": 0.3088,
"step": 17
},
{
"epoch": 0.009768009768009768,
"grad_norm": 11.303365707397461,
"learning_rate": 6e-05,
"loss": 1.2146,
"step": 18
},
{
"epoch": 0.010310676977343645,
"grad_norm": 5.85835599899292,
"learning_rate": 6.333333333333333e-05,
"loss": 0.7783,
"step": 19
},
{
"epoch": 0.01085334418667752,
"grad_norm": 3.3280720710754395,
"learning_rate": 6.666666666666667e-05,
"loss": 0.5964,
"step": 20
},
{
"epoch": 0.011396011396011397,
"grad_norm": 3.908247947692871,
"learning_rate": 7e-05,
"loss": 0.4214,
"step": 21
},
{
"epoch": 0.011938678605345272,
"grad_norm": 7.787004470825195,
"learning_rate": 7.333333333333333e-05,
"loss": 0.4716,
"step": 22
},
{
"epoch": 0.012481345814679148,
"grad_norm": 7.936584949493408,
"learning_rate": 7.666666666666667e-05,
"loss": 0.4301,
"step": 23
},
{
"epoch": 0.013024013024013023,
"grad_norm": 5.566396713256836,
"learning_rate": 8e-05,
"loss": 0.727,
"step": 24
},
{
"epoch": 0.0135666802333469,
"grad_norm": 3.142136335372925,
"learning_rate": 8.333333333333334e-05,
"loss": 0.5904,
"step": 25
},
{
"epoch": 0.014109347442680775,
"grad_norm": 2.2857656478881836,
"learning_rate": 8.666666666666667e-05,
"loss": 0.401,
"step": 26
},
{
"epoch": 0.014652014652014652,
"grad_norm": 1.4375312328338623,
"learning_rate": 9e-05,
"loss": 0.2759,
"step": 27
},
{
"epoch": 0.015194681861348529,
"grad_norm": 1.1307240724563599,
"learning_rate": 9.333333333333334e-05,
"loss": 0.2203,
"step": 28
},
{
"epoch": 0.015737349070682406,
"grad_norm": 1.402246356010437,
"learning_rate": 9.666666666666667e-05,
"loss": 0.3203,
"step": 29
},
{
"epoch": 0.01628001628001628,
"grad_norm": 5.896515846252441,
"learning_rate": 0.0001,
"loss": 0.621,
"step": 30
},
{
"epoch": 0.016822683489350156,
"grad_norm": 3.45434308052063,
"learning_rate": 9.999146252290264e-05,
"loss": 0.4411,
"step": 31
},
{
"epoch": 0.017365350698684032,
"grad_norm": 2.1787056922912598,
"learning_rate": 9.996585300715116e-05,
"loss": 0.302,
"step": 32
},
{
"epoch": 0.01790801790801791,
"grad_norm": 1.774258017539978,
"learning_rate": 9.99231801983717e-05,
"loss": 0.1932,
"step": 33
},
{
"epoch": 0.018450685117351782,
"grad_norm": 2.169252634048462,
"learning_rate": 9.986345866928941e-05,
"loss": 0.3516,
"step": 34
},
{
"epoch": 0.01899335232668566,
"grad_norm": 2.699979543685913,
"learning_rate": 9.978670881475172e-05,
"loss": 0.5089,
"step": 35
},
{
"epoch": 0.019536019536019536,
"grad_norm": 2.209733724594116,
"learning_rate": 9.96929568447637e-05,
"loss": 0.3997,
"step": 36
},
{
"epoch": 0.020078686745353413,
"grad_norm": 2.640620470046997,
"learning_rate": 9.958223477553714e-05,
"loss": 0.5742,
"step": 37
},
{
"epoch": 0.02062135395468729,
"grad_norm": 2.7024731636047363,
"learning_rate": 9.94545804185573e-05,
"loss": 0.4846,
"step": 38
},
{
"epoch": 0.021164021164021163,
"grad_norm": 3.163442850112915,
"learning_rate": 9.931003736767013e-05,
"loss": 0.4393,
"step": 39
},
{
"epoch": 0.02170668837335504,
"grad_norm": 2.5975422859191895,
"learning_rate": 9.91486549841951e-05,
"loss": 0.3196,
"step": 40
},
{
"epoch": 0.022249355582688916,
"grad_norm": 3.849959135055542,
"learning_rate": 9.89704883800683e-05,
"loss": 0.2137,
"step": 41
},
{
"epoch": 0.022792022792022793,
"grad_norm": 1.7151565551757812,
"learning_rate": 9.877559839902184e-05,
"loss": 0.1716,
"step": 42
},
{
"epoch": 0.023334690001356666,
"grad_norm": 2.8859987258911133,
"learning_rate": 9.85640515958057e-05,
"loss": 0.434,
"step": 43
},
{
"epoch": 0.023877357210690543,
"grad_norm": 6.60436487197876,
"learning_rate": 9.833592021345937e-05,
"loss": 0.529,
"step": 44
},
{
"epoch": 0.02442002442002442,
"grad_norm": 3.7714180946350098,
"learning_rate": 9.809128215864097e-05,
"loss": 0.1852,
"step": 45
},
{
"epoch": 0.024962691629358297,
"grad_norm": 1.2581069469451904,
"learning_rate": 9.783022097502204e-05,
"loss": 0.0727,
"step": 46
},
{
"epoch": 0.025505358838692174,
"grad_norm": 0.7298163175582886,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0407,
"step": 47
},
{
"epoch": 0.026048026048026047,
"grad_norm": 0.47441840171813965,
"learning_rate": 9.725919140804099e-05,
"loss": 0.0686,
"step": 48
},
{
"epoch": 0.026590693257359924,
"grad_norm": 0.4721538722515106,
"learning_rate": 9.694941803075283e-05,
"loss": 0.0462,
"step": 49
},
{
"epoch": 0.0271333604666938,
"grad_norm": 3.0532617568969727,
"learning_rate": 9.662361147021779e-05,
"loss": 0.3636,
"step": 50
},
{
"epoch": 0.0271333604666938,
"eval_loss": 1.24636971950531,
"eval_runtime": 109.441,
"eval_samples_per_second": 28.362,
"eval_steps_per_second": 14.181,
"step": 50
},
{
"epoch": 0.027676027676027677,
"grad_norm": 3.458024501800537,
"learning_rate": 9.628188298907782e-05,
"loss": 1.8542,
"step": 51
},
{
"epoch": 0.02821869488536155,
"grad_norm": 3.7774367332458496,
"learning_rate": 9.592434928729616e-05,
"loss": 1.969,
"step": 52
},
{
"epoch": 0.028761362094695427,
"grad_norm": 3.341789484024048,
"learning_rate": 9.555113246230442e-05,
"loss": 1.9199,
"step": 53
},
{
"epoch": 0.029304029304029304,
"grad_norm": 2.504328489303589,
"learning_rate": 9.516235996730645e-05,
"loss": 1.7206,
"step": 54
},
{
"epoch": 0.02984669651336318,
"grad_norm": 2.0271387100219727,
"learning_rate": 9.475816456775313e-05,
"loss": 1.3525,
"step": 55
},
{
"epoch": 0.030389363722697058,
"grad_norm": 1.4228510856628418,
"learning_rate": 9.43386842960031e-05,
"loss": 1.2227,
"step": 56
},
{
"epoch": 0.03093203093203093,
"grad_norm": 1.3967491388320923,
"learning_rate": 9.39040624041849e-05,
"loss": 1.138,
"step": 57
},
{
"epoch": 0.03147469814136481,
"grad_norm": 0.9970569014549255,
"learning_rate": 9.345444731527642e-05,
"loss": 1.0437,
"step": 58
},
{
"epoch": 0.032017365350698684,
"grad_norm": 0.9406002163887024,
"learning_rate": 9.298999257241863e-05,
"loss": 0.9546,
"step": 59
},
{
"epoch": 0.03256003256003256,
"grad_norm": 1.0668526887893677,
"learning_rate": 9.251085678648072e-05,
"loss": 0.8642,
"step": 60
},
{
"epoch": 0.03310269976936644,
"grad_norm": 0.9564623832702637,
"learning_rate": 9.201720358189464e-05,
"loss": 0.4615,
"step": 61
},
{
"epoch": 0.03364536697870031,
"grad_norm": 0.7209700345993042,
"learning_rate": 9.150920154077754e-05,
"loss": 0.536,
"step": 62
},
{
"epoch": 0.03418803418803419,
"grad_norm": 0.9375424385070801,
"learning_rate": 9.098702414536107e-05,
"loss": 0.7291,
"step": 63
},
{
"epoch": 0.034730701397368065,
"grad_norm": 0.9395576119422913,
"learning_rate": 9.045084971874738e-05,
"loss": 0.4634,
"step": 64
},
{
"epoch": 0.03527336860670194,
"grad_norm": 0.6775506734848022,
"learning_rate": 8.9900861364012e-05,
"loss": 0.333,
"step": 65
},
{
"epoch": 0.03581603581603582,
"grad_norm": 0.7822219133377075,
"learning_rate": 8.933724690167417e-05,
"loss": 0.197,
"step": 66
},
{
"epoch": 0.03635870302536969,
"grad_norm": 0.6001350283622742,
"learning_rate": 8.876019880555649e-05,
"loss": 0.2904,
"step": 67
},
{
"epoch": 0.036901370234703565,
"grad_norm": 0.35400617122650146,
"learning_rate": 8.816991413705516e-05,
"loss": 0.1933,
"step": 68
},
{
"epoch": 0.037444037444037445,
"grad_norm": 1.2720855474472046,
"learning_rate": 8.756659447784368e-05,
"loss": 0.7092,
"step": 69
},
{
"epoch": 0.03798670465337132,
"grad_norm": 1.343387246131897,
"learning_rate": 8.695044586103296e-05,
"loss": 0.4298,
"step": 70
},
{
"epoch": 0.0385293718627052,
"grad_norm": 1.2000885009765625,
"learning_rate": 8.632167870081121e-05,
"loss": 0.2887,
"step": 71
},
{
"epoch": 0.03907203907203907,
"grad_norm": 0.7588382959365845,
"learning_rate": 8.568050772058762e-05,
"loss": 0.4491,
"step": 72
},
{
"epoch": 0.039614706281372945,
"grad_norm": 0.8610361218452454,
"learning_rate": 8.502715187966455e-05,
"loss": 0.247,
"step": 73
},
{
"epoch": 0.040157373490706826,
"grad_norm": 1.5872234106063843,
"learning_rate": 8.436183429846313e-05,
"loss": 0.2978,
"step": 74
},
{
"epoch": 0.0407000407000407,
"grad_norm": 1.328898310661316,
"learning_rate": 8.368478218232787e-05,
"loss": 0.4512,
"step": 75
},
{
"epoch": 0.04124270790937458,
"grad_norm": 1.1457730531692505,
"learning_rate": 8.299622674393614e-05,
"loss": 0.5961,
"step": 76
},
{
"epoch": 0.04178537511870845,
"grad_norm": 1.1928189992904663,
"learning_rate": 8.229640312433937e-05,
"loss": 0.2556,
"step": 77
},
{
"epoch": 0.042328042328042326,
"grad_norm": 0.9460683465003967,
"learning_rate": 8.158555031266254e-05,
"loss": 0.2041,
"step": 78
},
{
"epoch": 0.042870709537376206,
"grad_norm": 0.7704111337661743,
"learning_rate": 8.086391106448965e-05,
"loss": 0.2108,
"step": 79
},
{
"epoch": 0.04341337674671008,
"grad_norm": 0.714647114276886,
"learning_rate": 8.013173181896283e-05,
"loss": 0.2097,
"step": 80
},
{
"epoch": 0.04395604395604396,
"grad_norm": 1.409079670906067,
"learning_rate": 7.938926261462366e-05,
"loss": 0.2898,
"step": 81
},
{
"epoch": 0.04449871116537783,
"grad_norm": 1.393788456916809,
"learning_rate": 7.863675700402526e-05,
"loss": 0.3064,
"step": 82
},
{
"epoch": 0.045041378374711706,
"grad_norm": 1.120473861694336,
"learning_rate": 7.787447196714427e-05,
"loss": 0.218,
"step": 83
},
{
"epoch": 0.045584045584045586,
"grad_norm": 0.7864734530448914,
"learning_rate": 7.710266782362247e-05,
"loss": 0.1783,
"step": 84
},
{
"epoch": 0.04612671279337946,
"grad_norm": 1.4955497980117798,
"learning_rate": 7.63216081438678e-05,
"loss": 0.3466,
"step": 85
},
{
"epoch": 0.04666938000271333,
"grad_norm": 1.4786607027053833,
"learning_rate": 7.553155965904535e-05,
"loss": 0.3539,
"step": 86
},
{
"epoch": 0.04721204721204721,
"grad_norm": 1.4631497859954834,
"learning_rate": 7.473279216998895e-05,
"loss": 0.3824,
"step": 87
},
{
"epoch": 0.047754714421381086,
"grad_norm": 1.5295202732086182,
"learning_rate": 7.392557845506432e-05,
"loss": 0.3619,
"step": 88
},
{
"epoch": 0.04829738163071497,
"grad_norm": 0.977811336517334,
"learning_rate": 7.311019417701566e-05,
"loss": 0.3502,
"step": 89
},
{
"epoch": 0.04884004884004884,
"grad_norm": 1.178490400314331,
"learning_rate": 7.228691778882693e-05,
"loss": 0.3345,
"step": 90
},
{
"epoch": 0.04938271604938271,
"grad_norm": 0.8995553255081177,
"learning_rate": 7.145603043863045e-05,
"loss": 0.3611,
"step": 91
},
{
"epoch": 0.049925383258716594,
"grad_norm": 0.9408777356147766,
"learning_rate": 7.061781587369519e-05,
"loss": 0.1358,
"step": 92
},
{
"epoch": 0.05046805046805047,
"grad_norm": 0.9388291835784912,
"learning_rate": 6.977256034352712e-05,
"loss": 0.0947,
"step": 93
},
{
"epoch": 0.05101071767738435,
"grad_norm": 1.260951280593872,
"learning_rate": 6.892055250211552e-05,
"loss": 0.3262,
"step": 94
},
{
"epoch": 0.05155338488671822,
"grad_norm": 2.3727824687957764,
"learning_rate": 6.806208330935766e-05,
"loss": 0.2905,
"step": 95
},
{
"epoch": 0.052096052096052094,
"grad_norm": 1.8929994106292725,
"learning_rate": 6.719744593169641e-05,
"loss": 0.0723,
"step": 96
},
{
"epoch": 0.052638719305385974,
"grad_norm": 0.7458711266517639,
"learning_rate": 6.632693564200416e-05,
"loss": 0.0405,
"step": 97
},
{
"epoch": 0.05318138651471985,
"grad_norm": 0.3814379572868347,
"learning_rate": 6.545084971874738e-05,
"loss": 0.028,
"step": 98
},
{
"epoch": 0.05372405372405373,
"grad_norm": 0.41902849078178406,
"learning_rate": 6.456948734446624e-05,
"loss": 0.0304,
"step": 99
},
{
"epoch": 0.0542667209333876,
"grad_norm": 2.0934085845947266,
"learning_rate": 6.368314950360415e-05,
"loss": 0.365,
"step": 100
},
{
"epoch": 0.0542667209333876,
"eval_loss": 1.2894482612609863,
"eval_runtime": 109.2762,
"eval_samples_per_second": 28.405,
"eval_steps_per_second": 14.203,
"step": 100
},
{
"epoch": 0.054809388142721474,
"grad_norm": 4.360636234283447,
"learning_rate": 6.279213887972179e-05,
"loss": 2.1731,
"step": 101
},
{
"epoch": 0.055352055352055354,
"grad_norm": 4.3935866355896,
"learning_rate": 6.189675975213094e-05,
"loss": 2.3281,
"step": 102
},
{
"epoch": 0.05589472256138923,
"grad_norm": 3.9898064136505127,
"learning_rate": 6.099731789198344e-05,
"loss": 2.1225,
"step": 103
},
{
"epoch": 0.0564373897707231,
"grad_norm": 3.203814744949341,
"learning_rate": 6.009412045785051e-05,
"loss": 2.0483,
"step": 104
},
{
"epoch": 0.05698005698005698,
"grad_norm": 2.7947652339935303,
"learning_rate": 5.918747589082853e-05,
"loss": 1.773,
"step": 105
},
{
"epoch": 0.057522724189390854,
"grad_norm": 2.35221529006958,
"learning_rate": 5.82776938092065e-05,
"loss": 1.4951,
"step": 106
},
{
"epoch": 0.058065391398724735,
"grad_norm": 1.9279205799102783,
"learning_rate": 5.736508490273188e-05,
"loss": 1.4358,
"step": 107
},
{
"epoch": 0.05860805860805861,
"grad_norm": 1.6778484582901,
"learning_rate": 5.644996082651017e-05,
"loss": 1.1536,
"step": 108
},
{
"epoch": 0.05915072581739248,
"grad_norm": 1.3214651346206665,
"learning_rate": 5.553263409457504e-05,
"loss": 1.0911,
"step": 109
},
{
"epoch": 0.05969339302672636,
"grad_norm": 1.1080145835876465,
"learning_rate": 5.4613417973165106e-05,
"loss": 0.9185,
"step": 110
},
{
"epoch": 0.060236060236060235,
"grad_norm": 1.219183325767517,
"learning_rate": 5.3692626373743706e-05,
"loss": 0.4164,
"step": 111
},
{
"epoch": 0.060778727445394115,
"grad_norm": 0.7684482932090759,
"learning_rate": 5.27705737457985e-05,
"loss": 0.578,
"step": 112
},
{
"epoch": 0.06132139465472799,
"grad_norm": 0.7245362401008606,
"learning_rate": 5.184757496945726e-05,
"loss": 0.6867,
"step": 113
},
{
"epoch": 0.06186406186406186,
"grad_norm": 0.7537386417388916,
"learning_rate": 5.092394524795649e-05,
"loss": 0.3152,
"step": 114
},
{
"epoch": 0.06240672907339574,
"grad_norm": 0.7625346779823303,
"learning_rate": 5e-05,
"loss": 0.2146,
"step": 115
},
{
"epoch": 0.06294939628272962,
"grad_norm": 0.7598631978034973,
"learning_rate": 4.907605475204352e-05,
"loss": 0.1776,
"step": 116
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.6567097902297974,
"learning_rate": 4.8152425030542766e-05,
"loss": 0.2142,
"step": 117
},
{
"epoch": 0.06403473070139737,
"grad_norm": 0.6322920918464661,
"learning_rate": 4.72294262542015e-05,
"loss": 0.5704,
"step": 118
},
{
"epoch": 0.06457739791073125,
"grad_norm": 0.7434191703796387,
"learning_rate": 4.6307373626256306e-05,
"loss": 0.4056,
"step": 119
},
{
"epoch": 0.06512006512006512,
"grad_norm": 0.7763521671295166,
"learning_rate": 4.5386582026834906e-05,
"loss": 0.411,
"step": 120
},
{
"epoch": 0.065662732329399,
"grad_norm": 0.6602493524551392,
"learning_rate": 4.446736590542497e-05,
"loss": 0.3652,
"step": 121
},
{
"epoch": 0.06620539953873288,
"grad_norm": 0.6829279065132141,
"learning_rate": 4.3550039173489845e-05,
"loss": 0.1626,
"step": 122
},
{
"epoch": 0.06674806674806674,
"grad_norm": 0.7579967379570007,
"learning_rate": 4.2634915097268115e-05,
"loss": 0.1324,
"step": 123
},
{
"epoch": 0.06729073395740062,
"grad_norm": 0.6911221742630005,
"learning_rate": 4.1722306190793495e-05,
"loss": 0.3195,
"step": 124
},
{
"epoch": 0.0678334011667345,
"grad_norm": 0.6620950102806091,
"learning_rate": 4.0812524109171476e-05,
"loss": 0.4545,
"step": 125
},
{
"epoch": 0.06837606837606838,
"grad_norm": 0.7261799573898315,
"learning_rate": 3.99058795421495e-05,
"loss": 0.208,
"step": 126
},
{
"epoch": 0.06891873558540225,
"grad_norm": 0.6014336943626404,
"learning_rate": 3.9002682108016585e-05,
"loss": 0.2196,
"step": 127
},
{
"epoch": 0.06946140279473613,
"grad_norm": 0.7216453552246094,
"learning_rate": 3.8103240247869075e-05,
"loss": 0.146,
"step": 128
},
{
"epoch": 0.07000407000407001,
"grad_norm": 0.7673162817955017,
"learning_rate": 3.720786112027822e-05,
"loss": 0.245,
"step": 129
},
{
"epoch": 0.07054673721340388,
"grad_norm": 1.1044118404388428,
"learning_rate": 3.631685049639586e-05,
"loss": 0.3419,
"step": 130
},
{
"epoch": 0.07108940442273776,
"grad_norm": 1.402040719985962,
"learning_rate": 3.543051265553377e-05,
"loss": 0.2472,
"step": 131
},
{
"epoch": 0.07163207163207164,
"grad_norm": 1.219019889831543,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.1868,
"step": 132
},
{
"epoch": 0.0721747388414055,
"grad_norm": 0.914679229259491,
"learning_rate": 3.367306435799584e-05,
"loss": 0.1656,
"step": 133
},
{
"epoch": 0.07271740605073938,
"grad_norm": 0.9330466985702515,
"learning_rate": 3.2802554068303596e-05,
"loss": 0.2796,
"step": 134
},
{
"epoch": 0.07326007326007326,
"grad_norm": 1.7541531324386597,
"learning_rate": 3.1937916690642356e-05,
"loss": 0.263,
"step": 135
},
{
"epoch": 0.07380274046940713,
"grad_norm": 1.3093783855438232,
"learning_rate": 3.107944749788449e-05,
"loss": 0.2625,
"step": 136
},
{
"epoch": 0.07434540767874101,
"grad_norm": 1.9325745105743408,
"learning_rate": 3.0227439656472877e-05,
"loss": 0.3429,
"step": 137
},
{
"epoch": 0.07488807488807489,
"grad_norm": 1.6673192977905273,
"learning_rate": 2.9382184126304834e-05,
"loss": 0.2903,
"step": 138
},
{
"epoch": 0.07543074209740877,
"grad_norm": 0.6153419613838196,
"learning_rate": 2.8543969561369556e-05,
"loss": 0.3469,
"step": 139
},
{
"epoch": 0.07597340930674264,
"grad_norm": 0.7528604865074158,
"learning_rate": 2.771308221117309e-05,
"loss": 0.2047,
"step": 140
},
{
"epoch": 0.07651607651607652,
"grad_norm": 0.7970343232154846,
"learning_rate": 2.688980582298435e-05,
"loss": 0.1811,
"step": 141
},
{
"epoch": 0.0770587437254104,
"grad_norm": 1.1138691902160645,
"learning_rate": 2.607442154493568e-05,
"loss": 0.1813,
"step": 142
},
{
"epoch": 0.07760141093474426,
"grad_norm": 1.0628737211227417,
"learning_rate": 2.5267207830011068e-05,
"loss": 0.3945,
"step": 143
},
{
"epoch": 0.07814407814407814,
"grad_norm": 2.7330329418182373,
"learning_rate": 2.446844034095466e-05,
"loss": 0.3657,
"step": 144
},
{
"epoch": 0.07868674535341202,
"grad_norm": 2.918161153793335,
"learning_rate": 2.3678391856132204e-05,
"loss": 0.2343,
"step": 145
},
{
"epoch": 0.07922941256274589,
"grad_norm": 1.6331360340118408,
"learning_rate": 2.2897332176377528e-05,
"loss": 0.122,
"step": 146
},
{
"epoch": 0.07977207977207977,
"grad_norm": 1.2085009813308716,
"learning_rate": 2.2125528032855724e-05,
"loss": 0.091,
"step": 147
},
{
"epoch": 0.08031474698141365,
"grad_norm": 0.8547497391700745,
"learning_rate": 2.136324299597474e-05,
"loss": 0.0648,
"step": 148
},
{
"epoch": 0.08085741419074752,
"grad_norm": 0.6054078340530396,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0454,
"step": 149
},
{
"epoch": 0.0814000814000814,
"grad_norm": 2.0531108379364014,
"learning_rate": 1.9868268181037185e-05,
"loss": 0.3661,
"step": 150
},
{
"epoch": 0.0814000814000814,
"eval_loss": 0.7246484756469727,
"eval_runtime": 109.2416,
"eval_samples_per_second": 28.414,
"eval_steps_per_second": 14.207,
"step": 150
},
{
"epoch": 0.08194274860941528,
"grad_norm": 2.4585351943969727,
"learning_rate": 1.9136088935510362e-05,
"loss": 1.6484,
"step": 151
},
{
"epoch": 0.08248541581874916,
"grad_norm": 2.7452824115753174,
"learning_rate": 1.8414449687337464e-05,
"loss": 1.7274,
"step": 152
},
{
"epoch": 0.08302808302808302,
"grad_norm": 2.7809834480285645,
"learning_rate": 1.7703596875660645e-05,
"loss": 1.6597,
"step": 153
},
{
"epoch": 0.0835707502374169,
"grad_norm": 2.423109769821167,
"learning_rate": 1.700377325606388e-05,
"loss": 1.6053,
"step": 154
},
{
"epoch": 0.08411341744675079,
"grad_norm": 2.2255589962005615,
"learning_rate": 1.631521781767214e-05,
"loss": 1.4753,
"step": 155
},
{
"epoch": 0.08465608465608465,
"grad_norm": 1.9394181966781616,
"learning_rate": 1.5638165701536868e-05,
"loss": 1.2572,
"step": 156
},
{
"epoch": 0.08519875186541853,
"grad_norm": 1.7988132238388062,
"learning_rate": 1.4972848120335453e-05,
"loss": 1.3386,
"step": 157
},
{
"epoch": 0.08574141907475241,
"grad_norm": 1.8108716011047363,
"learning_rate": 1.4319492279412388e-05,
"loss": 1.042,
"step": 158
},
{
"epoch": 0.08628408628408628,
"grad_norm": 1.5393013954162598,
"learning_rate": 1.3678321299188801e-05,
"loss": 1.0092,
"step": 159
},
{
"epoch": 0.08682675349342016,
"grad_norm": 1.369483232498169,
"learning_rate": 1.3049554138967051e-05,
"loss": 0.9053,
"step": 160
},
{
"epoch": 0.08736942070275404,
"grad_norm": 1.5199528932571411,
"learning_rate": 1.2433405522156332e-05,
"loss": 0.4746,
"step": 161
},
{
"epoch": 0.08791208791208792,
"grad_norm": 1.314212679862976,
"learning_rate": 1.183008586294485e-05,
"loss": 0.4789,
"step": 162
},
{
"epoch": 0.08845475512142179,
"grad_norm": 1.0513840913772583,
"learning_rate": 1.1239801194443506e-05,
"loss": 0.7773,
"step": 163
},
{
"epoch": 0.08899742233075567,
"grad_norm": 0.9221037030220032,
"learning_rate": 1.066275309832584e-05,
"loss": 0.613,
"step": 164
},
{
"epoch": 0.08954008954008955,
"grad_norm": 0.8674269914627075,
"learning_rate": 1.0099138635988026e-05,
"loss": 0.3583,
"step": 165
},
{
"epoch": 0.09008275674942341,
"grad_norm": 0.7268597483634949,
"learning_rate": 9.549150281252633e-06,
"loss": 0.2715,
"step": 166
},
{
"epoch": 0.09062542395875729,
"grad_norm": 1.0291173458099365,
"learning_rate": 9.012975854638949e-06,
"loss": 0.1534,
"step": 167
},
{
"epoch": 0.09116809116809117,
"grad_norm": 0.6670011878013611,
"learning_rate": 8.490798459222476e-06,
"loss": 0.4197,
"step": 168
},
{
"epoch": 0.09171075837742504,
"grad_norm": 0.7457062602043152,
"learning_rate": 7.982796418105371e-06,
"loss": 0.4144,
"step": 169
},
{
"epoch": 0.09225342558675892,
"grad_norm": 0.6587404012680054,
"learning_rate": 7.489143213519301e-06,
"loss": 0.5097,
"step": 170
},
{
"epoch": 0.0927960927960928,
"grad_norm": 0.794647753238678,
"learning_rate": 7.010007427581378e-06,
"loss": 0.3255,
"step": 171
},
{
"epoch": 0.09333876000542667,
"grad_norm": 0.5329033732414246,
"learning_rate": 6.5455526847235825e-06,
"loss": 0.255,
"step": 172
},
{
"epoch": 0.09388142721476055,
"grad_norm": 0.5956745743751526,
"learning_rate": 6.0959375958151045e-06,
"loss": 0.1974,
"step": 173
},
{
"epoch": 0.09442409442409443,
"grad_norm": 0.5087482333183289,
"learning_rate": 5.6613157039969055e-06,
"loss": 0.318,
"step": 174
},
{
"epoch": 0.0949667616334283,
"grad_norm": 0.6372755169868469,
"learning_rate": 5.241835432246889e-06,
"loss": 0.5034,
"step": 175
},
{
"epoch": 0.09550942884276217,
"grad_norm": 0.6690561175346375,
"learning_rate": 4.837640032693558e-06,
"loss": 0.2968,
"step": 176
},
{
"epoch": 0.09605209605209605,
"grad_norm": 0.7963606119155884,
"learning_rate": 4.448867537695578e-06,
"loss": 0.2768,
"step": 177
},
{
"epoch": 0.09659476326142993,
"grad_norm": 0.7704768776893616,
"learning_rate": 4.075650712703849e-06,
"loss": 0.3783,
"step": 178
},
{
"epoch": 0.0971374304707638,
"grad_norm": 0.7436457276344299,
"learning_rate": 3.71811701092219e-06,
"loss": 0.3566,
"step": 179
},
{
"epoch": 0.09768009768009768,
"grad_norm": 1.0820454359054565,
"learning_rate": 3.376388529782215e-06,
"loss": 0.2758,
"step": 180
},
{
"epoch": 0.09822276488943156,
"grad_norm": 1.1378484964370728,
"learning_rate": 3.0505819692471792e-06,
"loss": 0.2739,
"step": 181
},
{
"epoch": 0.09876543209876543,
"grad_norm": 1.144866704940796,
"learning_rate": 2.7408085919590264e-06,
"loss": 0.3067,
"step": 182
},
{
"epoch": 0.0993080993080993,
"grad_norm": 1.2071819305419922,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.2637,
"step": 183
},
{
"epoch": 0.09985076651743319,
"grad_norm": 0.7118948698043823,
"learning_rate": 2.1697790249779636e-06,
"loss": 0.2811,
"step": 184
},
{
"epoch": 0.10039343372676705,
"grad_norm": 0.7794045805931091,
"learning_rate": 1.908717841359048e-06,
"loss": 0.2323,
"step": 185
},
{
"epoch": 0.10093610093610093,
"grad_norm": 0.7099078893661499,
"learning_rate": 1.6640797865406288e-06,
"loss": 0.2769,
"step": 186
},
{
"epoch": 0.10147876814543481,
"grad_norm": 0.579207181930542,
"learning_rate": 1.4359484041943038e-06,
"loss": 0.2418,
"step": 187
},
{
"epoch": 0.1020214353547687,
"grad_norm": 0.9049748778343201,
"learning_rate": 1.2244016009781701e-06,
"loss": 0.2772,
"step": 188
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.7932896614074707,
"learning_rate": 1.0295116199317057e-06,
"loss": 0.3465,
"step": 189
},
{
"epoch": 0.10310676977343644,
"grad_norm": 1.252280592918396,
"learning_rate": 8.513450158049108e-07,
"loss": 0.267,
"step": 190
},
{
"epoch": 0.10364943698277032,
"grad_norm": 0.9156208038330078,
"learning_rate": 6.899626323298713e-07,
"loss": 0.2506,
"step": 191
},
{
"epoch": 0.10419210419210419,
"grad_norm": 1.0016757249832153,
"learning_rate": 5.454195814427021e-07,
"loss": 0.1856,
"step": 192
},
{
"epoch": 0.10473477140143807,
"grad_norm": 1.0268343687057495,
"learning_rate": 4.177652244628627e-07,
"loss": 0.2485,
"step": 193
},
{
"epoch": 0.10527743861077195,
"grad_norm": 1.2182773351669312,
"learning_rate": 3.0704315523631953e-07,
"loss": 0.3693,
"step": 194
},
{
"epoch": 0.10582010582010581,
"grad_norm": 1.4859917163848877,
"learning_rate": 2.1329118524827662e-07,
"loss": 0.34,
"step": 195
},
{
"epoch": 0.1063627730294397,
"grad_norm": 1.299428939819336,
"learning_rate": 1.3654133071059893e-07,
"loss": 0.1563,
"step": 196
},
{
"epoch": 0.10690544023877357,
"grad_norm": 1.2561206817626953,
"learning_rate": 7.681980162830282e-08,
"loss": 0.1746,
"step": 197
},
{
"epoch": 0.10744810744810745,
"grad_norm": 1.2994426488876343,
"learning_rate": 3.4146992848854695e-08,
"loss": 0.1565,
"step": 198
},
{
"epoch": 0.10799077465744132,
"grad_norm": 1.261816382408142,
"learning_rate": 8.537477097364522e-09,
"loss": 0.1866,
"step": 199
},
{
"epoch": 0.1085334418667752,
"grad_norm": 1.7563239336013794,
"learning_rate": 0.0,
"loss": 0.4114,
"step": 200
},
{
"epoch": 0.1085334418667752,
"eval_loss": 0.46117889881134033,
"eval_runtime": 109.2594,
"eval_samples_per_second": 28.409,
"eval_steps_per_second": 14.205,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.449698231975936e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}