prxy5607's picture
Training in progress, step 119, checkpoint
7c46c45 verified
{
"best_metric": 0.32027119398117065,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 2.9937106918238996,
"eval_steps": 50,
"global_step": 119,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025157232704402517,
"grad_norm": 10.445202827453613,
"learning_rate": 1e-05,
"loss": 1.8113,
"step": 1
},
{
"epoch": 0.025157232704402517,
"eval_loss": 2.903780698776245,
"eval_runtime": 1.3578,
"eval_samples_per_second": 49.344,
"eval_steps_per_second": 12.52,
"step": 1
},
{
"epoch": 0.050314465408805034,
"grad_norm": 18.712600708007812,
"learning_rate": 2e-05,
"loss": 2.412,
"step": 2
},
{
"epoch": 0.07547169811320754,
"grad_norm": 13.591891288757324,
"learning_rate": 3e-05,
"loss": 2.6184,
"step": 3
},
{
"epoch": 0.10062893081761007,
"grad_norm": 6.334252834320068,
"learning_rate": 4e-05,
"loss": 2.2968,
"step": 4
},
{
"epoch": 0.12578616352201258,
"grad_norm": 8.629500389099121,
"learning_rate": 5e-05,
"loss": 2.6715,
"step": 5
},
{
"epoch": 0.1509433962264151,
"grad_norm": 6.428257465362549,
"learning_rate": 6e-05,
"loss": 2.3585,
"step": 6
},
{
"epoch": 0.1761006289308176,
"grad_norm": 6.464021682739258,
"learning_rate": 7e-05,
"loss": 2.2552,
"step": 7
},
{
"epoch": 0.20125786163522014,
"grad_norm": 14.248180389404297,
"learning_rate": 8e-05,
"loss": 3.8528,
"step": 8
},
{
"epoch": 0.22641509433962265,
"grad_norm": 15.20981216430664,
"learning_rate": 9e-05,
"loss": 3.4106,
"step": 9
},
{
"epoch": 0.25157232704402516,
"grad_norm": 5.780796051025391,
"learning_rate": 0.0001,
"loss": 1.1669,
"step": 10
},
{
"epoch": 0.27672955974842767,
"grad_norm": 5.746275424957275,
"learning_rate": 9.997923381619256e-05,
"loss": 1.1579,
"step": 11
},
{
"epoch": 0.3018867924528302,
"grad_norm": 2.8907012939453125,
"learning_rate": 9.991695251414583e-05,
"loss": 1.0849,
"step": 12
},
{
"epoch": 0.3270440251572327,
"grad_norm": 1.9942095279693604,
"learning_rate": 9.981320782765846e-05,
"loss": 1.0375,
"step": 13
},
{
"epoch": 0.3522012578616352,
"grad_norm": 2.2817931175231934,
"learning_rate": 9.966808593197959e-05,
"loss": 0.9995,
"step": 14
},
{
"epoch": 0.37735849056603776,
"grad_norm": 2.029221773147583,
"learning_rate": 9.948170737222762e-05,
"loss": 0.8702,
"step": 15
},
{
"epoch": 0.4025157232704403,
"grad_norm": 2.2605092525482178,
"learning_rate": 9.925422696325975e-05,
"loss": 0.8997,
"step": 16
},
{
"epoch": 0.4276729559748428,
"grad_norm": 6.524753093719482,
"learning_rate": 9.898583366107538e-05,
"loss": 1.0832,
"step": 17
},
{
"epoch": 0.4528301886792453,
"grad_norm": 4.861238479614258,
"learning_rate": 9.867675040586034e-05,
"loss": 1.2063,
"step": 18
},
{
"epoch": 0.4779874213836478,
"grad_norm": 1.9360491037368774,
"learning_rate": 9.83272339368022e-05,
"loss": 0.8503,
"step": 19
},
{
"epoch": 0.5031446540880503,
"grad_norm": 1.9203150272369385,
"learning_rate": 9.793757457883062e-05,
"loss": 0.8055,
"step": 20
},
{
"epoch": 0.5283018867924528,
"grad_norm": 1.335469126701355,
"learning_rate": 9.750809600145954e-05,
"loss": 0.8518,
"step": 21
},
{
"epoch": 0.5534591194968553,
"grad_norm": 1.137628197669983,
"learning_rate": 9.703915494993215e-05,
"loss": 0.7453,
"step": 22
},
{
"epoch": 0.5786163522012578,
"grad_norm": 1.0536383390426636,
"learning_rate": 9.653114094889127e-05,
"loss": 0.7192,
"step": 23
},
{
"epoch": 0.6037735849056604,
"grad_norm": 1.5118601322174072,
"learning_rate": 9.598447597882181e-05,
"loss": 0.7093,
"step": 24
},
{
"epoch": 0.6289308176100629,
"grad_norm": 1.6779704093933105,
"learning_rate": 9.539961412553375e-05,
"loss": 0.7177,
"step": 25
},
{
"epoch": 0.6540880503144654,
"grad_norm": 2.0891356468200684,
"learning_rate": 9.477704120297697e-05,
"loss": 0.8265,
"step": 26
},
{
"epoch": 0.6792452830188679,
"grad_norm": 3.018578052520752,
"learning_rate": 9.411727434970121e-05,
"loss": 1.1182,
"step": 27
},
{
"epoch": 0.7044025157232704,
"grad_norm": 1.017041563987732,
"learning_rate": 9.34208615992963e-05,
"loss": 0.6662,
"step": 28
},
{
"epoch": 0.7295597484276729,
"grad_norm": 1.0053677558898926,
"learning_rate": 9.268838142516943e-05,
"loss": 0.5747,
"step": 29
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.353121280670166,
"learning_rate": 9.192044226003789e-05,
"loss": 0.6665,
"step": 30
},
{
"epoch": 0.779874213836478,
"grad_norm": 1.0210703611373901,
"learning_rate": 9.111768199053588e-05,
"loss": 0.5584,
"step": 31
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.9525967836380005,
"learning_rate": 9.028076742735583e-05,
"loss": 0.5162,
"step": 32
},
{
"epoch": 0.8301886792452831,
"grad_norm": 1.384174108505249,
"learning_rate": 8.941039375136371e-05,
"loss": 0.6165,
"step": 33
},
{
"epoch": 0.8553459119496856,
"grad_norm": 1.3128784894943237,
"learning_rate": 8.850728393614902e-05,
"loss": 0.6381,
"step": 34
},
{
"epoch": 0.8805031446540881,
"grad_norm": 2.184060573577881,
"learning_rate": 8.75721881474886e-05,
"loss": 0.6739,
"step": 35
},
{
"epoch": 0.9056603773584906,
"grad_norm": 3.2247838973999023,
"learning_rate": 8.660588312022344e-05,
"loss": 1.0203,
"step": 36
},
{
"epoch": 0.9308176100628931,
"grad_norm": 1.1204508543014526,
"learning_rate": 8.560917151306593e-05,
"loss": 0.6185,
"step": 37
},
{
"epoch": 0.9559748427672956,
"grad_norm": 1.065934658050537,
"learning_rate": 8.458288124187359e-05,
"loss": 0.4873,
"step": 38
},
{
"epoch": 0.9811320754716981,
"grad_norm": 1.583658218383789,
"learning_rate": 8.352786479194288e-05,
"loss": 0.4867,
"step": 39
},
{
"epoch": 1.0062893081761006,
"grad_norm": 2.8070600032806396,
"learning_rate": 8.244499850989452e-05,
"loss": 0.8313,
"step": 40
},
{
"epoch": 1.0314465408805031,
"grad_norm": 0.8271421194076538,
"learning_rate": 8.133518187573862e-05,
"loss": 0.4702,
"step": 41
},
{
"epoch": 1.0566037735849056,
"grad_norm": 0.8070173263549805,
"learning_rate": 8.019933675572389e-05,
"loss": 0.4361,
"step": 42
},
{
"epoch": 1.0817610062893082,
"grad_norm": 2.6176536083221436,
"learning_rate": 7.903840663659186e-05,
"loss": 0.4341,
"step": 43
},
{
"epoch": 1.1069182389937107,
"grad_norm": 0.9174895286560059,
"learning_rate": 7.785335584187219e-05,
"loss": 0.3583,
"step": 44
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.8380503058433533,
"learning_rate": 7.664516873086987e-05,
"loss": 0.3898,
"step": 45
},
{
"epoch": 1.1572327044025157,
"grad_norm": 0.9413546919822693,
"learning_rate": 7.541484888100974e-05,
"loss": 0.4148,
"step": 46
},
{
"epoch": 1.1823899371069182,
"grad_norm": 1.0667158365249634,
"learning_rate": 7.416341825421754e-05,
"loss": 0.4358,
"step": 47
},
{
"epoch": 1.2075471698113207,
"grad_norm": 1.6355628967285156,
"learning_rate": 7.289191634803003e-05,
"loss": 0.535,
"step": 48
},
{
"epoch": 1.2327044025157232,
"grad_norm": 2.1079909801483154,
"learning_rate": 7.160139933213898e-05,
"loss": 0.5971,
"step": 49
},
{
"epoch": 1.2578616352201257,
"grad_norm": 0.9688711166381836,
"learning_rate": 7.029293917108678e-05,
"loss": 0.3642,
"step": 50
},
{
"epoch": 1.2578616352201257,
"eval_loss": 0.40250465273857117,
"eval_runtime": 1.357,
"eval_samples_per_second": 49.373,
"eval_steps_per_second": 12.527,
"step": 50
},
{
"epoch": 1.2830188679245282,
"grad_norm": 0.8682835102081299,
"learning_rate": 6.896762273384178e-05,
"loss": 0.3127,
"step": 51
},
{
"epoch": 1.3081761006289307,
"grad_norm": 0.8948720693588257,
"learning_rate": 6.762655089099353e-05,
"loss": 0.3749,
"step": 52
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.8554229736328125,
"learning_rate": 6.627083760031754e-05,
"loss": 0.3637,
"step": 53
},
{
"epoch": 1.3584905660377358,
"grad_norm": 0.9757639765739441,
"learning_rate": 6.490160898146918e-05,
"loss": 0.2847,
"step": 54
},
{
"epoch": 1.3836477987421385,
"grad_norm": 1.135493516921997,
"learning_rate": 6.35200023805754e-05,
"loss": 0.3686,
"step": 55
},
{
"epoch": 1.408805031446541,
"grad_norm": 1.8949108123779297,
"learning_rate": 6.212716542550112e-05,
"loss": 0.3382,
"step": 56
},
{
"epoch": 1.4339622641509435,
"grad_norm": 1.7161883115768433,
"learning_rate": 6.0724255072575275e-05,
"loss": 0.4808,
"step": 57
},
{
"epoch": 1.459119496855346,
"grad_norm": 1.8359076976776123,
"learning_rate": 5.931243664556803e-05,
"loss": 0.5259,
"step": 58
},
{
"epoch": 1.4842767295597485,
"grad_norm": 0.7648423314094543,
"learning_rate": 5.78928828677177e-05,
"loss": 0.3778,
"step": 59
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.8067821860313416,
"learning_rate": 5.646677288761132e-05,
"loss": 0.3448,
"step": 60
},
{
"epoch": 1.5345911949685536,
"grad_norm": 0.7813786864280701,
"learning_rate": 5.503529129972792e-05,
"loss": 0.339,
"step": 61
},
{
"epoch": 1.559748427672956,
"grad_norm": 0.7792041897773743,
"learning_rate": 5.359962716045835e-05,
"loss": 0.2644,
"step": 62
},
{
"epoch": 1.5849056603773586,
"grad_norm": 0.9082090258598328,
"learning_rate": 5.21609730004187e-05,
"loss": 0.2565,
"step": 63
},
{
"epoch": 1.610062893081761,
"grad_norm": 0.9733455777168274,
"learning_rate": 5.072052383387786e-05,
"loss": 0.3213,
"step": 64
},
{
"epoch": 1.6352201257861636,
"grad_norm": 0.960435152053833,
"learning_rate": 4.927947616612215e-05,
"loss": 0.3019,
"step": 65
},
{
"epoch": 1.6603773584905661,
"grad_norm": 1.6576495170593262,
"learning_rate": 4.7839026999581296e-05,
"loss": 0.4513,
"step": 66
},
{
"epoch": 1.6855345911949686,
"grad_norm": 1.8107895851135254,
"learning_rate": 4.640037283954165e-05,
"loss": 0.4877,
"step": 67
},
{
"epoch": 1.7106918238993711,
"grad_norm": 0.639157772064209,
"learning_rate": 4.496470870027209e-05,
"loss": 0.2969,
"step": 68
},
{
"epoch": 1.7358490566037736,
"grad_norm": 0.8809495568275452,
"learning_rate": 4.3533227112388694e-05,
"loss": 0.3093,
"step": 69
},
{
"epoch": 1.7610062893081762,
"grad_norm": 0.8104914426803589,
"learning_rate": 4.21071171322823e-05,
"loss": 0.3098,
"step": 70
},
{
"epoch": 1.7861635220125787,
"grad_norm": 0.8440978527069092,
"learning_rate": 4.0687563354431984e-05,
"loss": 0.2953,
"step": 71
},
{
"epoch": 1.8113207547169812,
"grad_norm": 0.8337366580963135,
"learning_rate": 3.927574492742473e-05,
"loss": 0.2859,
"step": 72
},
{
"epoch": 1.8364779874213837,
"grad_norm": 0.8279858231544495,
"learning_rate": 3.78728345744989e-05,
"loss": 0.2481,
"step": 73
},
{
"epoch": 1.8616352201257862,
"grad_norm": 0.9654554724693298,
"learning_rate": 3.6479997619424605e-05,
"loss": 0.2819,
"step": 74
},
{
"epoch": 1.8867924528301887,
"grad_norm": 1.6905914545059204,
"learning_rate": 3.5098391018530816e-05,
"loss": 0.5995,
"step": 75
},
{
"epoch": 1.9119496855345912,
"grad_norm": 1.8605477809906006,
"learning_rate": 3.3729162399682456e-05,
"loss": 0.4759,
"step": 76
},
{
"epoch": 1.9371069182389937,
"grad_norm": 0.7602652907371521,
"learning_rate": 3.237344910900648e-05,
"loss": 0.3107,
"step": 77
},
{
"epoch": 1.9622641509433962,
"grad_norm": 0.8071399331092834,
"learning_rate": 3.103237726615822e-05,
"loss": 0.2663,
"step": 78
},
{
"epoch": 1.9874213836477987,
"grad_norm": 1.6272022724151611,
"learning_rate": 2.9707060828913225e-05,
"loss": 0.409,
"step": 79
},
{
"epoch": 2.0125786163522013,
"grad_norm": 1.3061223030090332,
"learning_rate": 2.839860066786103e-05,
"loss": 0.4409,
"step": 80
},
{
"epoch": 2.0377358490566038,
"grad_norm": 0.5977674126625061,
"learning_rate": 2.710808365197e-05,
"loss": 0.2442,
"step": 81
},
{
"epoch": 2.0628930817610063,
"grad_norm": 0.6834774017333984,
"learning_rate": 2.5836581745782475e-05,
"loss": 0.2757,
"step": 82
},
{
"epoch": 2.088050314465409,
"grad_norm": 0.7066735625267029,
"learning_rate": 2.4585151118990286e-05,
"loss": 0.2006,
"step": 83
},
{
"epoch": 2.1132075471698113,
"grad_norm": 0.6501676440238953,
"learning_rate": 2.3354831269130133e-05,
"loss": 0.2125,
"step": 84
},
{
"epoch": 2.138364779874214,
"grad_norm": 0.6182856559753418,
"learning_rate": 2.2146644158127827e-05,
"loss": 0.1671,
"step": 85
},
{
"epoch": 2.1635220125786163,
"grad_norm": 0.8084297180175781,
"learning_rate": 2.0961593363408156e-05,
"loss": 0.2286,
"step": 86
},
{
"epoch": 2.188679245283019,
"grad_norm": 0.7915740609169006,
"learning_rate": 1.980066324427613e-05,
"loss": 0.186,
"step": 87
},
{
"epoch": 2.2138364779874213,
"grad_norm": 1.525448203086853,
"learning_rate": 1.8664818124261374e-05,
"loss": 0.3382,
"step": 88
},
{
"epoch": 2.238993710691824,
"grad_norm": 1.2538678646087646,
"learning_rate": 1.7555001490105488e-05,
"loss": 0.3341,
"step": 89
},
{
"epoch": 2.2641509433962264,
"grad_norm": 0.5898963212966919,
"learning_rate": 1.6472135208057126e-05,
"loss": 0.1937,
"step": 90
},
{
"epoch": 2.289308176100629,
"grad_norm": 0.7369837760925293,
"learning_rate": 1.541711875812641e-05,
"loss": 0.2273,
"step": 91
},
{
"epoch": 2.3144654088050314,
"grad_norm": 0.7151015996932983,
"learning_rate": 1.439082848693406e-05,
"loss": 0.2349,
"step": 92
},
{
"epoch": 2.339622641509434,
"grad_norm": 0.6623278260231018,
"learning_rate": 1.339411687977657e-05,
"loss": 0.1702,
"step": 93
},
{
"epoch": 2.3647798742138364,
"grad_norm": 0.731364369392395,
"learning_rate": 1.2427811852511395e-05,
"loss": 0.2054,
"step": 94
},
{
"epoch": 2.389937106918239,
"grad_norm": 1.0562483072280884,
"learning_rate": 1.1492716063850973e-05,
"loss": 0.2254,
"step": 95
},
{
"epoch": 2.4150943396226414,
"grad_norm": 1.070011854171753,
"learning_rate": 1.0589606248636292e-05,
"loss": 0.2412,
"step": 96
},
{
"epoch": 2.440251572327044,
"grad_norm": 1.4945615530014038,
"learning_rate": 9.719232572644187e-06,
"loss": 0.297,
"step": 97
},
{
"epoch": 2.4654088050314464,
"grad_norm": 1.1410236358642578,
"learning_rate": 8.882318009464125e-06,
"loss": 0.3131,
"step": 98
},
{
"epoch": 2.490566037735849,
"grad_norm": 0.6251071095466614,
"learning_rate": 8.079557739962128e-06,
"loss": 0.2219,
"step": 99
},
{
"epoch": 2.5157232704402515,
"grad_norm": 0.7983537912368774,
"learning_rate": 7.31161857483057e-06,
"loss": 0.2371,
"step": 100
},
{
"epoch": 2.5157232704402515,
"eval_loss": 0.32027119398117065,
"eval_runtime": 1.3603,
"eval_samples_per_second": 49.255,
"eval_steps_per_second": 12.498,
"step": 100
},
{
"epoch": 2.540880503144654,
"grad_norm": 0.7304105758666992,
"learning_rate": 6.579138400703716e-06,
"loss": 0.2125,
"step": 101
},
{
"epoch": 2.5660377358490565,
"grad_norm": 0.6842655539512634,
"learning_rate": 5.882725650298787e-06,
"loss": 0.176,
"step": 102
},
{
"epoch": 2.591194968553459,
"grad_norm": 0.8605983853340149,
"learning_rate": 5.222958797023036e-06,
"loss": 0.2162,
"step": 103
},
{
"epoch": 2.6163522012578615,
"grad_norm": 0.7604213356971741,
"learning_rate": 4.600385874466256e-06,
"loss": 0.1974,
"step": 104
},
{
"epoch": 2.641509433962264,
"grad_norm": 1.07295560836792,
"learning_rate": 4.015524021178196e-06,
"loss": 0.2702,
"step": 105
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.6208587884902954,
"learning_rate": 3.4688590511087304e-06,
"loss": 0.3159,
"step": 106
},
{
"epoch": 2.691823899371069,
"grad_norm": 1.2537561655044556,
"learning_rate": 2.9608450500678565e-06,
"loss": 0.3131,
"step": 107
},
{
"epoch": 2.7169811320754715,
"grad_norm": 0.7774904370307922,
"learning_rate": 2.4919039985404626e-06,
"loss": 0.2692,
"step": 108
},
{
"epoch": 2.742138364779874,
"grad_norm": 0.7653168439865112,
"learning_rate": 2.0624254211693894e-06,
"loss": 0.2267,
"step": 109
},
{
"epoch": 2.767295597484277,
"grad_norm": 0.7065545320510864,
"learning_rate": 1.6727660631977893e-06,
"loss": 0.2014,
"step": 110
},
{
"epoch": 2.7924528301886795,
"grad_norm": 0.7866085767745972,
"learning_rate": 1.3232495941396639e-06,
"loss": 0.1873,
"step": 111
},
{
"epoch": 2.817610062893082,
"grad_norm": 0.7286617159843445,
"learning_rate": 1.014166338924627e-06,
"loss": 0.165,
"step": 112
},
{
"epoch": 2.8427672955974845,
"grad_norm": 0.9083012938499451,
"learning_rate": 7.457730367402549e-07,
"loss": 0.23,
"step": 113
},
{
"epoch": 2.867924528301887,
"grad_norm": 0.7802666425704956,
"learning_rate": 5.18292627772382e-07,
"loss": 0.1645,
"step": 114
},
{
"epoch": 2.8930817610062896,
"grad_norm": 1.510367512702942,
"learning_rate": 3.3191406802041693e-07,
"loss": 0.3319,
"step": 115
},
{
"epoch": 2.918238993710692,
"grad_norm": 1.282609462738037,
"learning_rate": 1.8679217234154334e-07,
"loss": 0.3489,
"step": 116
},
{
"epoch": 2.9433962264150946,
"grad_norm": 0.7282307744026184,
"learning_rate": 8.304748585417078e-08,
"loss": 0.234,
"step": 117
},
{
"epoch": 2.968553459119497,
"grad_norm": 3.293064832687378,
"learning_rate": 2.076618380744133e-08,
"loss": 0.1896,
"step": 118
},
{
"epoch": 2.9937106918238996,
"grad_norm": 1.658855676651001,
"learning_rate": 0.0,
"loss": 0.3107,
"step": 119
}
],
"logging_steps": 1,
"max_steps": 119,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.531779857403085e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}