besimray's picture
Training in progress, step 100, checkpoint
ea07abc verified
raw
history blame
23 kB
{
"best_metric": 11.032732963562012,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.004519671871822106,
"eval_steps": 5,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.519671871822106e-05,
"grad_norm": 0.741265594959259,
"learning_rate": 2e-05,
"loss": 44.3865,
"step": 1
},
{
"epoch": 4.519671871822106e-05,
"eval_loss": 11.093368530273438,
"eval_runtime": 175.5908,
"eval_samples_per_second": 53.061,
"eval_steps_per_second": 26.533,
"step": 1
},
{
"epoch": 9.039343743644212e-05,
"grad_norm": 0.6581929326057434,
"learning_rate": 4e-05,
"loss": 44.3813,
"step": 2
},
{
"epoch": 0.00013559015615466317,
"grad_norm": 0.6729432344436646,
"learning_rate": 6e-05,
"loss": 44.3393,
"step": 3
},
{
"epoch": 0.00018078687487288423,
"grad_norm": 0.6872175335884094,
"learning_rate": 8e-05,
"loss": 44.3795,
"step": 4
},
{
"epoch": 0.00022598359359110527,
"grad_norm": 0.704067051410675,
"learning_rate": 0.0001,
"loss": 44.389,
"step": 5
},
{
"epoch": 0.00022598359359110527,
"eval_loss": 11.093063354492188,
"eval_runtime": 176.3334,
"eval_samples_per_second": 52.837,
"eval_steps_per_second": 26.422,
"step": 5
},
{
"epoch": 0.00027118031230932634,
"grad_norm": 0.6682418584823608,
"learning_rate": 0.00012,
"loss": 44.3529,
"step": 6
},
{
"epoch": 0.0003163770310275474,
"grad_norm": 0.6353705525398254,
"learning_rate": 0.00014,
"loss": 44.4355,
"step": 7
},
{
"epoch": 0.00036157374974576847,
"grad_norm": 0.6866922974586487,
"learning_rate": 0.00016,
"loss": 44.4036,
"step": 8
},
{
"epoch": 0.00040677046846398953,
"grad_norm": 0.7315343618392944,
"learning_rate": 0.00018,
"loss": 44.3756,
"step": 9
},
{
"epoch": 0.00045196718718221055,
"grad_norm": 0.6867555379867554,
"learning_rate": 0.0002,
"loss": 44.3678,
"step": 10
},
{
"epoch": 0.00045196718718221055,
"eval_loss": 11.091917037963867,
"eval_runtime": 176.1396,
"eval_samples_per_second": 52.896,
"eval_steps_per_second": 26.451,
"step": 10
},
{
"epoch": 0.0004971639059004316,
"grad_norm": 0.7067858576774597,
"learning_rate": 0.0001999979446958366,
"loss": 44.3933,
"step": 11
},
{
"epoch": 0.0005423606246186527,
"grad_norm": 0.7694055438041687,
"learning_rate": 0.00019999177886783194,
"loss": 44.3476,
"step": 12
},
{
"epoch": 0.0005875573433368737,
"grad_norm": 0.6980550289154053,
"learning_rate": 0.00019998150276943902,
"loss": 44.3621,
"step": 13
},
{
"epoch": 0.0006327540620550948,
"grad_norm": 0.7399426698684692,
"learning_rate": 0.000199967116823068,
"loss": 44.3727,
"step": 14
},
{
"epoch": 0.0006779507807733159,
"grad_norm": 0.6623771786689758,
"learning_rate": 0.0001999486216200688,
"loss": 44.3563,
"step": 15
},
{
"epoch": 0.0006779507807733159,
"eval_loss": 11.089905738830566,
"eval_runtime": 176.0326,
"eval_samples_per_second": 52.928,
"eval_steps_per_second": 26.467,
"step": 15
},
{
"epoch": 0.0007231474994915369,
"grad_norm": 0.6647756695747375,
"learning_rate": 0.00019992601792070679,
"loss": 44.3454,
"step": 16
},
{
"epoch": 0.000768344218209758,
"grad_norm": 0.7416101694107056,
"learning_rate": 0.00019989930665413147,
"loss": 44.3249,
"step": 17
},
{
"epoch": 0.0008135409369279791,
"grad_norm": 0.6351829171180725,
"learning_rate": 0.00019986848891833845,
"loss": 44.37,
"step": 18
},
{
"epoch": 0.0008587376556462001,
"grad_norm": 0.6839431524276733,
"learning_rate": 0.0001998335659801241,
"loss": 44.3472,
"step": 19
},
{
"epoch": 0.0009039343743644211,
"grad_norm": 0.6762228608131409,
"learning_rate": 0.00019979453927503364,
"loss": 44.3507,
"step": 20
},
{
"epoch": 0.0009039343743644211,
"eval_loss": 11.087591171264648,
"eval_runtime": 176.1534,
"eval_samples_per_second": 52.891,
"eval_steps_per_second": 26.449,
"step": 20
},
{
"epoch": 0.0009491310930826422,
"grad_norm": 0.7993413209915161,
"learning_rate": 0.00019975141040730207,
"loss": 44.288,
"step": 21
},
{
"epoch": 0.0009943278118008632,
"grad_norm": 0.6926490664482117,
"learning_rate": 0.0001997041811497882,
"loss": 44.3672,
"step": 22
},
{
"epoch": 0.0010395245305190844,
"grad_norm": 0.7373084425926208,
"learning_rate": 0.00019965285344390184,
"loss": 44.3927,
"step": 23
},
{
"epoch": 0.0010847212492373054,
"grad_norm": 0.6655643582344055,
"learning_rate": 0.00019959742939952392,
"loss": 44.3481,
"step": 24
},
{
"epoch": 0.0011299179679555265,
"grad_norm": 0.7115928530693054,
"learning_rate": 0.00019953791129491983,
"loss": 44.3368,
"step": 25
},
{
"epoch": 0.0011299179679555265,
"eval_loss": 11.085227012634277,
"eval_runtime": 175.877,
"eval_samples_per_second": 52.975,
"eval_steps_per_second": 26.49,
"step": 25
},
{
"epoch": 0.0011751146866737475,
"grad_norm": 0.7096830010414124,
"learning_rate": 0.00019947430157664576,
"loss": 44.3735,
"step": 26
},
{
"epoch": 0.0012203114053919684,
"grad_norm": 0.6747312545776367,
"learning_rate": 0.00019940660285944803,
"loss": 44.3323,
"step": 27
},
{
"epoch": 0.0012655081241101896,
"grad_norm": 0.7371957302093506,
"learning_rate": 0.00019933481792615583,
"loss": 44.2951,
"step": 28
},
{
"epoch": 0.0013107048428284106,
"grad_norm": 0.7316697239875793,
"learning_rate": 0.0001992589497275665,
"loss": 44.3097,
"step": 29
},
{
"epoch": 0.0013559015615466317,
"grad_norm": 0.6886783838272095,
"learning_rate": 0.0001991790013823246,
"loss": 44.3137,
"step": 30
},
{
"epoch": 0.0013559015615466317,
"eval_loss": 11.082609176635742,
"eval_runtime": 176.2695,
"eval_samples_per_second": 52.857,
"eval_steps_per_second": 26.431,
"step": 30
},
{
"epoch": 0.0014010982802648527,
"grad_norm": 0.7027749419212341,
"learning_rate": 0.00019909497617679348,
"loss": 44.3391,
"step": 31
},
{
"epoch": 0.0014462949989830739,
"grad_norm": 0.735598087310791,
"learning_rate": 0.0001990068775649202,
"loss": 44.3645,
"step": 32
},
{
"epoch": 0.0014914917177012948,
"grad_norm": 0.7152600288391113,
"learning_rate": 0.00019891470916809362,
"loss": 44.3478,
"step": 33
},
{
"epoch": 0.001536688436419516,
"grad_norm": 0.6983291506767273,
"learning_rate": 0.00019881847477499557,
"loss": 44.3252,
"step": 34
},
{
"epoch": 0.001581885155137737,
"grad_norm": 0.6892045140266418,
"learning_rate": 0.00019871817834144504,
"loss": 44.2998,
"step": 35
},
{
"epoch": 0.001581885155137737,
"eval_loss": 11.079712867736816,
"eval_runtime": 176.1378,
"eval_samples_per_second": 52.896,
"eval_steps_per_second": 26.451,
"step": 35
},
{
"epoch": 0.0016270818738559581,
"grad_norm": 0.7166262865066528,
"learning_rate": 0.0001986138239902355,
"loss": 44.3485,
"step": 36
},
{
"epoch": 0.001672278592574179,
"grad_norm": 0.7545002102851868,
"learning_rate": 0.0001985054160109657,
"loss": 44.2613,
"step": 37
},
{
"epoch": 0.0017174753112924003,
"grad_norm": 0.7944263219833374,
"learning_rate": 0.00019839295885986296,
"loss": 44.2665,
"step": 38
},
{
"epoch": 0.0017626720300106212,
"grad_norm": 0.7216903567314148,
"learning_rate": 0.0001982764571596004,
"loss": 44.3546,
"step": 39
},
{
"epoch": 0.0018078687487288422,
"grad_norm": 0.7492774128913879,
"learning_rate": 0.00019815591569910654,
"loss": 44.3223,
"step": 40
},
{
"epoch": 0.0018078687487288422,
"eval_loss": 11.076553344726562,
"eval_runtime": 176.1866,
"eval_samples_per_second": 52.881,
"eval_steps_per_second": 26.444,
"step": 40
},
{
"epoch": 0.0018530654674470634,
"grad_norm": 0.8118460774421692,
"learning_rate": 0.00019803133943336874,
"loss": 44.3122,
"step": 41
},
{
"epoch": 0.0018982621861652843,
"grad_norm": 0.7527559399604797,
"learning_rate": 0.0001979027334832293,
"loss": 44.3061,
"step": 42
},
{
"epoch": 0.0019434589048835055,
"grad_norm": 0.7425262331962585,
"learning_rate": 0.00019777010313517518,
"loss": 44.2408,
"step": 43
},
{
"epoch": 0.0019886556236017264,
"grad_norm": 0.753101646900177,
"learning_rate": 0.00019763345384112043,
"loss": 44.3362,
"step": 44
},
{
"epoch": 0.0020338523423199476,
"grad_norm": 0.767737090587616,
"learning_rate": 0.00019749279121818235,
"loss": 44.2864,
"step": 45
},
{
"epoch": 0.0020338523423199476,
"eval_loss": 11.072389602661133,
"eval_runtime": 175.9667,
"eval_samples_per_second": 52.948,
"eval_steps_per_second": 26.477,
"step": 45
},
{
"epoch": 0.002079049061038169,
"grad_norm": 0.7275786995887756,
"learning_rate": 0.00019734812104845047,
"loss": 44.3542,
"step": 46
},
{
"epoch": 0.0021242457797563895,
"grad_norm": 0.6908650994300842,
"learning_rate": 0.00019719944927874881,
"loss": 44.3377,
"step": 47
},
{
"epoch": 0.0021694424984746107,
"grad_norm": 0.7260599136352539,
"learning_rate": 0.0001970467820203915,
"loss": 44.2621,
"step": 48
},
{
"epoch": 0.002214639217192832,
"grad_norm": 0.7138715982437134,
"learning_rate": 0.00019689012554893154,
"loss": 44.2338,
"step": 49
},
{
"epoch": 0.002259835935911053,
"grad_norm": 0.7867954969406128,
"learning_rate": 0.00019672948630390294,
"loss": 44.3044,
"step": 50
},
{
"epoch": 0.002259835935911053,
"eval_loss": 11.067892074584961,
"eval_runtime": 176.5244,
"eval_samples_per_second": 52.78,
"eval_steps_per_second": 26.393,
"step": 50
},
{
"epoch": 0.002305032654629274,
"grad_norm": 0.7787512540817261,
"learning_rate": 0.00019656487088855592,
"loss": 44.2918,
"step": 51
},
{
"epoch": 0.002350229373347495,
"grad_norm": 0.7184544801712036,
"learning_rate": 0.00019639628606958533,
"loss": 44.2751,
"step": 52
},
{
"epoch": 0.002395426092065716,
"grad_norm": 0.7348573803901672,
"learning_rate": 0.0001962237387768529,
"loss": 44.246,
"step": 53
},
{
"epoch": 0.002440622810783937,
"grad_norm": 0.7713965773582458,
"learning_rate": 0.00019604723610310194,
"loss": 44.3292,
"step": 54
},
{
"epoch": 0.002485819529502158,
"grad_norm": 0.8040369749069214,
"learning_rate": 0.00019586678530366606,
"loss": 44.2155,
"step": 55
},
{
"epoch": 0.002485819529502158,
"eval_loss": 11.062650680541992,
"eval_runtime": 176.278,
"eval_samples_per_second": 52.854,
"eval_steps_per_second": 26.43,
"step": 55
},
{
"epoch": 0.0025310162482203792,
"grad_norm": 0.7459877133369446,
"learning_rate": 0.00019568239379617088,
"loss": 44.2188,
"step": 56
},
{
"epoch": 0.0025762129669386004,
"grad_norm": 0.8008533716201782,
"learning_rate": 0.00019549406916022905,
"loss": 44.226,
"step": 57
},
{
"epoch": 0.002621409685656821,
"grad_norm": 0.7918010354042053,
"learning_rate": 0.00019530181913712872,
"loss": 44.287,
"step": 58
},
{
"epoch": 0.0026666064043750423,
"grad_norm": 0.7287217974662781,
"learning_rate": 0.00019510565162951537,
"loss": 44.2581,
"step": 59
},
{
"epoch": 0.0027118031230932635,
"grad_norm": 0.7925474643707275,
"learning_rate": 0.00019490557470106686,
"loss": 44.2277,
"step": 60
},
{
"epoch": 0.0027118031230932635,
"eval_loss": 11.05736255645752,
"eval_runtime": 176.2465,
"eval_samples_per_second": 52.863,
"eval_steps_per_second": 26.435,
"step": 60
},
{
"epoch": 0.0027569998418114847,
"grad_norm": 0.8553807735443115,
"learning_rate": 0.00019470159657616215,
"loss": 44.2439,
"step": 61
},
{
"epoch": 0.0028021965605297054,
"grad_norm": 0.7586395740509033,
"learning_rate": 0.00019449372563954293,
"loss": 44.1943,
"step": 62
},
{
"epoch": 0.0028473932792479266,
"grad_norm": 0.7628232836723328,
"learning_rate": 0.0001942819704359693,
"loss": 44.2594,
"step": 63
},
{
"epoch": 0.0028925899979661478,
"grad_norm": 0.718551754951477,
"learning_rate": 0.00019406633966986828,
"loss": 44.2302,
"step": 64
},
{
"epoch": 0.002937786716684369,
"grad_norm": 0.7625423073768616,
"learning_rate": 0.00019384684220497605,
"loss": 44.1989,
"step": 65
},
{
"epoch": 0.002937786716684369,
"eval_loss": 11.051901817321777,
"eval_runtime": 176.1759,
"eval_samples_per_second": 52.885,
"eval_steps_per_second": 26.445,
"step": 65
},
{
"epoch": 0.0029829834354025897,
"grad_norm": 0.7891851663589478,
"learning_rate": 0.00019362348706397373,
"loss": 44.2199,
"step": 66
},
{
"epoch": 0.003028180154120811,
"grad_norm": 0.6770808100700378,
"learning_rate": 0.00019339628342811632,
"loss": 44.1689,
"step": 67
},
{
"epoch": 0.003073376872839032,
"grad_norm": 0.7498692870140076,
"learning_rate": 0.0001931652406368554,
"loss": 44.1741,
"step": 68
},
{
"epoch": 0.0031185735915572528,
"grad_norm": 0.7661782503128052,
"learning_rate": 0.0001929303681874552,
"loss": 44.2123,
"step": 69
},
{
"epoch": 0.003163770310275474,
"grad_norm": 0.6438837647438049,
"learning_rate": 0.0001926916757346022,
"loss": 44.1718,
"step": 70
},
{
"epoch": 0.003163770310275474,
"eval_loss": 11.046669960021973,
"eval_runtime": 176.3634,
"eval_samples_per_second": 52.828,
"eval_steps_per_second": 26.417,
"step": 70
},
{
"epoch": 0.003208967028993695,
"grad_norm": 0.7522275447845459,
"learning_rate": 0.00019244917309000817,
"loss": 44.2246,
"step": 71
},
{
"epoch": 0.0032541637477119163,
"grad_norm": 0.7135974168777466,
"learning_rate": 0.00019220287022200707,
"loss": 44.2111,
"step": 72
},
{
"epoch": 0.003299360466430137,
"grad_norm": 0.7275662422180176,
"learning_rate": 0.0001919527772551451,
"loss": 44.1464,
"step": 73
},
{
"epoch": 0.003344557185148358,
"grad_norm": 0.6742229461669922,
"learning_rate": 0.00019169890446976454,
"loss": 44.2105,
"step": 74
},
{
"epoch": 0.0033897539038665794,
"grad_norm": 0.6085646152496338,
"learning_rate": 0.00019144126230158127,
"loss": 44.0926,
"step": 75
},
{
"epoch": 0.0033897539038665794,
"eval_loss": 11.042237281799316,
"eval_runtime": 176.114,
"eval_samples_per_second": 52.903,
"eval_steps_per_second": 26.454,
"step": 75
},
{
"epoch": 0.0034349506225848005,
"grad_norm": 0.7245734333992004,
"learning_rate": 0.0001911798613412557,
"loss": 44.2154,
"step": 76
},
{
"epoch": 0.0034801473413030213,
"grad_norm": 0.7311281561851501,
"learning_rate": 0.0001909147123339575,
"loss": 44.1687,
"step": 77
},
{
"epoch": 0.0035253440600212425,
"grad_norm": 0.6399495601654053,
"learning_rate": 0.0001906458261789238,
"loss": 44.1596,
"step": 78
},
{
"epoch": 0.0035705407787394636,
"grad_norm": 0.5650178790092468,
"learning_rate": 0.00019037321392901136,
"loss": 44.1466,
"step": 79
},
{
"epoch": 0.0036157374974576844,
"grad_norm": 0.6039579510688782,
"learning_rate": 0.0001900968867902419,
"loss": 44.1955,
"step": 80
},
{
"epoch": 0.0036157374974576844,
"eval_loss": 11.038910865783691,
"eval_runtime": 176.3853,
"eval_samples_per_second": 52.822,
"eval_steps_per_second": 26.414,
"step": 80
},
{
"epoch": 0.0036609342161759055,
"grad_norm": 0.7481367588043213,
"learning_rate": 0.0001898168561213419,
"loss": 44.2182,
"step": 81
},
{
"epoch": 0.0037061309348941267,
"grad_norm": 0.628414511680603,
"learning_rate": 0.0001895331334332753,
"loss": 44.1519,
"step": 82
},
{
"epoch": 0.003751327653612348,
"grad_norm": 0.658549964427948,
"learning_rate": 0.0001892457303887706,
"loss": 44.1364,
"step": 83
},
{
"epoch": 0.0037965243723305686,
"grad_norm": 0.5245007276535034,
"learning_rate": 0.0001889546588018412,
"loss": 44.1079,
"step": 84
},
{
"epoch": 0.00384172109104879,
"grad_norm": 0.5555324554443359,
"learning_rate": 0.00018865993063730004,
"loss": 44.1445,
"step": 85
},
{
"epoch": 0.00384172109104879,
"eval_loss": 11.036417007446289,
"eval_runtime": 176.131,
"eval_samples_per_second": 52.898,
"eval_steps_per_second": 26.452,
"step": 85
},
{
"epoch": 0.003886917809767011,
"grad_norm": 0.43622660636901855,
"learning_rate": 0.00018836155801026753,
"loss": 44.1515,
"step": 86
},
{
"epoch": 0.003932114528485232,
"grad_norm": 0.578544020652771,
"learning_rate": 0.0001880595531856738,
"loss": 44.0766,
"step": 87
},
{
"epoch": 0.003977311247203453,
"grad_norm": 0.598685085773468,
"learning_rate": 0.00018775392857775432,
"loss": 44.1756,
"step": 88
},
{
"epoch": 0.004022507965921674,
"grad_norm": 0.5733134150505066,
"learning_rate": 0.00018744469674953956,
"loss": 44.1756,
"step": 89
},
{
"epoch": 0.004067704684639895,
"grad_norm": 0.5177151560783386,
"learning_rate": 0.00018713187041233896,
"loss": 44.173,
"step": 90
},
{
"epoch": 0.004067704684639895,
"eval_loss": 11.034589767456055,
"eval_runtime": 176.3402,
"eval_samples_per_second": 52.835,
"eval_steps_per_second": 26.421,
"step": 90
},
{
"epoch": 0.004112901403358116,
"grad_norm": 0.5208268761634827,
"learning_rate": 0.00018681546242521786,
"loss": 44.1346,
"step": 91
},
{
"epoch": 0.004158098122076338,
"grad_norm": 0.6029201149940491,
"learning_rate": 0.00018649548579446936,
"loss": 44.152,
"step": 92
},
{
"epoch": 0.004203294840794558,
"grad_norm": 0.468414843082428,
"learning_rate": 0.0001861719536730795,
"loss": 44.117,
"step": 93
},
{
"epoch": 0.004248491559512779,
"grad_norm": 0.3942670226097107,
"learning_rate": 0.00018584487936018661,
"loss": 44.137,
"step": 94
},
{
"epoch": 0.004293688278231,
"grad_norm": 0.49822431802749634,
"learning_rate": 0.00018551427630053463,
"loss": 44.119,
"step": 95
},
{
"epoch": 0.004293688278231,
"eval_loss": 11.03354549407959,
"eval_runtime": 176.0642,
"eval_samples_per_second": 52.918,
"eval_steps_per_second": 26.462,
"step": 95
},
{
"epoch": 0.004338884996949221,
"grad_norm": 0.5527846813201904,
"learning_rate": 0.00018518015808392045,
"loss": 44.0893,
"step": 96
},
{
"epoch": 0.004384081715667443,
"grad_norm": 0.5725367665290833,
"learning_rate": 0.00018484253844463526,
"loss": 44.1162,
"step": 97
},
{
"epoch": 0.004429278434385664,
"grad_norm": 0.49278348684310913,
"learning_rate": 0.00018450143126090015,
"loss": 44.1031,
"step": 98
},
{
"epoch": 0.004474475153103885,
"grad_norm": 0.4361265301704407,
"learning_rate": 0.00018415685055429533,
"loss": 44.1386,
"step": 99
},
{
"epoch": 0.004519671871822106,
"grad_norm": 0.397714763879776,
"learning_rate": 0.00018380881048918405,
"loss": 44.1072,
"step": 100
},
{
"epoch": 0.004519671871822106,
"eval_loss": 11.032732963562012,
"eval_runtime": 176.1844,
"eval_samples_per_second": 52.882,
"eval_steps_per_second": 26.444,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1050673152000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}