besimray's picture
Training in progress, step 70, checkpoint
7abc53d verified
raw
history blame
16.5 kB
{
"best_metric": 11.046669960021973,
"best_model_checkpoint": "miner_id_24/checkpoint-70",
"epoch": 0.003163770310275474,
"eval_steps": 5,
"global_step": 70,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.519671871822106e-05,
"grad_norm": 0.741265594959259,
"learning_rate": 2e-05,
"loss": 44.3865,
"step": 1
},
{
"epoch": 4.519671871822106e-05,
"eval_loss": 11.093368530273438,
"eval_runtime": 175.5908,
"eval_samples_per_second": 53.061,
"eval_steps_per_second": 26.533,
"step": 1
},
{
"epoch": 9.039343743644212e-05,
"grad_norm": 0.6581929326057434,
"learning_rate": 4e-05,
"loss": 44.3813,
"step": 2
},
{
"epoch": 0.00013559015615466317,
"grad_norm": 0.6729432344436646,
"learning_rate": 6e-05,
"loss": 44.3393,
"step": 3
},
{
"epoch": 0.00018078687487288423,
"grad_norm": 0.6872175335884094,
"learning_rate": 8e-05,
"loss": 44.3795,
"step": 4
},
{
"epoch": 0.00022598359359110527,
"grad_norm": 0.704067051410675,
"learning_rate": 0.0001,
"loss": 44.389,
"step": 5
},
{
"epoch": 0.00022598359359110527,
"eval_loss": 11.093063354492188,
"eval_runtime": 176.3334,
"eval_samples_per_second": 52.837,
"eval_steps_per_second": 26.422,
"step": 5
},
{
"epoch": 0.00027118031230932634,
"grad_norm": 0.6682418584823608,
"learning_rate": 0.00012,
"loss": 44.3529,
"step": 6
},
{
"epoch": 0.0003163770310275474,
"grad_norm": 0.6353705525398254,
"learning_rate": 0.00014,
"loss": 44.4355,
"step": 7
},
{
"epoch": 0.00036157374974576847,
"grad_norm": 0.6866922974586487,
"learning_rate": 0.00016,
"loss": 44.4036,
"step": 8
},
{
"epoch": 0.00040677046846398953,
"grad_norm": 0.7315343618392944,
"learning_rate": 0.00018,
"loss": 44.3756,
"step": 9
},
{
"epoch": 0.00045196718718221055,
"grad_norm": 0.6867555379867554,
"learning_rate": 0.0002,
"loss": 44.3678,
"step": 10
},
{
"epoch": 0.00045196718718221055,
"eval_loss": 11.091917037963867,
"eval_runtime": 176.1396,
"eval_samples_per_second": 52.896,
"eval_steps_per_second": 26.451,
"step": 10
},
{
"epoch": 0.0004971639059004316,
"grad_norm": 0.7067858576774597,
"learning_rate": 0.0001999979446958366,
"loss": 44.3933,
"step": 11
},
{
"epoch": 0.0005423606246186527,
"grad_norm": 0.7694055438041687,
"learning_rate": 0.00019999177886783194,
"loss": 44.3476,
"step": 12
},
{
"epoch": 0.0005875573433368737,
"grad_norm": 0.6980550289154053,
"learning_rate": 0.00019998150276943902,
"loss": 44.3621,
"step": 13
},
{
"epoch": 0.0006327540620550948,
"grad_norm": 0.7399426698684692,
"learning_rate": 0.000199967116823068,
"loss": 44.3727,
"step": 14
},
{
"epoch": 0.0006779507807733159,
"grad_norm": 0.6623771786689758,
"learning_rate": 0.0001999486216200688,
"loss": 44.3563,
"step": 15
},
{
"epoch": 0.0006779507807733159,
"eval_loss": 11.089905738830566,
"eval_runtime": 176.0326,
"eval_samples_per_second": 52.928,
"eval_steps_per_second": 26.467,
"step": 15
},
{
"epoch": 0.0007231474994915369,
"grad_norm": 0.6647756695747375,
"learning_rate": 0.00019992601792070679,
"loss": 44.3454,
"step": 16
},
{
"epoch": 0.000768344218209758,
"grad_norm": 0.7416101694107056,
"learning_rate": 0.00019989930665413147,
"loss": 44.3249,
"step": 17
},
{
"epoch": 0.0008135409369279791,
"grad_norm": 0.6351829171180725,
"learning_rate": 0.00019986848891833845,
"loss": 44.37,
"step": 18
},
{
"epoch": 0.0008587376556462001,
"grad_norm": 0.6839431524276733,
"learning_rate": 0.0001998335659801241,
"loss": 44.3472,
"step": 19
},
{
"epoch": 0.0009039343743644211,
"grad_norm": 0.6762228608131409,
"learning_rate": 0.00019979453927503364,
"loss": 44.3507,
"step": 20
},
{
"epoch": 0.0009039343743644211,
"eval_loss": 11.087591171264648,
"eval_runtime": 176.1534,
"eval_samples_per_second": 52.891,
"eval_steps_per_second": 26.449,
"step": 20
},
{
"epoch": 0.0009491310930826422,
"grad_norm": 0.7993413209915161,
"learning_rate": 0.00019975141040730207,
"loss": 44.288,
"step": 21
},
{
"epoch": 0.0009943278118008632,
"grad_norm": 0.6926490664482117,
"learning_rate": 0.0001997041811497882,
"loss": 44.3672,
"step": 22
},
{
"epoch": 0.0010395245305190844,
"grad_norm": 0.7373084425926208,
"learning_rate": 0.00019965285344390184,
"loss": 44.3927,
"step": 23
},
{
"epoch": 0.0010847212492373054,
"grad_norm": 0.6655643582344055,
"learning_rate": 0.00019959742939952392,
"loss": 44.3481,
"step": 24
},
{
"epoch": 0.0011299179679555265,
"grad_norm": 0.7115928530693054,
"learning_rate": 0.00019953791129491983,
"loss": 44.3368,
"step": 25
},
{
"epoch": 0.0011299179679555265,
"eval_loss": 11.085227012634277,
"eval_runtime": 175.877,
"eval_samples_per_second": 52.975,
"eval_steps_per_second": 26.49,
"step": 25
},
{
"epoch": 0.0011751146866737475,
"grad_norm": 0.7096830010414124,
"learning_rate": 0.00019947430157664576,
"loss": 44.3735,
"step": 26
},
{
"epoch": 0.0012203114053919684,
"grad_norm": 0.6747312545776367,
"learning_rate": 0.00019940660285944803,
"loss": 44.3323,
"step": 27
},
{
"epoch": 0.0012655081241101896,
"grad_norm": 0.7371957302093506,
"learning_rate": 0.00019933481792615583,
"loss": 44.2951,
"step": 28
},
{
"epoch": 0.0013107048428284106,
"grad_norm": 0.7316697239875793,
"learning_rate": 0.0001992589497275665,
"loss": 44.3097,
"step": 29
},
{
"epoch": 0.0013559015615466317,
"grad_norm": 0.6886783838272095,
"learning_rate": 0.0001991790013823246,
"loss": 44.3137,
"step": 30
},
{
"epoch": 0.0013559015615466317,
"eval_loss": 11.082609176635742,
"eval_runtime": 176.2695,
"eval_samples_per_second": 52.857,
"eval_steps_per_second": 26.431,
"step": 30
},
{
"epoch": 0.0014010982802648527,
"grad_norm": 0.7027749419212341,
"learning_rate": 0.00019909497617679348,
"loss": 44.3391,
"step": 31
},
{
"epoch": 0.0014462949989830739,
"grad_norm": 0.735598087310791,
"learning_rate": 0.0001990068775649202,
"loss": 44.3645,
"step": 32
},
{
"epoch": 0.0014914917177012948,
"grad_norm": 0.7152600288391113,
"learning_rate": 0.00019891470916809362,
"loss": 44.3478,
"step": 33
},
{
"epoch": 0.001536688436419516,
"grad_norm": 0.6983291506767273,
"learning_rate": 0.00019881847477499557,
"loss": 44.3252,
"step": 34
},
{
"epoch": 0.001581885155137737,
"grad_norm": 0.6892045140266418,
"learning_rate": 0.00019871817834144504,
"loss": 44.2998,
"step": 35
},
{
"epoch": 0.001581885155137737,
"eval_loss": 11.079712867736816,
"eval_runtime": 176.1378,
"eval_samples_per_second": 52.896,
"eval_steps_per_second": 26.451,
"step": 35
},
{
"epoch": 0.0016270818738559581,
"grad_norm": 0.7166262865066528,
"learning_rate": 0.0001986138239902355,
"loss": 44.3485,
"step": 36
},
{
"epoch": 0.001672278592574179,
"grad_norm": 0.7545002102851868,
"learning_rate": 0.0001985054160109657,
"loss": 44.2613,
"step": 37
},
{
"epoch": 0.0017174753112924003,
"grad_norm": 0.7944263219833374,
"learning_rate": 0.00019839295885986296,
"loss": 44.2665,
"step": 38
},
{
"epoch": 0.0017626720300106212,
"grad_norm": 0.7216903567314148,
"learning_rate": 0.0001982764571596004,
"loss": 44.3546,
"step": 39
},
{
"epoch": 0.0018078687487288422,
"grad_norm": 0.7492774128913879,
"learning_rate": 0.00019815591569910654,
"loss": 44.3223,
"step": 40
},
{
"epoch": 0.0018078687487288422,
"eval_loss": 11.076553344726562,
"eval_runtime": 176.1866,
"eval_samples_per_second": 52.881,
"eval_steps_per_second": 26.444,
"step": 40
},
{
"epoch": 0.0018530654674470634,
"grad_norm": 0.8118460774421692,
"learning_rate": 0.00019803133943336874,
"loss": 44.3122,
"step": 41
},
{
"epoch": 0.0018982621861652843,
"grad_norm": 0.7527559399604797,
"learning_rate": 0.0001979027334832293,
"loss": 44.3061,
"step": 42
},
{
"epoch": 0.0019434589048835055,
"grad_norm": 0.7425262331962585,
"learning_rate": 0.00019777010313517518,
"loss": 44.2408,
"step": 43
},
{
"epoch": 0.0019886556236017264,
"grad_norm": 0.753101646900177,
"learning_rate": 0.00019763345384112043,
"loss": 44.3362,
"step": 44
},
{
"epoch": 0.0020338523423199476,
"grad_norm": 0.767737090587616,
"learning_rate": 0.00019749279121818235,
"loss": 44.2864,
"step": 45
},
{
"epoch": 0.0020338523423199476,
"eval_loss": 11.072389602661133,
"eval_runtime": 175.9667,
"eval_samples_per_second": 52.948,
"eval_steps_per_second": 26.477,
"step": 45
},
{
"epoch": 0.002079049061038169,
"grad_norm": 0.7275786995887756,
"learning_rate": 0.00019734812104845047,
"loss": 44.3542,
"step": 46
},
{
"epoch": 0.0021242457797563895,
"grad_norm": 0.6908650994300842,
"learning_rate": 0.00019719944927874881,
"loss": 44.3377,
"step": 47
},
{
"epoch": 0.0021694424984746107,
"grad_norm": 0.7260599136352539,
"learning_rate": 0.0001970467820203915,
"loss": 44.2621,
"step": 48
},
{
"epoch": 0.002214639217192832,
"grad_norm": 0.7138715982437134,
"learning_rate": 0.00019689012554893154,
"loss": 44.2338,
"step": 49
},
{
"epoch": 0.002259835935911053,
"grad_norm": 0.7867954969406128,
"learning_rate": 0.00019672948630390294,
"loss": 44.3044,
"step": 50
},
{
"epoch": 0.002259835935911053,
"eval_loss": 11.067892074584961,
"eval_runtime": 176.5244,
"eval_samples_per_second": 52.78,
"eval_steps_per_second": 26.393,
"step": 50
},
{
"epoch": 0.002305032654629274,
"grad_norm": 0.7787512540817261,
"learning_rate": 0.00019656487088855592,
"loss": 44.2918,
"step": 51
},
{
"epoch": 0.002350229373347495,
"grad_norm": 0.7184544801712036,
"learning_rate": 0.00019639628606958533,
"loss": 44.2751,
"step": 52
},
{
"epoch": 0.002395426092065716,
"grad_norm": 0.7348573803901672,
"learning_rate": 0.0001962237387768529,
"loss": 44.246,
"step": 53
},
{
"epoch": 0.002440622810783937,
"grad_norm": 0.7713965773582458,
"learning_rate": 0.00019604723610310194,
"loss": 44.3292,
"step": 54
},
{
"epoch": 0.002485819529502158,
"grad_norm": 0.8040369749069214,
"learning_rate": 0.00019586678530366606,
"loss": 44.2155,
"step": 55
},
{
"epoch": 0.002485819529502158,
"eval_loss": 11.062650680541992,
"eval_runtime": 176.278,
"eval_samples_per_second": 52.854,
"eval_steps_per_second": 26.43,
"step": 55
},
{
"epoch": 0.0025310162482203792,
"grad_norm": 0.7459877133369446,
"learning_rate": 0.00019568239379617088,
"loss": 44.2188,
"step": 56
},
{
"epoch": 0.0025762129669386004,
"grad_norm": 0.8008533716201782,
"learning_rate": 0.00019549406916022905,
"loss": 44.226,
"step": 57
},
{
"epoch": 0.002621409685656821,
"grad_norm": 0.7918010354042053,
"learning_rate": 0.00019530181913712872,
"loss": 44.287,
"step": 58
},
{
"epoch": 0.0026666064043750423,
"grad_norm": 0.7287217974662781,
"learning_rate": 0.00019510565162951537,
"loss": 44.2581,
"step": 59
},
{
"epoch": 0.0027118031230932635,
"grad_norm": 0.7925474643707275,
"learning_rate": 0.00019490557470106686,
"loss": 44.2277,
"step": 60
},
{
"epoch": 0.0027118031230932635,
"eval_loss": 11.05736255645752,
"eval_runtime": 176.2465,
"eval_samples_per_second": 52.863,
"eval_steps_per_second": 26.435,
"step": 60
},
{
"epoch": 0.0027569998418114847,
"grad_norm": 0.8553807735443115,
"learning_rate": 0.00019470159657616215,
"loss": 44.2439,
"step": 61
},
{
"epoch": 0.0028021965605297054,
"grad_norm": 0.7586395740509033,
"learning_rate": 0.00019449372563954293,
"loss": 44.1943,
"step": 62
},
{
"epoch": 0.0028473932792479266,
"grad_norm": 0.7628232836723328,
"learning_rate": 0.0001942819704359693,
"loss": 44.2594,
"step": 63
},
{
"epoch": 0.0028925899979661478,
"grad_norm": 0.718551754951477,
"learning_rate": 0.00019406633966986828,
"loss": 44.2302,
"step": 64
},
{
"epoch": 0.002937786716684369,
"grad_norm": 0.7625423073768616,
"learning_rate": 0.00019384684220497605,
"loss": 44.1989,
"step": 65
},
{
"epoch": 0.002937786716684369,
"eval_loss": 11.051901817321777,
"eval_runtime": 176.1759,
"eval_samples_per_second": 52.885,
"eval_steps_per_second": 26.445,
"step": 65
},
{
"epoch": 0.0029829834354025897,
"grad_norm": 0.7891851663589478,
"learning_rate": 0.00019362348706397373,
"loss": 44.2199,
"step": 66
},
{
"epoch": 0.003028180154120811,
"grad_norm": 0.6770808100700378,
"learning_rate": 0.00019339628342811632,
"loss": 44.1689,
"step": 67
},
{
"epoch": 0.003073376872839032,
"grad_norm": 0.7498692870140076,
"learning_rate": 0.0001931652406368554,
"loss": 44.1741,
"step": 68
},
{
"epoch": 0.0031185735915572528,
"grad_norm": 0.7661782503128052,
"learning_rate": 0.0001929303681874552,
"loss": 44.2123,
"step": 69
},
{
"epoch": 0.003163770310275474,
"grad_norm": 0.6438837647438049,
"learning_rate": 0.0001926916757346022,
"loss": 44.1718,
"step": 70
},
{
"epoch": 0.003163770310275474,
"eval_loss": 11.046669960021973,
"eval_runtime": 176.3634,
"eval_samples_per_second": 52.828,
"eval_steps_per_second": 26.417,
"step": 70
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 735471206400.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}