besimray's picture
Training in progress, step 70, checkpoint
fe87256 verified
raw
history blame
16.2 kB
{
"best_metric": 0.8046614527702332,
"best_model_checkpoint": "miner_id_24/checkpoint-70",
"epoch": 0.16706443914081145,
"eval_steps": 5,
"global_step": 70,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002386634844868735,
"grad_norm": 1.13600492477417,
"learning_rate": 2e-05,
"loss": 1.612,
"step": 1
},
{
"epoch": 0.002386634844868735,
"eval_loss": 1.3748971223831177,
"eval_runtime": 52.3445,
"eval_samples_per_second": 3.381,
"eval_steps_per_second": 1.7,
"step": 1
},
{
"epoch": 0.00477326968973747,
"grad_norm": 0.5665156245231628,
"learning_rate": 4e-05,
"loss": 0.8491,
"step": 2
},
{
"epoch": 0.007159904534606206,
"grad_norm": 0.6333050727844238,
"learning_rate": 6e-05,
"loss": 1.1232,
"step": 3
},
{
"epoch": 0.00954653937947494,
"grad_norm": 0.7455923557281494,
"learning_rate": 8e-05,
"loss": 1.3785,
"step": 4
},
{
"epoch": 0.011933174224343675,
"grad_norm": 0.7020689845085144,
"learning_rate": 0.0001,
"loss": 1.1252,
"step": 5
},
{
"epoch": 0.011933174224343675,
"eval_loss": 1.3356304168701172,
"eval_runtime": 53.3626,
"eval_samples_per_second": 3.317,
"eval_steps_per_second": 1.668,
"step": 5
},
{
"epoch": 0.014319809069212411,
"grad_norm": 1.1946194171905518,
"learning_rate": 0.00012,
"loss": 1.4196,
"step": 6
},
{
"epoch": 0.016706443914081145,
"grad_norm": 0.7819809913635254,
"learning_rate": 0.00014,
"loss": 1.1684,
"step": 7
},
{
"epoch": 0.01909307875894988,
"grad_norm": 0.698846161365509,
"learning_rate": 0.00016,
"loss": 1.1637,
"step": 8
},
{
"epoch": 0.021479713603818614,
"grad_norm": 0.8251622915267944,
"learning_rate": 0.00018,
"loss": 1.284,
"step": 9
},
{
"epoch": 0.02386634844868735,
"grad_norm": 0.3362489640712738,
"learning_rate": 0.0002,
"loss": 0.3899,
"step": 10
},
{
"epoch": 0.02386634844868735,
"eval_loss": 1.0437217950820923,
"eval_runtime": 52.5447,
"eval_samples_per_second": 3.369,
"eval_steps_per_second": 1.694,
"step": 10
},
{
"epoch": 0.026252983293556086,
"grad_norm": 0.6162827610969543,
"learning_rate": 0.0001999979446958366,
"loss": 1.037,
"step": 11
},
{
"epoch": 0.028639618138424822,
"grad_norm": 0.7912341356277466,
"learning_rate": 0.00019999177886783194,
"loss": 0.9055,
"step": 12
},
{
"epoch": 0.031026252983293555,
"grad_norm": 0.7281938791275024,
"learning_rate": 0.00019998150276943902,
"loss": 1.1855,
"step": 13
},
{
"epoch": 0.03341288782816229,
"grad_norm": 0.5928153991699219,
"learning_rate": 0.000199967116823068,
"loss": 0.6797,
"step": 14
},
{
"epoch": 0.03579952267303103,
"grad_norm": 0.5825729370117188,
"learning_rate": 0.0001999486216200688,
"loss": 1.4463,
"step": 15
},
{
"epoch": 0.03579952267303103,
"eval_loss": 0.9187370538711548,
"eval_runtime": 52.4578,
"eval_samples_per_second": 3.374,
"eval_steps_per_second": 1.697,
"step": 15
},
{
"epoch": 0.03818615751789976,
"grad_norm": 0.6294780969619751,
"learning_rate": 0.00019992601792070679,
"loss": 0.8622,
"step": 16
},
{
"epoch": 0.0405727923627685,
"grad_norm": 0.7047346830368042,
"learning_rate": 0.00019989930665413147,
"loss": 0.9514,
"step": 17
},
{
"epoch": 0.04295942720763723,
"grad_norm": 0.6274527907371521,
"learning_rate": 0.00019986848891833845,
"loss": 1.1009,
"step": 18
},
{
"epoch": 0.045346062052505964,
"grad_norm": 0.6628918647766113,
"learning_rate": 0.0001998335659801241,
"loss": 0.7443,
"step": 19
},
{
"epoch": 0.0477326968973747,
"grad_norm": 0.7120680809020996,
"learning_rate": 0.00019979453927503364,
"loss": 0.7956,
"step": 20
},
{
"epoch": 0.0477326968973747,
"eval_loss": 0.8748182654380798,
"eval_runtime": 52.5712,
"eval_samples_per_second": 3.367,
"eval_steps_per_second": 1.693,
"step": 20
},
{
"epoch": 0.050119331742243436,
"grad_norm": 0.6338897943496704,
"learning_rate": 0.00019975141040730207,
"loss": 0.9699,
"step": 21
},
{
"epoch": 0.05250596658711217,
"grad_norm": 0.5438985228538513,
"learning_rate": 0.0001997041811497882,
"loss": 0.6738,
"step": 22
},
{
"epoch": 0.05489260143198091,
"grad_norm": 0.5119079351425171,
"learning_rate": 0.00019965285344390184,
"loss": 0.7623,
"step": 23
},
{
"epoch": 0.057279236276849645,
"grad_norm": 0.39392197132110596,
"learning_rate": 0.00019959742939952392,
"loss": 0.868,
"step": 24
},
{
"epoch": 0.059665871121718374,
"grad_norm": 0.7678072452545166,
"learning_rate": 0.00019953791129491983,
"loss": 0.9788,
"step": 25
},
{
"epoch": 0.059665871121718374,
"eval_loss": 0.8621673583984375,
"eval_runtime": 52.6342,
"eval_samples_per_second": 3.363,
"eval_steps_per_second": 1.691,
"step": 25
},
{
"epoch": 0.06205250596658711,
"grad_norm": 0.774450421333313,
"learning_rate": 0.00019947430157664576,
"loss": 1.307,
"step": 26
},
{
"epoch": 0.06443914081145585,
"grad_norm": 0.5224051475524902,
"learning_rate": 0.00019940660285944803,
"loss": 0.9499,
"step": 27
},
{
"epoch": 0.06682577565632458,
"grad_norm": 0.4764106869697571,
"learning_rate": 0.00019933481792615583,
"loss": 0.7441,
"step": 28
},
{
"epoch": 0.06921241050119331,
"grad_norm": 0.6187341809272766,
"learning_rate": 0.0001992589497275665,
"loss": 0.8829,
"step": 29
},
{
"epoch": 0.07159904534606205,
"grad_norm": 0.5497674345970154,
"learning_rate": 0.0001991790013823246,
"loss": 1.1846,
"step": 30
},
{
"epoch": 0.07159904534606205,
"eval_loss": 0.8491736054420471,
"eval_runtime": 52.5491,
"eval_samples_per_second": 3.368,
"eval_steps_per_second": 1.694,
"step": 30
},
{
"epoch": 0.07398568019093078,
"grad_norm": 0.6167699098587036,
"learning_rate": 0.00019909497617679348,
"loss": 0.8677,
"step": 31
},
{
"epoch": 0.07637231503579953,
"grad_norm": 0.4538175165653229,
"learning_rate": 0.0001990068775649202,
"loss": 0.5806,
"step": 32
},
{
"epoch": 0.07875894988066826,
"grad_norm": 0.5126294493675232,
"learning_rate": 0.00019891470916809362,
"loss": 0.7348,
"step": 33
},
{
"epoch": 0.081145584725537,
"grad_norm": 0.5069904923439026,
"learning_rate": 0.00019881847477499557,
"loss": 0.7255,
"step": 34
},
{
"epoch": 0.08353221957040573,
"grad_norm": 0.43907415866851807,
"learning_rate": 0.00019871817834144504,
"loss": 0.8456,
"step": 35
},
{
"epoch": 0.08353221957040573,
"eval_loss": 0.8385397791862488,
"eval_runtime": 52.4798,
"eval_samples_per_second": 3.373,
"eval_steps_per_second": 1.696,
"step": 35
},
{
"epoch": 0.08591885441527446,
"grad_norm": 0.5554007887840271,
"learning_rate": 0.0001986138239902355,
"loss": 0.9552,
"step": 36
},
{
"epoch": 0.0883054892601432,
"grad_norm": 0.7507627606391907,
"learning_rate": 0.0001985054160109657,
"loss": 0.9233,
"step": 37
},
{
"epoch": 0.09069212410501193,
"grad_norm": 0.8395837545394897,
"learning_rate": 0.00019839295885986296,
"loss": 1.0273,
"step": 38
},
{
"epoch": 0.09307875894988067,
"grad_norm": 0.5746117830276489,
"learning_rate": 0.0001982764571596004,
"loss": 0.6501,
"step": 39
},
{
"epoch": 0.0954653937947494,
"grad_norm": 0.5443254113197327,
"learning_rate": 0.00019815591569910654,
"loss": 0.6094,
"step": 40
},
{
"epoch": 0.0954653937947494,
"eval_loss": 0.8293895721435547,
"eval_runtime": 52.4747,
"eval_samples_per_second": 3.373,
"eval_steps_per_second": 1.696,
"step": 40
},
{
"epoch": 0.09785202863961814,
"grad_norm": 0.7453851699829102,
"learning_rate": 0.00019803133943336874,
"loss": 1.1035,
"step": 41
},
{
"epoch": 0.10023866348448687,
"grad_norm": 0.5359771251678467,
"learning_rate": 0.0001979027334832293,
"loss": 0.9259,
"step": 42
},
{
"epoch": 0.1026252983293556,
"grad_norm": 0.6819587349891663,
"learning_rate": 0.00019777010313517518,
"loss": 0.802,
"step": 43
},
{
"epoch": 0.10501193317422435,
"grad_norm": 0.6070658564567566,
"learning_rate": 0.00019763345384112043,
"loss": 0.8852,
"step": 44
},
{
"epoch": 0.10739856801909307,
"grad_norm": 0.5346612334251404,
"learning_rate": 0.00019749279121818235,
"loss": 0.5703,
"step": 45
},
{
"epoch": 0.10739856801909307,
"eval_loss": 0.8229199647903442,
"eval_runtime": 52.5648,
"eval_samples_per_second": 3.367,
"eval_steps_per_second": 1.693,
"step": 45
},
{
"epoch": 0.10978520286396182,
"grad_norm": 0.5736297369003296,
"learning_rate": 0.00019734812104845047,
"loss": 0.8594,
"step": 46
},
{
"epoch": 0.11217183770883055,
"grad_norm": 0.40257322788238525,
"learning_rate": 0.00019719944927874881,
"loss": 0.6683,
"step": 47
},
{
"epoch": 0.11455847255369929,
"grad_norm": 0.7019076943397522,
"learning_rate": 0.0001970467820203915,
"loss": 1.1878,
"step": 48
},
{
"epoch": 0.11694510739856802,
"grad_norm": 0.7357410788536072,
"learning_rate": 0.00019689012554893154,
"loss": 0.8202,
"step": 49
},
{
"epoch": 0.11933174224343675,
"grad_norm": 0.5880741477012634,
"learning_rate": 0.00019672948630390294,
"loss": 0.7308,
"step": 50
},
{
"epoch": 0.11933174224343675,
"eval_loss": 0.8160228729248047,
"eval_runtime": 52.5516,
"eval_samples_per_second": 3.368,
"eval_steps_per_second": 1.694,
"step": 50
},
{
"epoch": 0.12171837708830549,
"grad_norm": 0.5184401273727417,
"learning_rate": 0.00019656487088855592,
"loss": 1.107,
"step": 51
},
{
"epoch": 0.12410501193317422,
"grad_norm": 0.42614543437957764,
"learning_rate": 0.00019639628606958533,
"loss": 0.6306,
"step": 52
},
{
"epoch": 0.12649164677804295,
"grad_norm": 0.2956556975841522,
"learning_rate": 0.0001962237387768529,
"loss": 0.7877,
"step": 53
},
{
"epoch": 0.1288782816229117,
"grad_norm": 0.5040286779403687,
"learning_rate": 0.00019604723610310194,
"loss": 0.7839,
"step": 54
},
{
"epoch": 0.13126491646778043,
"grad_norm": 0.4167059063911438,
"learning_rate": 0.00019586678530366606,
"loss": 0.86,
"step": 55
},
{
"epoch": 0.13126491646778043,
"eval_loss": 0.8153512477874756,
"eval_runtime": 52.5424,
"eval_samples_per_second": 3.369,
"eval_steps_per_second": 1.694,
"step": 55
},
{
"epoch": 0.13365155131264916,
"grad_norm": 0.5592871308326721,
"learning_rate": 0.00019568239379617088,
"loss": 0.7068,
"step": 56
},
{
"epoch": 0.1360381861575179,
"grad_norm": 0.5967664122581482,
"learning_rate": 0.00019549406916022905,
"loss": 0.9163,
"step": 57
},
{
"epoch": 0.13842482100238662,
"grad_norm": 0.4031658470630646,
"learning_rate": 0.00019530181913712872,
"loss": 0.7602,
"step": 58
},
{
"epoch": 0.14081145584725538,
"grad_norm": 0.6687312722206116,
"learning_rate": 0.00019510565162951537,
"loss": 0.8132,
"step": 59
},
{
"epoch": 0.1431980906921241,
"grad_norm": 0.5811920166015625,
"learning_rate": 0.00019490557470106686,
"loss": 0.7686,
"step": 60
},
{
"epoch": 0.1431980906921241,
"eval_loss": 0.8120850920677185,
"eval_runtime": 52.5863,
"eval_samples_per_second": 3.366,
"eval_steps_per_second": 1.692,
"step": 60
},
{
"epoch": 0.14558472553699284,
"grad_norm": 0.4957926273345947,
"learning_rate": 0.00019470159657616215,
"loss": 0.5649,
"step": 61
},
{
"epoch": 0.14797136038186157,
"grad_norm": 0.8247906565666199,
"learning_rate": 0.00019449372563954293,
"loss": 1.1628,
"step": 62
},
{
"epoch": 0.15035799522673032,
"grad_norm": 0.502251148223877,
"learning_rate": 0.0001942819704359693,
"loss": 0.7712,
"step": 63
},
{
"epoch": 0.15274463007159905,
"grad_norm": 0.3308824300765991,
"learning_rate": 0.00019406633966986828,
"loss": 0.5516,
"step": 64
},
{
"epoch": 0.15513126491646778,
"grad_norm": 0.5597787499427795,
"learning_rate": 0.00019384684220497605,
"loss": 0.556,
"step": 65
},
{
"epoch": 0.15513126491646778,
"eval_loss": 0.8096857666969299,
"eval_runtime": 52.5376,
"eval_samples_per_second": 3.369,
"eval_steps_per_second": 1.694,
"step": 65
},
{
"epoch": 0.1575178997613365,
"grad_norm": 0.6705149412155151,
"learning_rate": 0.00019362348706397373,
"loss": 1.0559,
"step": 66
},
{
"epoch": 0.15990453460620524,
"grad_norm": 0.5521374940872192,
"learning_rate": 0.00019339628342811632,
"loss": 0.7597,
"step": 67
},
{
"epoch": 0.162291169451074,
"grad_norm": 0.5938752293586731,
"learning_rate": 0.0001931652406368554,
"loss": 0.8473,
"step": 68
},
{
"epoch": 0.16467780429594273,
"grad_norm": 0.5228351950645447,
"learning_rate": 0.0001929303681874552,
"loss": 0.7279,
"step": 69
},
{
"epoch": 0.16706443914081145,
"grad_norm": 0.6063805818557739,
"learning_rate": 0.0001926916757346022,
"loss": 1.2009,
"step": 70
},
{
"epoch": 0.16706443914081145,
"eval_loss": 0.8046614527702332,
"eval_runtime": 52.6343,
"eval_samples_per_second": 3.363,
"eval_steps_per_second": 1.691,
"step": 70
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.946242191130624e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}