|
{ |
|
"best_metric": 0.8046614527702332, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-70", |
|
"epoch": 0.16706443914081145, |
|
"eval_steps": 5, |
|
"global_step": 70, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002386634844868735, |
|
"grad_norm": 1.13600492477417, |
|
"learning_rate": 2e-05, |
|
"loss": 1.612, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002386634844868735, |
|
"eval_loss": 1.3748971223831177, |
|
"eval_runtime": 52.3445, |
|
"eval_samples_per_second": 3.381, |
|
"eval_steps_per_second": 1.7, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00477326968973747, |
|
"grad_norm": 0.5665156245231628, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8491, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007159904534606206, |
|
"grad_norm": 0.6333050727844238, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1232, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00954653937947494, |
|
"grad_norm": 0.7455923557281494, |
|
"learning_rate": 8e-05, |
|
"loss": 1.3785, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.011933174224343675, |
|
"grad_norm": 0.7020689845085144, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1252, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011933174224343675, |
|
"eval_loss": 1.3356304168701172, |
|
"eval_runtime": 53.3626, |
|
"eval_samples_per_second": 3.317, |
|
"eval_steps_per_second": 1.668, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014319809069212411, |
|
"grad_norm": 1.1946194171905518, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4196, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.016706443914081145, |
|
"grad_norm": 0.7819809913635254, |
|
"learning_rate": 0.00014, |
|
"loss": 1.1684, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01909307875894988, |
|
"grad_norm": 0.698846161365509, |
|
"learning_rate": 0.00016, |
|
"loss": 1.1637, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.021479713603818614, |
|
"grad_norm": 0.8251622915267944, |
|
"learning_rate": 0.00018, |
|
"loss": 1.284, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02386634844868735, |
|
"grad_norm": 0.3362489640712738, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3899, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02386634844868735, |
|
"eval_loss": 1.0437217950820923, |
|
"eval_runtime": 52.5447, |
|
"eval_samples_per_second": 3.369, |
|
"eval_steps_per_second": 1.694, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.026252983293556086, |
|
"grad_norm": 0.6162827610969543, |
|
"learning_rate": 0.0001999979446958366, |
|
"loss": 1.037, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.028639618138424822, |
|
"grad_norm": 0.7912341356277466, |
|
"learning_rate": 0.00019999177886783194, |
|
"loss": 0.9055, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.031026252983293555, |
|
"grad_norm": 0.7281938791275024, |
|
"learning_rate": 0.00019998150276943902, |
|
"loss": 1.1855, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03341288782816229, |
|
"grad_norm": 0.5928153991699219, |
|
"learning_rate": 0.000199967116823068, |
|
"loss": 0.6797, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03579952267303103, |
|
"grad_norm": 0.5825729370117188, |
|
"learning_rate": 0.0001999486216200688, |
|
"loss": 1.4463, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03579952267303103, |
|
"eval_loss": 0.9187370538711548, |
|
"eval_runtime": 52.4578, |
|
"eval_samples_per_second": 3.374, |
|
"eval_steps_per_second": 1.697, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03818615751789976, |
|
"grad_norm": 0.6294780969619751, |
|
"learning_rate": 0.00019992601792070679, |
|
"loss": 0.8622, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0405727923627685, |
|
"grad_norm": 0.7047346830368042, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 0.9514, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04295942720763723, |
|
"grad_norm": 0.6274527907371521, |
|
"learning_rate": 0.00019986848891833845, |
|
"loss": 1.1009, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.045346062052505964, |
|
"grad_norm": 0.6628918647766113, |
|
"learning_rate": 0.0001998335659801241, |
|
"loss": 0.7443, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0477326968973747, |
|
"grad_norm": 0.7120680809020996, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 0.7956, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0477326968973747, |
|
"eval_loss": 0.8748182654380798, |
|
"eval_runtime": 52.5712, |
|
"eval_samples_per_second": 3.367, |
|
"eval_steps_per_second": 1.693, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.050119331742243436, |
|
"grad_norm": 0.6338897943496704, |
|
"learning_rate": 0.00019975141040730207, |
|
"loss": 0.9699, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05250596658711217, |
|
"grad_norm": 0.5438985228538513, |
|
"learning_rate": 0.0001997041811497882, |
|
"loss": 0.6738, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05489260143198091, |
|
"grad_norm": 0.5119079351425171, |
|
"learning_rate": 0.00019965285344390184, |
|
"loss": 0.7623, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.057279236276849645, |
|
"grad_norm": 0.39392197132110596, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 0.868, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.059665871121718374, |
|
"grad_norm": 0.7678072452545166, |
|
"learning_rate": 0.00019953791129491983, |
|
"loss": 0.9788, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.059665871121718374, |
|
"eval_loss": 0.8621673583984375, |
|
"eval_runtime": 52.6342, |
|
"eval_samples_per_second": 3.363, |
|
"eval_steps_per_second": 1.691, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06205250596658711, |
|
"grad_norm": 0.774450421333313, |
|
"learning_rate": 0.00019947430157664576, |
|
"loss": 1.307, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06443914081145585, |
|
"grad_norm": 0.5224051475524902, |
|
"learning_rate": 0.00019940660285944803, |
|
"loss": 0.9499, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06682577565632458, |
|
"grad_norm": 0.4764106869697571, |
|
"learning_rate": 0.00019933481792615583, |
|
"loss": 0.7441, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06921241050119331, |
|
"grad_norm": 0.6187341809272766, |
|
"learning_rate": 0.0001992589497275665, |
|
"loss": 0.8829, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07159904534606205, |
|
"grad_norm": 0.5497674345970154, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 1.1846, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07159904534606205, |
|
"eval_loss": 0.8491736054420471, |
|
"eval_runtime": 52.5491, |
|
"eval_samples_per_second": 3.368, |
|
"eval_steps_per_second": 1.694, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07398568019093078, |
|
"grad_norm": 0.6167699098587036, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 0.8677, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07637231503579953, |
|
"grad_norm": 0.4538175165653229, |
|
"learning_rate": 0.0001990068775649202, |
|
"loss": 0.5806, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07875894988066826, |
|
"grad_norm": 0.5126294493675232, |
|
"learning_rate": 0.00019891470916809362, |
|
"loss": 0.7348, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.081145584725537, |
|
"grad_norm": 0.5069904923439026, |
|
"learning_rate": 0.00019881847477499557, |
|
"loss": 0.7255, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08353221957040573, |
|
"grad_norm": 0.43907415866851807, |
|
"learning_rate": 0.00019871817834144504, |
|
"loss": 0.8456, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08353221957040573, |
|
"eval_loss": 0.8385397791862488, |
|
"eval_runtime": 52.4798, |
|
"eval_samples_per_second": 3.373, |
|
"eval_steps_per_second": 1.696, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08591885441527446, |
|
"grad_norm": 0.5554007887840271, |
|
"learning_rate": 0.0001986138239902355, |
|
"loss": 0.9552, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0883054892601432, |
|
"grad_norm": 0.7507627606391907, |
|
"learning_rate": 0.0001985054160109657, |
|
"loss": 0.9233, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09069212410501193, |
|
"grad_norm": 0.8395837545394897, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 1.0273, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09307875894988067, |
|
"grad_norm": 0.5746117830276489, |
|
"learning_rate": 0.0001982764571596004, |
|
"loss": 0.6501, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0954653937947494, |
|
"grad_norm": 0.5443254113197327, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 0.6094, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0954653937947494, |
|
"eval_loss": 0.8293895721435547, |
|
"eval_runtime": 52.4747, |
|
"eval_samples_per_second": 3.373, |
|
"eval_steps_per_second": 1.696, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09785202863961814, |
|
"grad_norm": 0.7453851699829102, |
|
"learning_rate": 0.00019803133943336874, |
|
"loss": 1.1035, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10023866348448687, |
|
"grad_norm": 0.5359771251678467, |
|
"learning_rate": 0.0001979027334832293, |
|
"loss": 0.9259, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1026252983293556, |
|
"grad_norm": 0.6819587349891663, |
|
"learning_rate": 0.00019777010313517518, |
|
"loss": 0.802, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10501193317422435, |
|
"grad_norm": 0.6070658564567566, |
|
"learning_rate": 0.00019763345384112043, |
|
"loss": 0.8852, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10739856801909307, |
|
"grad_norm": 0.5346612334251404, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.5703, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10739856801909307, |
|
"eval_loss": 0.8229199647903442, |
|
"eval_runtime": 52.5648, |
|
"eval_samples_per_second": 3.367, |
|
"eval_steps_per_second": 1.693, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10978520286396182, |
|
"grad_norm": 0.5736297369003296, |
|
"learning_rate": 0.00019734812104845047, |
|
"loss": 0.8594, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11217183770883055, |
|
"grad_norm": 0.40257322788238525, |
|
"learning_rate": 0.00019719944927874881, |
|
"loss": 0.6683, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11455847255369929, |
|
"grad_norm": 0.7019076943397522, |
|
"learning_rate": 0.0001970467820203915, |
|
"loss": 1.1878, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11694510739856802, |
|
"grad_norm": 0.7357410788536072, |
|
"learning_rate": 0.00019689012554893154, |
|
"loss": 0.8202, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11933174224343675, |
|
"grad_norm": 0.5880741477012634, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 0.7308, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11933174224343675, |
|
"eval_loss": 0.8160228729248047, |
|
"eval_runtime": 52.5516, |
|
"eval_samples_per_second": 3.368, |
|
"eval_steps_per_second": 1.694, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12171837708830549, |
|
"grad_norm": 0.5184401273727417, |
|
"learning_rate": 0.00019656487088855592, |
|
"loss": 1.107, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12410501193317422, |
|
"grad_norm": 0.42614543437957764, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.6306, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12649164677804295, |
|
"grad_norm": 0.2956556975841522, |
|
"learning_rate": 0.0001962237387768529, |
|
"loss": 0.7877, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1288782816229117, |
|
"grad_norm": 0.5040286779403687, |
|
"learning_rate": 0.00019604723610310194, |
|
"loss": 0.7839, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.13126491646778043, |
|
"grad_norm": 0.4167059063911438, |
|
"learning_rate": 0.00019586678530366606, |
|
"loss": 0.86, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13126491646778043, |
|
"eval_loss": 0.8153512477874756, |
|
"eval_runtime": 52.5424, |
|
"eval_samples_per_second": 3.369, |
|
"eval_steps_per_second": 1.694, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13365155131264916, |
|
"grad_norm": 0.5592871308326721, |
|
"learning_rate": 0.00019568239379617088, |
|
"loss": 0.7068, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1360381861575179, |
|
"grad_norm": 0.5967664122581482, |
|
"learning_rate": 0.00019549406916022905, |
|
"loss": 0.9163, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.13842482100238662, |
|
"grad_norm": 0.4031658470630646, |
|
"learning_rate": 0.00019530181913712872, |
|
"loss": 0.7602, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14081145584725538, |
|
"grad_norm": 0.6687312722206116, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.8132, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1431980906921241, |
|
"grad_norm": 0.5811920166015625, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 0.7686, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1431980906921241, |
|
"eval_loss": 0.8120850920677185, |
|
"eval_runtime": 52.5863, |
|
"eval_samples_per_second": 3.366, |
|
"eval_steps_per_second": 1.692, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14558472553699284, |
|
"grad_norm": 0.4957926273345947, |
|
"learning_rate": 0.00019470159657616215, |
|
"loss": 0.5649, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.14797136038186157, |
|
"grad_norm": 0.8247906565666199, |
|
"learning_rate": 0.00019449372563954293, |
|
"loss": 1.1628, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15035799522673032, |
|
"grad_norm": 0.502251148223877, |
|
"learning_rate": 0.0001942819704359693, |
|
"loss": 0.7712, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15274463007159905, |
|
"grad_norm": 0.3308824300765991, |
|
"learning_rate": 0.00019406633966986828, |
|
"loss": 0.5516, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.15513126491646778, |
|
"grad_norm": 0.5597787499427795, |
|
"learning_rate": 0.00019384684220497605, |
|
"loss": 0.556, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15513126491646778, |
|
"eval_loss": 0.8096857666969299, |
|
"eval_runtime": 52.5376, |
|
"eval_samples_per_second": 3.369, |
|
"eval_steps_per_second": 1.694, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1575178997613365, |
|
"grad_norm": 0.6705149412155151, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 1.0559, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.15990453460620524, |
|
"grad_norm": 0.5521374940872192, |
|
"learning_rate": 0.00019339628342811632, |
|
"loss": 0.7597, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.162291169451074, |
|
"grad_norm": 0.5938752293586731, |
|
"learning_rate": 0.0001931652406368554, |
|
"loss": 0.8473, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.16467780429594273, |
|
"grad_norm": 0.5228351950645447, |
|
"learning_rate": 0.0001929303681874552, |
|
"loss": 0.7279, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.16706443914081145, |
|
"grad_norm": 0.6063805818557739, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 1.2009, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16706443914081145, |
|
"eval_loss": 0.8046614527702332, |
|
"eval_runtime": 52.6343, |
|
"eval_samples_per_second": 3.363, |
|
"eval_steps_per_second": 1.691, |
|
"step": 70 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.946242191130624e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|