{ "best_metric": 0.8046614527702332, "best_model_checkpoint": "miner_id_24/checkpoint-70", "epoch": 0.16706443914081145, "eval_steps": 5, "global_step": 70, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002386634844868735, "grad_norm": 1.13600492477417, "learning_rate": 2e-05, "loss": 1.612, "step": 1 }, { "epoch": 0.002386634844868735, "eval_loss": 1.3748971223831177, "eval_runtime": 52.3445, "eval_samples_per_second": 3.381, "eval_steps_per_second": 1.7, "step": 1 }, { "epoch": 0.00477326968973747, "grad_norm": 0.5665156245231628, "learning_rate": 4e-05, "loss": 0.8491, "step": 2 }, { "epoch": 0.007159904534606206, "grad_norm": 0.6333050727844238, "learning_rate": 6e-05, "loss": 1.1232, "step": 3 }, { "epoch": 0.00954653937947494, "grad_norm": 0.7455923557281494, "learning_rate": 8e-05, "loss": 1.3785, "step": 4 }, { "epoch": 0.011933174224343675, "grad_norm": 0.7020689845085144, "learning_rate": 0.0001, "loss": 1.1252, "step": 5 }, { "epoch": 0.011933174224343675, "eval_loss": 1.3356304168701172, "eval_runtime": 53.3626, "eval_samples_per_second": 3.317, "eval_steps_per_second": 1.668, "step": 5 }, { "epoch": 0.014319809069212411, "grad_norm": 1.1946194171905518, "learning_rate": 0.00012, "loss": 1.4196, "step": 6 }, { "epoch": 0.016706443914081145, "grad_norm": 0.7819809913635254, "learning_rate": 0.00014, "loss": 1.1684, "step": 7 }, { "epoch": 0.01909307875894988, "grad_norm": 0.698846161365509, "learning_rate": 0.00016, "loss": 1.1637, "step": 8 }, { "epoch": 0.021479713603818614, "grad_norm": 0.8251622915267944, "learning_rate": 0.00018, "loss": 1.284, "step": 9 }, { "epoch": 0.02386634844868735, "grad_norm": 0.3362489640712738, "learning_rate": 0.0002, "loss": 0.3899, "step": 10 }, { "epoch": 0.02386634844868735, "eval_loss": 1.0437217950820923, "eval_runtime": 52.5447, "eval_samples_per_second": 3.369, "eval_steps_per_second": 1.694, "step": 10 }, { "epoch": 0.026252983293556086, "grad_norm": 0.6162827610969543, "learning_rate": 0.0001999979446958366, "loss": 1.037, "step": 11 }, { "epoch": 0.028639618138424822, "grad_norm": 0.7912341356277466, "learning_rate": 0.00019999177886783194, "loss": 0.9055, "step": 12 }, { "epoch": 0.031026252983293555, "grad_norm": 0.7281938791275024, "learning_rate": 0.00019998150276943902, "loss": 1.1855, "step": 13 }, { "epoch": 0.03341288782816229, "grad_norm": 0.5928153991699219, "learning_rate": 0.000199967116823068, "loss": 0.6797, "step": 14 }, { "epoch": 0.03579952267303103, "grad_norm": 0.5825729370117188, "learning_rate": 0.0001999486216200688, "loss": 1.4463, "step": 15 }, { "epoch": 0.03579952267303103, "eval_loss": 0.9187370538711548, "eval_runtime": 52.4578, "eval_samples_per_second": 3.374, "eval_steps_per_second": 1.697, "step": 15 }, { "epoch": 0.03818615751789976, "grad_norm": 0.6294780969619751, "learning_rate": 0.00019992601792070679, "loss": 0.8622, "step": 16 }, { "epoch": 0.0405727923627685, "grad_norm": 0.7047346830368042, "learning_rate": 0.00019989930665413147, "loss": 0.9514, "step": 17 }, { "epoch": 0.04295942720763723, "grad_norm": 0.6274527907371521, "learning_rate": 0.00019986848891833845, "loss": 1.1009, "step": 18 }, { "epoch": 0.045346062052505964, "grad_norm": 0.6628918647766113, "learning_rate": 0.0001998335659801241, "loss": 0.7443, "step": 19 }, { "epoch": 0.0477326968973747, "grad_norm": 0.7120680809020996, "learning_rate": 0.00019979453927503364, "loss": 0.7956, "step": 20 }, { "epoch": 0.0477326968973747, "eval_loss": 0.8748182654380798, "eval_runtime": 52.5712, "eval_samples_per_second": 3.367, "eval_steps_per_second": 1.693, "step": 20 }, { "epoch": 0.050119331742243436, "grad_norm": 0.6338897943496704, "learning_rate": 0.00019975141040730207, "loss": 0.9699, "step": 21 }, { "epoch": 0.05250596658711217, "grad_norm": 0.5438985228538513, "learning_rate": 0.0001997041811497882, "loss": 0.6738, "step": 22 }, { "epoch": 0.05489260143198091, "grad_norm": 0.5119079351425171, "learning_rate": 0.00019965285344390184, "loss": 0.7623, "step": 23 }, { "epoch": 0.057279236276849645, "grad_norm": 0.39392197132110596, "learning_rate": 0.00019959742939952392, "loss": 0.868, "step": 24 }, { "epoch": 0.059665871121718374, "grad_norm": 0.7678072452545166, "learning_rate": 0.00019953791129491983, "loss": 0.9788, "step": 25 }, { "epoch": 0.059665871121718374, "eval_loss": 0.8621673583984375, "eval_runtime": 52.6342, "eval_samples_per_second": 3.363, "eval_steps_per_second": 1.691, "step": 25 }, { "epoch": 0.06205250596658711, "grad_norm": 0.774450421333313, "learning_rate": 0.00019947430157664576, "loss": 1.307, "step": 26 }, { "epoch": 0.06443914081145585, "grad_norm": 0.5224051475524902, "learning_rate": 0.00019940660285944803, "loss": 0.9499, "step": 27 }, { "epoch": 0.06682577565632458, "grad_norm": 0.4764106869697571, "learning_rate": 0.00019933481792615583, "loss": 0.7441, "step": 28 }, { "epoch": 0.06921241050119331, "grad_norm": 0.6187341809272766, "learning_rate": 0.0001992589497275665, "loss": 0.8829, "step": 29 }, { "epoch": 0.07159904534606205, "grad_norm": 0.5497674345970154, "learning_rate": 0.0001991790013823246, "loss": 1.1846, "step": 30 }, { "epoch": 0.07159904534606205, "eval_loss": 0.8491736054420471, "eval_runtime": 52.5491, "eval_samples_per_second": 3.368, "eval_steps_per_second": 1.694, "step": 30 }, { "epoch": 0.07398568019093078, "grad_norm": 0.6167699098587036, "learning_rate": 0.00019909497617679348, "loss": 0.8677, "step": 31 }, { "epoch": 0.07637231503579953, "grad_norm": 0.4538175165653229, "learning_rate": 0.0001990068775649202, "loss": 0.5806, "step": 32 }, { "epoch": 0.07875894988066826, "grad_norm": 0.5126294493675232, "learning_rate": 0.00019891470916809362, "loss": 0.7348, "step": 33 }, { "epoch": 0.081145584725537, "grad_norm": 0.5069904923439026, "learning_rate": 0.00019881847477499557, "loss": 0.7255, "step": 34 }, { "epoch": 0.08353221957040573, "grad_norm": 0.43907415866851807, "learning_rate": 0.00019871817834144504, "loss": 0.8456, "step": 35 }, { "epoch": 0.08353221957040573, "eval_loss": 0.8385397791862488, "eval_runtime": 52.4798, "eval_samples_per_second": 3.373, "eval_steps_per_second": 1.696, "step": 35 }, { "epoch": 0.08591885441527446, "grad_norm": 0.5554007887840271, "learning_rate": 0.0001986138239902355, "loss": 0.9552, "step": 36 }, { "epoch": 0.0883054892601432, "grad_norm": 0.7507627606391907, "learning_rate": 0.0001985054160109657, "loss": 0.9233, "step": 37 }, { "epoch": 0.09069212410501193, "grad_norm": 0.8395837545394897, "learning_rate": 0.00019839295885986296, "loss": 1.0273, "step": 38 }, { "epoch": 0.09307875894988067, "grad_norm": 0.5746117830276489, "learning_rate": 0.0001982764571596004, "loss": 0.6501, "step": 39 }, { "epoch": 0.0954653937947494, "grad_norm": 0.5443254113197327, "learning_rate": 0.00019815591569910654, "loss": 0.6094, "step": 40 }, { "epoch": 0.0954653937947494, "eval_loss": 0.8293895721435547, "eval_runtime": 52.4747, "eval_samples_per_second": 3.373, "eval_steps_per_second": 1.696, "step": 40 }, { "epoch": 0.09785202863961814, "grad_norm": 0.7453851699829102, "learning_rate": 0.00019803133943336874, "loss": 1.1035, "step": 41 }, { "epoch": 0.10023866348448687, "grad_norm": 0.5359771251678467, "learning_rate": 0.0001979027334832293, "loss": 0.9259, "step": 42 }, { "epoch": 0.1026252983293556, "grad_norm": 0.6819587349891663, "learning_rate": 0.00019777010313517518, "loss": 0.802, "step": 43 }, { "epoch": 0.10501193317422435, "grad_norm": 0.6070658564567566, "learning_rate": 0.00019763345384112043, "loss": 0.8852, "step": 44 }, { "epoch": 0.10739856801909307, "grad_norm": 0.5346612334251404, "learning_rate": 0.00019749279121818235, "loss": 0.5703, "step": 45 }, { "epoch": 0.10739856801909307, "eval_loss": 0.8229199647903442, "eval_runtime": 52.5648, "eval_samples_per_second": 3.367, "eval_steps_per_second": 1.693, "step": 45 }, { "epoch": 0.10978520286396182, "grad_norm": 0.5736297369003296, "learning_rate": 0.00019734812104845047, "loss": 0.8594, "step": 46 }, { "epoch": 0.11217183770883055, "grad_norm": 0.40257322788238525, "learning_rate": 0.00019719944927874881, "loss": 0.6683, "step": 47 }, { "epoch": 0.11455847255369929, "grad_norm": 0.7019076943397522, "learning_rate": 0.0001970467820203915, "loss": 1.1878, "step": 48 }, { "epoch": 0.11694510739856802, "grad_norm": 0.7357410788536072, "learning_rate": 0.00019689012554893154, "loss": 0.8202, "step": 49 }, { "epoch": 0.11933174224343675, "grad_norm": 0.5880741477012634, "learning_rate": 0.00019672948630390294, "loss": 0.7308, "step": 50 }, { "epoch": 0.11933174224343675, "eval_loss": 0.8160228729248047, "eval_runtime": 52.5516, "eval_samples_per_second": 3.368, "eval_steps_per_second": 1.694, "step": 50 }, { "epoch": 0.12171837708830549, "grad_norm": 0.5184401273727417, "learning_rate": 0.00019656487088855592, "loss": 1.107, "step": 51 }, { "epoch": 0.12410501193317422, "grad_norm": 0.42614543437957764, "learning_rate": 0.00019639628606958533, "loss": 0.6306, "step": 52 }, { "epoch": 0.12649164677804295, "grad_norm": 0.2956556975841522, "learning_rate": 0.0001962237387768529, "loss": 0.7877, "step": 53 }, { "epoch": 0.1288782816229117, "grad_norm": 0.5040286779403687, "learning_rate": 0.00019604723610310194, "loss": 0.7839, "step": 54 }, { "epoch": 0.13126491646778043, "grad_norm": 0.4167059063911438, "learning_rate": 0.00019586678530366606, "loss": 0.86, "step": 55 }, { "epoch": 0.13126491646778043, "eval_loss": 0.8153512477874756, "eval_runtime": 52.5424, "eval_samples_per_second": 3.369, "eval_steps_per_second": 1.694, "step": 55 }, { "epoch": 0.13365155131264916, "grad_norm": 0.5592871308326721, "learning_rate": 0.00019568239379617088, "loss": 0.7068, "step": 56 }, { "epoch": 0.1360381861575179, "grad_norm": 0.5967664122581482, "learning_rate": 0.00019549406916022905, "loss": 0.9163, "step": 57 }, { "epoch": 0.13842482100238662, "grad_norm": 0.4031658470630646, "learning_rate": 0.00019530181913712872, "loss": 0.7602, "step": 58 }, { "epoch": 0.14081145584725538, "grad_norm": 0.6687312722206116, "learning_rate": 0.00019510565162951537, "loss": 0.8132, "step": 59 }, { "epoch": 0.1431980906921241, "grad_norm": 0.5811920166015625, "learning_rate": 0.00019490557470106686, "loss": 0.7686, "step": 60 }, { "epoch": 0.1431980906921241, "eval_loss": 0.8120850920677185, "eval_runtime": 52.5863, "eval_samples_per_second": 3.366, "eval_steps_per_second": 1.692, "step": 60 }, { "epoch": 0.14558472553699284, "grad_norm": 0.4957926273345947, "learning_rate": 0.00019470159657616215, "loss": 0.5649, "step": 61 }, { "epoch": 0.14797136038186157, "grad_norm": 0.8247906565666199, "learning_rate": 0.00019449372563954293, "loss": 1.1628, "step": 62 }, { "epoch": 0.15035799522673032, "grad_norm": 0.502251148223877, "learning_rate": 0.0001942819704359693, "loss": 0.7712, "step": 63 }, { "epoch": 0.15274463007159905, "grad_norm": 0.3308824300765991, "learning_rate": 0.00019406633966986828, "loss": 0.5516, "step": 64 }, { "epoch": 0.15513126491646778, "grad_norm": 0.5597787499427795, "learning_rate": 0.00019384684220497605, "loss": 0.556, "step": 65 }, { "epoch": 0.15513126491646778, "eval_loss": 0.8096857666969299, "eval_runtime": 52.5376, "eval_samples_per_second": 3.369, "eval_steps_per_second": 1.694, "step": 65 }, { "epoch": 0.1575178997613365, "grad_norm": 0.6705149412155151, "learning_rate": 0.00019362348706397373, "loss": 1.0559, "step": 66 }, { "epoch": 0.15990453460620524, "grad_norm": 0.5521374940872192, "learning_rate": 0.00019339628342811632, "loss": 0.7597, "step": 67 }, { "epoch": 0.162291169451074, "grad_norm": 0.5938752293586731, "learning_rate": 0.0001931652406368554, "loss": 0.8473, "step": 68 }, { "epoch": 0.16467780429594273, "grad_norm": 0.5228351950645447, "learning_rate": 0.0001929303681874552, "loss": 0.7279, "step": 69 }, { "epoch": 0.16706443914081145, "grad_norm": 0.6063805818557739, "learning_rate": 0.0001926916757346022, "loss": 1.2009, "step": 70 }, { "epoch": 0.16706443914081145, "eval_loss": 0.8046614527702332, "eval_runtime": 52.6343, "eval_samples_per_second": 3.363, "eval_steps_per_second": 1.691, "step": 70 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.946242191130624e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }