{ "best_metric": 0.32027119398117065, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.9937106918238996, "eval_steps": 50, "global_step": 119, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025157232704402517, "grad_norm": 10.445202827453613, "learning_rate": 1e-05, "loss": 1.8113, "step": 1 }, { "epoch": 0.025157232704402517, "eval_loss": 2.903780698776245, "eval_runtime": 1.3578, "eval_samples_per_second": 49.344, "eval_steps_per_second": 12.52, "step": 1 }, { "epoch": 0.050314465408805034, "grad_norm": 18.712600708007812, "learning_rate": 2e-05, "loss": 2.412, "step": 2 }, { "epoch": 0.07547169811320754, "grad_norm": 13.591891288757324, "learning_rate": 3e-05, "loss": 2.6184, "step": 3 }, { "epoch": 0.10062893081761007, "grad_norm": 6.334252834320068, "learning_rate": 4e-05, "loss": 2.2968, "step": 4 }, { "epoch": 0.12578616352201258, "grad_norm": 8.629500389099121, "learning_rate": 5e-05, "loss": 2.6715, "step": 5 }, { "epoch": 0.1509433962264151, "grad_norm": 6.428257465362549, "learning_rate": 6e-05, "loss": 2.3585, "step": 6 }, { "epoch": 0.1761006289308176, "grad_norm": 6.464021682739258, "learning_rate": 7e-05, "loss": 2.2552, "step": 7 }, { "epoch": 0.20125786163522014, "grad_norm": 14.248180389404297, "learning_rate": 8e-05, "loss": 3.8528, "step": 8 }, { "epoch": 0.22641509433962265, "grad_norm": 15.20981216430664, "learning_rate": 9e-05, "loss": 3.4106, "step": 9 }, { "epoch": 0.25157232704402516, "grad_norm": 5.780796051025391, "learning_rate": 0.0001, "loss": 1.1669, "step": 10 }, { "epoch": 0.27672955974842767, "grad_norm": 5.746275424957275, "learning_rate": 9.997923381619256e-05, "loss": 1.1579, "step": 11 }, { "epoch": 0.3018867924528302, "grad_norm": 2.8907012939453125, "learning_rate": 9.991695251414583e-05, "loss": 1.0849, "step": 12 }, { "epoch": 0.3270440251572327, "grad_norm": 1.9942095279693604, "learning_rate": 9.981320782765846e-05, "loss": 1.0375, "step": 13 }, { "epoch": 0.3522012578616352, "grad_norm": 2.2817931175231934, "learning_rate": 9.966808593197959e-05, "loss": 0.9995, "step": 14 }, { "epoch": 0.37735849056603776, "grad_norm": 2.029221773147583, "learning_rate": 9.948170737222762e-05, "loss": 0.8702, "step": 15 }, { "epoch": 0.4025157232704403, "grad_norm": 2.2605092525482178, "learning_rate": 9.925422696325975e-05, "loss": 0.8997, "step": 16 }, { "epoch": 0.4276729559748428, "grad_norm": 6.524753093719482, "learning_rate": 9.898583366107538e-05, "loss": 1.0832, "step": 17 }, { "epoch": 0.4528301886792453, "grad_norm": 4.861238479614258, "learning_rate": 9.867675040586034e-05, "loss": 1.2063, "step": 18 }, { "epoch": 0.4779874213836478, "grad_norm": 1.9360491037368774, "learning_rate": 9.83272339368022e-05, "loss": 0.8503, "step": 19 }, { "epoch": 0.5031446540880503, "grad_norm": 1.9203150272369385, "learning_rate": 9.793757457883062e-05, "loss": 0.8055, "step": 20 }, { "epoch": 0.5283018867924528, "grad_norm": 1.335469126701355, "learning_rate": 9.750809600145954e-05, "loss": 0.8518, "step": 21 }, { "epoch": 0.5534591194968553, "grad_norm": 1.137628197669983, "learning_rate": 9.703915494993215e-05, "loss": 0.7453, "step": 22 }, { "epoch": 0.5786163522012578, "grad_norm": 1.0536383390426636, "learning_rate": 9.653114094889127e-05, "loss": 0.7192, "step": 23 }, { "epoch": 0.6037735849056604, "grad_norm": 1.5118601322174072, "learning_rate": 9.598447597882181e-05, "loss": 0.7093, "step": 24 }, { "epoch": 0.6289308176100629, "grad_norm": 1.6779704093933105, "learning_rate": 9.539961412553375e-05, "loss": 0.7177, "step": 25 }, { "epoch": 0.6540880503144654, "grad_norm": 2.0891356468200684, "learning_rate": 9.477704120297697e-05, "loss": 0.8265, "step": 26 }, { "epoch": 0.6792452830188679, "grad_norm": 3.018578052520752, "learning_rate": 9.411727434970121e-05, "loss": 1.1182, "step": 27 }, { "epoch": 0.7044025157232704, "grad_norm": 1.017041563987732, "learning_rate": 9.34208615992963e-05, "loss": 0.6662, "step": 28 }, { "epoch": 0.7295597484276729, "grad_norm": 1.0053677558898926, "learning_rate": 9.268838142516943e-05, "loss": 0.5747, "step": 29 }, { "epoch": 0.7547169811320755, "grad_norm": 1.353121280670166, "learning_rate": 9.192044226003789e-05, "loss": 0.6665, "step": 30 }, { "epoch": 0.779874213836478, "grad_norm": 1.0210703611373901, "learning_rate": 9.111768199053588e-05, "loss": 0.5584, "step": 31 }, { "epoch": 0.8050314465408805, "grad_norm": 0.9525967836380005, "learning_rate": 9.028076742735583e-05, "loss": 0.5162, "step": 32 }, { "epoch": 0.8301886792452831, "grad_norm": 1.384174108505249, "learning_rate": 8.941039375136371e-05, "loss": 0.6165, "step": 33 }, { "epoch": 0.8553459119496856, "grad_norm": 1.3128784894943237, "learning_rate": 8.850728393614902e-05, "loss": 0.6381, "step": 34 }, { "epoch": 0.8805031446540881, "grad_norm": 2.184060573577881, "learning_rate": 8.75721881474886e-05, "loss": 0.6739, "step": 35 }, { "epoch": 0.9056603773584906, "grad_norm": 3.2247838973999023, "learning_rate": 8.660588312022344e-05, "loss": 1.0203, "step": 36 }, { "epoch": 0.9308176100628931, "grad_norm": 1.1204508543014526, "learning_rate": 8.560917151306593e-05, "loss": 0.6185, "step": 37 }, { "epoch": 0.9559748427672956, "grad_norm": 1.065934658050537, "learning_rate": 8.458288124187359e-05, "loss": 0.4873, "step": 38 }, { "epoch": 0.9811320754716981, "grad_norm": 1.583658218383789, "learning_rate": 8.352786479194288e-05, "loss": 0.4867, "step": 39 }, { "epoch": 1.0062893081761006, "grad_norm": 2.8070600032806396, "learning_rate": 8.244499850989452e-05, "loss": 0.8313, "step": 40 }, { "epoch": 1.0314465408805031, "grad_norm": 0.8271421194076538, "learning_rate": 8.133518187573862e-05, "loss": 0.4702, "step": 41 }, { "epoch": 1.0566037735849056, "grad_norm": 0.8070173263549805, "learning_rate": 8.019933675572389e-05, "loss": 0.4361, "step": 42 }, { "epoch": 1.0817610062893082, "grad_norm": 2.6176536083221436, "learning_rate": 7.903840663659186e-05, "loss": 0.4341, "step": 43 }, { "epoch": 1.1069182389937107, "grad_norm": 0.9174895286560059, "learning_rate": 7.785335584187219e-05, "loss": 0.3583, "step": 44 }, { "epoch": 1.1320754716981132, "grad_norm": 0.8380503058433533, "learning_rate": 7.664516873086987e-05, "loss": 0.3898, "step": 45 }, { "epoch": 1.1572327044025157, "grad_norm": 0.9413546919822693, "learning_rate": 7.541484888100974e-05, "loss": 0.4148, "step": 46 }, { "epoch": 1.1823899371069182, "grad_norm": 1.0667158365249634, "learning_rate": 7.416341825421754e-05, "loss": 0.4358, "step": 47 }, { "epoch": 1.2075471698113207, "grad_norm": 1.6355628967285156, "learning_rate": 7.289191634803003e-05, "loss": 0.535, "step": 48 }, { "epoch": 1.2327044025157232, "grad_norm": 2.1079909801483154, "learning_rate": 7.160139933213898e-05, "loss": 0.5971, "step": 49 }, { "epoch": 1.2578616352201257, "grad_norm": 0.9688711166381836, "learning_rate": 7.029293917108678e-05, "loss": 0.3642, "step": 50 }, { "epoch": 1.2578616352201257, "eval_loss": 0.40250465273857117, "eval_runtime": 1.357, "eval_samples_per_second": 49.373, "eval_steps_per_second": 12.527, "step": 50 }, { "epoch": 1.2830188679245282, "grad_norm": 0.8682835102081299, "learning_rate": 6.896762273384178e-05, "loss": 0.3127, "step": 51 }, { "epoch": 1.3081761006289307, "grad_norm": 0.8948720693588257, "learning_rate": 6.762655089099353e-05, "loss": 0.3749, "step": 52 }, { "epoch": 1.3333333333333333, "grad_norm": 0.8554229736328125, "learning_rate": 6.627083760031754e-05, "loss": 0.3637, "step": 53 }, { "epoch": 1.3584905660377358, "grad_norm": 0.9757639765739441, "learning_rate": 6.490160898146918e-05, "loss": 0.2847, "step": 54 }, { "epoch": 1.3836477987421385, "grad_norm": 1.135493516921997, "learning_rate": 6.35200023805754e-05, "loss": 0.3686, "step": 55 }, { "epoch": 1.408805031446541, "grad_norm": 1.8949108123779297, "learning_rate": 6.212716542550112e-05, "loss": 0.3382, "step": 56 }, { "epoch": 1.4339622641509435, "grad_norm": 1.7161883115768433, "learning_rate": 6.0724255072575275e-05, "loss": 0.4808, "step": 57 }, { "epoch": 1.459119496855346, "grad_norm": 1.8359076976776123, "learning_rate": 5.931243664556803e-05, "loss": 0.5259, "step": 58 }, { "epoch": 1.4842767295597485, "grad_norm": 0.7648423314094543, "learning_rate": 5.78928828677177e-05, "loss": 0.3778, "step": 59 }, { "epoch": 1.509433962264151, "grad_norm": 0.8067821860313416, "learning_rate": 5.646677288761132e-05, "loss": 0.3448, "step": 60 }, { "epoch": 1.5345911949685536, "grad_norm": 0.7813786864280701, "learning_rate": 5.503529129972792e-05, "loss": 0.339, "step": 61 }, { "epoch": 1.559748427672956, "grad_norm": 0.7792041897773743, "learning_rate": 5.359962716045835e-05, "loss": 0.2644, "step": 62 }, { "epoch": 1.5849056603773586, "grad_norm": 0.9082090258598328, "learning_rate": 5.21609730004187e-05, "loss": 0.2565, "step": 63 }, { "epoch": 1.610062893081761, "grad_norm": 0.9733455777168274, "learning_rate": 5.072052383387786e-05, "loss": 0.3213, "step": 64 }, { "epoch": 1.6352201257861636, "grad_norm": 0.960435152053833, "learning_rate": 4.927947616612215e-05, "loss": 0.3019, "step": 65 }, { "epoch": 1.6603773584905661, "grad_norm": 1.6576495170593262, "learning_rate": 4.7839026999581296e-05, "loss": 0.4513, "step": 66 }, { "epoch": 1.6855345911949686, "grad_norm": 1.8107895851135254, "learning_rate": 4.640037283954165e-05, "loss": 0.4877, "step": 67 }, { "epoch": 1.7106918238993711, "grad_norm": 0.639157772064209, "learning_rate": 4.496470870027209e-05, "loss": 0.2969, "step": 68 }, { "epoch": 1.7358490566037736, "grad_norm": 0.8809495568275452, "learning_rate": 4.3533227112388694e-05, "loss": 0.3093, "step": 69 }, { "epoch": 1.7610062893081762, "grad_norm": 0.8104914426803589, "learning_rate": 4.21071171322823e-05, "loss": 0.3098, "step": 70 }, { "epoch": 1.7861635220125787, "grad_norm": 0.8440978527069092, "learning_rate": 4.0687563354431984e-05, "loss": 0.2953, "step": 71 }, { "epoch": 1.8113207547169812, "grad_norm": 0.8337366580963135, "learning_rate": 3.927574492742473e-05, "loss": 0.2859, "step": 72 }, { "epoch": 1.8364779874213837, "grad_norm": 0.8279858231544495, "learning_rate": 3.78728345744989e-05, "loss": 0.2481, "step": 73 }, { "epoch": 1.8616352201257862, "grad_norm": 0.9654554724693298, "learning_rate": 3.6479997619424605e-05, "loss": 0.2819, "step": 74 }, { "epoch": 1.8867924528301887, "grad_norm": 1.6905914545059204, "learning_rate": 3.5098391018530816e-05, "loss": 0.5995, "step": 75 }, { "epoch": 1.9119496855345912, "grad_norm": 1.8605477809906006, "learning_rate": 3.3729162399682456e-05, "loss": 0.4759, "step": 76 }, { "epoch": 1.9371069182389937, "grad_norm": 0.7602652907371521, "learning_rate": 3.237344910900648e-05, "loss": 0.3107, "step": 77 }, { "epoch": 1.9622641509433962, "grad_norm": 0.8071399331092834, "learning_rate": 3.103237726615822e-05, "loss": 0.2663, "step": 78 }, { "epoch": 1.9874213836477987, "grad_norm": 1.6272022724151611, "learning_rate": 2.9707060828913225e-05, "loss": 0.409, "step": 79 }, { "epoch": 2.0125786163522013, "grad_norm": 1.3061223030090332, "learning_rate": 2.839860066786103e-05, "loss": 0.4409, "step": 80 }, { "epoch": 2.0377358490566038, "grad_norm": 0.5977674126625061, "learning_rate": 2.710808365197e-05, "loss": 0.2442, "step": 81 }, { "epoch": 2.0628930817610063, "grad_norm": 0.6834774017333984, "learning_rate": 2.5836581745782475e-05, "loss": 0.2757, "step": 82 }, { "epoch": 2.088050314465409, "grad_norm": 0.7066735625267029, "learning_rate": 2.4585151118990286e-05, "loss": 0.2006, "step": 83 }, { "epoch": 2.1132075471698113, "grad_norm": 0.6501676440238953, "learning_rate": 2.3354831269130133e-05, "loss": 0.2125, "step": 84 }, { "epoch": 2.138364779874214, "grad_norm": 0.6182856559753418, "learning_rate": 2.2146644158127827e-05, "loss": 0.1671, "step": 85 }, { "epoch": 2.1635220125786163, "grad_norm": 0.8084297180175781, "learning_rate": 2.0961593363408156e-05, "loss": 0.2286, "step": 86 }, { "epoch": 2.188679245283019, "grad_norm": 0.7915740609169006, "learning_rate": 1.980066324427613e-05, "loss": 0.186, "step": 87 }, { "epoch": 2.2138364779874213, "grad_norm": 1.525448203086853, "learning_rate": 1.8664818124261374e-05, "loss": 0.3382, "step": 88 }, { "epoch": 2.238993710691824, "grad_norm": 1.2538678646087646, "learning_rate": 1.7555001490105488e-05, "loss": 0.3341, "step": 89 }, { "epoch": 2.2641509433962264, "grad_norm": 0.5898963212966919, "learning_rate": 1.6472135208057126e-05, "loss": 0.1937, "step": 90 }, { "epoch": 2.289308176100629, "grad_norm": 0.7369837760925293, "learning_rate": 1.541711875812641e-05, "loss": 0.2273, "step": 91 }, { "epoch": 2.3144654088050314, "grad_norm": 0.7151015996932983, "learning_rate": 1.439082848693406e-05, "loss": 0.2349, "step": 92 }, { "epoch": 2.339622641509434, "grad_norm": 0.6623278260231018, "learning_rate": 1.339411687977657e-05, "loss": 0.1702, "step": 93 }, { "epoch": 2.3647798742138364, "grad_norm": 0.731364369392395, "learning_rate": 1.2427811852511395e-05, "loss": 0.2054, "step": 94 }, { "epoch": 2.389937106918239, "grad_norm": 1.0562483072280884, "learning_rate": 1.1492716063850973e-05, "loss": 0.2254, "step": 95 }, { "epoch": 2.4150943396226414, "grad_norm": 1.070011854171753, "learning_rate": 1.0589606248636292e-05, "loss": 0.2412, "step": 96 }, { "epoch": 2.440251572327044, "grad_norm": 1.4945615530014038, "learning_rate": 9.719232572644187e-06, "loss": 0.297, "step": 97 }, { "epoch": 2.4654088050314464, "grad_norm": 1.1410236358642578, "learning_rate": 8.882318009464125e-06, "loss": 0.3131, "step": 98 }, { "epoch": 2.490566037735849, "grad_norm": 0.6251071095466614, "learning_rate": 8.079557739962128e-06, "loss": 0.2219, "step": 99 }, { "epoch": 2.5157232704402515, "grad_norm": 0.7983537912368774, "learning_rate": 7.31161857483057e-06, "loss": 0.2371, "step": 100 }, { "epoch": 2.5157232704402515, "eval_loss": 0.32027119398117065, "eval_runtime": 1.3603, "eval_samples_per_second": 49.255, "eval_steps_per_second": 12.498, "step": 100 }, { "epoch": 2.540880503144654, "grad_norm": 0.7304105758666992, "learning_rate": 6.579138400703716e-06, "loss": 0.2125, "step": 101 }, { "epoch": 2.5660377358490565, "grad_norm": 0.6842655539512634, "learning_rate": 5.882725650298787e-06, "loss": 0.176, "step": 102 }, { "epoch": 2.591194968553459, "grad_norm": 0.8605983853340149, "learning_rate": 5.222958797023036e-06, "loss": 0.2162, "step": 103 }, { "epoch": 2.6163522012578615, "grad_norm": 0.7604213356971741, "learning_rate": 4.600385874466256e-06, "loss": 0.1974, "step": 104 }, { "epoch": 2.641509433962264, "grad_norm": 1.07295560836792, "learning_rate": 4.015524021178196e-06, "loss": 0.2702, "step": 105 }, { "epoch": 2.6666666666666665, "grad_norm": 1.6208587884902954, "learning_rate": 3.4688590511087304e-06, "loss": 0.3159, "step": 106 }, { "epoch": 2.691823899371069, "grad_norm": 1.2537561655044556, "learning_rate": 2.9608450500678565e-06, "loss": 0.3131, "step": 107 }, { "epoch": 2.7169811320754715, "grad_norm": 0.7774904370307922, "learning_rate": 2.4919039985404626e-06, "loss": 0.2692, "step": 108 }, { "epoch": 2.742138364779874, "grad_norm": 0.7653168439865112, "learning_rate": 2.0624254211693894e-06, "loss": 0.2267, "step": 109 }, { "epoch": 2.767295597484277, "grad_norm": 0.7065545320510864, "learning_rate": 1.6727660631977893e-06, "loss": 0.2014, "step": 110 }, { "epoch": 2.7924528301886795, "grad_norm": 0.7866085767745972, "learning_rate": 1.3232495941396639e-06, "loss": 0.1873, "step": 111 }, { "epoch": 2.817610062893082, "grad_norm": 0.7286617159843445, "learning_rate": 1.014166338924627e-06, "loss": 0.165, "step": 112 }, { "epoch": 2.8427672955974845, "grad_norm": 0.9083012938499451, "learning_rate": 7.457730367402549e-07, "loss": 0.23, "step": 113 }, { "epoch": 2.867924528301887, "grad_norm": 0.7802666425704956, "learning_rate": 5.18292627772382e-07, "loss": 0.1645, "step": 114 }, { "epoch": 2.8930817610062896, "grad_norm": 1.510367512702942, "learning_rate": 3.3191406802041693e-07, "loss": 0.3319, "step": 115 }, { "epoch": 2.918238993710692, "grad_norm": 1.282609462738037, "learning_rate": 1.8679217234154334e-07, "loss": 0.3489, "step": 116 }, { "epoch": 2.9433962264150946, "grad_norm": 0.7282307744026184, "learning_rate": 8.304748585417078e-08, "loss": 0.234, "step": 117 }, { "epoch": 2.968553459119497, "grad_norm": 3.293064832687378, "learning_rate": 2.076618380744133e-08, "loss": 0.1896, "step": 118 }, { "epoch": 2.9937106918238996, "grad_norm": 1.658855676651001, "learning_rate": 0.0, "loss": 0.3107, "step": 119 } ], "logging_steps": 1, "max_steps": 119, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.531779857403085e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }