{ "best_metric": 0.8136073350906372, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 1.8772491989154547, "eval_steps": 25, "global_step": 119, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01577520335223071, "grad_norm": 0.5629048347473145, "learning_rate": 2e-05, "loss": 2.0216, "step": 1 }, { "epoch": 0.01577520335223071, "eval_loss": 1.9757364988327026, "eval_runtime": 0.9944, "eval_samples_per_second": 50.282, "eval_steps_per_second": 13.073, "step": 1 }, { "epoch": 0.03155040670446142, "grad_norm": 0.608745276927948, "learning_rate": 4e-05, "loss": 2.0373, "step": 2 }, { "epoch": 0.047325610056692134, "grad_norm": 0.6430224180221558, "learning_rate": 6e-05, "loss": 2.0016, "step": 3 }, { "epoch": 0.06310081340892285, "grad_norm": 0.6356592774391174, "learning_rate": 8e-05, "loss": 2.0053, "step": 4 }, { "epoch": 0.07887601676115356, "grad_norm": 0.7383120656013489, "learning_rate": 0.0001, "loss": 1.8959, "step": 5 }, { "epoch": 0.09465122011338427, "grad_norm": 0.8844524025917053, "learning_rate": 9.998291381612281e-05, "loss": 1.8298, "step": 6 }, { "epoch": 0.11042642346561499, "grad_norm": 0.8834840655326843, "learning_rate": 9.993166823949923e-05, "loss": 1.5862, "step": 7 }, { "epoch": 0.1262016268178457, "grad_norm": 0.8253166675567627, "learning_rate": 9.984630218530014e-05, "loss": 1.4341, "step": 8 }, { "epoch": 0.14197683017007642, "grad_norm": 0.6953087449073792, "learning_rate": 9.972688047930772e-05, "loss": 1.4089, "step": 9 }, { "epoch": 0.15775203352230713, "grad_norm": 0.6863667368888855, "learning_rate": 9.957349380868764e-05, "loss": 1.3484, "step": 10 }, { "epoch": 0.17352723687453783, "grad_norm": 0.5870503783226013, "learning_rate": 9.938625865312251e-05, "loss": 1.2239, "step": 11 }, { "epoch": 0.18930244022676854, "grad_norm": 0.5060000419616699, "learning_rate": 9.916531719635881e-05, "loss": 1.1336, "step": 12 }, { "epoch": 0.20507764357899927, "grad_norm": 0.5288716554641724, "learning_rate": 9.891083721823461e-05, "loss": 1.1815, "step": 13 }, { "epoch": 0.22085284693122997, "grad_norm": 0.5995634198188782, "learning_rate": 9.862301196726987e-05, "loss": 1.1409, "step": 14 }, { "epoch": 0.23662805028346068, "grad_norm": 0.5360584855079651, "learning_rate": 9.830206001391626e-05, "loss": 1.15, "step": 15 }, { "epoch": 0.2524032536356914, "grad_norm": 0.4800111651420593, "learning_rate": 9.794822508457784e-05, "loss": 1.1158, "step": 16 }, { "epoch": 0.2681784569879221, "grad_norm": 0.3739275634288788, "learning_rate": 9.756177587652856e-05, "loss": 1.0791, "step": 17 }, { "epoch": 0.28395366034015285, "grad_norm": 0.39843374490737915, "learning_rate": 9.714300585386747e-05, "loss": 1.0447, "step": 18 }, { "epoch": 0.29972886369238355, "grad_norm": 0.33565428853034973, "learning_rate": 9.669223302466608e-05, "loss": 1.0964, "step": 19 }, { "epoch": 0.31550406704461426, "grad_norm": 0.3283925950527191, "learning_rate": 9.620979969947759e-05, "loss": 1.0396, "step": 20 }, { "epoch": 0.33127927039684496, "grad_norm": 0.2815157175064087, "learning_rate": 9.5696072231391e-05, "loss": 1.0363, "step": 21 }, { "epoch": 0.34705447374907566, "grad_norm": 0.3215339779853821, "learning_rate": 9.515144073782774e-05, "loss": 1.0186, "step": 22 }, { "epoch": 0.36282967710130637, "grad_norm": 0.28140565752983093, "learning_rate": 9.4576318804292e-05, "loss": 0.9579, "step": 23 }, { "epoch": 0.3786048804535371, "grad_norm": 0.2168906331062317, "learning_rate": 9.397114317029975e-05, "loss": 0.8863, "step": 24 }, { "epoch": 0.39438008380576783, "grad_norm": 0.2551526725292206, "learning_rate": 9.333637339772472e-05, "loss": 0.9752, "step": 25 }, { "epoch": 0.39438008380576783, "eval_loss": 0.9391992688179016, "eval_runtime": 0.9989, "eval_samples_per_second": 50.057, "eval_steps_per_second": 13.015, "step": 25 }, { "epoch": 0.41015528715799854, "grad_norm": 0.4114759862422943, "learning_rate": 9.267249152181379e-05, "loss": 1.0444, "step": 26 }, { "epoch": 0.42593049051022924, "grad_norm": 0.4204542934894562, "learning_rate": 9.198000168513604e-05, "loss": 1.0266, "step": 27 }, { "epoch": 0.44170569386245995, "grad_norm": 0.3284596800804138, "learning_rate": 9.125942975474403e-05, "loss": 1.0057, "step": 28 }, { "epoch": 0.45748089721469065, "grad_norm": 0.2836671471595764, "learning_rate": 9.051132292283771e-05, "loss": 1.0299, "step": 29 }, { "epoch": 0.47325610056692136, "grad_norm": 0.22336989641189575, "learning_rate": 8.973624929123445e-05, "loss": 0.9243, "step": 30 }, { "epoch": 0.48903130391915206, "grad_norm": 0.24765413999557495, "learning_rate": 8.893479743996034e-05, "loss": 0.9547, "step": 31 }, { "epoch": 0.5048065072713828, "grad_norm": 0.2682327330112457, "learning_rate": 8.810757598029093e-05, "loss": 0.9908, "step": 32 }, { "epoch": 0.5205817106236135, "grad_norm": 0.22742092609405518, "learning_rate": 8.725521309258031e-05, "loss": 0.938, "step": 33 }, { "epoch": 0.5363569139758442, "grad_norm": 0.2611725330352783, "learning_rate": 8.637835604922979e-05, "loss": 0.9504, "step": 34 }, { "epoch": 0.5521321173280749, "grad_norm": 0.23787076771259308, "learning_rate": 8.547767072315835e-05, "loss": 0.9262, "step": 35 }, { "epoch": 0.5679073206803057, "grad_norm": 0.17593003809452057, "learning_rate": 8.455384108214805e-05, "loss": 0.9063, "step": 36 }, { "epoch": 0.5836825240325364, "grad_norm": 0.1923958659172058, "learning_rate": 8.360756866944858e-05, "loss": 0.8857, "step": 37 }, { "epoch": 0.5994577273847671, "grad_norm": 0.22798652946949005, "learning_rate": 8.263957207103507e-05, "loss": 0.9305, "step": 38 }, { "epoch": 0.6152329307369978, "grad_norm": 0.26211604475975037, "learning_rate": 8.165058636992411e-05, "loss": 0.9826, "step": 39 }, { "epoch": 0.6310081340892285, "grad_norm": 0.23822127282619476, "learning_rate": 8.064136258796198e-05, "loss": 0.9387, "step": 40 }, { "epoch": 0.6467833374414592, "grad_norm": 0.18069308996200562, "learning_rate": 7.961266711550922e-05, "loss": 0.9238, "step": 41 }, { "epoch": 0.6625585407936899, "grad_norm": 0.1730770766735077, "learning_rate": 7.856528112945452e-05, "loss": 0.9561, "step": 42 }, { "epoch": 0.6783337441459206, "grad_norm": 0.21277105808258057, "learning_rate": 7.75e-05, "loss": 0.9098, "step": 43 }, { "epoch": 0.6941089474981513, "grad_norm": 0.26214319467544556, "learning_rate": 7.641763268666831e-05, "loss": 0.8907, "step": 44 }, { "epoch": 0.709884150850382, "grad_norm": 0.26092496514320374, "learning_rate": 7.531900112399004e-05, "loss": 0.8726, "step": 45 }, { "epoch": 0.7256593542026127, "grad_norm": 0.19305089116096497, "learning_rate": 7.420493959733816e-05, "loss": 0.9292, "step": 46 }, { "epoch": 0.7414345575548434, "grad_norm": 0.18803110718727112, "learning_rate": 7.307629410938363e-05, "loss": 0.8926, "step": 47 }, { "epoch": 0.7572097609070741, "grad_norm": 0.18940120935440063, "learning_rate": 7.193392173765261e-05, "loss": 0.8856, "step": 48 }, { "epoch": 0.7729849642593049, "grad_norm": 0.20543551445007324, "learning_rate": 7.077868998367395e-05, "loss": 0.8136, "step": 49 }, { "epoch": 0.7887601676115357, "grad_norm": 0.2140730768442154, "learning_rate": 6.961147611421075e-05, "loss": 0.7804, "step": 50 }, { "epoch": 0.7887601676115357, "eval_loss": 0.8551968932151794, "eval_runtime": 0.9944, "eval_samples_per_second": 50.281, "eval_steps_per_second": 13.073, "step": 50 }, { "epoch": 0.8045353709637664, "grad_norm": 0.9923269152641296, "learning_rate": 6.843316649507626e-05, "loss": 1.0079, "step": 51 }, { "epoch": 0.8203105743159971, "grad_norm": 0.41361427307128906, "learning_rate": 6.724465591804008e-05, "loss": 0.8992, "step": 52 }, { "epoch": 0.8360857776682278, "grad_norm": 0.22296248376369476, "learning_rate": 6.604684692133597e-05, "loss": 0.9351, "step": 53 }, { "epoch": 0.8518609810204585, "grad_norm": 0.1803758591413498, "learning_rate": 6.484064910428692e-05, "loss": 0.8927, "step": 54 }, { "epoch": 0.8676361843726892, "grad_norm": 0.31560787558555603, "learning_rate": 6.362697843656823e-05, "loss": 0.8899, "step": 55 }, { "epoch": 0.8834113877249199, "grad_norm": 0.32848024368286133, "learning_rate": 6.240675656263303e-05, "loss": 0.8834, "step": 56 }, { "epoch": 0.8991865910771506, "grad_norm": 0.24373023211956024, "learning_rate": 6.118091010182837e-05, "loss": 0.8762, "step": 57 }, { "epoch": 0.9149617944293813, "grad_norm": 0.20689459145069122, "learning_rate": 5.995036994473357e-05, "loss": 0.8839, "step": 58 }, { "epoch": 0.930736997781612, "grad_norm": 0.16526438295841217, "learning_rate": 5.8716070546254966e-05, "loss": 0.8477, "step": 59 }, { "epoch": 0.9465122011338427, "grad_norm": 0.2062149941921234, "learning_rate": 5.747894921601396e-05, "loss": 0.856, "step": 60 }, { "epoch": 0.9622874044860734, "grad_norm": 0.2273935228586197, "learning_rate": 5.62399454065673e-05, "loss": 0.8561, "step": 61 }, { "epoch": 0.9780626078383041, "grad_norm": 0.21312715113162994, "learning_rate": 5.500000000000001e-05, "loss": 0.8319, "step": 62 }, { "epoch": 0.9938378111905348, "grad_norm": 0.2801797688007355, "learning_rate": 5.376005459343272e-05, "loss": 0.8702, "step": 63 }, { "epoch": 1.0096130145427655, "grad_norm": 0.40973302721977234, "learning_rate": 5.2521050783986046e-05, "loss": 1.4046, "step": 64 }, { "epoch": 1.0253882178949962, "grad_norm": 0.299440860748291, "learning_rate": 5.128392945374505e-05, "loss": 0.893, "step": 65 }, { "epoch": 1.041163421247227, "grad_norm": 0.23597873747348785, "learning_rate": 5.004963005526644e-05, "loss": 0.9389, "step": 66 }, { "epoch": 1.0569386245994576, "grad_norm": 0.18155038356781006, "learning_rate": 4.881908989817163e-05, "loss": 0.8973, "step": 67 }, { "epoch": 1.0727138279516883, "grad_norm": 0.27366727590560913, "learning_rate": 4.7593243437366975e-05, "loss": 0.8746, "step": 68 }, { "epoch": 1.088489031303919, "grad_norm": 0.3317013084888458, "learning_rate": 4.6373021563431784e-05, "loss": 0.8609, "step": 69 }, { "epoch": 1.1042642346561498, "grad_norm": 0.35338032245635986, "learning_rate": 4.515935089571309e-05, "loss": 0.8783, "step": 70 }, { "epoch": 1.1200394380083805, "grad_norm": 0.24775464832782745, "learning_rate": 4.395315307866405e-05, "loss": 0.8472, "step": 71 }, { "epoch": 1.1358146413606112, "grad_norm": 0.20005960762500763, "learning_rate": 4.275534408195991e-05, "loss": 0.8236, "step": 72 }, { "epoch": 1.151589844712842, "grad_norm": 0.19893726706504822, "learning_rate": 4.156683350492376e-05, "loss": 0.8198, "step": 73 }, { "epoch": 1.1673650480650728, "grad_norm": 0.20478643476963043, "learning_rate": 4.0388523885789256e-05, "loss": 0.844, "step": 74 }, { "epoch": 1.1831402514173035, "grad_norm": 0.19046536087989807, "learning_rate": 3.922131001632606e-05, "loss": 0.7865, "step": 75 }, { "epoch": 1.1831402514173035, "eval_loss": 0.8290574550628662, "eval_runtime": 1.0154, "eval_samples_per_second": 49.241, "eval_steps_per_second": 12.803, "step": 75 }, { "epoch": 1.1989154547695342, "grad_norm": 0.2349046766757965, "learning_rate": 3.8066078262347406e-05, "loss": 0.8493, "step": 76 }, { "epoch": 1.214690658121765, "grad_norm": 0.3539714217185974, "learning_rate": 3.692370589061639e-05, "loss": 0.9299, "step": 77 }, { "epoch": 1.2304658614739956, "grad_norm": 0.33250951766967773, "learning_rate": 3.579506040266184e-05, "loss": 0.9057, "step": 78 }, { "epoch": 1.2462410648262263, "grad_norm": 0.3029455542564392, "learning_rate": 3.468099887600999e-05, "loss": 0.8519, "step": 79 }, { "epoch": 1.262016268178457, "grad_norm": 0.21666352450847626, "learning_rate": 3.358236731333169e-05, "loss": 0.9029, "step": 80 }, { "epoch": 1.2777914715306877, "grad_norm": 0.18449752032756805, "learning_rate": 3.250000000000001e-05, "loss": 0.8616, "step": 81 }, { "epoch": 1.2935666748829184, "grad_norm": 0.18198740482330322, "learning_rate": 3.14347188705455e-05, "loss": 0.8339, "step": 82 }, { "epoch": 1.3093418782351491, "grad_norm": 0.20003017783164978, "learning_rate": 3.0387332884490805e-05, "loss": 0.8547, "step": 83 }, { "epoch": 1.3251170815873798, "grad_norm": 0.3120158910751343, "learning_rate": 2.9358637412038027e-05, "loss": 0.8515, "step": 84 }, { "epoch": 1.3408922849396105, "grad_norm": 0.289815753698349, "learning_rate": 2.8349413630075906e-05, "loss": 0.8998, "step": 85 }, { "epoch": 1.3566674882918413, "grad_norm": 0.31206515431404114, "learning_rate": 2.736042792896495e-05, "loss": 0.8424, "step": 86 }, { "epoch": 1.372442691644072, "grad_norm": 0.32302799820899963, "learning_rate": 2.639243133055145e-05, "loss": 0.8251, "step": 87 }, { "epoch": 1.3882178949963027, "grad_norm": 0.27142566442489624, "learning_rate": 2.5446158917851958e-05, "loss": 0.7932, "step": 88 }, { "epoch": 1.4039930983485334, "grad_norm": 0.17222517728805542, "learning_rate": 2.4522329276841663e-05, "loss": 0.8706, "step": 89 }, { "epoch": 1.419768301700764, "grad_norm": 0.24352790415287018, "learning_rate": 2.362164395077021e-05, "loss": 0.8768, "step": 90 }, { "epoch": 1.4355435050529948, "grad_norm": 0.27131199836730957, "learning_rate": 2.2744786907419703e-05, "loss": 0.9101, "step": 91 }, { "epoch": 1.4513187084052255, "grad_norm": 0.25456181168556213, "learning_rate": 2.189242401970908e-05, "loss": 0.9031, "step": 92 }, { "epoch": 1.4670939117574562, "grad_norm": 0.23848816752433777, "learning_rate": 2.1065202560039677e-05, "loss": 0.8784, "step": 93 }, { "epoch": 1.4828691151096869, "grad_norm": 0.23838257789611816, "learning_rate": 2.026375070876556e-05, "loss": 0.8914, "step": 94 }, { "epoch": 1.4986443184619176, "grad_norm": 0.21126312017440796, "learning_rate": 1.9488677077162295e-05, "loss": 0.8688, "step": 95 }, { "epoch": 1.5144195218141485, "grad_norm": 0.20426392555236816, "learning_rate": 1.8740570245255984e-05, "loss": 0.8531, "step": 96 }, { "epoch": 1.5301947251663792, "grad_norm": 0.18937963247299194, "learning_rate": 1.8019998314863974e-05, "loss": 0.9092, "step": 97 }, { "epoch": 1.54596992851861, "grad_norm": 0.1737300157546997, "learning_rate": 1.7327508478186218e-05, "loss": 0.8571, "step": 98 }, { "epoch": 1.5617451318708406, "grad_norm": 0.17040841281414032, "learning_rate": 1.6663626602275288e-05, "loss": 0.8698, "step": 99 }, { "epoch": 1.5775203352230713, "grad_norm": 0.1793239712715149, "learning_rate": 1.602885682970026e-05, "loss": 0.7618, "step": 100 }, { "epoch": 1.5775203352230713, "eval_loss": 0.8136073350906372, "eval_runtime": 0.9915, "eval_samples_per_second": 50.43, "eval_steps_per_second": 13.112, "step": 100 }, { "epoch": 1.593295538575302, "grad_norm": 0.23437979817390442, "learning_rate": 1.5423681195707997e-05, "loss": 0.8343, "step": 101 }, { "epoch": 1.6090707419275327, "grad_norm": 0.23647752404212952, "learning_rate": 1.484855926217227e-05, "loss": 0.9165, "step": 102 }, { "epoch": 1.6248459452797634, "grad_norm": 0.18379107117652893, "learning_rate": 1.4303927768609015e-05, "loss": 0.8544, "step": 103 }, { "epoch": 1.6406211486319942, "grad_norm": 0.18732428550720215, "learning_rate": 1.3790200300522413e-05, "loss": 0.875, "step": 104 }, { "epoch": 1.6563963519842249, "grad_norm": 0.16876737773418427, "learning_rate": 1.330776697533392e-05, "loss": 0.8525, "step": 105 }, { "epoch": 1.6721715553364556, "grad_norm": 0.1615445613861084, "learning_rate": 1.2856994146132542e-05, "loss": 0.8207, "step": 106 }, { "epoch": 1.6879467586886863, "grad_norm": 0.17801810801029205, "learning_rate": 1.2438224123471442e-05, "loss": 0.8531, "step": 107 }, { "epoch": 1.703721962040917, "grad_norm": 0.18149054050445557, "learning_rate": 1.2051774915422163e-05, "loss": 0.9099, "step": 108 }, { "epoch": 1.7194971653931477, "grad_norm": 0.1865503042936325, "learning_rate": 1.1697939986083733e-05, "loss": 0.8604, "step": 109 }, { "epoch": 1.7352723687453784, "grad_norm": 0.18773400783538818, "learning_rate": 1.1376988032730134e-05, "loss": 0.8526, "step": 110 }, { "epoch": 1.751047572097609, "grad_norm": 0.19192053377628326, "learning_rate": 1.1089162781765398e-05, "loss": 0.833, "step": 111 }, { "epoch": 1.7668227754498398, "grad_norm": 0.23631104826927185, "learning_rate": 1.0834682803641197e-05, "loss": 0.8052, "step": 112 }, { "epoch": 1.7825979788020705, "grad_norm": 0.22724592685699463, "learning_rate": 1.0613741346877497e-05, "loss": 0.804, "step": 113 }, { "epoch": 1.7983731821543012, "grad_norm": 0.1648131012916565, "learning_rate": 1.0426506191312355e-05, "loss": 0.8892, "step": 114 }, { "epoch": 1.814148385506532, "grad_norm": 0.19587408006191254, "learning_rate": 1.0273119520692275e-05, "loss": 0.8797, "step": 115 }, { "epoch": 1.8299235888587626, "grad_norm": 0.20268595218658447, "learning_rate": 1.0153697814699859e-05, "loss": 0.8665, "step": 116 }, { "epoch": 1.8456987922109933, "grad_norm": 0.2090834677219391, "learning_rate": 1.0068331760500774e-05, "loss": 0.8841, "step": 117 }, { "epoch": 1.861473995563224, "grad_norm": 0.18287914991378784, "learning_rate": 1.0017086183877188e-05, "loss": 0.8637, "step": 118 }, { "epoch": 1.8772491989154547, "grad_norm": 0.18256083130836487, "learning_rate": 1e-05, "loss": 0.8543, "step": 119 } ], "logging_steps": 1, "max_steps": 119, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 20, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3299091053509345e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }