{ "best_metric": 1.3449122905731201, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 1.1450094161958568, "eval_steps": 25, "global_step": 133, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008609093354856066, "grad_norm": 0.6988075971603394, "learning_rate": 2e-05, "loss": 1.7571, "step": 1 }, { "epoch": 0.008609093354856066, "eval_loss": 3.2656900882720947, "eval_runtime": 1.2466, "eval_samples_per_second": 40.11, "eval_steps_per_second": 10.429, "step": 1 }, { "epoch": 0.017218186709712133, "grad_norm": 0.7088323831558228, "learning_rate": 4e-05, "loss": 2.0087, "step": 2 }, { "epoch": 0.0258272800645682, "grad_norm": 1.009822130203247, "learning_rate": 6e-05, "loss": 2.4702, "step": 3 }, { "epoch": 0.034436373419424265, "grad_norm": 0.9133837819099426, "learning_rate": 8e-05, "loss": 2.5742, "step": 4 }, { "epoch": 0.04304546677428033, "grad_norm": 1.0774202346801758, "learning_rate": 0.0001, "loss": 2.4651, "step": 5 }, { "epoch": 0.0516545601291364, "grad_norm": 1.940169095993042, "learning_rate": 9.99864468413292e-05, "loss": 2.7402, "step": 6 }, { "epoch": 0.060263653483992465, "grad_norm": 2.5114591121673584, "learning_rate": 9.994579552923277e-05, "loss": 2.36, "step": 7 }, { "epoch": 0.06887274683884853, "grad_norm": 2.1416642665863037, "learning_rate": 9.987807055054106e-05, "loss": 2.1412, "step": 8 }, { "epoch": 0.0774818401937046, "grad_norm": 2.493290424346924, "learning_rate": 9.978331270024886e-05, "loss": 2.134, "step": 9 }, { "epoch": 0.08609093354856066, "grad_norm": 2.5377631187438965, "learning_rate": 9.966157905694196e-05, "loss": 2.2798, "step": 10 }, { "epoch": 0.09470002690341674, "grad_norm": 5.070113658905029, "learning_rate": 9.951294294841516e-05, "loss": 2.0949, "step": 11 }, { "epoch": 0.1033091202582728, "grad_norm": 8.32091999053955, "learning_rate": 9.933749390750235e-05, "loss": 2.669, "step": 12 }, { "epoch": 0.11191821361312887, "grad_norm": 10.361486434936523, "learning_rate": 9.913533761814537e-05, "loss": 2.7766, "step": 13 }, { "epoch": 0.12052730696798493, "grad_norm": 5.311871528625488, "learning_rate": 9.890659585173379e-05, "loss": 1.494, "step": 14 }, { "epoch": 0.129136400322841, "grad_norm": 4.676233291625977, "learning_rate": 9.865140639375449e-05, "loss": 1.5863, "step": 15 }, { "epoch": 0.13774549367769706, "grad_norm": 3.6029069423675537, "learning_rate": 9.83699229607948e-05, "loss": 1.7019, "step": 16 }, { "epoch": 0.14635458703255314, "grad_norm": 1.6760152578353882, "learning_rate": 9.80623151079494e-05, "loss": 1.6638, "step": 17 }, { "epoch": 0.1549636803874092, "grad_norm": 1.4253727197647095, "learning_rate": 9.772876812668666e-05, "loss": 1.7801, "step": 18 }, { "epoch": 0.16357277374226525, "grad_norm": 1.3295799493789673, "learning_rate": 9.736948293323593e-05, "loss": 1.7038, "step": 19 }, { "epoch": 0.17218186709712133, "grad_norm": 0.7724640965461731, "learning_rate": 9.698467594756325e-05, "loss": 1.7746, "step": 20 }, { "epoch": 0.1807909604519774, "grad_norm": 0.8214246034622192, "learning_rate": 9.657457896300791e-05, "loss": 1.5971, "step": 21 }, { "epoch": 0.18940005380683347, "grad_norm": 1.1587809324264526, "learning_rate": 9.613943900665889e-05, "loss": 1.7559, "step": 22 }, { "epoch": 0.19800914716168955, "grad_norm": 1.8553802967071533, "learning_rate": 9.567951819055496e-05, "loss": 1.7612, "step": 23 }, { "epoch": 0.2066182405165456, "grad_norm": 2.004556655883789, "learning_rate": 9.519509355379818e-05, "loss": 1.5969, "step": 24 }, { "epoch": 0.21522733387140167, "grad_norm": 5.224137783050537, "learning_rate": 9.468645689567598e-05, "loss": 2.0711, "step": 25 }, { "epoch": 0.21522733387140167, "eval_loss": 1.508137822151184, "eval_runtime": 1.2723, "eval_samples_per_second": 39.3, "eval_steps_per_second": 10.218, "step": 25 }, { "epoch": 0.22383642722625774, "grad_norm": 1.341894268989563, "learning_rate": 9.415391459989203e-05, "loss": 1.2867, "step": 26 }, { "epoch": 0.2324455205811138, "grad_norm": 2.224653720855713, "learning_rate": 9.359778745001225e-05, "loss": 1.2927, "step": 27 }, { "epoch": 0.24105461393596986, "grad_norm": 1.4196522235870361, "learning_rate": 9.301841043623682e-05, "loss": 1.3711, "step": 28 }, { "epoch": 0.24966370729082593, "grad_norm": 1.2161178588867188, "learning_rate": 9.241613255361455e-05, "loss": 1.4576, "step": 29 }, { "epoch": 0.258272800645682, "grad_norm": 0.8192944526672363, "learning_rate": 9.179131659182127e-05, "loss": 1.6426, "step": 30 }, { "epoch": 0.2668818940005381, "grad_norm": 0.6419580578804016, "learning_rate": 9.114433891662902e-05, "loss": 1.7142, "step": 31 }, { "epoch": 0.2754909873553941, "grad_norm": 0.41701650619506836, "learning_rate": 9.047558924319729e-05, "loss": 1.6853, "step": 32 }, { "epoch": 0.2841000807102502, "grad_norm": 0.4797891080379486, "learning_rate": 8.978547040132317e-05, "loss": 1.6622, "step": 33 }, { "epoch": 0.29270917406510627, "grad_norm": 0.737724781036377, "learning_rate": 8.907439809279181e-05, "loss": 1.6734, "step": 34 }, { "epoch": 0.3013182674199623, "grad_norm": 1.0522786378860474, "learning_rate": 8.834280064097317e-05, "loss": 1.6301, "step": 35 }, { "epoch": 0.3099273607748184, "grad_norm": 1.4976811408996582, "learning_rate": 8.759111873281603e-05, "loss": 1.6521, "step": 36 }, { "epoch": 0.31853645412967446, "grad_norm": 3.2661497592926025, "learning_rate": 8.681980515339464e-05, "loss": 1.5304, "step": 37 }, { "epoch": 0.3271455474845305, "grad_norm": 2.8426661491394043, "learning_rate": 8.602932451316802e-05, "loss": 1.437, "step": 38 }, { "epoch": 0.3357546408393866, "grad_norm": 0.49562180042266846, "learning_rate": 8.522015296811584e-05, "loss": 1.2391, "step": 39 }, { "epoch": 0.34436373419424265, "grad_norm": 0.5359828472137451, "learning_rate": 8.439277793291995e-05, "loss": 1.2585, "step": 40 }, { "epoch": 0.35297282754909876, "grad_norm": 0.8592618107795715, "learning_rate": 8.354769778736406e-05, "loss": 1.3682, "step": 41 }, { "epoch": 0.3615819209039548, "grad_norm": 0.8251994848251343, "learning_rate": 8.268542157612821e-05, "loss": 1.548, "step": 42 }, { "epoch": 0.37019101425881085, "grad_norm": 0.9783174991607666, "learning_rate": 8.180646870215952e-05, "loss": 1.7041, "step": 43 }, { "epoch": 0.37880010761366695, "grad_norm": 0.894888699054718, "learning_rate": 8.091136861380305e-05, "loss": 1.8391, "step": 44 }, { "epoch": 0.387409200968523, "grad_norm": 0.5933730006217957, "learning_rate": 8.000066048588211e-05, "loss": 1.6974, "step": 45 }, { "epoch": 0.3960182943233791, "grad_norm": 0.7440256476402283, "learning_rate": 7.907489289491939e-05, "loss": 1.6231, "step": 46 }, { "epoch": 0.40462738767823514, "grad_norm": 0.7255629897117615, "learning_rate": 7.813462348869497e-05, "loss": 1.6172, "step": 47 }, { "epoch": 0.4132364810330912, "grad_norm": 1.1230436563491821, "learning_rate": 7.71804186503403e-05, "loss": 1.5745, "step": 48 }, { "epoch": 0.4218455743879473, "grad_norm": 1.6526938676834106, "learning_rate": 7.62128531571699e-05, "loss": 1.2586, "step": 49 }, { "epoch": 0.43045466774280333, "grad_norm": 5.730405330657959, "learning_rate": 7.523250983445731e-05, "loss": 1.7199, "step": 50 }, { "epoch": 0.43045466774280333, "eval_loss": 1.389930248260498, "eval_runtime": 1.2729, "eval_samples_per_second": 39.28, "eval_steps_per_second": 10.213, "step": 50 }, { "epoch": 0.4390637610976594, "grad_norm": 0.4678877890110016, "learning_rate": 7.42399792043627e-05, "loss": 1.2294, "step": 51 }, { "epoch": 0.4476728544525155, "grad_norm": 0.5002795457839966, "learning_rate": 7.323585913022454e-05, "loss": 1.2342, "step": 52 }, { "epoch": 0.4562819478073715, "grad_norm": 0.3534197509288788, "learning_rate": 7.222075445642904e-05, "loss": 1.2975, "step": 53 }, { "epoch": 0.4648910411622276, "grad_norm": 0.6102612018585205, "learning_rate": 7.119527664407447e-05, "loss": 1.4773, "step": 54 }, { "epoch": 0.47350013451708367, "grad_norm": 0.5064122080802917, "learning_rate": 7.01600434026499e-05, "loss": 1.5257, "step": 55 }, { "epoch": 0.4821092278719397, "grad_norm": 0.6477398872375488, "learning_rate": 6.911567831795013e-05, "loss": 1.7135, "step": 56 }, { "epoch": 0.4907183212267958, "grad_norm": 1.0539360046386719, "learning_rate": 6.80628104764508e-05, "loss": 1.8241, "step": 57 }, { "epoch": 0.49932741458165186, "grad_norm": 0.7702855467796326, "learning_rate": 6.700207408637044e-05, "loss": 1.7362, "step": 58 }, { "epoch": 0.5079365079365079, "grad_norm": 0.6455403566360474, "learning_rate": 6.593410809564689e-05, "loss": 1.5381, "step": 59 }, { "epoch": 0.516545601291364, "grad_norm": 0.6673574447631836, "learning_rate": 6.485955580705913e-05, "loss": 1.4796, "step": 60 }, { "epoch": 0.5251546946462201, "grad_norm": 0.8242542743682861, "learning_rate": 6.377906449072578e-05, "loss": 1.6654, "step": 61 }, { "epoch": 0.5337637880010762, "grad_norm": 1.4092378616333008, "learning_rate": 6.269328499421356e-05, "loss": 1.2351, "step": 62 }, { "epoch": 0.5423728813559322, "grad_norm": 2.419718027114868, "learning_rate": 6.160287135049127e-05, "loss": 1.4315, "step": 63 }, { "epoch": 0.5509819747107882, "grad_norm": 0.38671913743019104, "learning_rate": 6.050848038396473e-05, "loss": 1.2274, "step": 64 }, { "epoch": 0.5595910680656443, "grad_norm": 0.5623155832290649, "learning_rate": 5.941077131483025e-05, "loss": 1.3062, "step": 65 }, { "epoch": 0.5682001614205004, "grad_norm": 0.6458035111427307, "learning_rate": 5.831040536198504e-05, "loss": 1.4318, "step": 66 }, { "epoch": 0.5768092547753565, "grad_norm": 0.6504884958267212, "learning_rate": 5.720804534473382e-05, "loss": 1.3897, "step": 67 }, { "epoch": 0.5854183481302125, "grad_norm": 0.4223068356513977, "learning_rate": 5.610435528353106e-05, "loss": 1.5331, "step": 68 }, { "epoch": 0.5940274414850686, "grad_norm": 0.5046920776367188, "learning_rate": 5.500000000000001e-05, "loss": 1.6225, "step": 69 }, { "epoch": 0.6026365348399246, "grad_norm": 0.41651174426078796, "learning_rate": 5.389564471646895e-05, "loss": 1.7376, "step": 70 }, { "epoch": 0.6112456281947808, "grad_norm": 0.32924169301986694, "learning_rate": 5.27919546552662e-05, "loss": 1.5401, "step": 71 }, { "epoch": 0.6198547215496368, "grad_norm": 0.4280257821083069, "learning_rate": 5.168959463801497e-05, "loss": 1.5662, "step": 72 }, { "epoch": 0.6284638149044929, "grad_norm": 0.6656383275985718, "learning_rate": 5.058922868516978e-05, "loss": 1.4713, "step": 73 }, { "epoch": 0.6370729082593489, "grad_norm": 0.8646160960197449, "learning_rate": 4.9491519616035276e-05, "loss": 1.2566, "step": 74 }, { "epoch": 0.645682001614205, "grad_norm": 2.5206289291381836, "learning_rate": 4.839712864950873e-05, "loss": 1.7236, "step": 75 }, { "epoch": 0.645682001614205, "eval_loss": 1.349289894104004, "eval_runtime": 1.2727, "eval_samples_per_second": 39.288, "eval_steps_per_second": 10.215, "step": 75 }, { "epoch": 0.654291094969061, "grad_norm": 0.4360639750957489, "learning_rate": 4.730671500578645e-05, "loss": 1.1383, "step": 76 }, { "epoch": 0.6629001883239172, "grad_norm": 0.8731722235679626, "learning_rate": 4.6220935509274235e-05, "loss": 1.4032, "step": 77 }, { "epoch": 0.6715092816787732, "grad_norm": 0.7142120003700256, "learning_rate": 4.5140444192940864e-05, "loss": 1.1904, "step": 78 }, { "epoch": 0.6801183750336293, "grad_norm": 0.594018280506134, "learning_rate": 4.406589190435313e-05, "loss": 1.3872, "step": 79 }, { "epoch": 0.6887274683884853, "grad_norm": 0.6022002696990967, "learning_rate": 4.2997925913629577e-05, "loss": 1.5956, "step": 80 }, { "epoch": 0.6973365617433414, "grad_norm": 0.5471949577331543, "learning_rate": 4.19371895235492e-05, "loss": 1.6525, "step": 81 }, { "epoch": 0.7059456550981975, "grad_norm": 0.3283829391002655, "learning_rate": 4.0884321682049884e-05, "loss": 1.772, "step": 82 }, { "epoch": 0.7145547484530536, "grad_norm": 0.34206530451774597, "learning_rate": 3.98399565973501e-05, "loss": 1.6938, "step": 83 }, { "epoch": 0.7231638418079096, "grad_norm": 0.35002151131629944, "learning_rate": 3.880472335592553e-05, "loss": 1.418, "step": 84 }, { "epoch": 0.7317729351627656, "grad_norm": 0.7404176592826843, "learning_rate": 3.777924554357096e-05, "loss": 1.5774, "step": 85 }, { "epoch": 0.7403820285176217, "grad_norm": 0.8380143046379089, "learning_rate": 3.676414086977546e-05, "loss": 1.3188, "step": 86 }, { "epoch": 0.7489911218724778, "grad_norm": 2.1834990978240967, "learning_rate": 3.576002079563732e-05, "loss": 1.4621, "step": 87 }, { "epoch": 0.7576002152273339, "grad_norm": 2.3319005966186523, "learning_rate": 3.4767490165542704e-05, "loss": 1.5594, "step": 88 }, { "epoch": 0.7662093085821899, "grad_norm": 0.3592979311943054, "learning_rate": 3.378714684283011e-05, "loss": 1.1, "step": 89 }, { "epoch": 0.774818401937046, "grad_norm": 0.49761757254600525, "learning_rate": 3.281958134965972e-05, "loss": 1.3531, "step": 90 }, { "epoch": 0.783427495291902, "grad_norm": 0.3277381658554077, "learning_rate": 3.186537651130503e-05, "loss": 1.3467, "step": 91 }, { "epoch": 0.7920365886467582, "grad_norm": 0.3256728947162628, "learning_rate": 3.0925107105080636e-05, "loss": 1.5374, "step": 92 }, { "epoch": 0.8006456820016142, "grad_norm": 0.35263001918792725, "learning_rate": 2.9999339514117912e-05, "loss": 1.5367, "step": 93 }, { "epoch": 0.8092547753564703, "grad_norm": 0.3698779344558716, "learning_rate": 2.9088631386196964e-05, "loss": 1.7344, "step": 94 }, { "epoch": 0.8178638687113263, "grad_norm": 0.445311576128006, "learning_rate": 2.8193531297840503e-05, "loss": 1.7141, "step": 95 }, { "epoch": 0.8264729620661824, "grad_norm": 0.4353031814098358, "learning_rate": 2.73145784238718e-05, "loss": 1.5168, "step": 96 }, { "epoch": 0.8350820554210385, "grad_norm": 0.6268022060394287, "learning_rate": 2.645230221263596e-05, "loss": 1.4016, "step": 97 }, { "epoch": 0.8436911487758946, "grad_norm": 0.5284622311592102, "learning_rate": 2.560722206708006e-05, "loss": 1.5741, "step": 98 }, { "epoch": 0.8523002421307506, "grad_norm": 0.7828362584114075, "learning_rate": 2.4779847031884175e-05, "loss": 1.243, "step": 99 }, { "epoch": 0.8609093354856067, "grad_norm": 3.621532678604126, "learning_rate": 2.397067548683199e-05, "loss": 1.5976, "step": 100 }, { "epoch": 0.8609093354856067, "eval_loss": 1.3449122905731201, "eval_runtime": 1.2724, "eval_samples_per_second": 39.295, "eval_steps_per_second": 10.217, "step": 100 }, { "epoch": 0.8695184288404627, "grad_norm": 0.3012229800224304, "learning_rate": 2.3180194846605367e-05, "loss": 1.176, "step": 101 }, { "epoch": 0.8781275221953188, "grad_norm": 0.4754287004470825, "learning_rate": 2.2408881267183997e-05, "loss": 1.1958, "step": 102 }, { "epoch": 0.8867366155501749, "grad_norm": 0.43265655636787415, "learning_rate": 2.165719935902685e-05, "loss": 1.3262, "step": 103 }, { "epoch": 0.895345708905031, "grad_norm": 0.5260616540908813, "learning_rate": 2.09256019072082e-05, "loss": 1.3721, "step": 104 }, { "epoch": 0.903954802259887, "grad_norm": 0.5602609515190125, "learning_rate": 2.0214529598676836e-05, "loss": 1.401, "step": 105 }, { "epoch": 0.912563895614743, "grad_norm": 0.29336562752723694, "learning_rate": 1.952441075680272e-05, "loss": 1.6924, "step": 106 }, { "epoch": 0.9211729889695991, "grad_norm": 0.9488304853439331, "learning_rate": 1.8855661083370986e-05, "loss": 1.8012, "step": 107 }, { "epoch": 0.9297820823244553, "grad_norm": 0.3932758867740631, "learning_rate": 1.820868340817874e-05, "loss": 1.6428, "step": 108 }, { "epoch": 0.9383911756793113, "grad_norm": 0.3379191756248474, "learning_rate": 1.758386744638546e-05, "loss": 1.3678, "step": 109 }, { "epoch": 0.9470002690341673, "grad_norm": 0.5376018285751343, "learning_rate": 1.698158956376318e-05, "loss": 1.6057, "step": 110 }, { "epoch": 0.9556093623890234, "grad_norm": 0.6705049872398376, "learning_rate": 1.6402212549987762e-05, "loss": 1.5497, "step": 111 }, { "epoch": 0.9642184557438794, "grad_norm": 1.5708343982696533, "learning_rate": 1.584608540010799e-05, "loss": 1.4589, "step": 112 }, { "epoch": 0.9728275490987356, "grad_norm": 2.8929443359375, "learning_rate": 1.531354310432403e-05, "loss": 1.5784, "step": 113 }, { "epoch": 0.9814366424535916, "grad_norm": 0.3657113313674927, "learning_rate": 1.4804906446201816e-05, "loss": 1.3912, "step": 114 }, { "epoch": 0.9900457358084477, "grad_norm": 0.3794941306114197, "learning_rate": 1.4320481809445051e-05, "loss": 1.5847, "step": 115 }, { "epoch": 0.9986548291633037, "grad_norm": 0.7362991571426392, "learning_rate": 1.386056099334112e-05, "loss": 1.399, "step": 116 }, { "epoch": 1.0072639225181599, "grad_norm": 0.8029009103775024, "learning_rate": 1.3425421036992098e-05, "loss": 1.2215, "step": 117 }, { "epoch": 1.0158730158730158, "grad_norm": 0.5080808997154236, "learning_rate": 1.3015324052436753e-05, "loss": 1.2015, "step": 118 }, { "epoch": 1.024482109227872, "grad_norm": 0.44496291875839233, "learning_rate": 1.2630517066764069e-05, "loss": 1.2138, "step": 119 }, { "epoch": 1.033091202582728, "grad_norm": 0.4348479211330414, "learning_rate": 1.227123187331335e-05, "loss": 1.2767, "step": 120 }, { "epoch": 1.041700295937584, "grad_norm": 0.37992164492607117, "learning_rate": 1.1937684892050604e-05, "loss": 1.5242, "step": 121 }, { "epoch": 1.0503093892924402, "grad_norm": 0.32971861958503723, "learning_rate": 1.1630077039205209e-05, "loss": 1.5498, "step": 122 }, { "epoch": 1.0589184826472962, "grad_norm": 0.5224172472953796, "learning_rate": 1.1348593606245522e-05, "loss": 1.6984, "step": 123 }, { "epoch": 1.0675275760021523, "grad_norm": 0.43070971965789795, "learning_rate": 1.109340414826622e-05, "loss": 1.5932, "step": 124 }, { "epoch": 1.0761366693570082, "grad_norm": 0.4774491786956787, "learning_rate": 1.0864662381854632e-05, "loss": 1.4308, "step": 125 }, { "epoch": 1.0761366693570082, "eval_loss": 1.3410676717758179, "eval_runtime": 1.2741, "eval_samples_per_second": 39.242, "eval_steps_per_second": 10.203, "step": 125 }, { "epoch": 1.0847457627118644, "grad_norm": 0.5184400677680969, "learning_rate": 1.0662506092497646e-05, "loss": 1.4641, "step": 126 }, { "epoch": 1.0933548560667206, "grad_norm": 0.5525245666503906, "learning_rate": 1.0487057051584856e-05, "loss": 1.5545, "step": 127 }, { "epoch": 1.1019639494215765, "grad_norm": 1.609927773475647, "learning_rate": 1.0338420943058053e-05, "loss": 1.3439, "step": 128 }, { "epoch": 1.1105730427764327, "grad_norm": 2.2938551902770996, "learning_rate": 1.0216687299751144e-05, "loss": 1.4817, "step": 129 }, { "epoch": 1.1191821361312886, "grad_norm": 0.45292142033576965, "learning_rate": 1.0121929449458941e-05, "loss": 1.1242, "step": 130 }, { "epoch": 1.1277912294861447, "grad_norm": 0.4423352777957916, "learning_rate": 1.0054204470767243e-05, "loss": 1.1672, "step": 131 }, { "epoch": 1.136400322841001, "grad_norm": 0.33851832151412964, "learning_rate": 1.0013553158670811e-05, "loss": 1.2433, "step": 132 }, { "epoch": 1.1450094161958568, "grad_norm": 0.3434777855873108, "learning_rate": 1e-05, "loss": 1.4094, "step": 133 } ], "logging_steps": 1, "max_steps": 133, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8256510115053568e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }