{ "best_metric": 0.696293830871582, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.03715745471435207, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018578727357176033, "grad_norm": 4.046935558319092, "learning_rate": 8.000000000000001e-06, "loss": 1.3446, "step": 1 }, { "epoch": 0.00018578727357176033, "eval_loss": 1.5395567417144775, "eval_runtime": 210.244, "eval_samples_per_second": 43.121, "eval_steps_per_second": 10.783, "step": 1 }, { "epoch": 0.00037157454714352065, "grad_norm": 5.148815631866455, "learning_rate": 1.6000000000000003e-05, "loss": 1.5422, "step": 2 }, { "epoch": 0.000557361820715281, "grad_norm": 8.490533828735352, "learning_rate": 2.4e-05, "loss": 1.7292, "step": 3 }, { "epoch": 0.0007431490942870413, "grad_norm": 29.692520141601562, "learning_rate": 3.2000000000000005e-05, "loss": 1.6123, "step": 4 }, { "epoch": 0.0009289363678588017, "grad_norm": 23.357440948486328, "learning_rate": 4e-05, "loss": 1.7596, "step": 5 }, { "epoch": 0.001114723641430562, "grad_norm": 20.767066955566406, "learning_rate": 4.8e-05, "loss": 1.3862, "step": 6 }, { "epoch": 0.0013005109150023223, "grad_norm": 2.4082024097442627, "learning_rate": 5.6e-05, "loss": 1.1814, "step": 7 }, { "epoch": 0.0014862981885740826, "grad_norm": 2.5414106845855713, "learning_rate": 6.400000000000001e-05, "loss": 1.1491, "step": 8 }, { "epoch": 0.001672085462145843, "grad_norm": 1.9966752529144287, "learning_rate": 7.2e-05, "loss": 1.0002, "step": 9 }, { "epoch": 0.0018578727357176034, "grad_norm": 2.459282398223877, "learning_rate": 8e-05, "loss": 1.0005, "step": 10 }, { "epoch": 0.0020436600092893636, "grad_norm": 1.952752709388733, "learning_rate": 7.999453219969877e-05, "loss": 0.8618, "step": 11 }, { "epoch": 0.002229447282861124, "grad_norm": 1.8736774921417236, "learning_rate": 7.997813029363704e-05, "loss": 0.8635, "step": 12 }, { "epoch": 0.002415234556432884, "grad_norm": 2.54675555229187, "learning_rate": 7.99507987659322e-05, "loss": 0.8823, "step": 13 }, { "epoch": 0.0026010218300046447, "grad_norm": 2.0304737091064453, "learning_rate": 7.991254508875098e-05, "loss": 0.9139, "step": 14 }, { "epoch": 0.002786809103576405, "grad_norm": 1.8423620462417603, "learning_rate": 7.98633797202668e-05, "loss": 0.8128, "step": 15 }, { "epoch": 0.0029725963771481652, "grad_norm": 4.305455684661865, "learning_rate": 7.980331610180046e-05, "loss": 0.9064, "step": 16 }, { "epoch": 0.0031583836507199257, "grad_norm": 3.134892225265503, "learning_rate": 7.973237065414553e-05, "loss": 0.901, "step": 17 }, { "epoch": 0.003344170924291686, "grad_norm": 1.7530468702316284, "learning_rate": 7.965056277307902e-05, "loss": 0.8444, "step": 18 }, { "epoch": 0.0035299581978634463, "grad_norm": 1.8036566972732544, "learning_rate": 7.955791482405875e-05, "loss": 0.9215, "step": 19 }, { "epoch": 0.0037157454714352067, "grad_norm": 1.2477421760559082, "learning_rate": 7.94544521361089e-05, "loss": 0.7512, "step": 20 }, { "epoch": 0.0039015327450069672, "grad_norm": 1.4336917400360107, "learning_rate": 7.93402029948953e-05, "loss": 0.7466, "step": 21 }, { "epoch": 0.004087320018578727, "grad_norm": 1.7280688285827637, "learning_rate": 7.921519863499239e-05, "loss": 0.7877, "step": 22 }, { "epoch": 0.004273107292150488, "grad_norm": 1.2563620805740356, "learning_rate": 7.907947323134398e-05, "loss": 0.6979, "step": 23 }, { "epoch": 0.004458894565722248, "grad_norm": 1.380014181137085, "learning_rate": 7.893306388992023e-05, "loss": 0.7668, "step": 24 }, { "epoch": 0.004644681839294009, "grad_norm": 1.2533624172210693, "learning_rate": 7.877601063757323e-05, "loss": 0.7324, "step": 25 }, { "epoch": 0.004830469112865768, "grad_norm": 1.1784394979476929, "learning_rate": 7.860835641109395e-05, "loss": 0.595, "step": 26 }, { "epoch": 0.005016256386437529, "grad_norm": 1.1498676538467407, "learning_rate": 7.843014704547393e-05, "loss": 0.6538, "step": 27 }, { "epoch": 0.005202043660009289, "grad_norm": 1.195434331893921, "learning_rate": 7.824143126137431e-05, "loss": 0.6842, "step": 28 }, { "epoch": 0.00538783093358105, "grad_norm": 1.0709013938903809, "learning_rate": 7.804226065180615e-05, "loss": 0.611, "step": 29 }, { "epoch": 0.00557361820715281, "grad_norm": 1.418430209159851, "learning_rate": 7.783268966802539e-05, "loss": 0.9021, "step": 30 }, { "epoch": 0.00575940548072457, "grad_norm": 1.3018062114715576, "learning_rate": 7.761277560464645e-05, "loss": 0.7383, "step": 31 }, { "epoch": 0.0059451927542963304, "grad_norm": 1.2901121377944946, "learning_rate": 7.738257858397844e-05, "loss": 0.704, "step": 32 }, { "epoch": 0.006130980027868091, "grad_norm": 1.2701770067214966, "learning_rate": 7.71421615395883e-05, "loss": 0.8416, "step": 33 }, { "epoch": 0.006316767301439851, "grad_norm": 1.303396224975586, "learning_rate": 7.68915901990954e-05, "loss": 0.7444, "step": 34 }, { "epoch": 0.006502554575011612, "grad_norm": 1.3861703872680664, "learning_rate": 7.663093306620231e-05, "loss": 0.7191, "step": 35 }, { "epoch": 0.006688341848583372, "grad_norm": 1.503480315208435, "learning_rate": 7.636026140196651e-05, "loss": 0.8693, "step": 36 }, { "epoch": 0.006874129122155132, "grad_norm": 1.307245135307312, "learning_rate": 7.607964920531837e-05, "loss": 0.7712, "step": 37 }, { "epoch": 0.0070599163957268925, "grad_norm": 1.2363077402114868, "learning_rate": 7.578917319283055e-05, "loss": 0.7318, "step": 38 }, { "epoch": 0.007245703669298653, "grad_norm": 1.183120608329773, "learning_rate": 7.548891277774448e-05, "loss": 0.8098, "step": 39 }, { "epoch": 0.0074314909428704135, "grad_norm": 1.3283722400665283, "learning_rate": 7.517895004825956e-05, "loss": 0.7582, "step": 40 }, { "epoch": 0.007617278216442174, "grad_norm": 1.2799726724624634, "learning_rate": 7.48593697450911e-05, "loss": 0.8915, "step": 41 }, { "epoch": 0.0078030654900139345, "grad_norm": 1.4513181447982788, "learning_rate": 7.453025923830296e-05, "loss": 0.8696, "step": 42 }, { "epoch": 0.007988852763585694, "grad_norm": 1.2473115921020508, "learning_rate": 7.419170850342156e-05, "loss": 0.892, "step": 43 }, { "epoch": 0.008174640037157455, "grad_norm": 1.3187384605407715, "learning_rate": 7.384381009683742e-05, "loss": 0.9073, "step": 44 }, { "epoch": 0.008360427310729215, "grad_norm": 1.25960111618042, "learning_rate": 7.348665913050115e-05, "loss": 0.9463, "step": 45 }, { "epoch": 0.008546214584300976, "grad_norm": 1.1852588653564453, "learning_rate": 7.312035324592081e-05, "loss": 0.8179, "step": 46 }, { "epoch": 0.008732001857872736, "grad_norm": 1.2183889150619507, "learning_rate": 7.274499258746771e-05, "loss": 0.7668, "step": 47 }, { "epoch": 0.008917789131444497, "grad_norm": 1.3253717422485352, "learning_rate": 7.236067977499791e-05, "loss": 0.7596, "step": 48 }, { "epoch": 0.009103576405016257, "grad_norm": 1.3234570026397705, "learning_rate": 7.196751987579699e-05, "loss": 0.7846, "step": 49 }, { "epoch": 0.009289363678588018, "grad_norm": 1.4833699464797974, "learning_rate": 7.156562037585576e-05, "loss": 0.8507, "step": 50 }, { "epoch": 0.009289363678588018, "eval_loss": 0.7784072160720825, "eval_runtime": 210.9709, "eval_samples_per_second": 42.973, "eval_steps_per_second": 10.746, "step": 50 }, { "epoch": 0.009475150952159776, "grad_norm": 1.1448761224746704, "learning_rate": 7.11550911504845e-05, "loss": 0.7059, "step": 51 }, { "epoch": 0.009660938225731537, "grad_norm": 1.5964897871017456, "learning_rate": 7.073604443427437e-05, "loss": 1.0514, "step": 52 }, { "epoch": 0.009846725499303297, "grad_norm": 1.4827876091003418, "learning_rate": 7.03085947904134e-05, "loss": 1.0776, "step": 53 }, { "epoch": 0.010032512772875058, "grad_norm": 1.5401029586791992, "learning_rate": 6.987285907936617e-05, "loss": 1.0623, "step": 54 }, { "epoch": 0.010218300046446818, "grad_norm": 1.462876319885254, "learning_rate": 6.942895642692527e-05, "loss": 1.0345, "step": 55 }, { "epoch": 0.010404087320018579, "grad_norm": 1.5211715698242188, "learning_rate": 6.897700819164357e-05, "loss": 1.0968, "step": 56 }, { "epoch": 0.01058987459359034, "grad_norm": 1.2720927000045776, "learning_rate": 6.851713793165589e-05, "loss": 0.8471, "step": 57 }, { "epoch": 0.0107756618671621, "grad_norm": 1.2138237953186035, "learning_rate": 6.804947137089955e-05, "loss": 0.7272, "step": 58 }, { "epoch": 0.01096144914073386, "grad_norm": 1.2674663066864014, "learning_rate": 6.757413636474263e-05, "loss": 0.6568, "step": 59 }, { "epoch": 0.01114723641430562, "grad_norm": 0.9728902578353882, "learning_rate": 6.709126286502965e-05, "loss": 0.707, "step": 60 }, { "epoch": 0.011333023687877381, "grad_norm": 1.2180075645446777, "learning_rate": 6.660098288455393e-05, "loss": 0.8588, "step": 61 }, { "epoch": 0.01151881096144914, "grad_norm": 1.1713515520095825, "learning_rate": 6.610343046096674e-05, "loss": 0.7691, "step": 62 }, { "epoch": 0.0117045982350209, "grad_norm": 1.0399553775787354, "learning_rate": 6.559874162013267e-05, "loss": 0.7024, "step": 63 }, { "epoch": 0.011890385508592661, "grad_norm": 0.8005794286727905, "learning_rate": 6.508705433894149e-05, "loss": 0.5976, "step": 64 }, { "epoch": 0.012076172782164421, "grad_norm": 1.0212355852127075, "learning_rate": 6.456850850758673e-05, "loss": 0.7279, "step": 65 }, { "epoch": 0.012261960055736182, "grad_norm": 0.9839227795600891, "learning_rate": 6.404324589132101e-05, "loss": 0.714, "step": 66 }, { "epoch": 0.012447747329307942, "grad_norm": 0.9003210067749023, "learning_rate": 6.351141009169893e-05, "loss": 0.7392, "step": 67 }, { "epoch": 0.012633534602879703, "grad_norm": 0.9685829877853394, "learning_rate": 6.297314650731775e-05, "loss": 0.697, "step": 68 }, { "epoch": 0.012819321876451463, "grad_norm": 0.8940137624740601, "learning_rate": 6.242860229406692e-05, "loss": 0.5469, "step": 69 }, { "epoch": 0.013005109150023224, "grad_norm": 0.9139478206634521, "learning_rate": 6.18779263248971e-05, "loss": 0.6808, "step": 70 }, { "epoch": 0.013190896423594984, "grad_norm": 1.0126328468322754, "learning_rate": 6.132126914911976e-05, "loss": 0.6446, "step": 71 }, { "epoch": 0.013376683697166745, "grad_norm": 0.7734840512275696, "learning_rate": 6.075878295124861e-05, "loss": 0.6272, "step": 72 }, { "epoch": 0.013562470970738504, "grad_norm": 0.7684584856033325, "learning_rate": 6.019062150939376e-05, "loss": 0.5738, "step": 73 }, { "epoch": 0.013748258244310264, "grad_norm": 0.9124207496643066, "learning_rate": 5.9616940153220336e-05, "loss": 0.623, "step": 74 }, { "epoch": 0.013934045517882025, "grad_norm": 0.8309489488601685, "learning_rate": 5.903789572148295e-05, "loss": 0.5815, "step": 75 }, { "epoch": 0.014119832791453785, "grad_norm": 0.7745351791381836, "learning_rate": 5.845364651914752e-05, "loss": 0.6165, "step": 76 }, { "epoch": 0.014305620065025546, "grad_norm": 0.9630031585693359, "learning_rate": 5.786435227411227e-05, "loss": 0.6886, "step": 77 }, { "epoch": 0.014491407338597306, "grad_norm": 0.8420267105102539, "learning_rate": 5.727017409353971e-05, "loss": 0.6423, "step": 78 }, { "epoch": 0.014677194612169066, "grad_norm": 0.9119953513145447, "learning_rate": 5.667127441981162e-05, "loss": 0.7206, "step": 79 }, { "epoch": 0.014862981885740827, "grad_norm": 1.015648365020752, "learning_rate": 5.606781698611879e-05, "loss": 0.6322, "step": 80 }, { "epoch": 0.015048769159312587, "grad_norm": 0.9261860251426697, "learning_rate": 5.5459966771698096e-05, "loss": 0.6806, "step": 81 }, { "epoch": 0.015234556432884348, "grad_norm": 0.8736683130264282, "learning_rate": 5.4847889956728834e-05, "loss": 0.674, "step": 82 }, { "epoch": 0.015420343706456108, "grad_norm": 0.9856055378913879, "learning_rate": 5.423175387690067e-05, "loss": 0.8235, "step": 83 }, { "epoch": 0.015606130980027869, "grad_norm": 1.0767531394958496, "learning_rate": 5.361172697766573e-05, "loss": 0.778, "step": 84 }, { "epoch": 0.01579191825359963, "grad_norm": 0.8056624531745911, "learning_rate": 5.298797876818735e-05, "loss": 0.617, "step": 85 }, { "epoch": 0.015977705527171388, "grad_norm": 1.071303367614746, "learning_rate": 5.23606797749979e-05, "loss": 0.7675, "step": 86 }, { "epoch": 0.01616349280074315, "grad_norm": 0.9511001110076904, "learning_rate": 5.17300014953786e-05, "loss": 0.7353, "step": 87 }, { "epoch": 0.01634928007431491, "grad_norm": 0.9310784339904785, "learning_rate": 5.109611635047379e-05, "loss": 0.6954, "step": 88 }, { "epoch": 0.01653506734788667, "grad_norm": 1.0314819812774658, "learning_rate": 5.04591976381528e-05, "loss": 0.7053, "step": 89 }, { "epoch": 0.01672085462145843, "grad_norm": 0.9734024405479431, "learning_rate": 4.981941948563197e-05, "loss": 0.7357, "step": 90 }, { "epoch": 0.01690664189503019, "grad_norm": 0.9812660217285156, "learning_rate": 4.9176956801870065e-05, "loss": 0.705, "step": 91 }, { "epoch": 0.01709242916860195, "grad_norm": 1.061806082725525, "learning_rate": 4.853198522974988e-05, "loss": 0.7836, "step": 92 }, { "epoch": 0.01727821644217371, "grad_norm": 1.190076470375061, "learning_rate": 4.788468109805921e-05, "loss": 0.8644, "step": 93 }, { "epoch": 0.017464003715745472, "grad_norm": 1.0145090818405151, "learning_rate": 4.7235221373284407e-05, "loss": 0.7877, "step": 94 }, { "epoch": 0.01764979098931723, "grad_norm": 1.1526635885238647, "learning_rate": 4.658378361122936e-05, "loss": 0.8445, "step": 95 }, { "epoch": 0.017835578262888993, "grad_norm": 1.0449837446212769, "learning_rate": 4.593054590847368e-05, "loss": 0.8328, "step": 96 }, { "epoch": 0.018021365536460752, "grad_norm": 1.0370270013809204, "learning_rate": 4.5275686853682765e-05, "loss": 0.7437, "step": 97 }, { "epoch": 0.018207152810032514, "grad_norm": 1.0356221199035645, "learning_rate": 4.4619385478783456e-05, "loss": 0.7405, "step": 98 }, { "epoch": 0.018392940083604273, "grad_norm": 1.0345137119293213, "learning_rate": 4.396182121001852e-05, "loss": 0.6983, "step": 99 }, { "epoch": 0.018578727357176035, "grad_norm": 1.3793245553970337, "learning_rate": 4.33031738188933e-05, "loss": 0.9343, "step": 100 }, { "epoch": 0.018578727357176035, "eval_loss": 0.7299540638923645, "eval_runtime": 210.9599, "eval_samples_per_second": 42.975, "eval_steps_per_second": 10.746, "step": 100 }, { "epoch": 0.018764514630747794, "grad_norm": 1.1361013650894165, "learning_rate": 4.264362337302798e-05, "loss": 0.6842, "step": 101 }, { "epoch": 0.018950301904319553, "grad_norm": 1.120656132698059, "learning_rate": 4.1983350186928894e-05, "loss": 0.943, "step": 102 }, { "epoch": 0.019136089177891315, "grad_norm": 1.1091196537017822, "learning_rate": 4.132253477269233e-05, "loss": 0.8101, "step": 103 }, { "epoch": 0.019321876451463074, "grad_norm": 1.1574037075042725, "learning_rate": 4.0661357790654345e-05, "loss": 0.9537, "step": 104 }, { "epoch": 0.019507663725034836, "grad_norm": 1.3250733613967896, "learning_rate": 4e-05, "loss": 1.0247, "step": 105 }, { "epoch": 0.019693450998606594, "grad_norm": 1.2618422508239746, "learning_rate": 3.933864220934566e-05, "loss": 0.9238, "step": 106 }, { "epoch": 0.019879238272178357, "grad_norm": 1.1964272260665894, "learning_rate": 3.8677465227307676e-05, "loss": 0.8356, "step": 107 }, { "epoch": 0.020065025545750115, "grad_norm": 0.930941104888916, "learning_rate": 3.8016649813071106e-05, "loss": 0.7564, "step": 108 }, { "epoch": 0.020250812819321878, "grad_norm": 0.948442280292511, "learning_rate": 3.735637662697203e-05, "loss": 0.6836, "step": 109 }, { "epoch": 0.020436600092893636, "grad_norm": 0.9409064650535583, "learning_rate": 3.669682618110671e-05, "loss": 0.696, "step": 110 }, { "epoch": 0.0206223873664654, "grad_norm": 0.9388203024864197, "learning_rate": 3.6038178789981494e-05, "loss": 0.6616, "step": 111 }, { "epoch": 0.020808174640037157, "grad_norm": 0.8305265307426453, "learning_rate": 3.538061452121656e-05, "loss": 0.6802, "step": 112 }, { "epoch": 0.020993961913608916, "grad_norm": 0.8969584107398987, "learning_rate": 3.472431314631724e-05, "loss": 0.667, "step": 113 }, { "epoch": 0.02117974918718068, "grad_norm": 0.8265141844749451, "learning_rate": 3.406945409152632e-05, "loss": 0.5857, "step": 114 }, { "epoch": 0.021365536460752437, "grad_norm": 0.9097794890403748, "learning_rate": 3.341621638877064e-05, "loss": 0.7731, "step": 115 }, { "epoch": 0.0215513237343242, "grad_norm": 0.8234865665435791, "learning_rate": 3.276477862671562e-05, "loss": 0.6853, "step": 116 }, { "epoch": 0.021737111007895958, "grad_norm": 0.8503565788269043, "learning_rate": 3.21153189019408e-05, "loss": 0.5874, "step": 117 }, { "epoch": 0.02192289828146772, "grad_norm": 0.7727727890014648, "learning_rate": 3.146801477025013e-05, "loss": 0.6099, "step": 118 }, { "epoch": 0.02210868555503948, "grad_norm": 0.808000385761261, "learning_rate": 3.082304319812994e-05, "loss": 0.6345, "step": 119 }, { "epoch": 0.02229447282861124, "grad_norm": 0.7325134873390198, "learning_rate": 3.0180580514368037e-05, "loss": 0.5591, "step": 120 }, { "epoch": 0.022480260102183, "grad_norm": 0.7028451561927795, "learning_rate": 2.9540802361847212e-05, "loss": 0.512, "step": 121 }, { "epoch": 0.022666047375754762, "grad_norm": 0.7961578369140625, "learning_rate": 2.890388364952623e-05, "loss": 0.5754, "step": 122 }, { "epoch": 0.02285183464932652, "grad_norm": 0.7276977896690369, "learning_rate": 2.8269998504621416e-05, "loss": 0.5613, "step": 123 }, { "epoch": 0.02303762192289828, "grad_norm": 0.858888566493988, "learning_rate": 2.7639320225002108e-05, "loss": 0.655, "step": 124 }, { "epoch": 0.023223409196470042, "grad_norm": 0.9296563863754272, "learning_rate": 2.7012021231812666e-05, "loss": 0.6636, "step": 125 }, { "epoch": 0.0234091964700418, "grad_norm": 0.7812833189964294, "learning_rate": 2.638827302233428e-05, "loss": 0.6386, "step": 126 }, { "epoch": 0.023594983743613563, "grad_norm": 0.9057222604751587, "learning_rate": 2.576824612309934e-05, "loss": 0.638, "step": 127 }, { "epoch": 0.023780771017185322, "grad_norm": 0.8555361032485962, "learning_rate": 2.5152110043271166e-05, "loss": 0.6836, "step": 128 }, { "epoch": 0.023966558290757084, "grad_norm": 0.8539828062057495, "learning_rate": 2.454003322830192e-05, "loss": 0.7038, "step": 129 }, { "epoch": 0.024152345564328843, "grad_norm": 0.8139870166778564, "learning_rate": 2.393218301388123e-05, "loss": 0.4949, "step": 130 }, { "epoch": 0.024338132837900605, "grad_norm": 0.8350996375083923, "learning_rate": 2.3328725580188395e-05, "loss": 0.6847, "step": 131 }, { "epoch": 0.024523920111472364, "grad_norm": 0.8771671056747437, "learning_rate": 2.272982590646029e-05, "loss": 0.6576, "step": 132 }, { "epoch": 0.024709707385044126, "grad_norm": 0.9145622253417969, "learning_rate": 2.2135647725887744e-05, "loss": 0.6714, "step": 133 }, { "epoch": 0.024895494658615885, "grad_norm": 0.8157410621643066, "learning_rate": 2.1546353480852495e-05, "loss": 0.6085, "step": 134 }, { "epoch": 0.025081281932187643, "grad_norm": 0.860339879989624, "learning_rate": 2.096210427851706e-05, "loss": 0.5648, "step": 135 }, { "epoch": 0.025267069205759406, "grad_norm": 0.8176294565200806, "learning_rate": 2.038305984677969e-05, "loss": 0.6053, "step": 136 }, { "epoch": 0.025452856479331164, "grad_norm": 0.9252009391784668, "learning_rate": 1.9809378490606264e-05, "loss": 0.6228, "step": 137 }, { "epoch": 0.025638643752902927, "grad_norm": 0.8035367727279663, "learning_rate": 1.9241217048751406e-05, "loss": 0.6502, "step": 138 }, { "epoch": 0.025824431026474685, "grad_norm": 0.9396884441375732, "learning_rate": 1.867873085088026e-05, "loss": 0.6928, "step": 139 }, { "epoch": 0.026010218300046448, "grad_norm": 0.973107635974884, "learning_rate": 1.8122073675102935e-05, "loss": 0.7169, "step": 140 }, { "epoch": 0.026196005573618206, "grad_norm": 0.9451408982276917, "learning_rate": 1.75713977059331e-05, "loss": 0.6835, "step": 141 }, { "epoch": 0.02638179284718997, "grad_norm": 0.9551781415939331, "learning_rate": 1.702685349268226e-05, "loss": 0.7596, "step": 142 }, { "epoch": 0.026567580120761727, "grad_norm": 0.9721214175224304, "learning_rate": 1.648858990830108e-05, "loss": 0.804, "step": 143 }, { "epoch": 0.02675336739433349, "grad_norm": 1.019667148590088, "learning_rate": 1.5956754108678996e-05, "loss": 0.8623, "step": 144 }, { "epoch": 0.02693915466790525, "grad_norm": 1.0824493169784546, "learning_rate": 1.5431491492413288e-05, "loss": 0.8455, "step": 145 }, { "epoch": 0.027124941941477007, "grad_norm": 0.9800060987472534, "learning_rate": 1.491294566105852e-05, "loss": 0.8264, "step": 146 }, { "epoch": 0.02731072921504877, "grad_norm": 0.9747028350830078, "learning_rate": 1.4401258379867335e-05, "loss": 0.7024, "step": 147 }, { "epoch": 0.027496516488620528, "grad_norm": 0.8642198443412781, "learning_rate": 1.3896569539033253e-05, "loss": 0.7009, "step": 148 }, { "epoch": 0.02768230376219229, "grad_norm": 0.8279868960380554, "learning_rate": 1.3399017115446067e-05, "loss": 0.6983, "step": 149 }, { "epoch": 0.02786809103576405, "grad_norm": 1.2232890129089355, "learning_rate": 1.2908737134970367e-05, "loss": 0.8388, "step": 150 }, { "epoch": 0.02786809103576405, "eval_loss": 0.7057402729988098, "eval_runtime": 210.9537, "eval_samples_per_second": 42.976, "eval_steps_per_second": 10.746, "step": 150 }, { "epoch": 0.02805387830933581, "grad_norm": 0.8277557492256165, "learning_rate": 1.242586363525737e-05, "loss": 0.6744, "step": 151 }, { "epoch": 0.02823966558290757, "grad_norm": 1.129407286643982, "learning_rate": 1.1950528629100457e-05, "loss": 0.8988, "step": 152 }, { "epoch": 0.028425452856479332, "grad_norm": 1.1117703914642334, "learning_rate": 1.1482862068344121e-05, "loss": 0.8987, "step": 153 }, { "epoch": 0.02861124013005109, "grad_norm": 1.274944543838501, "learning_rate": 1.1022991808356442e-05, "loss": 0.9714, "step": 154 }, { "epoch": 0.028797027403622853, "grad_norm": 1.413684368133545, "learning_rate": 1.0571043573074737e-05, "loss": 1.0464, "step": 155 }, { "epoch": 0.028982814677194612, "grad_norm": 1.2533186674118042, "learning_rate": 1.0127140920633857e-05, "loss": 0.8553, "step": 156 }, { "epoch": 0.029168601950766374, "grad_norm": 0.9504323601722717, "learning_rate": 9.69140520958662e-06, "loss": 0.6454, "step": 157 }, { "epoch": 0.029354389224338133, "grad_norm": 0.9204007387161255, "learning_rate": 9.263955565725648e-06, "loss": 0.733, "step": 158 }, { "epoch": 0.02954017649790989, "grad_norm": 0.8115749359130859, "learning_rate": 8.844908849515509e-06, "loss": 0.6411, "step": 159 }, { "epoch": 0.029725963771481654, "grad_norm": 0.7680659294128418, "learning_rate": 8.434379624144261e-06, "loss": 0.6213, "step": 160 }, { "epoch": 0.029911751045053413, "grad_norm": 0.7348408699035645, "learning_rate": 8.032480124203013e-06, "loss": 0.608, "step": 161 }, { "epoch": 0.030097538318625175, "grad_norm": 0.690196692943573, "learning_rate": 7.639320225002106e-06, "loss": 0.5074, "step": 162 }, { "epoch": 0.030283325592196934, "grad_norm": 0.7912430167198181, "learning_rate": 7.255007412532307e-06, "loss": 0.6236, "step": 163 }, { "epoch": 0.030469112865768696, "grad_norm": 0.8454386591911316, "learning_rate": 6.8796467540791986e-06, "loss": 0.773, "step": 164 }, { "epoch": 0.030654900139340455, "grad_norm": 0.7565322518348694, "learning_rate": 6.513340869498859e-06, "loss": 0.5278, "step": 165 }, { "epoch": 0.030840687412912217, "grad_norm": 0.7427991032600403, "learning_rate": 6.1561899031625794e-06, "loss": 0.5895, "step": 166 }, { "epoch": 0.031026474686483976, "grad_norm": 0.72712242603302, "learning_rate": 5.808291496578435e-06, "loss": 0.554, "step": 167 }, { "epoch": 0.031212261960055738, "grad_norm": 0.8168418407440186, "learning_rate": 5.469740761697044e-06, "loss": 0.5795, "step": 168 }, { "epoch": 0.0313980492336275, "grad_norm": 0.7900062203407288, "learning_rate": 5.140630254908905e-06, "loss": 0.6155, "step": 169 }, { "epoch": 0.03158383650719926, "grad_norm": 0.7631322741508484, "learning_rate": 4.821049951740442e-06, "loss": 0.6395, "step": 170 }, { "epoch": 0.031769623780771014, "grad_norm": 0.8723105788230896, "learning_rate": 4.511087222255528e-06, "loss": 0.7083, "step": 171 }, { "epoch": 0.031955411054342776, "grad_norm": 0.8694934248924255, "learning_rate": 4.2108268071694616e-06, "loss": 0.6848, "step": 172 }, { "epoch": 0.03214119832791454, "grad_norm": 0.8055874109268188, "learning_rate": 3.9203507946816445e-06, "loss": 0.6301, "step": 173 }, { "epoch": 0.0323269856014863, "grad_norm": 0.782102644443512, "learning_rate": 3.6397385980335e-06, "loss": 0.5799, "step": 174 }, { "epoch": 0.032512772875058056, "grad_norm": 0.9123784303665161, "learning_rate": 3.3690669337977e-06, "loss": 0.6572, "step": 175 }, { "epoch": 0.03269856014862982, "grad_norm": 0.8065102100372314, "learning_rate": 3.1084098009046106e-06, "loss": 0.6309, "step": 176 }, { "epoch": 0.03288434742220158, "grad_norm": 0.7722126841545105, "learning_rate": 2.8578384604117217e-06, "loss": 0.5781, "step": 177 }, { "epoch": 0.03307013469577334, "grad_norm": 0.8441624641418457, "learning_rate": 2.6174214160215704e-06, "loss": 0.625, "step": 178 }, { "epoch": 0.0332559219693451, "grad_norm": 0.7183513045310974, "learning_rate": 2.3872243953535535e-06, "loss": 0.5938, "step": 179 }, { "epoch": 0.03344170924291686, "grad_norm": 0.8442609310150146, "learning_rate": 2.1673103319746146e-06, "loss": 0.6409, "step": 180 }, { "epoch": 0.03362749651648862, "grad_norm": 0.7444936633110046, "learning_rate": 1.957739348193859e-06, "loss": 0.6137, "step": 181 }, { "epoch": 0.03381328379006038, "grad_norm": 0.851841390132904, "learning_rate": 1.7585687386256944e-06, "loss": 0.5644, "step": 182 }, { "epoch": 0.03399907106363214, "grad_norm": 0.7763927578926086, "learning_rate": 1.5698529545260744e-06, "loss": 0.556, "step": 183 }, { "epoch": 0.0341848583372039, "grad_norm": 0.7218007445335388, "learning_rate": 1.3916435889060575e-06, "loss": 0.5211, "step": 184 }, { "epoch": 0.034370645610775664, "grad_norm": 0.8408937454223633, "learning_rate": 1.2239893624267852e-06, "loss": 0.6682, "step": 185 }, { "epoch": 0.03455643288434742, "grad_norm": 0.8628000020980835, "learning_rate": 1.0669361100797704e-06, "loss": 0.6878, "step": 186 }, { "epoch": 0.03474222015791918, "grad_norm": 0.8714439272880554, "learning_rate": 9.205267686560293e-07, "loss": 0.6547, "step": 187 }, { "epoch": 0.034928007431490944, "grad_norm": 0.7619119882583618, "learning_rate": 7.848013650076258e-07, "loss": 0.5769, "step": 188 }, { "epoch": 0.035113794705062706, "grad_norm": 0.9454699158668518, "learning_rate": 6.597970051047053e-07, "loss": 0.6948, "step": 189 }, { "epoch": 0.03529958197863446, "grad_norm": 0.7809498906135559, "learning_rate": 5.455478638911071e-07, "loss": 0.657, "step": 190 }, { "epoch": 0.035485369252206224, "grad_norm": 0.9400784373283386, "learning_rate": 4.420851759412603e-07, "loss": 0.723, "step": 191 }, { "epoch": 0.035671156525777986, "grad_norm": 1.0285460948944092, "learning_rate": 3.4943722692099224e-07, "loss": 0.8145, "step": 192 }, { "epoch": 0.03585694379934974, "grad_norm": 1.0024358034133911, "learning_rate": 2.676293458544743e-07, "loss": 0.8062, "step": 193 }, { "epoch": 0.036042731072921504, "grad_norm": 0.9210175275802612, "learning_rate": 1.9668389819954338e-07, "loss": 0.7595, "step": 194 }, { "epoch": 0.036228518346493266, "grad_norm": 1.0221508741378784, "learning_rate": 1.3662027973320614e-07, "loss": 0.7836, "step": 195 }, { "epoch": 0.03641430562006503, "grad_norm": 0.9825165271759033, "learning_rate": 8.745491124901861e-08, "loss": 0.8122, "step": 196 }, { "epoch": 0.03660009289363678, "grad_norm": 1.2274169921875, "learning_rate": 4.920123406781052e-08, "loss": 0.9055, "step": 197 }, { "epoch": 0.036785880167208546, "grad_norm": 1.0650230646133423, "learning_rate": 2.1869706362958044e-08, "loss": 0.7219, "step": 198 }, { "epoch": 0.03697166744078031, "grad_norm": 1.0528210401535034, "learning_rate": 5.467800301239834e-09, "loss": 0.6992, "step": 199 }, { "epoch": 0.03715745471435207, "grad_norm": 1.143416166305542, "learning_rate": 0.0, "loss": 0.808, "step": 200 }, { "epoch": 0.03715745471435207, "eval_loss": 0.696293830871582, "eval_runtime": 210.9668, "eval_samples_per_second": 42.974, "eval_steps_per_second": 10.746, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.895574321423974e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }