{ "best_metric": 3.3893699645996094, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.2074688796680498, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001037344398340249, "grad_norm": 3.6560280323028564, "learning_rate": 1e-05, "loss": 3.7424, "step": 1 }, { "epoch": 0.001037344398340249, "eval_loss": 5.154352188110352, "eval_runtime": 26.5688, "eval_samples_per_second": 61.124, "eval_steps_per_second": 15.281, "step": 1 }, { "epoch": 0.002074688796680498, "grad_norm": 4.519001007080078, "learning_rate": 2e-05, "loss": 4.0009, "step": 2 }, { "epoch": 0.0031120331950207467, "grad_norm": 4.717950820922852, "learning_rate": 3e-05, "loss": 3.8411, "step": 3 }, { "epoch": 0.004149377593360996, "grad_norm": 4.098848819732666, "learning_rate": 4e-05, "loss": 4.1416, "step": 4 }, { "epoch": 0.005186721991701245, "grad_norm": 3.9933223724365234, "learning_rate": 5e-05, "loss": 4.3214, "step": 5 }, { "epoch": 0.006224066390041493, "grad_norm": 3.811678171157837, "learning_rate": 6e-05, "loss": 4.0918, "step": 6 }, { "epoch": 0.007261410788381743, "grad_norm": 3.8618717193603516, "learning_rate": 7e-05, "loss": 4.1885, "step": 7 }, { "epoch": 0.008298755186721992, "grad_norm": 3.741856813430786, "learning_rate": 8e-05, "loss": 4.0593, "step": 8 }, { "epoch": 0.00933609958506224, "grad_norm": 3.6605193614959717, "learning_rate": 9e-05, "loss": 3.7919, "step": 9 }, { "epoch": 0.01037344398340249, "grad_norm": 4.069720268249512, "learning_rate": 0.0001, "loss": 4.0892, "step": 10 }, { "epoch": 0.011410788381742738, "grad_norm": 4.561852931976318, "learning_rate": 9.999316524962345e-05, "loss": 3.7796, "step": 11 }, { "epoch": 0.012448132780082987, "grad_norm": 4.809189796447754, "learning_rate": 9.997266286704631e-05, "loss": 3.9954, "step": 12 }, { "epoch": 0.013485477178423237, "grad_norm": 3.8352808952331543, "learning_rate": 9.993849845741524e-05, "loss": 3.9553, "step": 13 }, { "epoch": 0.014522821576763486, "grad_norm": 3.582754611968994, "learning_rate": 9.989068136093873e-05, "loss": 3.9918, "step": 14 }, { "epoch": 0.015560165975103735, "grad_norm": 3.632657766342163, "learning_rate": 9.98292246503335e-05, "loss": 3.5872, "step": 15 }, { "epoch": 0.016597510373443983, "grad_norm": 3.7454187870025635, "learning_rate": 9.975414512725057e-05, "loss": 4.036, "step": 16 }, { "epoch": 0.017634854771784232, "grad_norm": 3.546095132827759, "learning_rate": 9.966546331768191e-05, "loss": 3.297, "step": 17 }, { "epoch": 0.01867219917012448, "grad_norm": 3.9690802097320557, "learning_rate": 9.956320346634876e-05, "loss": 3.7145, "step": 18 }, { "epoch": 0.01970954356846473, "grad_norm": 3.569537878036499, "learning_rate": 9.944739353007344e-05, "loss": 4.0681, "step": 19 }, { "epoch": 0.02074688796680498, "grad_norm": 3.1518118381500244, "learning_rate": 9.931806517013612e-05, "loss": 3.6876, "step": 20 }, { "epoch": 0.021784232365145227, "grad_norm": 3.6860296726226807, "learning_rate": 9.917525374361912e-05, "loss": 3.7749, "step": 21 }, { "epoch": 0.022821576763485476, "grad_norm": 3.2175776958465576, "learning_rate": 9.901899829374047e-05, "loss": 3.6225, "step": 22 }, { "epoch": 0.023858921161825725, "grad_norm": 3.4466984272003174, "learning_rate": 9.884934153917997e-05, "loss": 3.7914, "step": 23 }, { "epoch": 0.024896265560165973, "grad_norm": 3.4754061698913574, "learning_rate": 9.86663298624003e-05, "loss": 3.7014, "step": 24 }, { "epoch": 0.025933609958506226, "grad_norm": 3.414550542831421, "learning_rate": 9.847001329696653e-05, "loss": 3.7435, "step": 25 }, { "epoch": 0.026970954356846474, "grad_norm": 3.7674219608306885, "learning_rate": 9.826044551386744e-05, "loss": 4.025, "step": 26 }, { "epoch": 0.028008298755186723, "grad_norm": 3.575585126876831, "learning_rate": 9.803768380684242e-05, "loss": 3.7974, "step": 27 }, { "epoch": 0.029045643153526972, "grad_norm": 3.6689274311065674, "learning_rate": 9.780178907671789e-05, "loss": 3.6685, "step": 28 }, { "epoch": 0.03008298755186722, "grad_norm": 3.6703617572784424, "learning_rate": 9.755282581475769e-05, "loss": 3.9441, "step": 29 }, { "epoch": 0.03112033195020747, "grad_norm": 3.904585123062134, "learning_rate": 9.729086208503174e-05, "loss": 4.0383, "step": 30 }, { "epoch": 0.032157676348547715, "grad_norm": 3.659148693084717, "learning_rate": 9.701596950580806e-05, "loss": 3.9392, "step": 31 }, { "epoch": 0.03319502074688797, "grad_norm": 3.6932144165039062, "learning_rate": 9.672822322997305e-05, "loss": 3.6377, "step": 32 }, { "epoch": 0.03423236514522822, "grad_norm": 3.7890701293945312, "learning_rate": 9.642770192448536e-05, "loss": 3.9012, "step": 33 }, { "epoch": 0.035269709543568464, "grad_norm": 4.021544933319092, "learning_rate": 9.611448774886924e-05, "loss": 3.5325, "step": 34 }, { "epoch": 0.03630705394190872, "grad_norm": 3.8901312351226807, "learning_rate": 9.578866633275288e-05, "loss": 3.8501, "step": 35 }, { "epoch": 0.03734439834024896, "grad_norm": 3.625171184539795, "learning_rate": 9.545032675245813e-05, "loss": 3.59, "step": 36 }, { "epoch": 0.038381742738589214, "grad_norm": 3.8629443645477295, "learning_rate": 9.509956150664796e-05, "loss": 4.0088, "step": 37 }, { "epoch": 0.03941908713692946, "grad_norm": 4.15440559387207, "learning_rate": 9.473646649103818e-05, "loss": 3.7416, "step": 38 }, { "epoch": 0.04045643153526971, "grad_norm": 3.913933515548706, "learning_rate": 9.43611409721806e-05, "loss": 3.9396, "step": 39 }, { "epoch": 0.04149377593360996, "grad_norm": 3.9919865131378174, "learning_rate": 9.397368756032445e-05, "loss": 3.4939, "step": 40 }, { "epoch": 0.04253112033195021, "grad_norm": 4.222021102905273, "learning_rate": 9.357421218136386e-05, "loss": 3.9626, "step": 41 }, { "epoch": 0.043568464730290454, "grad_norm": 3.666334867477417, "learning_rate": 9.316282404787871e-05, "loss": 3.8106, "step": 42 }, { "epoch": 0.044605809128630707, "grad_norm": 3.896925210952759, "learning_rate": 9.273963562927695e-05, "loss": 3.8724, "step": 43 }, { "epoch": 0.04564315352697095, "grad_norm": 4.4622578620910645, "learning_rate": 9.230476262104677e-05, "loss": 3.6229, "step": 44 }, { "epoch": 0.046680497925311204, "grad_norm": 4.379663944244385, "learning_rate": 9.185832391312644e-05, "loss": 3.6872, "step": 45 }, { "epoch": 0.04771784232365145, "grad_norm": 4.342223167419434, "learning_rate": 9.140044155740101e-05, "loss": 3.8005, "step": 46 }, { "epoch": 0.0487551867219917, "grad_norm": 5.052137851715088, "learning_rate": 9.093124073433463e-05, "loss": 3.8598, "step": 47 }, { "epoch": 0.04979253112033195, "grad_norm": 4.811618804931641, "learning_rate": 9.045084971874738e-05, "loss": 3.6151, "step": 48 }, { "epoch": 0.0508298755186722, "grad_norm": 5.3153300285339355, "learning_rate": 8.995939984474624e-05, "loss": 3.9977, "step": 49 }, { "epoch": 0.05186721991701245, "grad_norm": 5.884384632110596, "learning_rate": 8.945702546981969e-05, "loss": 3.5615, "step": 50 }, { "epoch": 0.05186721991701245, "eval_loss": 4.57713508605957, "eval_runtime": 26.5411, "eval_samples_per_second": 61.188, "eval_steps_per_second": 15.297, "step": 50 }, { "epoch": 0.052904564315352696, "grad_norm": 24.076704025268555, "learning_rate": 8.894386393810563e-05, "loss": 5.2672, "step": 51 }, { "epoch": 0.05394190871369295, "grad_norm": 19.754859924316406, "learning_rate": 8.842005554284296e-05, "loss": 4.7904, "step": 52 }, { "epoch": 0.054979253112033194, "grad_norm": 16.51922035217285, "learning_rate": 8.788574348801675e-05, "loss": 4.5515, "step": 53 }, { "epoch": 0.056016597510373446, "grad_norm": 13.558797836303711, "learning_rate": 8.73410738492077e-05, "loss": 4.4899, "step": 54 }, { "epoch": 0.05705394190871369, "grad_norm": 7.6642937660217285, "learning_rate": 8.678619553365659e-05, "loss": 4.2032, "step": 55 }, { "epoch": 0.058091286307053944, "grad_norm": 5.949044227600098, "learning_rate": 8.622126023955446e-05, "loss": 3.695, "step": 56 }, { "epoch": 0.05912863070539419, "grad_norm": 4.905073165893555, "learning_rate": 8.564642241456986e-05, "loss": 3.6473, "step": 57 }, { "epoch": 0.06016597510373444, "grad_norm": 4.315762519836426, "learning_rate": 8.506183921362443e-05, "loss": 3.4442, "step": 58 }, { "epoch": 0.061203319502074686, "grad_norm": 3.31467342376709, "learning_rate": 8.44676704559283e-05, "loss": 3.8328, "step": 59 }, { "epoch": 0.06224066390041494, "grad_norm": 2.8772637844085693, "learning_rate": 8.386407858128706e-05, "loss": 3.6724, "step": 60 }, { "epoch": 0.06327800829875518, "grad_norm": 2.7757973670959473, "learning_rate": 8.32512286056924e-05, "loss": 3.6571, "step": 61 }, { "epoch": 0.06431535269709543, "grad_norm": 2.6480956077575684, "learning_rate": 8.262928807620843e-05, "loss": 3.3738, "step": 62 }, { "epoch": 0.06535269709543569, "grad_norm": 2.5773894786834717, "learning_rate": 8.199842702516583e-05, "loss": 3.4269, "step": 63 }, { "epoch": 0.06639004149377593, "grad_norm": 2.5624706745147705, "learning_rate": 8.135881792367686e-05, "loss": 3.4427, "step": 64 }, { "epoch": 0.06742738589211618, "grad_norm": 2.816394329071045, "learning_rate": 8.07106356344834e-05, "loss": 3.6585, "step": 65 }, { "epoch": 0.06846473029045644, "grad_norm": 2.769735813140869, "learning_rate": 8.005405736415126e-05, "loss": 3.5136, "step": 66 }, { "epoch": 0.06950207468879668, "grad_norm": 2.795097827911377, "learning_rate": 7.938926261462366e-05, "loss": 3.4617, "step": 67 }, { "epoch": 0.07053941908713693, "grad_norm": 2.583852529525757, "learning_rate": 7.871643313414718e-05, "loss": 3.3261, "step": 68 }, { "epoch": 0.07157676348547717, "grad_norm": 2.535404920578003, "learning_rate": 7.803575286758364e-05, "loss": 3.388, "step": 69 }, { "epoch": 0.07261410788381743, "grad_norm": 2.6295790672302246, "learning_rate": 7.734740790612136e-05, "loss": 3.807, "step": 70 }, { "epoch": 0.07365145228215768, "grad_norm": 2.7374913692474365, "learning_rate": 7.66515864363997e-05, "loss": 3.3073, "step": 71 }, { "epoch": 0.07468879668049792, "grad_norm": 2.939117908477783, "learning_rate": 7.594847868906076e-05, "loss": 3.4621, "step": 72 }, { "epoch": 0.07572614107883817, "grad_norm": 2.762972831726074, "learning_rate": 7.52382768867422e-05, "loss": 3.4, "step": 73 }, { "epoch": 0.07676348547717843, "grad_norm": 2.9580655097961426, "learning_rate": 7.452117519152542e-05, "loss": 3.5031, "step": 74 }, { "epoch": 0.07780082987551867, "grad_norm": 2.899890661239624, "learning_rate": 7.379736965185368e-05, "loss": 3.3536, "step": 75 }, { "epoch": 0.07883817427385892, "grad_norm": 2.7998228073120117, "learning_rate": 7.30670581489344e-05, "loss": 3.7074, "step": 76 }, { "epoch": 0.07987551867219916, "grad_norm": 3.0808844566345215, "learning_rate": 7.233044034264034e-05, "loss": 3.3757, "step": 77 }, { "epoch": 0.08091286307053942, "grad_norm": 3.0572876930236816, "learning_rate": 7.158771761692464e-05, "loss": 3.4954, "step": 78 }, { "epoch": 0.08195020746887967, "grad_norm": 2.852540969848633, "learning_rate": 7.083909302476453e-05, "loss": 3.2891, "step": 79 }, { "epoch": 0.08298755186721991, "grad_norm": 3.0258772373199463, "learning_rate": 7.008477123264848e-05, "loss": 3.6323, "step": 80 }, { "epoch": 0.08402489626556017, "grad_norm": 3.0334479808807373, "learning_rate": 6.932495846462261e-05, "loss": 3.6048, "step": 81 }, { "epoch": 0.08506224066390042, "grad_norm": 3.7473435401916504, "learning_rate": 6.855986244591104e-05, "loss": 3.4813, "step": 82 }, { "epoch": 0.08609958506224066, "grad_norm": 3.568814277648926, "learning_rate": 6.778969234612584e-05, "loss": 3.747, "step": 83 }, { "epoch": 0.08713692946058091, "grad_norm": 3.2257232666015625, "learning_rate": 6.701465872208216e-05, "loss": 3.2229, "step": 84 }, { "epoch": 0.08817427385892117, "grad_norm": 3.3203718662261963, "learning_rate": 6.623497346023418e-05, "loss": 3.5216, "step": 85 }, { "epoch": 0.08921161825726141, "grad_norm": 3.708003520965576, "learning_rate": 6.545084971874738e-05, "loss": 3.293, "step": 86 }, { "epoch": 0.09024896265560166, "grad_norm": 3.2470216751098633, "learning_rate": 6.466250186922325e-05, "loss": 3.5909, "step": 87 }, { "epoch": 0.0912863070539419, "grad_norm": 3.2357749938964844, "learning_rate": 6.387014543809223e-05, "loss": 3.3392, "step": 88 }, { "epoch": 0.09232365145228216, "grad_norm": 3.4167912006378174, "learning_rate": 6.307399704769099e-05, "loss": 3.4653, "step": 89 }, { "epoch": 0.09336099585062241, "grad_norm": 3.372326135635376, "learning_rate": 6.227427435703997e-05, "loss": 3.3182, "step": 90 }, { "epoch": 0.09439834024896265, "grad_norm": 3.4928457736968994, "learning_rate": 6.147119600233758e-05, "loss": 3.5979, "step": 91 }, { "epoch": 0.0954356846473029, "grad_norm": 3.7030575275421143, "learning_rate": 6.066498153718735e-05, "loss": 3.8812, "step": 92 }, { "epoch": 0.09647302904564316, "grad_norm": 3.790165662765503, "learning_rate": 5.985585137257401e-05, "loss": 3.8599, "step": 93 }, { "epoch": 0.0975103734439834, "grad_norm": 3.600940465927124, "learning_rate": 5.90440267166055e-05, "loss": 3.5759, "step": 94 }, { "epoch": 0.09854771784232365, "grad_norm": 3.944754123687744, "learning_rate": 5.8229729514036705e-05, "loss": 3.8586, "step": 95 }, { "epoch": 0.0995850622406639, "grad_norm": 3.779947280883789, "learning_rate": 5.74131823855921e-05, "loss": 3.3266, "step": 96 }, { "epoch": 0.10062240663900415, "grad_norm": 4.06245756149292, "learning_rate": 5.6594608567103456e-05, "loss": 3.5335, "step": 97 }, { "epoch": 0.1016597510373444, "grad_norm": 4.446831226348877, "learning_rate": 5.577423184847932e-05, "loss": 3.5554, "step": 98 }, { "epoch": 0.10269709543568464, "grad_norm": 4.253654479980469, "learning_rate": 5.495227651252315e-05, "loss": 3.2417, "step": 99 }, { "epoch": 0.1037344398340249, "grad_norm": 5.696987628936768, "learning_rate": 5.4128967273616625e-05, "loss": 3.9678, "step": 100 }, { "epoch": 0.1037344398340249, "eval_loss": 3.7914316654205322, "eval_runtime": 26.5428, "eval_samples_per_second": 61.184, "eval_steps_per_second": 15.296, "step": 100 }, { "epoch": 0.10477178423236515, "grad_norm": 8.166366577148438, "learning_rate": 5.330452921628497e-05, "loss": 3.8268, "step": 101 }, { "epoch": 0.10580912863070539, "grad_norm": 6.857130527496338, "learning_rate": 5.247918773366112e-05, "loss": 3.7574, "step": 102 }, { "epoch": 0.10684647302904564, "grad_norm": 6.688436985015869, "learning_rate": 5.165316846586541e-05, "loss": 3.7533, "step": 103 }, { "epoch": 0.1078838174273859, "grad_norm": 6.446659564971924, "learning_rate": 5.0826697238317935e-05, "loss": 3.9865, "step": 104 }, { "epoch": 0.10892116182572614, "grad_norm": 5.080942153930664, "learning_rate": 5e-05, "loss": 3.4969, "step": 105 }, { "epoch": 0.10995850622406639, "grad_norm": 4.44970703125, "learning_rate": 4.917330276168208e-05, "loss": 3.5119, "step": 106 }, { "epoch": 0.11099585062240663, "grad_norm": 3.998990058898926, "learning_rate": 4.834683153413459e-05, "loss": 3.6256, "step": 107 }, { "epoch": 0.11203319502074689, "grad_norm": 3.2981390953063965, "learning_rate": 4.7520812266338885e-05, "loss": 3.4402, "step": 108 }, { "epoch": 0.11307053941908714, "grad_norm": 3.044083833694458, "learning_rate": 4.669547078371504e-05, "loss": 3.5525, "step": 109 }, { "epoch": 0.11410788381742738, "grad_norm": 2.891031503677368, "learning_rate": 4.5871032726383386e-05, "loss": 3.1974, "step": 110 }, { "epoch": 0.11514522821576763, "grad_norm": 2.743988037109375, "learning_rate": 4.504772348747687e-05, "loss": 3.2082, "step": 111 }, { "epoch": 0.11618257261410789, "grad_norm": 2.7417633533477783, "learning_rate": 4.4225768151520694e-05, "loss": 3.0046, "step": 112 }, { "epoch": 0.11721991701244813, "grad_norm": 2.6141929626464844, "learning_rate": 4.3405391432896555e-05, "loss": 3.1606, "step": 113 }, { "epoch": 0.11825726141078838, "grad_norm": 2.5122275352478027, "learning_rate": 4.2586817614407895e-05, "loss": 3.4908, "step": 114 }, { "epoch": 0.11929460580912864, "grad_norm": 2.8065576553344727, "learning_rate": 4.17702704859633e-05, "loss": 3.5437, "step": 115 }, { "epoch": 0.12033195020746888, "grad_norm": 2.443136692047119, "learning_rate": 4.095597328339452e-05, "loss": 3.4685, "step": 116 }, { "epoch": 0.12136929460580913, "grad_norm": 2.5843472480773926, "learning_rate": 4.0144148627425993e-05, "loss": 3.319, "step": 117 }, { "epoch": 0.12240663900414937, "grad_norm": 2.6064820289611816, "learning_rate": 3.933501846281267e-05, "loss": 3.3902, "step": 118 }, { "epoch": 0.12344398340248963, "grad_norm": 2.74259352684021, "learning_rate": 3.852880399766243e-05, "loss": 3.81, "step": 119 }, { "epoch": 0.12448132780082988, "grad_norm": 2.7329461574554443, "learning_rate": 3.772572564296005e-05, "loss": 3.4747, "step": 120 }, { "epoch": 0.12551867219917012, "grad_norm": 2.7343878746032715, "learning_rate": 3.6926002952309016e-05, "loss": 3.5092, "step": 121 }, { "epoch": 0.12655601659751037, "grad_norm": 2.749302625656128, "learning_rate": 3.612985456190778e-05, "loss": 3.8106, "step": 122 }, { "epoch": 0.1275933609958506, "grad_norm": 3.006464719772339, "learning_rate": 3.533749813077677e-05, "loss": 3.6465, "step": 123 }, { "epoch": 0.12863070539419086, "grad_norm": 2.8483219146728516, "learning_rate": 3.4549150281252636e-05, "loss": 3.3153, "step": 124 }, { "epoch": 0.12966804979253113, "grad_norm": 3.621854782104492, "learning_rate": 3.3765026539765834e-05, "loss": 3.0212, "step": 125 }, { "epoch": 0.13070539419087138, "grad_norm": 2.9148917198181152, "learning_rate": 3.298534127791785e-05, "loss": 3.2572, "step": 126 }, { "epoch": 0.13174273858921162, "grad_norm": 2.9206199645996094, "learning_rate": 3.221030765387417e-05, "loss": 3.4323, "step": 127 }, { "epoch": 0.13278008298755187, "grad_norm": 3.293531656265259, "learning_rate": 3.144013755408895e-05, "loss": 3.4568, "step": 128 }, { "epoch": 0.1338174273858921, "grad_norm": 2.9713733196258545, "learning_rate": 3.0675041535377405e-05, "loss": 3.4724, "step": 129 }, { "epoch": 0.13485477178423236, "grad_norm": 3.215953826904297, "learning_rate": 2.991522876735154e-05, "loss": 3.6332, "step": 130 }, { "epoch": 0.1358921161825726, "grad_norm": 3.0868947505950928, "learning_rate": 2.916090697523549e-05, "loss": 3.4988, "step": 131 }, { "epoch": 0.13692946058091288, "grad_norm": 3.1920242309570312, "learning_rate": 2.8412282383075363e-05, "loss": 3.3852, "step": 132 }, { "epoch": 0.13796680497925312, "grad_norm": 3.4271225929260254, "learning_rate": 2.766955965735968e-05, "loss": 3.8709, "step": 133 }, { "epoch": 0.13900414937759337, "grad_norm": 3.4510085582733154, "learning_rate": 2.693294185106562e-05, "loss": 3.4216, "step": 134 }, { "epoch": 0.1400414937759336, "grad_norm": 3.2544167041778564, "learning_rate": 2.6202630348146324e-05, "loss": 3.5809, "step": 135 }, { "epoch": 0.14107883817427386, "grad_norm": 3.22052001953125, "learning_rate": 2.547882480847461e-05, "loss": 3.2472, "step": 136 }, { "epoch": 0.1421161825726141, "grad_norm": 3.308317184448242, "learning_rate": 2.476172311325783e-05, "loss": 3.5626, "step": 137 }, { "epoch": 0.14315352697095435, "grad_norm": 3.3772473335266113, "learning_rate": 2.405152131093926e-05, "loss": 3.2607, "step": 138 }, { "epoch": 0.1441908713692946, "grad_norm": 3.1532044410705566, "learning_rate": 2.3348413563600325e-05, "loss": 3.5309, "step": 139 }, { "epoch": 0.14522821576763487, "grad_norm": 3.440068006515503, "learning_rate": 2.2652592093878666e-05, "loss": 3.5348, "step": 140 }, { "epoch": 0.1462655601659751, "grad_norm": 3.2612829208374023, "learning_rate": 2.196424713241637e-05, "loss": 3.5397, "step": 141 }, { "epoch": 0.14730290456431536, "grad_norm": 3.7282817363739014, "learning_rate": 2.128356686585282e-05, "loss": 3.7273, "step": 142 }, { "epoch": 0.1483402489626556, "grad_norm": 3.4528229236602783, "learning_rate": 2.061073738537635e-05, "loss": 3.3793, "step": 143 }, { "epoch": 0.14937759336099585, "grad_norm": 3.423583507537842, "learning_rate": 1.9945942635848748e-05, "loss": 3.2933, "step": 144 }, { "epoch": 0.1504149377593361, "grad_norm": 3.6867237091064453, "learning_rate": 1.928936436551661e-05, "loss": 3.8415, "step": 145 }, { "epoch": 0.15145228215767634, "grad_norm": 4.240562438964844, "learning_rate": 1.8641182076323148e-05, "loss": 3.9054, "step": 146 }, { "epoch": 0.1524896265560166, "grad_norm": 4.737523078918457, "learning_rate": 1.800157297483417e-05, "loss": 3.7616, "step": 147 }, { "epoch": 0.15352697095435686, "grad_norm": 4.262228965759277, "learning_rate": 1.7370711923791567e-05, "loss": 3.6418, "step": 148 }, { "epoch": 0.1545643153526971, "grad_norm": 4.715056896209717, "learning_rate": 1.6748771394307585e-05, "loss": 3.796, "step": 149 }, { "epoch": 0.15560165975103735, "grad_norm": 6.44474983215332, "learning_rate": 1.6135921418712956e-05, "loss": 4.0393, "step": 150 }, { "epoch": 0.15560165975103735, "eval_loss": 3.443917751312256, "eval_runtime": 26.554, "eval_samples_per_second": 61.158, "eval_steps_per_second": 15.29, "step": 150 }, { "epoch": 0.1566390041493776, "grad_norm": 2.9477157592773438, "learning_rate": 1.553232954407171e-05, "loss": 3.157, "step": 151 }, { "epoch": 0.15767634854771784, "grad_norm": 3.1337876319885254, "learning_rate": 1.4938160786375572e-05, "loss": 3.1213, "step": 152 }, { "epoch": 0.15871369294605808, "grad_norm": 3.1577205657958984, "learning_rate": 1.435357758543015e-05, "loss": 3.4105, "step": 153 }, { "epoch": 0.15975103734439833, "grad_norm": 3.00323748588562, "learning_rate": 1.3778739760445552e-05, "loss": 3.2181, "step": 154 }, { "epoch": 0.1607883817427386, "grad_norm": 2.866039514541626, "learning_rate": 1.3213804466343421e-05, "loss": 3.2601, "step": 155 }, { "epoch": 0.16182572614107885, "grad_norm": 3.097904920578003, "learning_rate": 1.2658926150792322e-05, "loss": 3.1206, "step": 156 }, { "epoch": 0.1628630705394191, "grad_norm": 3.0080816745758057, "learning_rate": 1.2114256511983274e-05, "loss": 3.464, "step": 157 }, { "epoch": 0.16390041493775934, "grad_norm": 3.1983859539031982, "learning_rate": 1.157994445715706e-05, "loss": 3.5615, "step": 158 }, { "epoch": 0.16493775933609958, "grad_norm": 2.9502131938934326, "learning_rate": 1.1056136061894384e-05, "loss": 3.3529, "step": 159 }, { "epoch": 0.16597510373443983, "grad_norm": 3.120685577392578, "learning_rate": 1.0542974530180327e-05, "loss": 3.4895, "step": 160 }, { "epoch": 0.16701244813278007, "grad_norm": 2.6355931758880615, "learning_rate": 1.0040600155253765e-05, "loss": 3.1231, "step": 161 }, { "epoch": 0.16804979253112035, "grad_norm": 2.8759355545043945, "learning_rate": 9.549150281252633e-06, "loss": 3.3021, "step": 162 }, { "epoch": 0.1690871369294606, "grad_norm": 2.9746170043945312, "learning_rate": 9.068759265665384e-06, "loss": 3.3881, "step": 163 }, { "epoch": 0.17012448132780084, "grad_norm": 2.8339903354644775, "learning_rate": 8.599558442598998e-06, "loss": 3.2455, "step": 164 }, { "epoch": 0.17116182572614108, "grad_norm": 2.7609705924987793, "learning_rate": 8.141676086873572e-06, "loss": 3.108, "step": 165 }, { "epoch": 0.17219917012448133, "grad_norm": 3.246521472930908, "learning_rate": 7.695237378953223e-06, "loss": 3.6929, "step": 166 }, { "epoch": 0.17323651452282157, "grad_norm": 2.688962459564209, "learning_rate": 7.260364370723044e-06, "loss": 3.138, "step": 167 }, { "epoch": 0.17427385892116182, "grad_norm": 2.9067366123199463, "learning_rate": 6.837175952121306e-06, "loss": 3.718, "step": 168 }, { "epoch": 0.17531120331950206, "grad_norm": 2.7512147426605225, "learning_rate": 6.425787818636131e-06, "loss": 3.4175, "step": 169 }, { "epoch": 0.17634854771784234, "grad_norm": 2.548328399658203, "learning_rate": 6.026312439675552e-06, "loss": 3.1292, "step": 170 }, { "epoch": 0.17738589211618258, "grad_norm": 2.8721537590026855, "learning_rate": 5.6388590278194096e-06, "loss": 3.5308, "step": 171 }, { "epoch": 0.17842323651452283, "grad_norm": 3.4221110343933105, "learning_rate": 5.263533508961827e-06, "loss": 3.8456, "step": 172 }, { "epoch": 0.17946058091286307, "grad_norm": 3.0033349990844727, "learning_rate": 4.900438493352055e-06, "loss": 3.4229, "step": 173 }, { "epoch": 0.18049792531120332, "grad_norm": 2.841123342514038, "learning_rate": 4.549673247541875e-06, "loss": 3.4928, "step": 174 }, { "epoch": 0.18153526970954356, "grad_norm": 2.7495405673980713, "learning_rate": 4.2113336672471245e-06, "loss": 3.2769, "step": 175 }, { "epoch": 0.1825726141078838, "grad_norm": 3.1454813480377197, "learning_rate": 3.885512251130763e-06, "loss": 3.1546, "step": 176 }, { "epoch": 0.18360995850622408, "grad_norm": 2.9140090942382812, "learning_rate": 3.5722980755146517e-06, "loss": 3.2434, "step": 177 }, { "epoch": 0.18464730290456433, "grad_norm": 2.936655044555664, "learning_rate": 3.271776770026963e-06, "loss": 3.7009, "step": 178 }, { "epoch": 0.18568464730290457, "grad_norm": 2.8600358963012695, "learning_rate": 2.9840304941919415e-06, "loss": 2.9847, "step": 179 }, { "epoch": 0.18672199170124482, "grad_norm": 3.1558871269226074, "learning_rate": 2.7091379149682685e-06, "loss": 3.9366, "step": 180 }, { "epoch": 0.18775933609958506, "grad_norm": 2.904458522796631, "learning_rate": 2.4471741852423237e-06, "loss": 2.9716, "step": 181 }, { "epoch": 0.1887966804979253, "grad_norm": 3.277076005935669, "learning_rate": 2.1982109232821178e-06, "loss": 3.3847, "step": 182 }, { "epoch": 0.18983402489626555, "grad_norm": 3.215095281600952, "learning_rate": 1.962316193157593e-06, "loss": 3.4074, "step": 183 }, { "epoch": 0.1908713692946058, "grad_norm": 3.3069441318511963, "learning_rate": 1.7395544861325718e-06, "loss": 3.3479, "step": 184 }, { "epoch": 0.19190871369294607, "grad_norm": 3.70759916305542, "learning_rate": 1.5299867030334814e-06, "loss": 3.5217, "step": 185 }, { "epoch": 0.19294605809128632, "grad_norm": 3.380190849304199, "learning_rate": 1.333670137599713e-06, "loss": 3.4685, "step": 186 }, { "epoch": 0.19398340248962656, "grad_norm": 3.148998498916626, "learning_rate": 1.1506584608200367e-06, "loss": 3.644, "step": 187 }, { "epoch": 0.1950207468879668, "grad_norm": 3.2292802333831787, "learning_rate": 9.810017062595322e-07, "loss": 3.4821, "step": 188 }, { "epoch": 0.19605809128630705, "grad_norm": 3.3793811798095703, "learning_rate": 8.247462563808817e-07, "loss": 3.3928, "step": 189 }, { "epoch": 0.1970954356846473, "grad_norm": 3.4184722900390625, "learning_rate": 6.819348298638839e-07, "loss": 3.7607, "step": 190 }, { "epoch": 0.19813278008298754, "grad_norm": 3.5667853355407715, "learning_rate": 5.526064699265753e-07, "loss": 3.7305, "step": 191 }, { "epoch": 0.1991701244813278, "grad_norm": 3.350822687149048, "learning_rate": 4.367965336512403e-07, "loss": 3.2065, "step": 192 }, { "epoch": 0.20020746887966806, "grad_norm": 3.8035242557525635, "learning_rate": 3.3453668231809286e-07, "loss": 3.5918, "step": 193 }, { "epoch": 0.2012448132780083, "grad_norm": 3.9756436347961426, "learning_rate": 2.458548727494292e-07, "loss": 3.7081, "step": 194 }, { "epoch": 0.20228215767634855, "grad_norm": 4.188309192657471, "learning_rate": 1.7077534966650766e-07, "loss": 3.7841, "step": 195 }, { "epoch": 0.2033195020746888, "grad_norm": 4.394107818603516, "learning_rate": 1.0931863906127327e-07, "loss": 3.6746, "step": 196 }, { "epoch": 0.20435684647302904, "grad_norm": 4.623396396636963, "learning_rate": 6.150154258476315e-08, "loss": 3.8618, "step": 197 }, { "epoch": 0.2053941908713693, "grad_norm": 4.6601762771606445, "learning_rate": 2.7337132953697554e-08, "loss": 3.3425, "step": 198 }, { "epoch": 0.20643153526970953, "grad_norm": 5.034449577331543, "learning_rate": 6.834750376549792e-09, "loss": 3.9177, "step": 199 }, { "epoch": 0.2074688796680498, "grad_norm": 7.078446388244629, "learning_rate": 0.0, "loss": 4.6096, "step": 200 }, { "epoch": 0.2074688796680498, "eval_loss": 3.3893699645996094, "eval_runtime": 26.548, "eval_samples_per_second": 61.172, "eval_steps_per_second": 15.293, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3317552734208e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }