{ "best_metric": 0.8282029628753662, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 2.150537634408602, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010752688172043012, "grad_norm": 6.99516487121582, "learning_rate": 5e-06, "loss": 3.1175, "step": 1 }, { "epoch": 0.010752688172043012, "eval_loss": 4.535027027130127, "eval_runtime": 14.1563, "eval_samples_per_second": 11.09, "eval_steps_per_second": 5.581, "step": 1 }, { "epoch": 0.021505376344086023, "grad_norm": 10.445697784423828, "learning_rate": 1e-05, "loss": 3.5728, "step": 2 }, { "epoch": 0.03225806451612903, "grad_norm": 12.358306884765625, "learning_rate": 1.5e-05, "loss": 3.9348, "step": 3 }, { "epoch": 0.043010752688172046, "grad_norm": 10.28347396850586, "learning_rate": 2e-05, "loss": 3.2841, "step": 4 }, { "epoch": 0.053763440860215055, "grad_norm": 10.178171157836914, "learning_rate": 2.5e-05, "loss": 3.2135, "step": 5 }, { "epoch": 0.06451612903225806, "grad_norm": 7.281465530395508, "learning_rate": 3e-05, "loss": 2.7599, "step": 6 }, { "epoch": 0.07526881720430108, "grad_norm": 9.009861946105957, "learning_rate": 3.5e-05, "loss": 2.4959, "step": 7 }, { "epoch": 0.08602150537634409, "grad_norm": 13.042397499084473, "learning_rate": 4e-05, "loss": 2.2706, "step": 8 }, { "epoch": 0.0967741935483871, "grad_norm": 7.2071638107299805, "learning_rate": 4.5e-05, "loss": 1.94, "step": 9 }, { "epoch": 0.10752688172043011, "grad_norm": 8.721714973449707, "learning_rate": 5e-05, "loss": 1.6717, "step": 10 }, { "epoch": 0.11827956989247312, "grad_norm": 9.722580909729004, "learning_rate": 5.500000000000001e-05, "loss": 1.821, "step": 11 }, { "epoch": 0.12903225806451613, "grad_norm": 8.217576026916504, "learning_rate": 6e-05, "loss": 1.58, "step": 12 }, { "epoch": 0.13978494623655913, "grad_norm": 6.87364387512207, "learning_rate": 6.500000000000001e-05, "loss": 1.3209, "step": 13 }, { "epoch": 0.15053763440860216, "grad_norm": 8.912040710449219, "learning_rate": 7e-05, "loss": 1.7184, "step": 14 }, { "epoch": 0.16129032258064516, "grad_norm": 7.484908580780029, "learning_rate": 7.500000000000001e-05, "loss": 1.0966, "step": 15 }, { "epoch": 0.17204301075268819, "grad_norm": 9.977514266967773, "learning_rate": 8e-05, "loss": 1.86, "step": 16 }, { "epoch": 0.1827956989247312, "grad_norm": 6.212277412414551, "learning_rate": 8.5e-05, "loss": 1.2687, "step": 17 }, { "epoch": 0.1935483870967742, "grad_norm": 6.920554161071777, "learning_rate": 9e-05, "loss": 1.1078, "step": 18 }, { "epoch": 0.20430107526881722, "grad_norm": 6.519954681396484, "learning_rate": 9.5e-05, "loss": 1.0365, "step": 19 }, { "epoch": 0.21505376344086022, "grad_norm": 9.026784896850586, "learning_rate": 0.0001, "loss": 1.2932, "step": 20 }, { "epoch": 0.22580645161290322, "grad_norm": 8.984053611755371, "learning_rate": 9.999238475781957e-05, "loss": 1.0453, "step": 21 }, { "epoch": 0.23655913978494625, "grad_norm": 7.758922100067139, "learning_rate": 9.99695413509548e-05, "loss": 1.2444, "step": 22 }, { "epoch": 0.24731182795698925, "grad_norm": 7.274909496307373, "learning_rate": 9.99314767377287e-05, "loss": 0.9704, "step": 23 }, { "epoch": 0.25806451612903225, "grad_norm": 8.09979248046875, "learning_rate": 9.987820251299122e-05, "loss": 2.5905, "step": 24 }, { "epoch": 0.26881720430107525, "grad_norm": 5.225498676300049, "learning_rate": 9.980973490458728e-05, "loss": 2.3784, "step": 25 }, { "epoch": 0.27956989247311825, "grad_norm": 4.132240295410156, "learning_rate": 9.972609476841367e-05, "loss": 1.9521, "step": 26 }, { "epoch": 0.2903225806451613, "grad_norm": 4.1804656982421875, "learning_rate": 9.962730758206611e-05, "loss": 2.1076, "step": 27 }, { "epoch": 0.3010752688172043, "grad_norm": 3.6776556968688965, "learning_rate": 9.951340343707852e-05, "loss": 1.9513, "step": 28 }, { "epoch": 0.3118279569892473, "grad_norm": 7.695979595184326, "learning_rate": 9.938441702975689e-05, "loss": 1.6991, "step": 29 }, { "epoch": 0.3225806451612903, "grad_norm": 6.833826065063477, "learning_rate": 9.924038765061042e-05, "loss": 1.3824, "step": 30 }, { "epoch": 0.3333333333333333, "grad_norm": 3.845757007598877, "learning_rate": 9.908135917238321e-05, "loss": 1.4698, "step": 31 }, { "epoch": 0.34408602150537637, "grad_norm": 3.1820149421691895, "learning_rate": 9.890738003669029e-05, "loss": 1.1889, "step": 32 }, { "epoch": 0.3548387096774194, "grad_norm": 4.390655517578125, "learning_rate": 9.871850323926177e-05, "loss": 1.1593, "step": 33 }, { "epoch": 0.3655913978494624, "grad_norm": 4.404603958129883, "learning_rate": 9.851478631379982e-05, "loss": 0.9826, "step": 34 }, { "epoch": 0.3763440860215054, "grad_norm": 3.859941005706787, "learning_rate": 9.829629131445342e-05, "loss": 1.0486, "step": 35 }, { "epoch": 0.3870967741935484, "grad_norm": 7.225592613220215, "learning_rate": 9.806308479691595e-05, "loss": 1.0135, "step": 36 }, { "epoch": 0.3978494623655914, "grad_norm": 4.242331504821777, "learning_rate": 9.781523779815179e-05, "loss": 0.8687, "step": 37 }, { "epoch": 0.40860215053763443, "grad_norm": 4.794056415557861, "learning_rate": 9.755282581475769e-05, "loss": 0.9721, "step": 38 }, { "epoch": 0.41935483870967744, "grad_norm": 3.9935474395751953, "learning_rate": 9.727592877996585e-05, "loss": 0.8433, "step": 39 }, { "epoch": 0.43010752688172044, "grad_norm": 3.3857905864715576, "learning_rate": 9.698463103929542e-05, "loss": 0.7633, "step": 40 }, { "epoch": 0.44086021505376344, "grad_norm": 3.6143317222595215, "learning_rate": 9.667902132486009e-05, "loss": 1.0136, "step": 41 }, { "epoch": 0.45161290322580644, "grad_norm": 3.9374451637268066, "learning_rate": 9.635919272833938e-05, "loss": 0.915, "step": 42 }, { "epoch": 0.46236559139784944, "grad_norm": 6.313716411590576, "learning_rate": 9.602524267262203e-05, "loss": 1.191, "step": 43 }, { "epoch": 0.4731182795698925, "grad_norm": 3.7929749488830566, "learning_rate": 9.567727288213005e-05, "loss": 1.0602, "step": 44 }, { "epoch": 0.4838709677419355, "grad_norm": 5.400207996368408, "learning_rate": 9.53153893518325e-05, "loss": 0.8725, "step": 45 }, { "epoch": 0.4946236559139785, "grad_norm": 3.7726528644561768, "learning_rate": 9.493970231495835e-05, "loss": 0.5123, "step": 46 }, { "epoch": 0.5053763440860215, "grad_norm": 3.6227760314941406, "learning_rate": 9.45503262094184e-05, "loss": 2.2086, "step": 47 }, { "epoch": 0.5161290322580645, "grad_norm": 3.246999979019165, "learning_rate": 9.414737964294636e-05, "loss": 2.3014, "step": 48 }, { "epoch": 0.5268817204301075, "grad_norm": 4.355656623840332, "learning_rate": 9.373098535696979e-05, "loss": 2.0727, "step": 49 }, { "epoch": 0.5376344086021505, "grad_norm": 2.746245861053467, "learning_rate": 9.330127018922194e-05, "loss": 1.5312, "step": 50 }, { "epoch": 0.5376344086021505, "eval_loss": 1.3152074813842773, "eval_runtime": 14.4699, "eval_samples_per_second": 10.85, "eval_steps_per_second": 5.46, "step": 50 }, { "epoch": 0.5483870967741935, "grad_norm": 3.6186563968658447, "learning_rate": 9.285836503510562e-05, "loss": 1.7819, "step": 51 }, { "epoch": 0.5591397849462365, "grad_norm": 3.9859840869903564, "learning_rate": 9.24024048078213e-05, "loss": 1.2978, "step": 52 }, { "epoch": 0.5698924731182796, "grad_norm": 3.9932148456573486, "learning_rate": 9.193352839727121e-05, "loss": 1.447, "step": 53 }, { "epoch": 0.5806451612903226, "grad_norm": 3.5936036109924316, "learning_rate": 9.145187862775209e-05, "loss": 1.3967, "step": 54 }, { "epoch": 0.5913978494623656, "grad_norm": 3.375316858291626, "learning_rate": 9.09576022144496e-05, "loss": 1.0998, "step": 55 }, { "epoch": 0.6021505376344086, "grad_norm": 3.807166814804077, "learning_rate": 9.045084971874738e-05, "loss": 1.1404, "step": 56 }, { "epoch": 0.6129032258064516, "grad_norm": 3.084326982498169, "learning_rate": 8.993177550236464e-05, "loss": 0.988, "step": 57 }, { "epoch": 0.6236559139784946, "grad_norm": 3.7563459873199463, "learning_rate": 8.940053768033609e-05, "loss": 1.0191, "step": 58 }, { "epoch": 0.6344086021505376, "grad_norm": 4.25908899307251, "learning_rate": 8.885729807284856e-05, "loss": 0.8702, "step": 59 }, { "epoch": 0.6451612903225806, "grad_norm": 3.750455617904663, "learning_rate": 8.83022221559489e-05, "loss": 0.8746, "step": 60 }, { "epoch": 0.6559139784946236, "grad_norm": 3.6707611083984375, "learning_rate": 8.773547901113862e-05, "loss": 0.9337, "step": 61 }, { "epoch": 0.6666666666666666, "grad_norm": 3.612056016921997, "learning_rate": 8.715724127386972e-05, "loss": 1.1718, "step": 62 }, { "epoch": 0.6774193548387096, "grad_norm": 4.298011779785156, "learning_rate": 8.656768508095853e-05, "loss": 0.628, "step": 63 }, { "epoch": 0.6881720430107527, "grad_norm": 4.248507022857666, "learning_rate": 8.596699001693255e-05, "loss": 0.6906, "step": 64 }, { "epoch": 0.6989247311827957, "grad_norm": 3.890913963317871, "learning_rate": 8.535533905932738e-05, "loss": 1.0229, "step": 65 }, { "epoch": 0.7096774193548387, "grad_norm": 4.77676248550415, "learning_rate": 8.473291852294987e-05, "loss": 0.8174, "step": 66 }, { "epoch": 0.7204301075268817, "grad_norm": 4.285477161407471, "learning_rate": 8.409991800312493e-05, "loss": 0.8005, "step": 67 }, { "epoch": 0.7311827956989247, "grad_norm": 3.8125786781311035, "learning_rate": 8.345653031794292e-05, "loss": 0.5634, "step": 68 }, { "epoch": 0.7419354838709677, "grad_norm": 4.32075309753418, "learning_rate": 8.280295144952536e-05, "loss": 0.6793, "step": 69 }, { "epoch": 0.7526881720430108, "grad_norm": 3.8095755577087402, "learning_rate": 8.213938048432697e-05, "loss": 2.0449, "step": 70 }, { "epoch": 0.7634408602150538, "grad_norm": 3.9818124771118164, "learning_rate": 8.146601955249188e-05, "loss": 1.9486, "step": 71 }, { "epoch": 0.7741935483870968, "grad_norm": 3.4325692653656006, "learning_rate": 8.07830737662829e-05, "loss": 1.8445, "step": 72 }, { "epoch": 0.7849462365591398, "grad_norm": 3.9117205142974854, "learning_rate": 8.009075115760243e-05, "loss": 1.9626, "step": 73 }, { "epoch": 0.7956989247311828, "grad_norm": 3.40067195892334, "learning_rate": 7.938926261462366e-05, "loss": 1.3113, "step": 74 }, { "epoch": 0.8064516129032258, "grad_norm": 3.8196802139282227, "learning_rate": 7.86788218175523e-05, "loss": 1.7274, "step": 75 }, { "epoch": 0.8172043010752689, "grad_norm": 4.589756488800049, "learning_rate": 7.795964517353735e-05, "loss": 1.7595, "step": 76 }, { "epoch": 0.8279569892473119, "grad_norm": 4.156907081604004, "learning_rate": 7.723195175075136e-05, "loss": 1.5407, "step": 77 }, { "epoch": 0.8387096774193549, "grad_norm": 3.101147174835205, "learning_rate": 7.649596321166024e-05, "loss": 0.9117, "step": 78 }, { "epoch": 0.8494623655913979, "grad_norm": 2.96360182762146, "learning_rate": 7.575190374550272e-05, "loss": 1.1026, "step": 79 }, { "epoch": 0.8602150537634409, "grad_norm": 2.9797332286834717, "learning_rate": 7.500000000000001e-05, "loss": 0.8541, "step": 80 }, { "epoch": 0.8709677419354839, "grad_norm": 3.0931079387664795, "learning_rate": 7.424048101231686e-05, "loss": 0.8875, "step": 81 }, { "epoch": 0.8817204301075269, "grad_norm": 2.8962762355804443, "learning_rate": 7.347357813929454e-05, "loss": 0.9001, "step": 82 }, { "epoch": 0.8924731182795699, "grad_norm": 3.5031051635742188, "learning_rate": 7.269952498697734e-05, "loss": 0.8276, "step": 83 }, { "epoch": 0.9032258064516129, "grad_norm": 3.142409086227417, "learning_rate": 7.191855733945387e-05, "loss": 0.823, "step": 84 }, { "epoch": 0.9139784946236559, "grad_norm": 2.962707757949829, "learning_rate": 7.113091308703498e-05, "loss": 0.8578, "step": 85 }, { "epoch": 0.9247311827956989, "grad_norm": 3.3335912227630615, "learning_rate": 7.033683215379002e-05, "loss": 0.8609, "step": 86 }, { "epoch": 0.9354838709677419, "grad_norm": 4.589385509490967, "learning_rate": 6.953655642446368e-05, "loss": 1.0085, "step": 87 }, { "epoch": 0.946236559139785, "grad_norm": 3.5878801345825195, "learning_rate": 6.873032967079561e-05, "loss": 0.5649, "step": 88 }, { "epoch": 0.956989247311828, "grad_norm": 3.286903142929077, "learning_rate": 6.7918397477265e-05, "loss": 0.4883, "step": 89 }, { "epoch": 0.967741935483871, "grad_norm": 3.4730312824249268, "learning_rate": 6.710100716628344e-05, "loss": 0.7835, "step": 90 }, { "epoch": 0.978494623655914, "grad_norm": 3.7954018115997314, "learning_rate": 6.627840772285784e-05, "loss": 0.7178, "step": 91 }, { "epoch": 0.989247311827957, "grad_norm": 3.0298449993133545, "learning_rate": 6.545084971874738e-05, "loss": 0.4597, "step": 92 }, { "epoch": 1.0, "grad_norm": 11.850226402282715, "learning_rate": 6.461858523613684e-05, "loss": 1.5025, "step": 93 }, { "epoch": 1.010752688172043, "grad_norm": 6.41492223739624, "learning_rate": 6.378186779084995e-05, "loss": 1.8111, "step": 94 }, { "epoch": 1.021505376344086, "grad_norm": 3.9378912448883057, "learning_rate": 6.294095225512603e-05, "loss": 1.5149, "step": 95 }, { "epoch": 1.032258064516129, "grad_norm": 3.533281087875366, "learning_rate": 6.209609477998338e-05, "loss": 1.4312, "step": 96 }, { "epoch": 1.043010752688172, "grad_norm": 3.780578136444092, "learning_rate": 6.124755271719325e-05, "loss": 1.2979, "step": 97 }, { "epoch": 1.053763440860215, "grad_norm": 2.7388339042663574, "learning_rate": 6.0395584540887963e-05, "loss": 0.988, "step": 98 }, { "epoch": 1.064516129032258, "grad_norm": 2.726057767868042, "learning_rate": 5.9540449768827246e-05, "loss": 1.0731, "step": 99 }, { "epoch": 1.075268817204301, "grad_norm": 4.266610622406006, "learning_rate": 5.868240888334653e-05, "loss": 1.0392, "step": 100 }, { "epoch": 1.075268817204301, "eval_loss": 1.115802526473999, "eval_runtime": 14.4453, "eval_samples_per_second": 10.869, "eval_steps_per_second": 5.469, "step": 100 }, { "epoch": 1.086021505376344, "grad_norm": 3.880851984024048, "learning_rate": 5.782172325201155e-05, "loss": 0.7542, "step": 101 }, { "epoch": 1.096774193548387, "grad_norm": 2.7799625396728516, "learning_rate": 5.695865504800327e-05, "loss": 0.6733, "step": 102 }, { "epoch": 1.10752688172043, "grad_norm": 2.877377986907959, "learning_rate": 5.6093467170257374e-05, "loss": 0.7712, "step": 103 }, { "epoch": 1.118279569892473, "grad_norm": 2.6312854290008545, "learning_rate": 5.522642316338268e-05, "loss": 0.5234, "step": 104 }, { "epoch": 1.129032258064516, "grad_norm": 2.5732200145721436, "learning_rate": 5.435778713738292e-05, "loss": 0.4727, "step": 105 }, { "epoch": 1.139784946236559, "grad_norm": 2.3443665504455566, "learning_rate": 5.348782368720626e-05, "loss": 0.5915, "step": 106 }, { "epoch": 1.1505376344086022, "grad_norm": 2.900613784790039, "learning_rate": 5.26167978121472e-05, "loss": 0.4433, "step": 107 }, { "epoch": 1.1612903225806452, "grad_norm": 2.779599666595459, "learning_rate": 5.174497483512506e-05, "loss": 0.4696, "step": 108 }, { "epoch": 1.1720430107526882, "grad_norm": 3.2906293869018555, "learning_rate": 5.0872620321864185e-05, "loss": 0.6608, "step": 109 }, { "epoch": 1.1827956989247312, "grad_norm": 3.181580066680908, "learning_rate": 5e-05, "loss": 0.5271, "step": 110 }, { "epoch": 1.1935483870967742, "grad_norm": 3.3558270931243896, "learning_rate": 4.912737967813583e-05, "loss": 0.4357, "step": 111 }, { "epoch": 1.2043010752688172, "grad_norm": 2.8602147102355957, "learning_rate": 4.825502516487497e-05, "loss": 0.4805, "step": 112 }, { "epoch": 1.2150537634408602, "grad_norm": 3.4753870964050293, "learning_rate": 4.738320218785281e-05, "loss": 0.5753, "step": 113 }, { "epoch": 1.2258064516129032, "grad_norm": 2.4663901329040527, "learning_rate": 4.6512176312793736e-05, "loss": 0.319, "step": 114 }, { "epoch": 1.2365591397849462, "grad_norm": 4.225439548492432, "learning_rate": 4.564221286261709e-05, "loss": 0.5583, "step": 115 }, { "epoch": 1.2473118279569892, "grad_norm": 2.3677308559417725, "learning_rate": 4.477357683661734e-05, "loss": 0.3289, "step": 116 }, { "epoch": 1.2580645161290323, "grad_norm": 3.1596832275390625, "learning_rate": 4.390653282974264e-05, "loss": 1.6511, "step": 117 }, { "epoch": 1.2688172043010753, "grad_norm": 3.4493701457977295, "learning_rate": 4.3041344951996746e-05, "loss": 1.6611, "step": 118 }, { "epoch": 1.2795698924731183, "grad_norm": 3.0767276287078857, "learning_rate": 4.2178276747988446e-05, "loss": 1.4595, "step": 119 }, { "epoch": 1.2903225806451613, "grad_norm": 3.6605751514434814, "learning_rate": 4.131759111665349e-05, "loss": 0.9704, "step": 120 }, { "epoch": 1.3010752688172043, "grad_norm": 4.2374420166015625, "learning_rate": 4.045955023117276e-05, "loss": 1.2689, "step": 121 }, { "epoch": 1.3118279569892473, "grad_norm": 3.0131664276123047, "learning_rate": 3.960441545911204e-05, "loss": 0.9044, "step": 122 }, { "epoch": 1.3225806451612903, "grad_norm": 2.836139678955078, "learning_rate": 3.875244728280676e-05, "loss": 0.9114, "step": 123 }, { "epoch": 1.3333333333333333, "grad_norm": 3.379606246948242, "learning_rate": 3.790390522001662e-05, "loss": 1.2156, "step": 124 }, { "epoch": 1.3440860215053765, "grad_norm": 3.377495050430298, "learning_rate": 3.705904774487396e-05, "loss": 0.6492, "step": 125 }, { "epoch": 1.3548387096774195, "grad_norm": 3.2035529613494873, "learning_rate": 3.6218132209150045e-05, "loss": 0.728, "step": 126 }, { "epoch": 1.3655913978494625, "grad_norm": 2.9157145023345947, "learning_rate": 3.5381414763863166e-05, "loss": 0.6313, "step": 127 }, { "epoch": 1.3763440860215055, "grad_norm": 2.234398126602173, "learning_rate": 3.4549150281252636e-05, "loss": 0.6193, "step": 128 }, { "epoch": 1.3870967741935485, "grad_norm": 2.1972103118896484, "learning_rate": 3.372159227714218e-05, "loss": 0.5867, "step": 129 }, { "epoch": 1.3978494623655915, "grad_norm": 3.0726523399353027, "learning_rate": 3.289899283371657e-05, "loss": 0.8497, "step": 130 }, { "epoch": 1.4086021505376345, "grad_norm": 2.2349085807800293, "learning_rate": 3.2081602522734986e-05, "loss": 0.5208, "step": 131 }, { "epoch": 1.4193548387096775, "grad_norm": 2.5332772731781006, "learning_rate": 3.12696703292044e-05, "loss": 0.4452, "step": 132 }, { "epoch": 1.4301075268817205, "grad_norm": 2.002074718475342, "learning_rate": 3.046344357553632e-05, "loss": 0.3896, "step": 133 }, { "epoch": 1.4408602150537635, "grad_norm": 2.316570281982422, "learning_rate": 2.9663167846209998e-05, "loss": 0.4541, "step": 134 }, { "epoch": 1.4516129032258065, "grad_norm": 2.4266834259033203, "learning_rate": 2.886908691296504e-05, "loss": 0.3945, "step": 135 }, { "epoch": 1.4623655913978495, "grad_norm": 2.9699063301086426, "learning_rate": 2.8081442660546125e-05, "loss": 0.6487, "step": 136 }, { "epoch": 1.4731182795698925, "grad_norm": 2.3638315200805664, "learning_rate": 2.7300475013022663e-05, "loss": 0.2829, "step": 137 }, { "epoch": 1.4838709677419355, "grad_norm": 4.946100234985352, "learning_rate": 2.6526421860705473e-05, "loss": 0.4617, "step": 138 }, { "epoch": 1.4946236559139785, "grad_norm": 2.5009000301361084, "learning_rate": 2.575951898768315e-05, "loss": 0.2181, "step": 139 }, { "epoch": 1.5053763440860215, "grad_norm": 2.8839311599731445, "learning_rate": 2.500000000000001e-05, "loss": 1.7407, "step": 140 }, { "epoch": 1.5161290322580645, "grad_norm": 2.9398391246795654, "learning_rate": 2.4248096254497288e-05, "loss": 1.4332, "step": 141 }, { "epoch": 1.5268817204301075, "grad_norm": 2.908121347427368, "learning_rate": 2.350403678833976e-05, "loss": 1.2238, "step": 142 }, { "epoch": 1.5376344086021505, "grad_norm": 3.3087892532348633, "learning_rate": 2.2768048249248648e-05, "loss": 1.2163, "step": 143 }, { "epoch": 1.5483870967741935, "grad_norm": 2.738379716873169, "learning_rate": 2.2040354826462668e-05, "loss": 0.9637, "step": 144 }, { "epoch": 1.5591397849462365, "grad_norm": 3.258429527282715, "learning_rate": 2.132117818244771e-05, "loss": 1.1101, "step": 145 }, { "epoch": 1.5698924731182795, "grad_norm": 2.7381410598754883, "learning_rate": 2.061073738537635e-05, "loss": 0.9134, "step": 146 }, { "epoch": 1.5806451612903225, "grad_norm": 2.7571890354156494, "learning_rate": 1.9909248842397584e-05, "loss": 0.7396, "step": 147 }, { "epoch": 1.5913978494623655, "grad_norm": 2.268134355545044, "learning_rate": 1.9216926233717085e-05, "loss": 0.5299, "step": 148 }, { "epoch": 1.6021505376344085, "grad_norm": 2.5643818378448486, "learning_rate": 1.8533980447508137e-05, "loss": 0.4746, "step": 149 }, { "epoch": 1.6129032258064515, "grad_norm": 2.6224515438079834, "learning_rate": 1.7860619515673033e-05, "loss": 0.5465, "step": 150 }, { "epoch": 1.6129032258064515, "eval_loss": 0.868911862373352, "eval_runtime": 14.4083, "eval_samples_per_second": 10.896, "eval_steps_per_second": 5.483, "step": 150 }, { "epoch": 1.6236559139784945, "grad_norm": 3.0091564655303955, "learning_rate": 1.7197048550474643e-05, "loss": 0.7287, "step": 151 }, { "epoch": 1.6344086021505375, "grad_norm": 1.6763337850570679, "learning_rate": 1.6543469682057106e-05, "loss": 0.2309, "step": 152 }, { "epoch": 1.6451612903225805, "grad_norm": 2.7991411685943604, "learning_rate": 1.5900081996875083e-05, "loss": 0.6103, "step": 153 }, { "epoch": 1.6559139784946235, "grad_norm": 2.1921868324279785, "learning_rate": 1.526708147705013e-05, "loss": 0.4928, "step": 154 }, { "epoch": 1.6666666666666665, "grad_norm": 3.5872364044189453, "learning_rate": 1.4644660940672627e-05, "loss": 0.6464, "step": 155 }, { "epoch": 1.6774193548387095, "grad_norm": 1.8818094730377197, "learning_rate": 1.4033009983067452e-05, "loss": 0.3707, "step": 156 }, { "epoch": 1.6881720430107527, "grad_norm": 2.5614068508148193, "learning_rate": 1.3432314919041478e-05, "loss": 0.428, "step": 157 }, { "epoch": 1.6989247311827957, "grad_norm": 2.164555311203003, "learning_rate": 1.2842758726130283e-05, "loss": 0.408, "step": 158 }, { "epoch": 1.7096774193548387, "grad_norm": 2.022615432739258, "learning_rate": 1.22645209888614e-05, "loss": 0.2232, "step": 159 }, { "epoch": 1.7204301075268817, "grad_norm": 2.447100877761841, "learning_rate": 1.1697777844051105e-05, "loss": 0.2983, "step": 160 }, { "epoch": 1.7311827956989247, "grad_norm": 4.865058898925781, "learning_rate": 1.1142701927151456e-05, "loss": 0.7972, "step": 161 }, { "epoch": 1.7419354838709677, "grad_norm": 2.682640790939331, "learning_rate": 1.0599462319663905e-05, "loss": 0.4589, "step": 162 }, { "epoch": 1.7526881720430108, "grad_norm": 2.2714807987213135, "learning_rate": 1.006822449763537e-05, "loss": 1.3321, "step": 163 }, { "epoch": 1.7634408602150538, "grad_norm": 2.663078784942627, "learning_rate": 9.549150281252633e-06, "loss": 1.4021, "step": 164 }, { "epoch": 1.7741935483870968, "grad_norm": 2.38264799118042, "learning_rate": 9.042397785550405e-06, "loss": 1.2299, "step": 165 }, { "epoch": 1.7849462365591398, "grad_norm": 2.5163092613220215, "learning_rate": 8.548121372247918e-06, "loss": 1.0744, "step": 166 }, { "epoch": 1.7956989247311828, "grad_norm": 3.2605020999908447, "learning_rate": 8.066471602728803e-06, "loss": 1.1439, "step": 167 }, { "epoch": 1.8064516129032258, "grad_norm": 2.8853485584259033, "learning_rate": 7.597595192178702e-06, "loss": 1.0157, "step": 168 }, { "epoch": 1.817204301075269, "grad_norm": 2.8386337757110596, "learning_rate": 7.1416349648943894e-06, "loss": 0.9278, "step": 169 }, { "epoch": 1.827956989247312, "grad_norm": 3.006211996078491, "learning_rate": 6.698729810778065e-06, "loss": 0.863, "step": 170 }, { "epoch": 1.838709677419355, "grad_norm": 3.339857339859009, "learning_rate": 6.269014643030213e-06, "loss": 0.7245, "step": 171 }, { "epoch": 1.849462365591398, "grad_norm": 2.269327402114868, "learning_rate": 5.852620357053651e-06, "loss": 0.3663, "step": 172 }, { "epoch": 1.860215053763441, "grad_norm": 2.1220529079437256, "learning_rate": 5.449673790581611e-06, "loss": 0.3854, "step": 173 }, { "epoch": 1.870967741935484, "grad_norm": 2.488945722579956, "learning_rate": 5.060297685041659e-06, "loss": 0.5545, "step": 174 }, { "epoch": 1.881720430107527, "grad_norm": 2.159728765487671, "learning_rate": 4.684610648167503e-06, "loss": 0.4613, "step": 175 }, { "epoch": 1.89247311827957, "grad_norm": 1.9596354961395264, "learning_rate": 4.322727117869951e-06, "loss": 0.3092, "step": 176 }, { "epoch": 1.903225806451613, "grad_norm": 2.403904914855957, "learning_rate": 3.974757327377981e-06, "loss": 0.4026, "step": 177 }, { "epoch": 1.913978494623656, "grad_norm": 2.790825605392456, "learning_rate": 3.6408072716606346e-06, "loss": 0.5259, "step": 178 }, { "epoch": 1.924731182795699, "grad_norm": 1.891068696975708, "learning_rate": 3.3209786751399187e-06, "loss": 0.2814, "step": 179 }, { "epoch": 1.935483870967742, "grad_norm": 2.383929967880249, "learning_rate": 3.0153689607045845e-06, "loss": 0.4715, "step": 180 }, { "epoch": 1.946236559139785, "grad_norm": 1.4359225034713745, "learning_rate": 2.724071220034158e-06, "loss": 0.1781, "step": 181 }, { "epoch": 1.956989247311828, "grad_norm": 2.4820544719696045, "learning_rate": 2.4471741852423237e-06, "loss": 0.4342, "step": 182 }, { "epoch": 1.967741935483871, "grad_norm": 1.628670573234558, "learning_rate": 2.1847622018482283e-06, "loss": 0.1903, "step": 183 }, { "epoch": 1.978494623655914, "grad_norm": 3.297879934310913, "learning_rate": 1.9369152030840556e-06, "loss": 0.3394, "step": 184 }, { "epoch": 1.989247311827957, "grad_norm": 2.3827061653137207, "learning_rate": 1.70370868554659e-06, "loss": 0.3608, "step": 185 }, { "epoch": 2.0, "grad_norm": 2.618680238723755, "learning_rate": 1.4852136862001764e-06, "loss": 0.8033, "step": 186 }, { "epoch": 2.010752688172043, "grad_norm": 1.7273980379104614, "learning_rate": 1.2814967607382432e-06, "loss": 1.237, "step": 187 }, { "epoch": 2.021505376344086, "grad_norm": 2.0505175590515137, "learning_rate": 1.0926199633097157e-06, "loss": 1.3056, "step": 188 }, { "epoch": 2.032258064516129, "grad_norm": 2.3544833660125732, "learning_rate": 9.186408276168013e-07, "loss": 1.1107, "step": 189 }, { "epoch": 2.043010752688172, "grad_norm": 1.91807222366333, "learning_rate": 7.596123493895991e-07, "loss": 0.7513, "step": 190 }, { "epoch": 2.053763440860215, "grad_norm": 2.319427728652954, "learning_rate": 6.15582970243117e-07, "loss": 0.8994, "step": 191 }, { "epoch": 2.064516129032258, "grad_norm": 2.4607162475585938, "learning_rate": 4.865965629214819e-07, "loss": 0.749, "step": 192 }, { "epoch": 2.075268817204301, "grad_norm": 2.2048518657684326, "learning_rate": 3.7269241793390085e-07, "loss": 0.6078, "step": 193 }, { "epoch": 2.086021505376344, "grad_norm": 2.24308443069458, "learning_rate": 2.7390523158633554e-07, "loss": 0.6862, "step": 194 }, { "epoch": 2.096774193548387, "grad_norm": 1.80559504032135, "learning_rate": 1.9026509541272275e-07, "loss": 0.4276, "step": 195 }, { "epoch": 2.10752688172043, "grad_norm": 1.79929518699646, "learning_rate": 1.2179748700879012e-07, "loss": 0.3818, "step": 196 }, { "epoch": 2.118279569892473, "grad_norm": 1.762046456336975, "learning_rate": 6.852326227130834e-08, "loss": 0.3659, "step": 197 }, { "epoch": 2.129032258064516, "grad_norm": 1.7550525665283203, "learning_rate": 3.04586490452119e-08, "loss": 0.2808, "step": 198 }, { "epoch": 2.139784946236559, "grad_norm": 1.8459057807922363, "learning_rate": 7.615242180436522e-09, "loss": 0.325, "step": 199 }, { "epoch": 2.150537634408602, "grad_norm": 1.6853699684143066, "learning_rate": 0.0, "loss": 0.2918, "step": 200 }, { "epoch": 2.150537634408602, "eval_loss": 0.8282029628753662, "eval_runtime": 14.4284, "eval_samples_per_second": 10.881, "eval_steps_per_second": 5.475, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.124610424184504e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }