{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9870967741935484, "eval_steps": 500, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012903225806451613, "grad_norm": 11.699292902761476, "learning_rate": 6.25e-07, "loss": 0.3091, "step": 1 }, { "epoch": 0.025806451612903226, "grad_norm": 10.2656902023561, "learning_rate": 1.25e-06, "loss": 0.2644, "step": 2 }, { "epoch": 0.03870967741935484, "grad_norm": 10.961480180598883, "learning_rate": 1.8750000000000003e-06, "loss": 0.3136, "step": 3 }, { "epoch": 0.05161290322580645, "grad_norm": 10.621965020575766, "learning_rate": 2.5e-06, "loss": 0.2649, "step": 4 }, { "epoch": 0.06451612903225806, "grad_norm": 7.63251523866155, "learning_rate": 3.125e-06, "loss": 0.2248, "step": 5 }, { "epoch": 0.07741935483870968, "grad_norm": 4.399939556445864, "learning_rate": 3.7500000000000005e-06, "loss": 0.1872, "step": 6 }, { "epoch": 0.09032258064516129, "grad_norm": 3.6018933396960233, "learning_rate": 4.3750000000000005e-06, "loss": 0.1277, "step": 7 }, { "epoch": 0.1032258064516129, "grad_norm": 2.350482184168289, "learning_rate": 5e-06, "loss": 0.1321, "step": 8 }, { "epoch": 0.11612903225806452, "grad_norm": 3.6577146283118167, "learning_rate": 5.625e-06, "loss": 0.1401, "step": 9 }, { "epoch": 0.12903225806451613, "grad_norm": 3.095671993174613, "learning_rate": 6.25e-06, "loss": 0.1247, "step": 10 }, { "epoch": 0.14193548387096774, "grad_norm": 1.8930944522483173, "learning_rate": 6.875e-06, "loss": 0.1243, "step": 11 }, { "epoch": 0.15483870967741936, "grad_norm": 3.4213993152717372, "learning_rate": 7.500000000000001e-06, "loss": 0.1183, "step": 12 }, { "epoch": 0.16774193548387098, "grad_norm": 3.0014464259254146, "learning_rate": 8.125000000000001e-06, "loss": 0.1319, "step": 13 }, { "epoch": 0.18064516129032257, "grad_norm": 2.846735403976308, "learning_rate": 8.750000000000001e-06, "loss": 0.1158, "step": 14 }, { "epoch": 0.1935483870967742, "grad_norm": 2.7768857827987614, "learning_rate": 9.375000000000001e-06, "loss": 0.1053, "step": 15 }, { "epoch": 0.2064516129032258, "grad_norm": 2.559286174315675, "learning_rate": 1e-05, "loss": 0.1351, "step": 16 }, { "epoch": 0.21935483870967742, "grad_norm": 2.793791051458797, "learning_rate": 9.998704424206747e-06, "loss": 0.1532, "step": 17 }, { "epoch": 0.23225806451612904, "grad_norm": 2.011717818438971, "learning_rate": 9.994818368233639e-06, "loss": 0.1041, "step": 18 }, { "epoch": 0.24516129032258063, "grad_norm": 2.44195378918142, "learning_rate": 9.988343845952697e-06, "loss": 0.1074, "step": 19 }, { "epoch": 0.25806451612903225, "grad_norm": 1.6874842592386545, "learning_rate": 9.979284212657658e-06, "loss": 0.1034, "step": 20 }, { "epoch": 0.2709677419354839, "grad_norm": 1.3503672327570895, "learning_rate": 9.967644163325157e-06, "loss": 0.0971, "step": 21 }, { "epoch": 0.2838709677419355, "grad_norm": 3.534445687027426, "learning_rate": 9.953429730181653e-06, "loss": 0.122, "step": 22 }, { "epoch": 0.2967741935483871, "grad_norm": 2.731723348296494, "learning_rate": 9.93664827957735e-06, "loss": 0.1402, "step": 23 }, { "epoch": 0.3096774193548387, "grad_norm": 2.516223822228138, "learning_rate": 9.917308508168712e-06, "loss": 0.1093, "step": 24 }, { "epoch": 0.3225806451612903, "grad_norm": 1.7606357719981414, "learning_rate": 9.895420438411616e-06, "loss": 0.1231, "step": 25 }, { "epoch": 0.33548387096774196, "grad_norm": 2.4360379684882654, "learning_rate": 9.870995413367397e-06, "loss": 0.1113, "step": 26 }, { "epoch": 0.34838709677419355, "grad_norm": 1.1166024452341257, "learning_rate": 9.844046090824533e-06, "loss": 0.0946, "step": 27 }, { "epoch": 0.36129032258064514, "grad_norm": 3.095834517141857, "learning_rate": 9.814586436738998e-06, "loss": 0.1032, "step": 28 }, { "epoch": 0.3741935483870968, "grad_norm": 3.200925995404148, "learning_rate": 9.782631717996675e-06, "loss": 0.1637, "step": 29 }, { "epoch": 0.3870967741935484, "grad_norm": 2.7067354639721333, "learning_rate": 9.748198494501598e-06, "loss": 0.1326, "step": 30 }, { "epoch": 0.4, "grad_norm": 1.9657792494045345, "learning_rate": 9.711304610594104e-06, "loss": 0.0985, "step": 31 }, { "epoch": 0.4129032258064516, "grad_norm": 1.2418236122465918, "learning_rate": 9.671969185803357e-06, "loss": 0.0866, "step": 32 }, { "epoch": 0.4258064516129032, "grad_norm": 1.4668414290482827, "learning_rate": 9.630212604939026e-06, "loss": 0.091, "step": 33 }, { "epoch": 0.43870967741935485, "grad_norm": 1.5674054362390097, "learning_rate": 9.586056507527266e-06, "loss": 0.1166, "step": 34 }, { "epoch": 0.45161290322580644, "grad_norm": 1.7105084425031547, "learning_rate": 9.539523776596446e-06, "loss": 0.121, "step": 35 }, { "epoch": 0.4645161290322581, "grad_norm": 1.3111865574883335, "learning_rate": 9.490638526818482e-06, "loss": 0.0835, "step": 36 }, { "epoch": 0.4774193548387097, "grad_norm": 1.0229500712234056, "learning_rate": 9.439426092011877e-06, "loss": 0.0947, "step": 37 }, { "epoch": 0.49032258064516127, "grad_norm": 1.2556959616077457, "learning_rate": 9.385913012012972e-06, "loss": 0.0911, "step": 38 }, { "epoch": 0.5032258064516129, "grad_norm": 1.0982611947906955, "learning_rate": 9.330127018922195e-06, "loss": 0.105, "step": 39 }, { "epoch": 0.5161290322580645, "grad_norm": 1.0216693563567487, "learning_rate": 9.272097022732444e-06, "loss": 0.0839, "step": 40 }, { "epoch": 0.5290322580645161, "grad_norm": 0.887997821512209, "learning_rate": 9.211853096347059e-06, "loss": 0.0823, "step": 41 }, { "epoch": 0.5419354838709678, "grad_norm": 1.5086680290000998, "learning_rate": 9.149426459995127e-06, "loss": 0.1165, "step": 42 }, { "epoch": 0.5548387096774193, "grad_norm": 1.1170270141146377, "learning_rate": 9.08484946505221e-06, "loss": 0.0964, "step": 43 }, { "epoch": 0.567741935483871, "grad_norm": 0.9096654753927038, "learning_rate": 9.018155577274891e-06, "loss": 0.1003, "step": 44 }, { "epoch": 0.5806451612903226, "grad_norm": 1.0160863955008763, "learning_rate": 8.949379359457795e-06, "loss": 0.0802, "step": 45 }, { "epoch": 0.5935483870967742, "grad_norm": 1.0265412727158079, "learning_rate": 8.8785564535221e-06, "loss": 0.0884, "step": 46 }, { "epoch": 0.6064516129032258, "grad_norm": 1.5105953615602765, "learning_rate": 8.805723562044825e-06, "loss": 0.0855, "step": 47 }, { "epoch": 0.6193548387096774, "grad_norm": 1.1851230689340553, "learning_rate": 8.730918429238429e-06, "loss": 0.0896, "step": 48 }, { "epoch": 0.632258064516129, "grad_norm": 0.9547624249115597, "learning_rate": 8.65417982139062e-06, "loss": 0.0895, "step": 49 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8828392166882096, "learning_rate": 8.575547506774498e-06, "loss": 0.0703, "step": 50 }, { "epoch": 0.6580645161290323, "grad_norm": 1.5517765054788453, "learning_rate": 8.49506223503941e-06, "loss": 0.0982, "step": 51 }, { "epoch": 0.6709677419354839, "grad_norm": 0.7610530682010276, "learning_rate": 8.412765716093273e-06, "loss": 0.09, "step": 52 }, { "epoch": 0.6838709677419355, "grad_norm": 1.2971615645647234, "learning_rate": 8.328700598487203e-06, "loss": 0.0951, "step": 53 }, { "epoch": 0.6967741935483871, "grad_norm": 0.6667526127324459, "learning_rate": 8.24291044731378e-06, "loss": 0.0624, "step": 54 }, { "epoch": 0.7096774193548387, "grad_norm": 1.1745372262285316, "learning_rate": 8.155439721630265e-06, "loss": 0.0955, "step": 55 }, { "epoch": 0.7225806451612903, "grad_norm": 0.8393263149118976, "learning_rate": 8.066333751418582e-06, "loss": 0.0734, "step": 56 }, { "epoch": 0.7354838709677419, "grad_norm": 0.9883905258578732, "learning_rate": 7.97563871409395e-06, "loss": 0.0753, "step": 57 }, { "epoch": 0.7483870967741936, "grad_norm": 0.8348230527715342, "learning_rate": 7.883401610574338e-06, "loss": 0.0764, "step": 58 }, { "epoch": 0.7612903225806451, "grad_norm": 1.0240572576125775, "learning_rate": 7.789670240923169e-06, "loss": 0.1, "step": 59 }, { "epoch": 0.7741935483870968, "grad_norm": 1.06880647144967, "learning_rate": 7.69449317957788e-06, "loss": 0.0917, "step": 60 }, { "epoch": 0.7870967741935484, "grad_norm": 0.9745921300249953, "learning_rate": 7.597919750177168e-06, "loss": 0.0903, "step": 61 }, { "epoch": 0.8, "grad_norm": 0.7844807247569731, "learning_rate": 7.500000000000001e-06, "loss": 0.0673, "step": 62 }, { "epoch": 0.8129032258064516, "grad_norm": 1.615686383831043, "learning_rate": 7.400784674029579e-06, "loss": 0.1024, "step": 63 }, { "epoch": 0.8258064516129032, "grad_norm": 0.7238588061260165, "learning_rate": 7.300325188655762e-06, "loss": 0.0705, "step": 64 }, { "epoch": 0.8387096774193549, "grad_norm": 1.3928662222572579, "learning_rate": 7.198673605029529e-06, "loss": 0.086, "step": 65 }, { "epoch": 0.8516129032258064, "grad_norm": 0.9405975350441244, "learning_rate": 7.095882602083321e-06, "loss": 0.081, "step": 66 }, { "epoch": 0.864516129032258, "grad_norm": 0.7204214932073081, "learning_rate": 6.9920054492312086e-06, "loss": 0.07, "step": 67 }, { "epoch": 0.8774193548387097, "grad_norm": 1.0257095309963755, "learning_rate": 6.887095978763072e-06, "loss": 0.0633, "step": 68 }, { "epoch": 0.8903225806451613, "grad_norm": 0.6843552801844485, "learning_rate": 6.781208557947085e-06, "loss": 0.0535, "step": 69 }, { "epoch": 0.9032258064516129, "grad_norm": 0.8369182932506963, "learning_rate": 6.674398060854931e-06, "loss": 0.0607, "step": 70 }, { "epoch": 0.9161290322580645, "grad_norm": 0.9450663518863992, "learning_rate": 6.566719839924412e-06, "loss": 0.0817, "step": 71 }, { "epoch": 0.9290322580645162, "grad_norm": 0.9648342453237144, "learning_rate": 6.458229697274125e-06, "loss": 0.0784, "step": 72 }, { "epoch": 0.9419354838709677, "grad_norm": 0.7226047890347407, "learning_rate": 6.348983855785122e-06, "loss": 0.0723, "step": 73 }, { "epoch": 0.9548387096774194, "grad_norm": 0.9883255400489244, "learning_rate": 6.2390389299645e-06, "loss": 0.0884, "step": 74 }, { "epoch": 0.967741935483871, "grad_norm": 0.8546590385544072, "learning_rate": 6.128451896606054e-06, "loss": 0.084, "step": 75 }, { "epoch": 0.9806451612903225, "grad_norm": 0.9429219413240326, "learning_rate": 6.0172800652631706e-06, "loss": 0.0809, "step": 76 }, { "epoch": 0.9935483870967742, "grad_norm": 0.8587345660767025, "learning_rate": 5.905581048549279e-06, "loss": 0.0681, "step": 77 }, { "epoch": 0.9935483870967742, "eval_loss": 0.08105655014514923, "eval_runtime": 39.4157, "eval_samples_per_second": 26.36, "eval_steps_per_second": 0.837, "step": 77 }, { "epoch": 1.0064516129032257, "grad_norm": 1.1624230129889668, "learning_rate": 5.793412732281258e-06, "loss": 0.0808, "step": 78 }, { "epoch": 1.0193548387096774, "grad_norm": 1.0860238876477792, "learning_rate": 5.680833245481234e-06, "loss": 0.0792, "step": 79 }, { "epoch": 1.032258064516129, "grad_norm": 0.8760532041288109, "learning_rate": 5.567900930252375e-06, "loss": 0.0675, "step": 80 }, { "epoch": 1.0451612903225806, "grad_norm": 0.8966395459365959, "learning_rate": 5.454674311544236e-06, "loss": 0.0682, "step": 81 }, { "epoch": 1.0580645161290323, "grad_norm": 1.019077536054118, "learning_rate": 5.341212066823356e-06, "loss": 0.0799, "step": 82 }, { "epoch": 1.070967741935484, "grad_norm": 1.3438302637394202, "learning_rate": 5.227572995664819e-06, "loss": 0.0882, "step": 83 }, { "epoch": 1.0838709677419356, "grad_norm": 0.9235621393269036, "learning_rate": 5.113815989280528e-06, "loss": 0.0763, "step": 84 }, { "epoch": 1.096774193548387, "grad_norm": 0.7779427795576146, "learning_rate": 5e-06, "loss": 0.0773, "step": 85 }, { "epoch": 1.1096774193548387, "grad_norm": 1.0066913546499912, "learning_rate": 4.886184010719472e-06, "loss": 0.0762, "step": 86 }, { "epoch": 1.1225806451612903, "grad_norm": 0.849885708644481, "learning_rate": 4.772427004335183e-06, "loss": 0.085, "step": 87 }, { "epoch": 1.135483870967742, "grad_norm": 0.8282232117216065, "learning_rate": 4.6587879331766465e-06, "loss": 0.0724, "step": 88 }, { "epoch": 1.1483870967741936, "grad_norm": 0.6227012177536829, "learning_rate": 4.545325688455766e-06, "loss": 0.0746, "step": 89 }, { "epoch": 1.1612903225806452, "grad_norm": 0.9694041549131266, "learning_rate": 4.432099069747625e-06, "loss": 0.0876, "step": 90 }, { "epoch": 1.1741935483870969, "grad_norm": 1.2344277757510045, "learning_rate": 4.319166754518768e-06, "loss": 0.0728, "step": 91 }, { "epoch": 1.1870967741935483, "grad_norm": 0.5797726042270337, "learning_rate": 4.206587267718743e-06, "loss": 0.05, "step": 92 }, { "epoch": 1.2, "grad_norm": 0.6275337263216433, "learning_rate": 4.094418951450721e-06, "loss": 0.0458, "step": 93 }, { "epoch": 1.2129032258064516, "grad_norm": 0.991136454485937, "learning_rate": 3.982719934736832e-06, "loss": 0.1011, "step": 94 }, { "epoch": 1.2258064516129032, "grad_norm": 0.5693434030485786, "learning_rate": 3.871548103393947e-06, "loss": 0.0535, "step": 95 }, { "epoch": 1.238709677419355, "grad_norm": 0.8648629215753009, "learning_rate": 3.7609610700355014e-06, "loss": 0.0726, "step": 96 }, { "epoch": 1.2516129032258063, "grad_norm": 0.49590292970379035, "learning_rate": 3.6510161442148783e-06, "loss": 0.0493, "step": 97 }, { "epoch": 1.2645161290322582, "grad_norm": 0.5252751061515991, "learning_rate": 3.5417703027258752e-06, "loss": 0.0499, "step": 98 }, { "epoch": 1.2774193548387096, "grad_norm": 0.5165334726304951, "learning_rate": 3.4332801600755895e-06, "loss": 0.0398, "step": 99 }, { "epoch": 1.2903225806451613, "grad_norm": 0.5588113997669555, "learning_rate": 3.3256019391450696e-06, "loss": 0.0451, "step": 100 }, { "epoch": 1.303225806451613, "grad_norm": 0.8000418887806057, "learning_rate": 3.2187914420529176e-06, "loss": 0.0558, "step": 101 }, { "epoch": 1.3161290322580645, "grad_norm": 0.6209463591754386, "learning_rate": 3.1129040212369286e-06, "loss": 0.0543, "step": 102 }, { "epoch": 1.3290322580645162, "grad_norm": 0.5956338266252976, "learning_rate": 3.007994550768793e-06, "loss": 0.0602, "step": 103 }, { "epoch": 1.3419354838709676, "grad_norm": 0.7601669697082293, "learning_rate": 2.9041173979166813e-06, "loss": 0.061, "step": 104 }, { "epoch": 1.3548387096774195, "grad_norm": 0.5346685587339685, "learning_rate": 2.8013263949704706e-06, "loss": 0.0516, "step": 105 }, { "epoch": 1.367741935483871, "grad_norm": 0.6722589800675358, "learning_rate": 2.6996748113442397e-06, "loss": 0.0667, "step": 106 }, { "epoch": 1.3806451612903226, "grad_norm": 0.7382443963105985, "learning_rate": 2.599215325970423e-06, "loss": 0.0755, "step": 107 }, { "epoch": 1.3935483870967742, "grad_norm": 0.717323507281047, "learning_rate": 2.5000000000000015e-06, "loss": 0.0608, "step": 108 }, { "epoch": 1.4064516129032258, "grad_norm": 0.5934322456540527, "learning_rate": 2.4020802498228333e-06, "loss": 0.0643, "step": 109 }, { "epoch": 1.4193548387096775, "grad_norm": 1.0152254575859148, "learning_rate": 2.3055068204221226e-06, "loss": 0.0926, "step": 110 }, { "epoch": 1.432258064516129, "grad_norm": 0.7338463370837035, "learning_rate": 2.2103297590768334e-06, "loss": 0.0718, "step": 111 }, { "epoch": 1.4451612903225808, "grad_norm": 0.6224898983786531, "learning_rate": 2.1165983894256647e-06, "loss": 0.0722, "step": 112 }, { "epoch": 1.4580645161290322, "grad_norm": 0.7111315812802891, "learning_rate": 2.0243612859060526e-06, "loss": 0.0796, "step": 113 }, { "epoch": 1.4709677419354839, "grad_norm": 0.5880677662618246, "learning_rate": 1.933666248581418e-06, "loss": 0.0612, "step": 114 }, { "epoch": 1.4838709677419355, "grad_norm": 0.5688066714876514, "learning_rate": 1.8445602783697375e-06, "loss": 0.0697, "step": 115 }, { "epoch": 1.4967741935483871, "grad_norm": 0.772697127447158, "learning_rate": 1.7570895526862202e-06, "loss": 0.0863, "step": 116 }, { "epoch": 1.5096774193548388, "grad_norm": 0.5188748378473136, "learning_rate": 1.6712994015127976e-06, "loss": 0.0456, "step": 117 }, { "epoch": 1.5225806451612902, "grad_norm": 0.5787690355073418, "learning_rate": 1.5872342839067305e-06, "loss": 0.0557, "step": 118 }, { "epoch": 1.535483870967742, "grad_norm": 1.1554001099512141, "learning_rate": 1.5049377649605906e-06, "loss": 0.0807, "step": 119 }, { "epoch": 1.5483870967741935, "grad_norm": 0.5280610958351776, "learning_rate": 1.4244524932255026e-06, "loss": 0.0476, "step": 120 }, { "epoch": 1.5612903225806452, "grad_norm": 0.6254239981186747, "learning_rate": 1.3458201786093795e-06, "loss": 0.07, "step": 121 }, { "epoch": 1.5741935483870968, "grad_norm": 0.526881850014464, "learning_rate": 1.2690815707615727e-06, "loss": 0.0412, "step": 122 }, { "epoch": 1.5870967741935482, "grad_norm": 0.5947939093981419, "learning_rate": 1.194276437955177e-06, "loss": 0.0594, "step": 123 }, { "epoch": 1.6, "grad_norm": 0.8234862252522671, "learning_rate": 1.1214435464779006e-06, "loss": 0.0656, "step": 124 }, { "epoch": 1.6129032258064515, "grad_norm": 0.6682337549186153, "learning_rate": 1.050620640542208e-06, "loss": 0.0821, "step": 125 }, { "epoch": 1.6258064516129034, "grad_norm": 0.46843664405997654, "learning_rate": 9.81844422725109e-07, "loss": 0.0538, "step": 126 }, { "epoch": 1.6387096774193548, "grad_norm": 0.5846588665419508, "learning_rate": 9.151505349477901e-07, "loss": 0.0552, "step": 127 }, { "epoch": 1.6516129032258065, "grad_norm": 0.6543914516209731, "learning_rate": 8.505735400048748e-07, "loss": 0.0739, "step": 128 }, { "epoch": 1.664516129032258, "grad_norm": 0.6018097015637573, "learning_rate": 7.881469036529427e-07, "loss": 0.0631, "step": 129 }, { "epoch": 1.6774193548387095, "grad_norm": 0.6248249705045374, "learning_rate": 7.279029772675572e-07, "loss": 0.0678, "step": 130 }, { "epoch": 1.6903225806451614, "grad_norm": 0.5074802404785773, "learning_rate": 6.698729810778065e-07, "loss": 0.0553, "step": 131 }, { "epoch": 1.7032258064516128, "grad_norm": 0.6303808241003845, "learning_rate": 6.140869879870287e-07, "loss": 0.0573, "step": 132 }, { "epoch": 1.7161290322580647, "grad_norm": 0.6548357295879033, "learning_rate": 5.60573907988124e-07, "loss": 0.0718, "step": 133 }, { "epoch": 1.729032258064516, "grad_norm": 1.0474619581709672, "learning_rate": 5.0936147318152e-07, "loss": 0.0684, "step": 134 }, { "epoch": 1.7419354838709677, "grad_norm": 0.4233097389816897, "learning_rate": 4.604762234035548e-07, "loss": 0.0425, "step": 135 }, { "epoch": 1.7548387096774194, "grad_norm": 0.5486123769584077, "learning_rate": 4.139434924727359e-07, "loss": 0.0555, "step": 136 }, { "epoch": 1.7677419354838708, "grad_norm": 0.42132757988454256, "learning_rate": 3.697873950609737e-07, "loss": 0.0454, "step": 137 }, { "epoch": 1.7806451612903227, "grad_norm": 0.48857725592632506, "learning_rate": 3.2803081419664483e-07, "loss": 0.0486, "step": 138 }, { "epoch": 1.793548387096774, "grad_norm": 0.6044138979316588, "learning_rate": 2.88695389405898e-07, "loss": 0.0625, "step": 139 }, { "epoch": 1.8064516129032258, "grad_norm": 0.45648144621900827, "learning_rate": 2.518015054984041e-07, "loss": 0.0485, "step": 140 }, { "epoch": 1.8193548387096774, "grad_norm": 0.6418678164146125, "learning_rate": 2.1736828200332628e-07, "loss": 0.053, "step": 141 }, { "epoch": 1.832258064516129, "grad_norm": 0.7770556575132163, "learning_rate": 1.8541356326100436e-07, "loss": 0.0678, "step": 142 }, { "epoch": 1.8451612903225807, "grad_norm": 0.5056829983103792, "learning_rate": 1.559539091754686e-07, "loss": 0.0513, "step": 143 }, { "epoch": 1.8580645161290321, "grad_norm": 0.6329718102226061, "learning_rate": 1.2900458663260506e-07, "loss": 0.082, "step": 144 }, { "epoch": 1.870967741935484, "grad_norm": 0.6625275685465645, "learning_rate": 1.0457956158838545e-07, "loss": 0.0626, "step": 145 }, { "epoch": 1.8838709677419354, "grad_norm": 0.4523695807369306, "learning_rate": 8.269149183128988e-08, "loss": 0.0457, "step": 146 }, { "epoch": 1.896774193548387, "grad_norm": 0.6684501205664699, "learning_rate": 6.335172042265192e-08, "loss": 0.0811, "step": 147 }, { "epoch": 1.9096774193548387, "grad_norm": 0.7933423479927778, "learning_rate": 4.657026981834623e-08, "loss": 0.0675, "step": 148 }, { "epoch": 1.9225806451612903, "grad_norm": 0.8055874409945138, "learning_rate": 3.235583667484443e-08, "loss": 0.0733, "step": 149 }, { "epoch": 1.935483870967742, "grad_norm": 0.4268126124624864, "learning_rate": 2.0715787342343586e-08, "loss": 0.0386, "step": 150 }, { "epoch": 1.9483870967741934, "grad_norm": 0.6755068312271777, "learning_rate": 1.1656154047303691e-08, "loss": 0.0681, "step": 151 }, { "epoch": 1.9612903225806453, "grad_norm": 0.5936835028163678, "learning_rate": 5.181631766362216e-09, "loss": 0.0675, "step": 152 }, { "epoch": 1.9741935483870967, "grad_norm": 0.45412737353530935, "learning_rate": 1.2955757932542334e-09, "loss": 0.0523, "step": 153 }, { "epoch": 1.9870967741935484, "grad_norm": 0.5652252375803395, "learning_rate": 0.0, "loss": 0.0505, "step": 154 }, { "epoch": 1.9870967741935484, "eval_loss": 0.06934936344623566, "eval_runtime": 38.7654, "eval_samples_per_second": 26.802, "eval_steps_per_second": 0.851, "step": 154 }, { "epoch": 1.9870967741935484, "step": 154, "total_flos": 5.361988883508429e+16, "train_loss": 0.08745080387437498, "train_runtime": 2120.7101, "train_samples_per_second": 9.302, "train_steps_per_second": 0.073 } ], "logging_steps": 1, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.361988883508429e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }