{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8387342737323675, "eval_steps": 500, "global_step": 275, "is_hyper_param_search": false, "is_local_process_zero": false, "is_world_process_zero": false, "log_history": [ { "epoch": 0.0, "grad_norm": 26.258708272339597, "learning_rate": 0.0001, "loss": 6.0469, "step": 1 }, { "epoch": 0.01, "grad_norm": 20.031528084144362, "learning_rate": 0.0002, "loss": 6.0524, "step": 2 }, { "epoch": 0.01, "grad_norm": 9.323112879274246, "learning_rate": 0.0003, "loss": 6.0052, "step": 3 }, { "epoch": 0.01, "grad_norm": 4.147754138762137, "learning_rate": 0.0004, "loss": 5.7581, "step": 4 }, { "epoch": 0.02, "grad_norm": 1.4117904450662409, "learning_rate": 0.0005, "loss": 5.5959, "step": 5 }, { "epoch": 0.02, "grad_norm": 1.450165632053372, "learning_rate": 0.0006, "loss": 5.3926, "step": 6 }, { "epoch": 0.02, "grad_norm": 1.3379034522061206, "learning_rate": 0.0007, "loss": 5.2432, "step": 7 }, { "epoch": 0.02, "grad_norm": 1.3149419664848485, "learning_rate": 0.0008, "loss": 4.9048, "step": 8 }, { "epoch": 0.03, "grad_norm": 1.5265629850853388, "learning_rate": 0.0009000000000000001, "loss": 4.8456, "step": 9 }, { "epoch": 0.03, "grad_norm": 1.2248009557702129, "learning_rate": 0.001, "loss": 4.4037, "step": 10 }, { "epoch": 0.03, "grad_norm": 1.5753342093655753, "learning_rate": 0.0009999754462587395, "loss": 3.7762, "step": 11 }, { "epoch": 0.04, "grad_norm": 1.0959671581261972, "learning_rate": 0.0009999017874465026, "loss": 3.5411, "step": 12 }, { "epoch": 0.04, "grad_norm": 0.8383646382276924, "learning_rate": 0.0009997790307976872, "loss": 3.4374, "step": 13 }, { "epoch": 0.04, "grad_norm": 0.6879126623626883, "learning_rate": 0.0009996071883688333, "loss": 3.3495, "step": 14 }, { "epoch": 0.05, "grad_norm": 0.48057277440632934, "learning_rate": 0.000999386277037439, "loss": 3.3236, "step": 15 }, { "epoch": 0.05, "grad_norm": 0.5193770648047042, "learning_rate": 0.0009991163185003027, "loss": 3.194, "step": 16 }, { "epoch": 0.05, "grad_norm": 0.34046016814570407, "learning_rate": 0.0009987973392713932, "loss": 3.008, "step": 17 }, { "epoch": 0.05, "grad_norm": 0.2969743503479429, "learning_rate": 0.0009984293706792437, "loss": 3.0682, "step": 18 }, { "epoch": 0.06, "grad_norm": 0.22754131045377113, "learning_rate": 0.0009980124488638772, "loss": 2.92, "step": 19 }, { "epoch": 0.06, "grad_norm": 0.15880784203685044, "learning_rate": 0.000997546614773255, "loss": 2.881, "step": 20 }, { "epoch": 0.06, "grad_norm": 0.10804346892890891, "learning_rate": 0.0009970319141592559, "loss": 2.8386, "step": 21 }, { "epoch": 0.07, "grad_norm": 0.0789962600177391, "learning_rate": 0.0009964683975731828, "loss": 2.8704, "step": 22 }, { "epoch": 0.07, "grad_norm": 0.0678766718481922, "learning_rate": 0.0009958561203607973, "loss": 2.8326, "step": 23 }, { "epoch": 0.07, "grad_norm": 0.15129601777209895, "learning_rate": 0.000995195142656885, "loss": 2.8444, "step": 24 }, { "epoch": 0.08, "grad_norm": 0.23876681297628152, "learning_rate": 0.0009944855293793475, "loss": 2.8809, "step": 25 }, { "epoch": 0.08, "grad_norm": 0.2458680275413572, "learning_rate": 0.0009937273502228282, "loss": 2.8308, "step": 26 }, { "epoch": 0.08, "grad_norm": 0.2646415320413548, "learning_rate": 0.0009929206796518663, "loss": 2.8296, "step": 27 }, { "epoch": 0.09, "grad_norm": 0.1855177204894958, "learning_rate": 0.0009920655968935837, "loss": 2.8302, "step": 28 }, { "epoch": 0.09, "grad_norm": 0.1692458314027983, "learning_rate": 0.0009911621859299041, "loss": 2.7798, "step": 29 }, { "epoch": 0.09, "grad_norm": 0.0887107703024984, "learning_rate": 0.000990210535489303, "loss": 2.8246, "step": 30 }, { "epoch": 0.09, "grad_norm": 0.05971709937694414, "learning_rate": 0.0009892107390380958, "loss": 2.777, "step": 31 }, { "epoch": 0.1, "grad_norm": 0.04203098841999937, "learning_rate": 0.0009881628947712555, "loss": 2.7673, "step": 32 }, { "epoch": 0.1, "grad_norm": 0.03708904853213128, "learning_rate": 0.0009870671056027706, "loss": 2.7486, "step": 33 }, { "epoch": 0.1, "grad_norm": 0.03528212659332326, "learning_rate": 0.0009859234791555354, "loss": 2.7886, "step": 34 }, { "epoch": 0.11, "grad_norm": 0.03018922862288316, "learning_rate": 0.000984732127750782, "loss": 2.7327, "step": 35 }, { "epoch": 0.11, "grad_norm": 0.03686815342528168, "learning_rate": 0.0009834931683970467, "loss": 2.7474, "step": 36 }, { "epoch": 0.11, "grad_norm": 0.033172131047386, "learning_rate": 0.0009822067227786794, "loss": 2.7345, "step": 37 }, { "epoch": 0.12, "grad_norm": 0.0373016703958135, "learning_rate": 0.000980872917243891, "loss": 2.7655, "step": 38 }, { "epoch": 0.12, "grad_norm": 0.03688389598963858, "learning_rate": 0.0009794918827923458, "loss": 2.7393, "step": 39 }, { "epoch": 0.12, "grad_norm": 0.038870114526809094, "learning_rate": 0.000978063755062294, "loss": 2.7611, "step": 40 }, { "epoch": 0.13, "grad_norm": 0.04038306990198406, "learning_rate": 0.0009765886743172511, "loss": 2.7386, "step": 41 }, { "epoch": 0.13, "grad_norm": 0.036670767168349196, "learning_rate": 0.0009750667854322206, "loss": 2.6912, "step": 42 }, { "epoch": 0.13, "grad_norm": 0.03291379266422108, "learning_rate": 0.0009734982378794661, "loss": 2.7719, "step": 43 }, { "epoch": 0.13, "grad_norm": 0.03542095069122837, "learning_rate": 0.0009718831857138308, "loss": 2.7095, "step": 44 }, { "epoch": 0.14, "grad_norm": 0.03556844983258325, "learning_rate": 0.0009702217875576068, "loss": 2.7435, "step": 45 }, { "epoch": 0.14, "grad_norm": 0.03705512589107, "learning_rate": 0.0009685142065849555, "loss": 2.713, "step": 46 }, { "epoch": 0.14, "grad_norm": 0.027028911527830556, "learning_rate": 0.0009667606105058828, "loss": 2.6472, "step": 47 }, { "epoch": 0.15, "grad_norm": 0.028600789964649966, "learning_rate": 0.0009649611715497661, "loss": 2.6931, "step": 48 }, { "epoch": 0.15, "grad_norm": 0.029116613327395607, "learning_rate": 0.0009631160664484398, "loss": 2.7082, "step": 49 }, { "epoch": 0.15, "grad_norm": 0.02557746875550266, "learning_rate": 0.0009612254764188368, "loss": 2.6707, "step": 50 }, { "epoch": 0.16, "grad_norm": 0.0256723041758138, "learning_rate": 0.0009592895871451908, "loss": 2.6608, "step": 51 }, { "epoch": 0.16, "grad_norm": 0.02295300029667766, "learning_rate": 0.000957308588760799, "loss": 2.6942, "step": 52 }, { "epoch": 0.16, "grad_norm": 0.02203986352323152, "learning_rate": 0.0009552826758293487, "loss": 2.6441, "step": 53 }, { "epoch": 0.16, "grad_norm": 0.02299784546740539, "learning_rate": 0.0009532120473258075, "loss": 2.6728, "step": 54 }, { "epoch": 0.17, "grad_norm": 0.023913877734993542, "learning_rate": 0.0009510969066168813, "loss": 2.6924, "step": 55 }, { "epoch": 0.17, "grad_norm": 0.020131800174103105, "learning_rate": 0.0009489374614410414, "loss": 2.6151, "step": 56 }, { "epoch": 0.17, "grad_norm": 0.02106884604663375, "learning_rate": 0.0009467339238881198, "loss": 2.6413, "step": 57 }, { "epoch": 0.18, "grad_norm": 0.02440845171243592, "learning_rate": 0.0009444865103784803, "loss": 2.6663, "step": 58 }, { "epoch": 0.18, "grad_norm": 0.019024075041881073, "learning_rate": 0.0009421954416417624, "loss": 2.6063, "step": 59 }, { "epoch": 0.18, "grad_norm": 0.019667612528268406, "learning_rate": 0.0009398609426952018, "loss": 2.6697, "step": 60 }, { "epoch": 0.19, "grad_norm": 0.020324139148677475, "learning_rate": 0.0009374832428215309, "loss": 2.6739, "step": 61 }, { "epoch": 0.19, "grad_norm": 0.02049609873784361, "learning_rate": 0.00093506257554646, "loss": 2.6115, "step": 62 }, { "epoch": 0.19, "grad_norm": 0.022068793706639576, "learning_rate": 0.0009325991786157404, "loss": 2.6537, "step": 63 }, { "epoch": 0.2, "grad_norm": 0.02119215165943166, "learning_rate": 0.0009300932939718159, "loss": 2.6668, "step": 64 }, { "epoch": 0.2, "grad_norm": 0.020470851870879974, "learning_rate": 0.000927545167730059, "loss": 2.6495, "step": 65 }, { "epoch": 0.2, "grad_norm": 0.020239015102427586, "learning_rate": 0.0009249550501545996, "loss": 2.6229, "step": 66 }, { "epoch": 0.2, "grad_norm": 0.02006561036060577, "learning_rate": 0.000922323195633745, "loss": 2.6541, "step": 67 }, { "epoch": 0.21, "grad_norm": 0.020517458730459407, "learning_rate": 0.0009196498626549943, "loss": 2.6263, "step": 68 }, { "epoch": 0.21, "grad_norm": 0.020546751672942867, "learning_rate": 0.0009169353137796533, "loss": 2.6868, "step": 69 }, { "epoch": 0.21, "grad_norm": 0.020795862822058586, "learning_rate": 0.0009141798156170446, "loss": 2.6376, "step": 70 }, { "epoch": 0.22, "grad_norm": 0.023603442107788076, "learning_rate": 0.0009113836387983239, "loss": 2.6679, "step": 71 }, { "epoch": 0.22, "grad_norm": 0.020593236730693215, "learning_rate": 0.0009085470579498995, "loss": 2.628, "step": 72 }, { "epoch": 0.22, "grad_norm": 0.0185506077207348, "learning_rate": 0.0009056703516664606, "loss": 2.6315, "step": 73 }, { "epoch": 0.23, "grad_norm": 0.02046637689680032, "learning_rate": 0.0009027538024836141, "loss": 2.6227, "step": 74 }, { "epoch": 0.23, "grad_norm": 0.018026412977487194, "learning_rate": 0.0008997976968501361, "loss": 2.6062, "step": 75 }, { "epoch": 0.23, "grad_norm": 0.017993838807838707, "learning_rate": 0.000896802325099838, "loss": 2.5869, "step": 76 }, { "epoch": 0.23, "grad_norm": 0.01897876413125471, "learning_rate": 0.0008937679814230517, "loss": 2.6046, "step": 77 }, { "epoch": 0.24, "grad_norm": 0.019221228785349705, "learning_rate": 0.000890694963837735, "loss": 2.6163, "step": 78 }, { "epoch": 0.24, "grad_norm": 0.01999062056670978, "learning_rate": 0.0008875835741602029, "loss": 2.6624, "step": 79 }, { "epoch": 0.24, "grad_norm": 0.01688098771768354, "learning_rate": 0.0008844341179754839, "loss": 2.597, "step": 80 }, { "epoch": 0.25, "grad_norm": 0.021679649830905732, "learning_rate": 0.0008812469046073068, "loss": 2.6252, "step": 81 }, { "epoch": 0.25, "grad_norm": 0.01976703829526433, "learning_rate": 0.0008780222470877213, "loss": 2.6547, "step": 82 }, { "epoch": 0.25, "grad_norm": 0.01721216059701167, "learning_rate": 0.000874760462126353, "loss": 2.5894, "step": 83 }, { "epoch": 0.26, "grad_norm": 0.017796757505650348, "learning_rate": 0.0008714618700792976, "loss": 2.6428, "step": 84 }, { "epoch": 0.26, "grad_norm": 0.017516085193709605, "learning_rate": 0.0008681267949176579, "loss": 2.6267, "step": 85 }, { "epoch": 0.26, "grad_norm": 0.019752354059471497, "learning_rate": 0.0008647555641957244, "loss": 2.6172, "step": 86 }, { "epoch": 0.27, "grad_norm": 0.01942107114850748, "learning_rate": 0.0008613485090188043, "loss": 2.6469, "step": 87 }, { "epoch": 0.27, "grad_norm": 0.021049275004733702, "learning_rate": 0.000857905964010703, "loss": 2.6337, "step": 88 }, { "epoch": 0.27, "grad_norm": 0.017375996512513512, "learning_rate": 0.0008544282672808579, "loss": 2.6066, "step": 89 }, { "epoch": 0.27, "grad_norm": 0.020054221841685173, "learning_rate": 0.0008509157603911319, "loss": 2.6293, "step": 90 }, { "epoch": 0.28, "grad_norm": 0.016754816692027007, "learning_rate": 0.0008473687883222664, "loss": 2.553, "step": 91 }, { "epoch": 0.28, "grad_norm": 0.018264517242335677, "learning_rate": 0.0008437876994399991, "loss": 2.582, "step": 92 }, { "epoch": 0.28, "grad_norm": 0.01738231752273472, "learning_rate": 0.0008401728454608494, "loss": 2.6246, "step": 93 }, { "epoch": 0.29, "grad_norm": 0.017582673986843284, "learning_rate": 0.0008365245814175744, "loss": 2.5377, "step": 94 }, { "epoch": 0.29, "grad_norm": 0.01841568278108112, "learning_rate": 0.0008328432656242997, "loss": 2.6107, "step": 95 }, { "epoch": 0.29, "grad_norm": 0.01806489585852763, "learning_rate": 0.0008291292596413272, "loss": 2.5388, "step": 96 }, { "epoch": 0.3, "grad_norm": 0.01910768409990766, "learning_rate": 0.0008253829282396245, "loss": 2.6252, "step": 97 }, { "epoch": 0.3, "grad_norm": 0.019034711943532324, "learning_rate": 0.0008216046393649996, "loss": 2.6193, "step": 98 }, { "epoch": 0.3, "grad_norm": 0.020284737834854932, "learning_rate": 0.0008177947641019621, "loss": 2.6149, "step": 99 }, { "epoch": 0.3, "grad_norm": 0.05158744985589759, "learning_rate": 0.0008139536766372775, "loss": 2.6072, "step": 100 }, { "epoch": 0.31, "grad_norm": 0.018922120101005344, "learning_rate": 0.0008100817542232173, "loss": 2.6032, "step": 101 }, { "epoch": 0.31, "grad_norm": 0.019044369144979888, "learning_rate": 0.000806179377140506, "loss": 2.5879, "step": 102 }, { "epoch": 0.31, "grad_norm": 0.017582902915047, "learning_rate": 0.000802246928660972, "loss": 2.6069, "step": 103 }, { "epoch": 0.32, "grad_norm": 0.019578808618249007, "learning_rate": 0.0007982847950099055, "loss": 2.5965, "step": 104 }, { "epoch": 0.32, "grad_norm": 0.018236739907291074, "learning_rate": 0.0007942933653281245, "loss": 2.5707, "step": 105 }, { "epoch": 0.32, "grad_norm": 0.018258178535900983, "learning_rate": 0.0007902730316337556, "loss": 2.5382, "step": 106 }, { "epoch": 0.33, "grad_norm": 0.022391092471974905, "learning_rate": 0.0007862241887837322, "loss": 2.6118, "step": 107 }, { "epoch": 0.33, "grad_norm": 0.016821289155497087, "learning_rate": 0.0007821472344350131, "loss": 2.5829, "step": 108 }, { "epoch": 0.33, "grad_norm": 0.019386550472160386, "learning_rate": 0.0007780425690055274, "loss": 2.6255, "step": 109 }, { "epoch": 0.34, "grad_norm": 0.1772374509222317, "learning_rate": 0.0007739105956348464, "loss": 2.5617, "step": 110 }, { "epoch": 0.34, "grad_norm": 0.020123188408530047, "learning_rate": 0.0007697517201445905, "loss": 2.6127, "step": 111 }, { "epoch": 0.34, "grad_norm": 0.0217268258079, "learning_rate": 0.0007655663509985707, "loss": 2.5991, "step": 112 }, { "epoch": 0.34, "grad_norm": 0.018981113976254002, "learning_rate": 0.0007613548992626711, "loss": 2.6047, "step": 113 }, { "epoch": 0.35, "grad_norm": 0.01878803361275613, "learning_rate": 0.0007571177785644766, "loss": 2.5482, "step": 114 }, { "epoch": 0.35, "grad_norm": 0.018908774218863146, "learning_rate": 0.0007528554050526488, "loss": 2.6141, "step": 115 }, { "epoch": 0.35, "grad_norm": 0.019547294966543106, "learning_rate": 0.0007485681973560532, "loss": 2.5407, "step": 116 }, { "epoch": 0.36, "grad_norm": 0.018739519276120584, "learning_rate": 0.0007442565765426436, "loss": 2.6212, "step": 117 }, { "epoch": 0.36, "grad_norm": 0.018979109952080055, "learning_rate": 0.0007399209660781074, "loss": 2.5721, "step": 118 }, { "epoch": 0.36, "grad_norm": 0.0175358024051344, "learning_rate": 0.0007355617917842751, "loss": 2.577, "step": 119 }, { "epoch": 0.37, "grad_norm": 0.018692342003597935, "learning_rate": 0.0007311794817972975, "loss": 2.5944, "step": 120 }, { "epoch": 0.37, "grad_norm": 0.019650294357122313, "learning_rate": 0.0007267744665255965, "loss": 2.608, "step": 121 }, { "epoch": 0.37, "grad_norm": 0.01787007786983172, "learning_rate": 0.0007223471786075934, "loss": 2.5898, "step": 122 }, { "epoch": 0.38, "grad_norm": 0.018327854729029175, "learning_rate": 0.0007178980528692161, "loss": 2.5641, "step": 123 }, { "epoch": 0.38, "grad_norm": 0.01819571331500734, "learning_rate": 0.0007134275262811934, "loss": 2.5724, "step": 124 }, { "epoch": 0.38, "grad_norm": 0.016879527517399177, "learning_rate": 0.0007089360379161381, "loss": 2.6016, "step": 125 }, { "epoch": 0.38, "grad_norm": 0.018893461615821387, "learning_rate": 0.0007044240289054227, "loss": 2.5726, "step": 126 }, { "epoch": 0.39, "grad_norm": 0.018652185039036564, "learning_rate": 0.0006998919423958547, "loss": 2.5559, "step": 127 }, { "epoch": 0.39, "grad_norm": 0.020673958617829544, "learning_rate": 0.0006953402235061519, "loss": 2.634, "step": 128 }, { "epoch": 0.39, "grad_norm": 0.01939109964021446, "learning_rate": 0.0006907693192832263, "loss": 2.568, "step": 129 }, { "epoch": 0.4, "grad_norm": 0.018062900613076246, "learning_rate": 0.0006861796786582761, "loss": 2.554, "step": 130 }, { "epoch": 0.4, "grad_norm": 0.018609687828424634, "learning_rate": 0.0006815717524026949, "loss": 2.5842, "step": 131 }, { "epoch": 0.4, "grad_norm": 0.019554131820221306, "learning_rate": 0.0006769459930837989, "loss": 2.632, "step": 132 }, { "epoch": 0.41, "grad_norm": 0.017713236616086883, "learning_rate": 0.0006723028550203778, "loss": 2.5547, "step": 133 }, { "epoch": 0.41, "grad_norm": 0.019353124709199913, "learning_rate": 0.000667642794238074, "loss": 2.6321, "step": 134 }, { "epoch": 0.41, "grad_norm": 0.01984763650797756, "learning_rate": 0.0006629662684245948, "loss": 2.5521, "step": 135 }, { "epoch": 0.41, "grad_norm": 0.0194768410660429, "learning_rate": 0.0006582737368847592, "loss": 2.564, "step": 136 }, { "epoch": 0.42, "grad_norm": 0.02008541514106773, "learning_rate": 0.0006535656604953884, "loss": 2.6406, "step": 137 }, { "epoch": 0.42, "grad_norm": 0.019023120632534838, "learning_rate": 0.0006488425016600402, "loss": 2.5492, "step": 138 }, { "epoch": 0.42, "grad_norm": 0.019894786782134775, "learning_rate": 0.0006441047242635947, "loss": 2.598, "step": 139 }, { "epoch": 0.43, "grad_norm": 0.02011873025218978, "learning_rate": 0.0006393527936266933, "loss": 2.5729, "step": 140 }, { "epoch": 0.43, "grad_norm": 0.019172173916513022, "learning_rate": 0.0006345871764600374, "loss": 2.5594, "step": 141 }, { "epoch": 0.43, "grad_norm": 0.019721366832981004, "learning_rate": 0.0006298083408185502, "loss": 2.5915, "step": 142 }, { "epoch": 0.44, "grad_norm": 0.020228287976845075, "learning_rate": 0.0006250167560554076, "loss": 2.5898, "step": 143 }, { "epoch": 0.44, "grad_norm": 0.020882340174208855, "learning_rate": 0.0006202128927759391, "loss": 2.6273, "step": 144 }, { "epoch": 0.44, "grad_norm": 0.02081451600660678, "learning_rate": 0.0006153972227914089, "loss": 2.595, "step": 145 }, { "epoch": 0.45, "grad_norm": 0.01941546346347613, "learning_rate": 0.0006105702190726764, "loss": 2.5732, "step": 146 }, { "epoch": 0.45, "grad_norm": 0.02166634263027907, "learning_rate": 0.000605732355703743, "loss": 2.5453, "step": 147 }, { "epoch": 0.45, "grad_norm": 0.019573148109730217, "learning_rate": 0.0006008841078351903, "loss": 2.5429, "step": 148 }, { "epoch": 0.45, "grad_norm": 0.01903540723775177, "learning_rate": 0.0005960259516375134, "loss": 2.5388, "step": 149 }, { "epoch": 0.46, "grad_norm": 0.02051218709860799, "learning_rate": 0.0005911583642543531, "loss": 2.5793, "step": 150 }, { "epoch": 0.46, "grad_norm": 0.02101151521755854, "learning_rate": 0.0005862818237556344, "loss": 2.5677, "step": 151 }, { "epoch": 0.46, "grad_norm": 0.02162618990843192, "learning_rate": 0.0005813968090906116, "loss": 2.5635, "step": 152 }, { "epoch": 0.47, "grad_norm": 0.018935934203135756, "learning_rate": 0.0005765038000408295, "loss": 2.5174, "step": 153 }, { "epoch": 0.47, "grad_norm": 0.021177497387846266, "learning_rate": 0.0005716032771730008, "loss": 2.5266, "step": 154 }, { "epoch": 0.47, "grad_norm": 0.019633074877257486, "learning_rate": 0.0005666957217918076, "loss": 2.5909, "step": 155 }, { "epoch": 0.48, "grad_norm": 0.020321617227441156, "learning_rate": 0.0005617816158926302, "loss": 2.5727, "step": 156 }, { "epoch": 0.48, "grad_norm": 0.02171314000863455, "learning_rate": 0.0005568614421142077, "loss": 2.5728, "step": 157 }, { "epoch": 0.48, "grad_norm": 0.02036197464612551, "learning_rate": 0.0005519356836912357, "loss": 2.5546, "step": 158 }, { "epoch": 0.48, "grad_norm": 0.021214591076630044, "learning_rate": 0.0005470048244069055, "loss": 2.6089, "step": 159 }, { "epoch": 0.49, "grad_norm": 0.021380356809754895, "learning_rate": 0.0005420693485453892, "loss": 2.5701, "step": 160 }, { "epoch": 0.49, "grad_norm": 0.020301144411914027, "learning_rate": 0.0005371297408442765, "loss": 2.5783, "step": 161 }, { "epoch": 0.49, "grad_norm": 0.02317064813036681, "learning_rate": 0.0005321864864469646, "loss": 2.5787, "step": 162 }, { "epoch": 0.5, "grad_norm": 0.020981576388978553, "learning_rate": 0.0005272400708550113, "loss": 2.5657, "step": 163 }, { "epoch": 0.5, "grad_norm": 0.01972062092753295, "learning_rate": 0.0005222909798804515, "loss": 2.5941, "step": 164 }, { "epoch": 0.5, "grad_norm": 0.022818553102453083, "learning_rate": 0.0005173396995980818, "loss": 2.5567, "step": 165 }, { "epoch": 0.51, "grad_norm": 0.021333416992185616, "learning_rate": 0.0005123867162977224, "loss": 2.5552, "step": 166 }, { "epoch": 0.51, "grad_norm": 0.020142143404330078, "learning_rate": 0.0005074325164364548, "loss": 2.5305, "step": 167 }, { "epoch": 0.51, "grad_norm": 0.022025222387284903, "learning_rate": 0.0005024775865908451, "loss": 2.59, "step": 168 }, { "epoch": 0.52, "grad_norm": 0.020907939242901075, "learning_rate": 0.000497522413409155, "loss": 2.5452, "step": 169 }, { "epoch": 0.52, "grad_norm": 0.022927802322477854, "learning_rate": 0.0004925674835635454, "loss": 2.5792, "step": 170 }, { "epoch": 0.52, "grad_norm": 0.020657968617921074, "learning_rate": 0.00048761328370227773, "loss": 2.5099, "step": 171 }, { "epoch": 0.52, "grad_norm": 0.020110997895501886, "learning_rate": 0.0004826603004019182, "loss": 2.5163, "step": 172 }, { "epoch": 0.53, "grad_norm": 0.021049174068166514, "learning_rate": 0.0004777090201195486, "loss": 2.5372, "step": 173 }, { "epoch": 0.53, "grad_norm": 0.02117182813897076, "learning_rate": 0.00047275992914498865, "loss": 2.5022, "step": 174 }, { "epoch": 0.53, "grad_norm": 0.020475434729428973, "learning_rate": 0.0004678135135530355, "loss": 2.5626, "step": 175 }, { "epoch": 0.54, "grad_norm": 0.01948270633034859, "learning_rate": 0.0004628702591557237, "loss": 2.5286, "step": 176 }, { "epoch": 0.54, "grad_norm": 0.020212144683261414, "learning_rate": 0.00045793065145461064, "loss": 2.5305, "step": 177 }, { "epoch": 0.54, "grad_norm": 0.020337213413087113, "learning_rate": 0.00045299517559309457, "loss": 2.5624, "step": 178 }, { "epoch": 0.55, "grad_norm": 0.02101629597235714, "learning_rate": 0.00044806431630876436, "loss": 2.5109, "step": 179 }, { "epoch": 0.55, "grad_norm": 0.02099914834706713, "learning_rate": 0.00044313855788579234, "loss": 2.5702, "step": 180 }, { "epoch": 0.55, "grad_norm": 0.020635070288327804, "learning_rate": 0.0004382183841073698, "loss": 2.5483, "step": 181 }, { "epoch": 0.56, "grad_norm": 0.019941103414359743, "learning_rate": 0.00043330427820819256, "loss": 2.5722, "step": 182 }, { "epoch": 0.56, "grad_norm": 0.032674916215766084, "learning_rate": 0.0004283967228269992, "loss": 2.5775, "step": 183 }, { "epoch": 0.56, "grad_norm": 0.021496384704403824, "learning_rate": 0.00042349619995917057, "loss": 2.5856, "step": 184 }, { "epoch": 0.56, "grad_norm": 0.030321207463772364, "learning_rate": 0.0004186031909093884, "loss": 2.5618, "step": 185 }, { "epoch": 0.57, "grad_norm": 0.02175980169134122, "learning_rate": 0.00041371817624436577, "loss": 2.5437, "step": 186 }, { "epoch": 0.57, "grad_norm": 0.020391275857096873, "learning_rate": 0.0004088416357456471, "loss": 2.5338, "step": 187 }, { "epoch": 0.57, "grad_norm": 0.022437463329837284, "learning_rate": 0.00040397404836248684, "loss": 2.5812, "step": 188 }, { "epoch": 0.58, "grad_norm": 0.02129701651167051, "learning_rate": 0.0003991158921648096, "loss": 2.5453, "step": 189 }, { "epoch": 0.58, "grad_norm": 0.0238633278616985, "learning_rate": 0.00039426764429625693, "loss": 2.5233, "step": 190 }, { "epoch": 0.58, "grad_norm": 0.02998116183042863, "learning_rate": 0.0003894297809273237, "loss": 2.5646, "step": 191 }, { "epoch": 0.59, "grad_norm": 0.022238039030238917, "learning_rate": 0.00038460277720859116, "loss": 2.5242, "step": 192 }, { "epoch": 0.59, "grad_norm": 0.0231977730809198, "learning_rate": 0.00037978710722406115, "loss": 2.5681, "step": 193 }, { "epoch": 0.59, "grad_norm": 1.3831873669660701, "learning_rate": 0.0003749832439445925, "loss": 2.6152, "step": 194 }, { "epoch": 0.59, "grad_norm": 0.024837482288635006, "learning_rate": 0.0003701916591814497, "loss": 2.5305, "step": 195 }, { "epoch": 0.6, "grad_norm": 0.02460819770267967, "learning_rate": 0.00036541282353996275, "loss": 2.5512, "step": 196 }, { "epoch": 0.6, "grad_norm": 0.021382499301954485, "learning_rate": 0.0003606472063733067, "loss": 2.5726, "step": 197 }, { "epoch": 0.6, "grad_norm": 0.022843480204416112, "learning_rate": 0.00035589527573640534, "loss": 2.5911, "step": 198 }, { "epoch": 0.61, "grad_norm": 0.0217433912155617, "learning_rate": 0.0003511574983399599, "loss": 2.5437, "step": 199 }, { "epoch": 0.61, "grad_norm": 0.022341313083280037, "learning_rate": 0.0003464343395046117, "loss": 2.5565, "step": 200 }, { "epoch": 0.61, "grad_norm": 0.023326386453843787, "learning_rate": 0.0003417262631152409, "loss": 2.5862, "step": 201 }, { "epoch": 0.62, "grad_norm": 0.022197350352034496, "learning_rate": 0.0003370337315754052, "loss": 2.514, "step": 202 }, { "epoch": 0.62, "grad_norm": 0.022582353309783542, "learning_rate": 0.000332357205761926, "loss": 2.5331, "step": 203 }, { "epoch": 0.62, "grad_norm": 0.022070023826541343, "learning_rate": 0.00032769714497962233, "loss": 2.5413, "step": 204 }, { "epoch": 0.63, "grad_norm": 0.02402408808891166, "learning_rate": 0.00032305400691620125, "loss": 2.5431, "step": 205 }, { "epoch": 0.63, "grad_norm": 0.022714950946985384, "learning_rate": 0.0003184282475973052, "loss": 2.5615, "step": 206 }, { "epoch": 0.63, "grad_norm": 0.02424470988882824, "learning_rate": 0.00031382032134172393, "loss": 2.5785, "step": 207 }, { "epoch": 0.63, "grad_norm": 0.02175217108726291, "learning_rate": 0.00030923068071677377, "loss": 2.5665, "step": 208 }, { "epoch": 0.64, "grad_norm": 0.022606633086133584, "learning_rate": 0.00030465977649384815, "loss": 2.5634, "step": 209 }, { "epoch": 0.64, "grad_norm": 0.021011727388658842, "learning_rate": 0.00030010805760414546, "loss": 2.5054, "step": 210 }, { "epoch": 0.64, "grad_norm": 0.021639315653958227, "learning_rate": 0.00029557597109457727, "loss": 2.4963, "step": 211 }, { "epoch": 0.65, "grad_norm": 0.022615720963103284, "learning_rate": 0.0002910639620838619, "loss": 2.5688, "step": 212 }, { "epoch": 0.65, "grad_norm": 0.02342914831258654, "learning_rate": 0.00028657247371880666, "loss": 2.5139, "step": 213 }, { "epoch": 0.65, "grad_norm": 0.021794224872284197, "learning_rate": 0.00028210194713078405, "loss": 2.566, "step": 214 }, { "epoch": 0.66, "grad_norm": 0.02372720731132013, "learning_rate": 0.00027765282139240676, "loss": 2.5438, "step": 215 }, { "epoch": 0.66, "grad_norm": 0.02308128114302715, "learning_rate": 0.00027322553347440364, "loss": 2.4794, "step": 216 }, { "epoch": 0.66, "grad_norm": 0.021725513095176348, "learning_rate": 0.0002688205182027026, "loss": 2.5699, "step": 217 }, { "epoch": 0.66, "grad_norm": 0.020065121737088774, "learning_rate": 0.00026443820821572497, "loss": 2.4926, "step": 218 }, { "epoch": 0.67, "grad_norm": 0.022790401677010647, "learning_rate": 0.00026007903392189256, "loss": 2.4887, "step": 219 }, { "epoch": 0.67, "grad_norm": 0.022412544948915192, "learning_rate": 0.00025574342345735654, "loss": 2.4844, "step": 220 }, { "epoch": 0.67, "grad_norm": 0.024482726567851745, "learning_rate": 0.0002514318026439469, "loss": 2.5568, "step": 221 }, { "epoch": 0.68, "grad_norm": 0.0239324452654226, "learning_rate": 0.0002471445949473512, "loss": 2.5034, "step": 222 }, { "epoch": 0.68, "grad_norm": 0.02099692498226099, "learning_rate": 0.00024288222143552346, "loss": 2.5125, "step": 223 }, { "epoch": 0.68, "grad_norm": 0.0235532129320808, "learning_rate": 0.00023864510073732915, "loss": 2.5117, "step": 224 }, { "epoch": 0.69, "grad_norm": 0.02514350938291889, "learning_rate": 0.00023443364900142949, "loss": 2.5651, "step": 225 }, { "epoch": 0.69, "grad_norm": 0.02429410977297701, "learning_rate": 0.00023024827985540958, "loss": 2.5001, "step": 226 }, { "epoch": 0.69, "grad_norm": 0.02368660244630603, "learning_rate": 0.00022608940436515368, "loss": 2.5307, "step": 227 }, { "epoch": 0.7, "grad_norm": 0.024787276626786165, "learning_rate": 0.00022195743099447256, "loss": 2.5478, "step": 228 }, { "epoch": 0.7, "grad_norm": 0.02513065999138821, "learning_rate": 0.0002178527655649868, "loss": 2.5136, "step": 229 }, { "epoch": 0.7, "grad_norm": 0.023180635989223257, "learning_rate": 0.00021377581121626778, "loss": 2.51, "step": 230 }, { "epoch": 0.7, "grad_norm": 0.02696456869362307, "learning_rate": 0.00020972696836624437, "loss": 2.5263, "step": 231 }, { "epoch": 0.71, "grad_norm": 0.022578451161665822, "learning_rate": 0.00020570663467187555, "loss": 2.4869, "step": 232 }, { "epoch": 0.71, "grad_norm": 0.0224881916816771, "learning_rate": 0.00020171520499009455, "loss": 2.4786, "step": 233 }, { "epoch": 0.71, "grad_norm": 0.025839833730156827, "learning_rate": 0.00019775307133902808, "loss": 2.5469, "step": 234 }, { "epoch": 0.72, "grad_norm": 0.02365555522712872, "learning_rate": 0.00019382062285949414, "loss": 2.5435, "step": 235 }, { "epoch": 0.72, "grad_norm": 0.023382640821847607, "learning_rate": 0.00018991824577678268, "loss": 2.4635, "step": 236 }, { "epoch": 0.72, "grad_norm": 0.02191815592839163, "learning_rate": 0.0001860463233627225, "loss": 2.5639, "step": 237 }, { "epoch": 0.73, "grad_norm": 0.02310077430861472, "learning_rate": 0.00018220523589803806, "loss": 2.4839, "step": 238 }, { "epoch": 0.73, "grad_norm": 0.02494213141918476, "learning_rate": 0.00017839536063500052, "loss": 2.5139, "step": 239 }, { "epoch": 0.73, "grad_norm": 0.025239258961502305, "learning_rate": 0.00017461707176037546, "loss": 2.5078, "step": 240 }, { "epoch": 0.74, "grad_norm": 0.02267677070954696, "learning_rate": 0.00017087074035867283, "loss": 2.5254, "step": 241 }, { "epoch": 0.74, "grad_norm": 0.024599189724574425, "learning_rate": 0.00016715673437570034, "loss": 2.5807, "step": 242 }, { "epoch": 0.74, "grad_norm": 0.023216034005314606, "learning_rate": 0.0001634754185824256, "loss": 2.5371, "step": 243 }, { "epoch": 0.74, "grad_norm": 0.02414260114722159, "learning_rate": 0.00015982715453915082, "loss": 2.5373, "step": 244 }, { "epoch": 0.75, "grad_norm": 0.02443242612238601, "learning_rate": 0.0001562123005600009, "loss": 2.5237, "step": 245 }, { "epoch": 0.75, "grad_norm": 0.02324926523813888, "learning_rate": 0.00015263121167773358, "loss": 2.5025, "step": 246 }, { "epoch": 0.75, "grad_norm": 0.02441320875098146, "learning_rate": 0.00014908423960886808, "loss": 2.581, "step": 247 }, { "epoch": 0.76, "grad_norm": 0.025546181244876925, "learning_rate": 0.00014557173271914213, "loss": 2.5928, "step": 248 }, { "epoch": 0.76, "grad_norm": 0.023743543298364814, "learning_rate": 0.0001420940359892971, "loss": 2.5503, "step": 249 }, { "epoch": 0.76, "grad_norm": 0.02346770448694418, "learning_rate": 0.00013865149098119577, "loss": 2.5245, "step": 250 }, { "epoch": 0.77, "grad_norm": 0.023286036875758236, "learning_rate": 0.00013524443580427564, "loss": 2.5717, "step": 251 }, { "epoch": 0.77, "grad_norm": 0.024579865623865476, "learning_rate": 0.00013187320508234208, "loss": 2.5259, "step": 252 }, { "epoch": 0.77, "grad_norm": 0.022907935808609867, "learning_rate": 0.00012853812992070257, "loss": 2.4928, "step": 253 }, { "epoch": 0.77, "grad_norm": 0.022534504070165762, "learning_rate": 0.00012523953787364723, "loss": 2.526, "step": 254 }, { "epoch": 0.78, "grad_norm": 0.02356875451883086, "learning_rate": 0.00012197775291227886, "loss": 2.4895, "step": 255 }, { "epoch": 0.78, "grad_norm": 0.025208951025856796, "learning_rate": 0.0001187530953926933, "loss": 2.5507, "step": 256 }, { "epoch": 0.78, "grad_norm": 0.02218054070579941, "learning_rate": 0.00011556588202451612, "loss": 2.4625, "step": 257 }, { "epoch": 0.79, "grad_norm": 0.02395294608850655, "learning_rate": 0.000112416425839797, "loss": 2.5639, "step": 258 }, { "epoch": 0.79, "grad_norm": 0.021823638739782776, "learning_rate": 0.00010930503616226495, "loss": 2.5266, "step": 259 }, { "epoch": 0.79, "grad_norm": 0.026597868541136594, "learning_rate": 0.00010623201857694837, "loss": 2.4885, "step": 260 }, { "epoch": 0.8, "grad_norm": 0.023249159762519452, "learning_rate": 0.00010319767490016197, "loss": 2.5496, "step": 261 }, { "epoch": 0.8, "grad_norm": 0.023962061282550982, "learning_rate": 0.00010020230314986395, "loss": 2.5455, "step": 262 }, { "epoch": 0.8, "grad_norm": 0.026815554632171338, "learning_rate": 9.724619751638597e-05, "loss": 2.5542, "step": 263 }, { "epoch": 0.81, "grad_norm": 0.02519451505343548, "learning_rate": 9.432964833353947e-05, "loss": 2.5337, "step": 264 }, { "epoch": 0.81, "grad_norm": 0.02636032377682285, "learning_rate": 9.145294205010057e-05, "loss": 2.5442, "step": 265 }, { "epoch": 0.81, "grad_norm": 0.02381095096499113, "learning_rate": 8.861636120167632e-05, "loss": 2.5356, "step": 266 }, { "epoch": 0.81, "grad_norm": 0.025026865092621895, "learning_rate": 8.582018438295552e-05, "loss": 2.518, "step": 267 }, { "epoch": 0.82, "grad_norm": 1.000001217827216, "learning_rate": 8.306468622034664e-05, "loss": 2.5153, "step": 268 }, { "epoch": 0.82, "grad_norm": 1.4142135623730951, "learning_rate": 8.035013734500557e-05, "loss": 0.0, "step": 269 }, { "epoch": 0.82, "grad_norm": 1.4142135623730951, "learning_rate": 7.767680436625513e-05, "loss": 0.0, "step": 270 }, { "epoch": 0.83, "grad_norm": 1.4142135623730951, "learning_rate": 7.504494984540034e-05, "loss": 0.0, "step": 271 }, { "epoch": 0.83, "grad_norm": 1.4142135623730951, "learning_rate": 7.245483226994093e-05, "loss": 0.0, "step": 272 }, { "epoch": 0.83, "grad_norm": 1.4142135623730951, "learning_rate": 6.990670602818411e-05, "loss": 0.0, "step": 273 }, { "epoch": 0.84, "grad_norm": 1.4142135623730951, "learning_rate": 6.740082138425962e-05, "loss": 0.0, "step": 274 }, { "epoch": 0.84, "grad_norm": 1.4142135623730951, "learning_rate": 6.493742445354011e-05, "loss": 0.0, "step": 275 } ], "logging_steps": 1.0, "max_steps": 327, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "total_flos": 4.881335385730318e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }