|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7624857033930614, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": false, |
|
"is_world_process_zero": false, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 26.258708272339597, |
|
"learning_rate": 0.0001, |
|
"loss": 6.0469, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 20.031528084144362, |
|
"learning_rate": 0.0002, |
|
"loss": 6.0524, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.323112879274246, |
|
"learning_rate": 0.0003, |
|
"loss": 6.0052, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.147754138762137, |
|
"learning_rate": 0.0004, |
|
"loss": 5.7581, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4117904450662409, |
|
"learning_rate": 0.0005, |
|
"loss": 5.5959, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.450165632053372, |
|
"learning_rate": 0.0006, |
|
"loss": 5.3926, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3379034522061206, |
|
"learning_rate": 0.0007, |
|
"loss": 5.2432, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3149419664848485, |
|
"learning_rate": 0.0008, |
|
"loss": 4.9048, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5265629850853388, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 4.8456, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2248009557702129, |
|
"learning_rate": 0.001, |
|
"loss": 4.4037, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5753342093655753, |
|
"learning_rate": 0.0009999754462587395, |
|
"loss": 3.7762, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0959671581261972, |
|
"learning_rate": 0.0009999017874465026, |
|
"loss": 3.5411, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8383646382276924, |
|
"learning_rate": 0.0009997790307976872, |
|
"loss": 3.4374, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6879126623626883, |
|
"learning_rate": 0.0009996071883688333, |
|
"loss": 3.3495, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.48057277440632934, |
|
"learning_rate": 0.000999386277037439, |
|
"loss": 3.3236, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5193770648047042, |
|
"learning_rate": 0.0009991163185003027, |
|
"loss": 3.194, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.34046016814570407, |
|
"learning_rate": 0.0009987973392713932, |
|
"loss": 3.008, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2969743503479429, |
|
"learning_rate": 0.0009984293706792437, |
|
"loss": 3.0682, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.22754131045377113, |
|
"learning_rate": 0.0009980124488638772, |
|
"loss": 2.92, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.15880784203685044, |
|
"learning_rate": 0.000997546614773255, |
|
"loss": 2.881, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.10804346892890891, |
|
"learning_rate": 0.0009970319141592559, |
|
"loss": 2.8386, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0789962600177391, |
|
"learning_rate": 0.0009964683975731828, |
|
"loss": 2.8704, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0678766718481922, |
|
"learning_rate": 0.0009958561203607973, |
|
"loss": 2.8326, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.15129601777209895, |
|
"learning_rate": 0.000995195142656885, |
|
"loss": 2.8444, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.23876681297628152, |
|
"learning_rate": 0.0009944855293793475, |
|
"loss": 2.8809, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2458680275413572, |
|
"learning_rate": 0.0009937273502228282, |
|
"loss": 2.8308, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2646415320413548, |
|
"learning_rate": 0.0009929206796518663, |
|
"loss": 2.8296, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1855177204894958, |
|
"learning_rate": 0.0009920655968935837, |
|
"loss": 2.8302, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1692458314027983, |
|
"learning_rate": 0.0009911621859299041, |
|
"loss": 2.7798, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.0887107703024984, |
|
"learning_rate": 0.000990210535489303, |
|
"loss": 2.8246, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.05971709937694414, |
|
"learning_rate": 0.0009892107390380958, |
|
"loss": 2.777, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.04203098841999937, |
|
"learning_rate": 0.0009881628947712555, |
|
"loss": 2.7673, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.03708904853213128, |
|
"learning_rate": 0.0009870671056027706, |
|
"loss": 2.7486, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.03528212659332326, |
|
"learning_rate": 0.0009859234791555354, |
|
"loss": 2.7886, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.03018922862288316, |
|
"learning_rate": 0.000984732127750782, |
|
"loss": 2.7327, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.03686815342528168, |
|
"learning_rate": 0.0009834931683970467, |
|
"loss": 2.7474, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.033172131047386, |
|
"learning_rate": 0.0009822067227786794, |
|
"loss": 2.7345, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.0373016703958135, |
|
"learning_rate": 0.000980872917243891, |
|
"loss": 2.7655, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.03688389598963858, |
|
"learning_rate": 0.0009794918827923458, |
|
"loss": 2.7393, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.038870114526809094, |
|
"learning_rate": 0.000978063755062294, |
|
"loss": 2.7611, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04038306990198406, |
|
"learning_rate": 0.0009765886743172511, |
|
"loss": 2.7386, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.036670767168349196, |
|
"learning_rate": 0.0009750667854322206, |
|
"loss": 2.6912, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.03291379266422108, |
|
"learning_rate": 0.0009734982378794661, |
|
"loss": 2.7719, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.03542095069122837, |
|
"learning_rate": 0.0009718831857138308, |
|
"loss": 2.7095, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.03556844983258325, |
|
"learning_rate": 0.0009702217875576068, |
|
"loss": 2.7435, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.03705512589107, |
|
"learning_rate": 0.0009685142065849555, |
|
"loss": 2.713, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.027028911527830556, |
|
"learning_rate": 0.0009667606105058828, |
|
"loss": 2.6472, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.028600789964649966, |
|
"learning_rate": 0.0009649611715497661, |
|
"loss": 2.6931, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.029116613327395607, |
|
"learning_rate": 0.0009631160664484398, |
|
"loss": 2.7082, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.02557746875550266, |
|
"learning_rate": 0.0009612254764188368, |
|
"loss": 2.6707, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.0256723041758138, |
|
"learning_rate": 0.0009592895871451908, |
|
"loss": 2.6608, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.02295300029667766, |
|
"learning_rate": 0.000957308588760799, |
|
"loss": 2.6942, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.02203986352323152, |
|
"learning_rate": 0.0009552826758293487, |
|
"loss": 2.6441, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.02299784546740539, |
|
"learning_rate": 0.0009532120473258075, |
|
"loss": 2.6728, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.023913877734993542, |
|
"learning_rate": 0.0009510969066168813, |
|
"loss": 2.6924, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.020131800174103105, |
|
"learning_rate": 0.0009489374614410414, |
|
"loss": 2.6151, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.02106884604663375, |
|
"learning_rate": 0.0009467339238881198, |
|
"loss": 2.6413, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.02440845171243592, |
|
"learning_rate": 0.0009444865103784803, |
|
"loss": 2.6663, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.019024075041881073, |
|
"learning_rate": 0.0009421954416417624, |
|
"loss": 2.6063, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.019667612528268406, |
|
"learning_rate": 0.0009398609426952018, |
|
"loss": 2.6697, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.020324139148677475, |
|
"learning_rate": 0.0009374832428215309, |
|
"loss": 2.6739, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.02049609873784361, |
|
"learning_rate": 0.00093506257554646, |
|
"loss": 2.6115, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.022068793706639576, |
|
"learning_rate": 0.0009325991786157404, |
|
"loss": 2.6537, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.02119215165943166, |
|
"learning_rate": 0.0009300932939718159, |
|
"loss": 2.6668, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.020470851870879974, |
|
"learning_rate": 0.000927545167730059, |
|
"loss": 2.6495, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.020239015102427586, |
|
"learning_rate": 0.0009249550501545996, |
|
"loss": 2.6229, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.02006561036060577, |
|
"learning_rate": 0.000922323195633745, |
|
"loss": 2.6541, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.020517458730459407, |
|
"learning_rate": 0.0009196498626549943, |
|
"loss": 2.6263, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.020546751672942867, |
|
"learning_rate": 0.0009169353137796533, |
|
"loss": 2.6868, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.020795862822058586, |
|
"learning_rate": 0.0009141798156170446, |
|
"loss": 2.6376, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.023603442107788076, |
|
"learning_rate": 0.0009113836387983239, |
|
"loss": 2.6679, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.020593236730693215, |
|
"learning_rate": 0.0009085470579498995, |
|
"loss": 2.628, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.0185506077207348, |
|
"learning_rate": 0.0009056703516664606, |
|
"loss": 2.6315, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.02046637689680032, |
|
"learning_rate": 0.0009027538024836141, |
|
"loss": 2.6227, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.018026412977487194, |
|
"learning_rate": 0.0008997976968501361, |
|
"loss": 2.6062, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.017993838807838707, |
|
"learning_rate": 0.000896802325099838, |
|
"loss": 2.5869, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.01897876413125471, |
|
"learning_rate": 0.0008937679814230517, |
|
"loss": 2.6046, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.019221228785349705, |
|
"learning_rate": 0.000890694963837735, |
|
"loss": 2.6163, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.01999062056670978, |
|
"learning_rate": 0.0008875835741602029, |
|
"loss": 2.6624, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.01688098771768354, |
|
"learning_rate": 0.0008844341179754839, |
|
"loss": 2.597, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.021679649830905732, |
|
"learning_rate": 0.0008812469046073068, |
|
"loss": 2.6252, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.01976703829526433, |
|
"learning_rate": 0.0008780222470877213, |
|
"loss": 2.6547, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.01721216059701167, |
|
"learning_rate": 0.000874760462126353, |
|
"loss": 2.5894, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.017796757505650348, |
|
"learning_rate": 0.0008714618700792976, |
|
"loss": 2.6428, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.017516085193709605, |
|
"learning_rate": 0.0008681267949176579, |
|
"loss": 2.6267, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.019752354059471497, |
|
"learning_rate": 0.0008647555641957244, |
|
"loss": 2.6172, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.01942107114850748, |
|
"learning_rate": 0.0008613485090188043, |
|
"loss": 2.6469, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.021049275004733702, |
|
"learning_rate": 0.000857905964010703, |
|
"loss": 2.6337, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.017375996512513512, |
|
"learning_rate": 0.0008544282672808579, |
|
"loss": 2.6066, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.020054221841685173, |
|
"learning_rate": 0.0008509157603911319, |
|
"loss": 2.6293, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.016754816692027007, |
|
"learning_rate": 0.0008473687883222664, |
|
"loss": 2.553, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.018264517242335677, |
|
"learning_rate": 0.0008437876994399991, |
|
"loss": 2.582, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.01738231752273472, |
|
"learning_rate": 0.0008401728454608494, |
|
"loss": 2.6246, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.017582673986843284, |
|
"learning_rate": 0.0008365245814175744, |
|
"loss": 2.5377, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.01841568278108112, |
|
"learning_rate": 0.0008328432656242997, |
|
"loss": 2.6107, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.01806489585852763, |
|
"learning_rate": 0.0008291292596413272, |
|
"loss": 2.5388, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.01910768409990766, |
|
"learning_rate": 0.0008253829282396245, |
|
"loss": 2.6252, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.019034711943532324, |
|
"learning_rate": 0.0008216046393649996, |
|
"loss": 2.6193, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.020284737834854932, |
|
"learning_rate": 0.0008177947641019621, |
|
"loss": 2.6149, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05158744985589759, |
|
"learning_rate": 0.0008139536766372775, |
|
"loss": 2.6072, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.018922120101005344, |
|
"learning_rate": 0.0008100817542232173, |
|
"loss": 2.6032, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.019044369144979888, |
|
"learning_rate": 0.000806179377140506, |
|
"loss": 2.5879, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.017582902915047, |
|
"learning_rate": 0.000802246928660972, |
|
"loss": 2.6069, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.019578808618249007, |
|
"learning_rate": 0.0007982847950099055, |
|
"loss": 2.5965, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.018236739907291074, |
|
"learning_rate": 0.0007942933653281245, |
|
"loss": 2.5707, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.018258178535900983, |
|
"learning_rate": 0.0007902730316337556, |
|
"loss": 2.5382, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.022391092471974905, |
|
"learning_rate": 0.0007862241887837322, |
|
"loss": 2.6118, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.016821289155497087, |
|
"learning_rate": 0.0007821472344350131, |
|
"loss": 2.5829, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.019386550472160386, |
|
"learning_rate": 0.0007780425690055274, |
|
"loss": 2.6255, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1772374509222317, |
|
"learning_rate": 0.0007739105956348464, |
|
"loss": 2.5617, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.020123188408530047, |
|
"learning_rate": 0.0007697517201445905, |
|
"loss": 2.6127, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.0217268258079, |
|
"learning_rate": 0.0007655663509985707, |
|
"loss": 2.5991, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.018981113976254002, |
|
"learning_rate": 0.0007613548992626711, |
|
"loss": 2.6047, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.01878803361275613, |
|
"learning_rate": 0.0007571177785644766, |
|
"loss": 2.5482, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.018908774218863146, |
|
"learning_rate": 0.0007528554050526488, |
|
"loss": 2.6141, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.019547294966543106, |
|
"learning_rate": 0.0007485681973560532, |
|
"loss": 2.5407, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.018739519276120584, |
|
"learning_rate": 0.0007442565765426436, |
|
"loss": 2.6212, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.018979109952080055, |
|
"learning_rate": 0.0007399209660781074, |
|
"loss": 2.5721, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0175358024051344, |
|
"learning_rate": 0.0007355617917842751, |
|
"loss": 2.577, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.018692342003597935, |
|
"learning_rate": 0.0007311794817972975, |
|
"loss": 2.5944, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.019650294357122313, |
|
"learning_rate": 0.0007267744665255965, |
|
"loss": 2.608, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.01787007786983172, |
|
"learning_rate": 0.0007223471786075934, |
|
"loss": 2.5898, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.018327854729029175, |
|
"learning_rate": 0.0007178980528692161, |
|
"loss": 2.5641, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.01819571331500734, |
|
"learning_rate": 0.0007134275262811934, |
|
"loss": 2.5724, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.016879527517399177, |
|
"learning_rate": 0.0007089360379161381, |
|
"loss": 2.6016, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.018893461615821387, |
|
"learning_rate": 0.0007044240289054227, |
|
"loss": 2.5726, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.018652185039036564, |
|
"learning_rate": 0.0006998919423958547, |
|
"loss": 2.5559, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.020673958617829544, |
|
"learning_rate": 0.0006953402235061519, |
|
"loss": 2.634, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.01939109964021446, |
|
"learning_rate": 0.0006907693192832263, |
|
"loss": 2.568, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.018062900613076246, |
|
"learning_rate": 0.0006861796786582761, |
|
"loss": 2.554, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.018609687828424634, |
|
"learning_rate": 0.0006815717524026949, |
|
"loss": 2.5842, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.019554131820221306, |
|
"learning_rate": 0.0006769459930837989, |
|
"loss": 2.632, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.017713236616086883, |
|
"learning_rate": 0.0006723028550203778, |
|
"loss": 2.5547, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.019353124709199913, |
|
"learning_rate": 0.000667642794238074, |
|
"loss": 2.6321, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.01984763650797756, |
|
"learning_rate": 0.0006629662684245948, |
|
"loss": 2.5521, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.0194768410660429, |
|
"learning_rate": 0.0006582737368847592, |
|
"loss": 2.564, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.02008541514106773, |
|
"learning_rate": 0.0006535656604953884, |
|
"loss": 2.6406, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.019023120632534838, |
|
"learning_rate": 0.0006488425016600402, |
|
"loss": 2.5492, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.019894786782134775, |
|
"learning_rate": 0.0006441047242635947, |
|
"loss": 2.598, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.02011873025218978, |
|
"learning_rate": 0.0006393527936266933, |
|
"loss": 2.5729, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.019172173916513022, |
|
"learning_rate": 0.0006345871764600374, |
|
"loss": 2.5594, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.019721366832981004, |
|
"learning_rate": 0.0006298083408185502, |
|
"loss": 2.5915, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.020228287976845075, |
|
"learning_rate": 0.0006250167560554076, |
|
"loss": 2.5898, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.020882340174208855, |
|
"learning_rate": 0.0006202128927759391, |
|
"loss": 2.6273, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.02081451600660678, |
|
"learning_rate": 0.0006153972227914089, |
|
"loss": 2.595, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.01941546346347613, |
|
"learning_rate": 0.0006105702190726764, |
|
"loss": 2.5732, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.02166634263027907, |
|
"learning_rate": 0.000605732355703743, |
|
"loss": 2.5453, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.019573148109730217, |
|
"learning_rate": 0.0006008841078351903, |
|
"loss": 2.5429, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.01903540723775177, |
|
"learning_rate": 0.0005960259516375134, |
|
"loss": 2.5388, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.02051218709860799, |
|
"learning_rate": 0.0005911583642543531, |
|
"loss": 2.5793, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.02101151521755854, |
|
"learning_rate": 0.0005862818237556344, |
|
"loss": 2.5677, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.02162618990843192, |
|
"learning_rate": 0.0005813968090906116, |
|
"loss": 2.5635, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.018935934203135756, |
|
"learning_rate": 0.0005765038000408295, |
|
"loss": 2.5174, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.021177497387846266, |
|
"learning_rate": 0.0005716032771730008, |
|
"loss": 2.5266, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.019633074877257486, |
|
"learning_rate": 0.0005666957217918076, |
|
"loss": 2.5909, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.020321617227441156, |
|
"learning_rate": 0.0005617816158926302, |
|
"loss": 2.5727, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.02171314000863455, |
|
"learning_rate": 0.0005568614421142077, |
|
"loss": 2.5728, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.02036197464612551, |
|
"learning_rate": 0.0005519356836912357, |
|
"loss": 2.5546, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.021214591076630044, |
|
"learning_rate": 0.0005470048244069055, |
|
"loss": 2.6089, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.021380356809754895, |
|
"learning_rate": 0.0005420693485453892, |
|
"loss": 2.5701, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.020301144411914027, |
|
"learning_rate": 0.0005371297408442765, |
|
"loss": 2.5783, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.02317064813036681, |
|
"learning_rate": 0.0005321864864469646, |
|
"loss": 2.5787, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.020981576388978553, |
|
"learning_rate": 0.0005272400708550113, |
|
"loss": 2.5657, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.01972062092753295, |
|
"learning_rate": 0.0005222909798804515, |
|
"loss": 2.5941, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.022818553102453083, |
|
"learning_rate": 0.0005173396995980818, |
|
"loss": 2.5567, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.021333416992185616, |
|
"learning_rate": 0.0005123867162977224, |
|
"loss": 2.5552, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.020142143404330078, |
|
"learning_rate": 0.0005074325164364548, |
|
"loss": 2.5305, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.022025222387284903, |
|
"learning_rate": 0.0005024775865908451, |
|
"loss": 2.59, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.020907939242901075, |
|
"learning_rate": 0.000497522413409155, |
|
"loss": 2.5452, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.022927802322477854, |
|
"learning_rate": 0.0004925674835635454, |
|
"loss": 2.5792, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.020657968617921074, |
|
"learning_rate": 0.00048761328370227773, |
|
"loss": 2.5099, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.020110997895501886, |
|
"learning_rate": 0.0004826603004019182, |
|
"loss": 2.5163, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.021049174068166514, |
|
"learning_rate": 0.0004777090201195486, |
|
"loss": 2.5372, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.02117182813897076, |
|
"learning_rate": 0.00047275992914498865, |
|
"loss": 2.5022, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.020475434729428973, |
|
"learning_rate": 0.0004678135135530355, |
|
"loss": 2.5626, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.01948270633034859, |
|
"learning_rate": 0.0004628702591557237, |
|
"loss": 2.5286, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.020212144683261414, |
|
"learning_rate": 0.00045793065145461064, |
|
"loss": 2.5305, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.020337213413087113, |
|
"learning_rate": 0.00045299517559309457, |
|
"loss": 2.5624, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.02101629597235714, |
|
"learning_rate": 0.00044806431630876436, |
|
"loss": 2.5109, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.02099914834706713, |
|
"learning_rate": 0.00044313855788579234, |
|
"loss": 2.5702, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.020635070288327804, |
|
"learning_rate": 0.0004382183841073698, |
|
"loss": 2.5483, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.019941103414359743, |
|
"learning_rate": 0.00043330427820819256, |
|
"loss": 2.5722, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.032674916215766084, |
|
"learning_rate": 0.0004283967228269992, |
|
"loss": 2.5775, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.021496384704403824, |
|
"learning_rate": 0.00042349619995917057, |
|
"loss": 2.5856, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.030321207463772364, |
|
"learning_rate": 0.0004186031909093884, |
|
"loss": 2.5618, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.02175980169134122, |
|
"learning_rate": 0.00041371817624436577, |
|
"loss": 2.5437, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.020391275857096873, |
|
"learning_rate": 0.0004088416357456471, |
|
"loss": 2.5338, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.022437463329837284, |
|
"learning_rate": 0.00040397404836248684, |
|
"loss": 2.5812, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.02129701651167051, |
|
"learning_rate": 0.0003991158921648096, |
|
"loss": 2.5453, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.0238633278616985, |
|
"learning_rate": 0.00039426764429625693, |
|
"loss": 2.5233, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.02998116183042863, |
|
"learning_rate": 0.0003894297809273237, |
|
"loss": 2.5646, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.022238039030238917, |
|
"learning_rate": 0.00038460277720859116, |
|
"loss": 2.5242, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.0231977730809198, |
|
"learning_rate": 0.00037978710722406115, |
|
"loss": 2.5681, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.3831873669660701, |
|
"learning_rate": 0.0003749832439445925, |
|
"loss": 2.6152, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.024837482288635006, |
|
"learning_rate": 0.0003701916591814497, |
|
"loss": 2.5305, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.02460819770267967, |
|
"learning_rate": 0.00036541282353996275, |
|
"loss": 2.5512, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.021382499301954485, |
|
"learning_rate": 0.0003606472063733067, |
|
"loss": 2.5726, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.022843480204416112, |
|
"learning_rate": 0.00035589527573640534, |
|
"loss": 2.5911, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0217433912155617, |
|
"learning_rate": 0.0003511574983399599, |
|
"loss": 2.5437, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.022341313083280037, |
|
"learning_rate": 0.0003464343395046117, |
|
"loss": 2.5565, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.023326386453843787, |
|
"learning_rate": 0.0003417262631152409, |
|
"loss": 2.5862, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.022197350352034496, |
|
"learning_rate": 0.0003370337315754052, |
|
"loss": 2.514, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.022582353309783542, |
|
"learning_rate": 0.000332357205761926, |
|
"loss": 2.5331, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.022070023826541343, |
|
"learning_rate": 0.00032769714497962233, |
|
"loss": 2.5413, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.02402408808891166, |
|
"learning_rate": 0.00032305400691620125, |
|
"loss": 2.5431, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.022714950946985384, |
|
"learning_rate": 0.0003184282475973052, |
|
"loss": 2.5615, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.02424470988882824, |
|
"learning_rate": 0.00031382032134172393, |
|
"loss": 2.5785, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.02175217108726291, |
|
"learning_rate": 0.00030923068071677377, |
|
"loss": 2.5665, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.022606633086133584, |
|
"learning_rate": 0.00030465977649384815, |
|
"loss": 2.5634, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.021011727388658842, |
|
"learning_rate": 0.00030010805760414546, |
|
"loss": 2.5054, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.021639315653958227, |
|
"learning_rate": 0.00029557597109457727, |
|
"loss": 2.4963, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.022615720963103284, |
|
"learning_rate": 0.0002910639620838619, |
|
"loss": 2.5688, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.02342914831258654, |
|
"learning_rate": 0.00028657247371880666, |
|
"loss": 2.5139, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.021794224872284197, |
|
"learning_rate": 0.00028210194713078405, |
|
"loss": 2.566, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.02372720731132013, |
|
"learning_rate": 0.00027765282139240676, |
|
"loss": 2.5438, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.02308128114302715, |
|
"learning_rate": 0.00027322553347440364, |
|
"loss": 2.4794, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.021725513095176348, |
|
"learning_rate": 0.0002688205182027026, |
|
"loss": 2.5699, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.020065121737088774, |
|
"learning_rate": 0.00026443820821572497, |
|
"loss": 2.4926, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.022790401677010647, |
|
"learning_rate": 0.00026007903392189256, |
|
"loss": 2.4887, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.022412544948915192, |
|
"learning_rate": 0.00025574342345735654, |
|
"loss": 2.4844, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.024482726567851745, |
|
"learning_rate": 0.0002514318026439469, |
|
"loss": 2.5568, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.0239324452654226, |
|
"learning_rate": 0.0002471445949473512, |
|
"loss": 2.5034, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.02099692498226099, |
|
"learning_rate": 0.00024288222143552346, |
|
"loss": 2.5125, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.0235532129320808, |
|
"learning_rate": 0.00023864510073732915, |
|
"loss": 2.5117, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.02514350938291889, |
|
"learning_rate": 0.00023443364900142949, |
|
"loss": 2.5651, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.02429410977297701, |
|
"learning_rate": 0.00023024827985540958, |
|
"loss": 2.5001, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.02368660244630603, |
|
"learning_rate": 0.00022608940436515368, |
|
"loss": 2.5307, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.024787276626786165, |
|
"learning_rate": 0.00022195743099447256, |
|
"loss": 2.5478, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.02513065999138821, |
|
"learning_rate": 0.0002178527655649868, |
|
"loss": 2.5136, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.023180635989223257, |
|
"learning_rate": 0.00021377581121626778, |
|
"loss": 2.51, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.02696456869362307, |
|
"learning_rate": 0.00020972696836624437, |
|
"loss": 2.5263, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.022578451161665822, |
|
"learning_rate": 0.00020570663467187555, |
|
"loss": 2.4869, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.0224881916816771, |
|
"learning_rate": 0.00020171520499009455, |
|
"loss": 2.4786, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.025839833730156827, |
|
"learning_rate": 0.00019775307133902808, |
|
"loss": 2.5469, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.02365555522712872, |
|
"learning_rate": 0.00019382062285949414, |
|
"loss": 2.5435, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.023382640821847607, |
|
"learning_rate": 0.00018991824577678268, |
|
"loss": 2.4635, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.02191815592839163, |
|
"learning_rate": 0.0001860463233627225, |
|
"loss": 2.5639, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.02310077430861472, |
|
"learning_rate": 0.00018220523589803806, |
|
"loss": 2.4839, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.02494213141918476, |
|
"learning_rate": 0.00017839536063500052, |
|
"loss": 2.5139, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.025239258961502305, |
|
"learning_rate": 0.00017461707176037546, |
|
"loss": 2.5078, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.02267677070954696, |
|
"learning_rate": 0.00017087074035867283, |
|
"loss": 2.5254, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.024599189724574425, |
|
"learning_rate": 0.00016715673437570034, |
|
"loss": 2.5807, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.023216034005314606, |
|
"learning_rate": 0.0001634754185824256, |
|
"loss": 2.5371, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.02414260114722159, |
|
"learning_rate": 0.00015982715453915082, |
|
"loss": 2.5373, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.02443242612238601, |
|
"learning_rate": 0.0001562123005600009, |
|
"loss": 2.5237, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.02324926523813888, |
|
"learning_rate": 0.00015263121167773358, |
|
"loss": 2.5025, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.02441320875098146, |
|
"learning_rate": 0.00014908423960886808, |
|
"loss": 2.581, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.025546181244876925, |
|
"learning_rate": 0.00014557173271914213, |
|
"loss": 2.5928, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.023743543298364814, |
|
"learning_rate": 0.0001420940359892971, |
|
"loss": 2.5503, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.02346770448694418, |
|
"learning_rate": 0.00013865149098119577, |
|
"loss": 2.5245, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 327, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"total_flos": 4.435654409757655e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|