diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7499691218246943, + "epoch": 0.9999588290995924, "eval_steps": 759, - "global_step": 2277, + "global_step": 3036, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15978,6 +15978,5327 @@ "eval_samples_per_second": 3.34, "eval_steps_per_second": 1.671, "step": 2277 + }, + { + "epoch": 0.750298489027955, + "grad_norm": 2.1856529712677, + "learning_rate": 1.5075010885937351e-05, + "loss": 2.0107, + "step": 2278 + }, + { + "epoch": 0.7506278562312158, + "grad_norm": 2.1313235759735107, + "learning_rate": 1.5037385280283556e-05, + "loss": 2.5175, + "step": 2279 + }, + { + "epoch": 0.7509572234344765, + "grad_norm": 2.2917118072509766, + "learning_rate": 1.4999798375791584e-05, + "loss": 2.2043, + "step": 2280 + }, + { + "epoch": 0.7512865906377373, + "grad_norm": 2.0116426944732666, + "learning_rate": 1.4962250214067525e-05, + "loss": 2.001, + "step": 2281 + }, + { + "epoch": 0.751615957840998, + "grad_norm": 2.135652542114258, + "learning_rate": 1.4924740836674523e-05, + "loss": 2.2404, + "step": 2282 + }, + { + "epoch": 0.7519453250442587, + "grad_norm": 2.7450170516967773, + "learning_rate": 1.4887270285132849e-05, + "loss": 2.1799, + "step": 2283 + }, + { + "epoch": 0.7522746922475194, + "grad_norm": 2.9586498737335205, + "learning_rate": 1.484983860091977e-05, + "loss": 2.5526, + "step": 2284 + }, + { + "epoch": 0.7526040594507802, + "grad_norm": 2.5501980781555176, + "learning_rate": 1.4812445825469534e-05, + "loss": 2.2452, + "step": 2285 + }, + { + "epoch": 0.752933426654041, + "grad_norm": 2.3167929649353027, + "learning_rate": 1.4775092000173323e-05, + "loss": 2.1382, + "step": 2286 + }, + { + "epoch": 0.7532627938573017, + "grad_norm": 2.4865753650665283, + "learning_rate": 1.473777716637922e-05, + "loss": 2.2168, + "step": 2287 + }, + { + "epoch": 0.7535921610605624, + "grad_norm": 2.432619333267212, + "learning_rate": 1.4700501365392089e-05, + "loss": 2.033, + "step": 2288 + }, + { + "epoch": 0.7539215282638231, + "grad_norm": 2.7285501956939697, + "learning_rate": 1.4663264638473644e-05, + "loss": 2.0842, + "step": 2289 + }, + { + "epoch": 0.7542508954670839, + "grad_norm": 2.7809548377990723, + "learning_rate": 1.4626067026842317e-05, + "loss": 2.2073, + "step": 2290 + }, + { + "epoch": 0.7545802626703446, + "grad_norm": 2.550999879837036, + "learning_rate": 1.4588908571673287e-05, + "loss": 1.9711, + "step": 2291 + }, + { + "epoch": 0.7549096298736053, + "grad_norm": 2.92107892036438, + "learning_rate": 1.4551789314098342e-05, + "loss": 2.2896, + "step": 2292 + }, + { + "epoch": 0.755238997076866, + "grad_norm": 2.7185511589050293, + "learning_rate": 1.4514709295205908e-05, + "loss": 2.1395, + "step": 2293 + }, + { + "epoch": 0.7555683642801269, + "grad_norm": 3.085444211959839, + "learning_rate": 1.4477668556040936e-05, + "loss": 1.8752, + "step": 2294 + }, + { + "epoch": 0.7558977314833876, + "grad_norm": 2.5131072998046875, + "learning_rate": 1.4440667137604936e-05, + "loss": 2.0242, + "step": 2295 + }, + { + "epoch": 0.7562270986866483, + "grad_norm": 2.6334311962127686, + "learning_rate": 1.440370508085589e-05, + "loss": 2.0597, + "step": 2296 + }, + { + "epoch": 0.756556465889909, + "grad_norm": 2.336954355239868, + "learning_rate": 1.4366782426708197e-05, + "loss": 1.7077, + "step": 2297 + }, + { + "epoch": 0.7568858330931697, + "grad_norm": 2.7365548610687256, + "learning_rate": 1.432989921603265e-05, + "loss": 1.9901, + "step": 2298 + }, + { + "epoch": 0.7572152002964305, + "grad_norm": 3.0975027084350586, + "learning_rate": 1.4293055489656388e-05, + "loss": 1.7814, + "step": 2299 + }, + { + "epoch": 0.7575445674996912, + "grad_norm": 3.5924556255340576, + "learning_rate": 1.4256251288362792e-05, + "loss": 1.5984, + "step": 2300 + }, + { + "epoch": 0.757873934702952, + "grad_norm": 2.4012112617492676, + "learning_rate": 1.4219486652891568e-05, + "loss": 2.4546, + "step": 2301 + }, + { + "epoch": 0.7582033019062127, + "grad_norm": 2.0408058166503906, + "learning_rate": 1.4182761623938568e-05, + "loss": 1.8476, + "step": 2302 + }, + { + "epoch": 0.7585326691094735, + "grad_norm": 2.051454782485962, + "learning_rate": 1.4146076242155831e-05, + "loss": 2.1982, + "step": 2303 + }, + { + "epoch": 0.7588620363127342, + "grad_norm": 2.249690294265747, + "learning_rate": 1.4109430548151498e-05, + "loss": 2.0641, + "step": 2304 + }, + { + "epoch": 0.7591914035159949, + "grad_norm": 2.241851329803467, + "learning_rate": 1.4072824582489785e-05, + "loss": 2.1071, + "step": 2305 + }, + { + "epoch": 0.7595207707192556, + "grad_norm": 2.264829635620117, + "learning_rate": 1.4036258385690948e-05, + "loss": 2.2489, + "step": 2306 + }, + { + "epoch": 0.7598501379225163, + "grad_norm": 2.8766753673553467, + "learning_rate": 1.3999731998231158e-05, + "loss": 2.4871, + "step": 2307 + }, + { + "epoch": 0.7601795051257771, + "grad_norm": 2.8360114097595215, + "learning_rate": 1.396324546054259e-05, + "loss": 2.3806, + "step": 2308 + }, + { + "epoch": 0.7605088723290379, + "grad_norm": 2.4339394569396973, + "learning_rate": 1.3926798813013275e-05, + "loss": 2.1888, + "step": 2309 + }, + { + "epoch": 0.7608382395322986, + "grad_norm": 2.5512523651123047, + "learning_rate": 1.3890392095987098e-05, + "loss": 2.1649, + "step": 2310 + }, + { + "epoch": 0.7611676067355593, + "grad_norm": 2.7157366275787354, + "learning_rate": 1.3854025349763744e-05, + "loss": 2.3705, + "step": 2311 + }, + { + "epoch": 0.76149697393882, + "grad_norm": 2.924532890319824, + "learning_rate": 1.3817698614598663e-05, + "loss": 2.4532, + "step": 2312 + }, + { + "epoch": 0.7618263411420808, + "grad_norm": 2.370842456817627, + "learning_rate": 1.3781411930702965e-05, + "loss": 1.979, + "step": 2313 + }, + { + "epoch": 0.7621557083453415, + "grad_norm": 2.5691492557525635, + "learning_rate": 1.3745165338243488e-05, + "loss": 2.0097, + "step": 2314 + }, + { + "epoch": 0.7624850755486022, + "grad_norm": 2.5397489070892334, + "learning_rate": 1.370895887734266e-05, + "loss": 1.9783, + "step": 2315 + }, + { + "epoch": 0.7628144427518629, + "grad_norm": 2.7631609439849854, + "learning_rate": 1.3672792588078497e-05, + "loss": 2.0406, + "step": 2316 + }, + { + "epoch": 0.7631438099551238, + "grad_norm": 2.517392873764038, + "learning_rate": 1.3636666510484548e-05, + "loss": 1.8141, + "step": 2317 + }, + { + "epoch": 0.7634731771583845, + "grad_norm": 2.7305877208709717, + "learning_rate": 1.3600580684549841e-05, + "loss": 2.1282, + "step": 2318 + }, + { + "epoch": 0.7638025443616452, + "grad_norm": 2.475735902786255, + "learning_rate": 1.3564535150218872e-05, + "loss": 2.1104, + "step": 2319 + }, + { + "epoch": 0.7641319115649059, + "grad_norm": 2.88720703125, + "learning_rate": 1.3528529947391466e-05, + "loss": 1.9543, + "step": 2320 + }, + { + "epoch": 0.7644612787681666, + "grad_norm": 2.685481548309326, + "learning_rate": 1.3492565115922911e-05, + "loss": 2.1007, + "step": 2321 + }, + { + "epoch": 0.7647906459714274, + "grad_norm": 3.0710129737854004, + "learning_rate": 1.3456640695623735e-05, + "loss": 1.9479, + "step": 2322 + }, + { + "epoch": 0.7651200131746881, + "grad_norm": 3.278609037399292, + "learning_rate": 1.3420756726259754e-05, + "loss": 2.3666, + "step": 2323 + }, + { + "epoch": 0.7654493803779489, + "grad_norm": 2.645240068435669, + "learning_rate": 1.3384913247552011e-05, + "loss": 1.8713, + "step": 2324 + }, + { + "epoch": 0.7657787475812096, + "grad_norm": 3.5083346366882324, + "learning_rate": 1.3349110299176704e-05, + "loss": 2.1438, + "step": 2325 + }, + { + "epoch": 0.7661081147844704, + "grad_norm": 2.030167579650879, + "learning_rate": 1.331334792076519e-05, + "loss": 2.4294, + "step": 2326 + }, + { + "epoch": 0.7664374819877311, + "grad_norm": 2.1392946243286133, + "learning_rate": 1.3277626151903917e-05, + "loss": 2.3092, + "step": 2327 + }, + { + "epoch": 0.7667668491909918, + "grad_norm": 2.1700847148895264, + "learning_rate": 1.324194503213438e-05, + "loss": 2.0788, + "step": 2328 + }, + { + "epoch": 0.7670962163942525, + "grad_norm": 2.3623058795928955, + "learning_rate": 1.3206304600953068e-05, + "loss": 2.4574, + "step": 2329 + }, + { + "epoch": 0.7674255835975132, + "grad_norm": 2.407792806625366, + "learning_rate": 1.3170704897811448e-05, + "loss": 2.4819, + "step": 2330 + }, + { + "epoch": 0.767754950800774, + "grad_norm": 2.397343397140503, + "learning_rate": 1.3135145962115902e-05, + "loss": 2.1706, + "step": 2331 + }, + { + "epoch": 0.7680843180040348, + "grad_norm": 2.4522149562835693, + "learning_rate": 1.3099627833227652e-05, + "loss": 2.4389, + "step": 2332 + }, + { + "epoch": 0.7684136852072955, + "grad_norm": 2.521545648574829, + "learning_rate": 1.3064150550462783e-05, + "loss": 2.5801, + "step": 2333 + }, + { + "epoch": 0.7687430524105562, + "grad_norm": 2.5713517665863037, + "learning_rate": 1.3028714153092163e-05, + "loss": 2.1845, + "step": 2334 + }, + { + "epoch": 0.769072419613817, + "grad_norm": 2.2203288078308105, + "learning_rate": 1.2993318680341399e-05, + "loss": 1.7644, + "step": 2335 + }, + { + "epoch": 0.7694017868170777, + "grad_norm": 2.6416571140289307, + "learning_rate": 1.2957964171390796e-05, + "loss": 2.2982, + "step": 2336 + }, + { + "epoch": 0.7697311540203384, + "grad_norm": 2.398996591567993, + "learning_rate": 1.292265066537533e-05, + "loss": 2.0765, + "step": 2337 + }, + { + "epoch": 0.7700605212235991, + "grad_norm": 2.396077871322632, + "learning_rate": 1.2887378201384542e-05, + "loss": 2.2264, + "step": 2338 + }, + { + "epoch": 0.7703898884268598, + "grad_norm": 2.5502443313598633, + "learning_rate": 1.285214681846259e-05, + "loss": 1.9957, + "step": 2339 + }, + { + "epoch": 0.7707192556301207, + "grad_norm": 2.315697431564331, + "learning_rate": 1.281695655560815e-05, + "loss": 1.898, + "step": 2340 + }, + { + "epoch": 0.7710486228333814, + "grad_norm": 2.28928804397583, + "learning_rate": 1.278180745177437e-05, + "loss": 1.9005, + "step": 2341 + }, + { + "epoch": 0.7713779900366421, + "grad_norm": 2.6291017532348633, + "learning_rate": 1.274669954586884e-05, + "loss": 2.1606, + "step": 2342 + }, + { + "epoch": 0.7717073572399028, + "grad_norm": 2.399324417114258, + "learning_rate": 1.2711632876753549e-05, + "loss": 1.884, + "step": 2343 + }, + { + "epoch": 0.7720367244431636, + "grad_norm": 2.780337333679199, + "learning_rate": 1.2676607483244862e-05, + "loss": 2.2213, + "step": 2344 + }, + { + "epoch": 0.7723660916464243, + "grad_norm": 2.4328701496124268, + "learning_rate": 1.2641623404113396e-05, + "loss": 1.7746, + "step": 2345 + }, + { + "epoch": 0.772695458849685, + "grad_norm": 2.7607338428497314, + "learning_rate": 1.2606680678084087e-05, + "loss": 2.1983, + "step": 2346 + }, + { + "epoch": 0.7730248260529458, + "grad_norm": 2.7279250621795654, + "learning_rate": 1.2571779343836082e-05, + "loss": 1.6927, + "step": 2347 + }, + { + "epoch": 0.7733541932562065, + "grad_norm": 2.8370895385742188, + "learning_rate": 1.2536919440002715e-05, + "loss": 1.7246, + "step": 2348 + }, + { + "epoch": 0.7736835604594673, + "grad_norm": 2.732421875, + "learning_rate": 1.2502101005171446e-05, + "loss": 2.0061, + "step": 2349 + }, + { + "epoch": 0.774012927662728, + "grad_norm": 3.144651412963867, + "learning_rate": 1.2467324077883864e-05, + "loss": 1.6209, + "step": 2350 + }, + { + "epoch": 0.7743422948659887, + "grad_norm": 2.1023104190826416, + "learning_rate": 1.2432588696635533e-05, + "loss": 2.5392, + "step": 2351 + }, + { + "epoch": 0.7746716620692494, + "grad_norm": 2.024048328399658, + "learning_rate": 1.2397894899876133e-05, + "loss": 2.1375, + "step": 2352 + }, + { + "epoch": 0.7750010292725102, + "grad_norm": 2.4644556045532227, + "learning_rate": 1.2363242726009256e-05, + "loss": 2.4582, + "step": 2353 + }, + { + "epoch": 0.775330396475771, + "grad_norm": 2.37516450881958, + "learning_rate": 1.232863221339241e-05, + "loss": 1.9603, + "step": 2354 + }, + { + "epoch": 0.7756597636790317, + "grad_norm": 2.3742220401763916, + "learning_rate": 1.2294063400337036e-05, + "loss": 2.2916, + "step": 2355 + }, + { + "epoch": 0.7759891308822924, + "grad_norm": 2.2466628551483154, + "learning_rate": 1.2259536325108345e-05, + "loss": 2.0996, + "step": 2356 + }, + { + "epoch": 0.7763184980855531, + "grad_norm": 2.6852376461029053, + "learning_rate": 1.2225051025925405e-05, + "loss": 2.3539, + "step": 2357 + }, + { + "epoch": 0.7766478652888139, + "grad_norm": 2.818572521209717, + "learning_rate": 1.2190607540961025e-05, + "loss": 2.0653, + "step": 2358 + }, + { + "epoch": 0.7769772324920746, + "grad_norm": 2.7895278930664062, + "learning_rate": 1.2156205908341728e-05, + "loss": 2.3245, + "step": 2359 + }, + { + "epoch": 0.7773065996953353, + "grad_norm": 2.3493518829345703, + "learning_rate": 1.2121846166147699e-05, + "loss": 1.9228, + "step": 2360 + }, + { + "epoch": 0.777635966898596, + "grad_norm": 2.814150810241699, + "learning_rate": 1.2087528352412775e-05, + "loss": 2.0494, + "step": 2361 + }, + { + "epoch": 0.7779653341018568, + "grad_norm": 2.2821226119995117, + "learning_rate": 1.2053252505124368e-05, + "loss": 1.8652, + "step": 2362 + }, + { + "epoch": 0.7782947013051176, + "grad_norm": 2.916224479675293, + "learning_rate": 1.2019018662223425e-05, + "loss": 2.3056, + "step": 2363 + }, + { + "epoch": 0.7786240685083783, + "grad_norm": 2.693669080734253, + "learning_rate": 1.198482686160441e-05, + "loss": 2.1136, + "step": 2364 + }, + { + "epoch": 0.778953435711639, + "grad_norm": 2.8778226375579834, + "learning_rate": 1.1950677141115246e-05, + "loss": 2.3815, + "step": 2365 + }, + { + "epoch": 0.7792828029148997, + "grad_norm": 2.426630735397339, + "learning_rate": 1.1916569538557287e-05, + "loss": 2.1077, + "step": 2366 + }, + { + "epoch": 0.7796121701181605, + "grad_norm": 2.658508539199829, + "learning_rate": 1.1882504091685253e-05, + "loss": 1.8335, + "step": 2367 + }, + { + "epoch": 0.7799415373214212, + "grad_norm": 3.0452957153320312, + "learning_rate": 1.184848083820721e-05, + "loss": 1.9977, + "step": 2368 + }, + { + "epoch": 0.780270904524682, + "grad_norm": 2.9064278602600098, + "learning_rate": 1.1814499815784513e-05, + "loss": 2.1386, + "step": 2369 + }, + { + "epoch": 0.7806002717279427, + "grad_norm": 2.525879144668579, + "learning_rate": 1.1780561062031753e-05, + "loss": 1.5748, + "step": 2370 + }, + { + "epoch": 0.7809296389312034, + "grad_norm": 2.879523277282715, + "learning_rate": 1.1746664614516756e-05, + "loss": 2.3143, + "step": 2371 + }, + { + "epoch": 0.7812590061344642, + "grad_norm": 2.849067211151123, + "learning_rate": 1.171281051076052e-05, + "loss": 1.8855, + "step": 2372 + }, + { + "epoch": 0.7815883733377249, + "grad_norm": 2.7838194370269775, + "learning_rate": 1.1678998788237155e-05, + "loss": 1.7152, + "step": 2373 + }, + { + "epoch": 0.7819177405409856, + "grad_norm": 2.8870933055877686, + "learning_rate": 1.164522948437387e-05, + "loss": 1.693, + "step": 2374 + }, + { + "epoch": 0.7822471077442463, + "grad_norm": 3.9270734786987305, + "learning_rate": 1.1611502636550931e-05, + "loss": 1.7755, + "step": 2375 + }, + { + "epoch": 0.7825764749475072, + "grad_norm": 1.9541338682174683, + "learning_rate": 1.157781828210156e-05, + "loss": 2.5028, + "step": 2376 + }, + { + "epoch": 0.7829058421507679, + "grad_norm": 2.4501519203186035, + "learning_rate": 1.1544176458311996e-05, + "loss": 2.7866, + "step": 2377 + }, + { + "epoch": 0.7832352093540286, + "grad_norm": 2.14868426322937, + "learning_rate": 1.1510577202421369e-05, + "loss": 2.3476, + "step": 2378 + }, + { + "epoch": 0.7835645765572893, + "grad_norm": 2.618394374847412, + "learning_rate": 1.1477020551621698e-05, + "loss": 2.5685, + "step": 2379 + }, + { + "epoch": 0.78389394376055, + "grad_norm": 2.1860313415527344, + "learning_rate": 1.1443506543057852e-05, + "loss": 1.9295, + "step": 2380 + }, + { + "epoch": 0.7842233109638108, + "grad_norm": 2.496148109436035, + "learning_rate": 1.141003521382748e-05, + "loss": 2.3171, + "step": 2381 + }, + { + "epoch": 0.7845526781670715, + "grad_norm": 2.7145514488220215, + "learning_rate": 1.1376606600980993e-05, + "loss": 2.456, + "step": 2382 + }, + { + "epoch": 0.7848820453703322, + "grad_norm": 2.3949971199035645, + "learning_rate": 1.1343220741521526e-05, + "loss": 2.1141, + "step": 2383 + }, + { + "epoch": 0.785211412573593, + "grad_norm": 2.371872901916504, + "learning_rate": 1.1309877672404878e-05, + "loss": 2.0841, + "step": 2384 + }, + { + "epoch": 0.7855407797768538, + "grad_norm": 2.2183234691619873, + "learning_rate": 1.1276577430539492e-05, + "loss": 2.1255, + "step": 2385 + }, + { + "epoch": 0.7858701469801145, + "grad_norm": 2.644172430038452, + "learning_rate": 1.12433200527864e-05, + "loss": 2.5442, + "step": 2386 + }, + { + "epoch": 0.7861995141833752, + "grad_norm": 2.8008968830108643, + "learning_rate": 1.1210105575959195e-05, + "loss": 2.4805, + "step": 2387 + }, + { + "epoch": 0.7865288813866359, + "grad_norm": 3.0009310245513916, + "learning_rate": 1.1176934036823949e-05, + "loss": 2.406, + "step": 2388 + }, + { + "epoch": 0.7868582485898966, + "grad_norm": 2.7110002040863037, + "learning_rate": 1.1143805472099245e-05, + "loss": 2.2543, + "step": 2389 + }, + { + "epoch": 0.7871876157931574, + "grad_norm": 2.5703489780426025, + "learning_rate": 1.1110719918456075e-05, + "loss": 1.7567, + "step": 2390 + }, + { + "epoch": 0.7875169829964181, + "grad_norm": 2.9390273094177246, + "learning_rate": 1.1077677412517828e-05, + "loss": 2.1317, + "step": 2391 + }, + { + "epoch": 0.7878463501996789, + "grad_norm": 2.4334805011749268, + "learning_rate": 1.1044677990860236e-05, + "loss": 2.1525, + "step": 2392 + }, + { + "epoch": 0.7881757174029396, + "grad_norm": 2.6386754512786865, + "learning_rate": 1.1011721690011368e-05, + "loss": 2.2738, + "step": 2393 + }, + { + "epoch": 0.7885050846062004, + "grad_norm": 2.390347957611084, + "learning_rate": 1.0978808546451503e-05, + "loss": 2.0497, + "step": 2394 + }, + { + "epoch": 0.7888344518094611, + "grad_norm": 3.1142003536224365, + "learning_rate": 1.0945938596613193e-05, + "loss": 2.1531, + "step": 2395 + }, + { + "epoch": 0.7891638190127218, + "grad_norm": 3.1742782592773438, + "learning_rate": 1.0913111876881166e-05, + "loss": 2.3082, + "step": 2396 + }, + { + "epoch": 0.7894931862159825, + "grad_norm": 3.1472554206848145, + "learning_rate": 1.0880328423592307e-05, + "loss": 2.0666, + "step": 2397 + }, + { + "epoch": 0.7898225534192432, + "grad_norm": 2.8876795768737793, + "learning_rate": 1.0847588273035592e-05, + "loss": 1.9119, + "step": 2398 + }, + { + "epoch": 0.7901519206225041, + "grad_norm": 3.2102115154266357, + "learning_rate": 1.081489146145207e-05, + "loss": 2.1106, + "step": 2399 + }, + { + "epoch": 0.7904812878257648, + "grad_norm": 3.5183258056640625, + "learning_rate": 1.0782238025034835e-05, + "loss": 2.0614, + "step": 2400 + }, + { + "epoch": 0.7908106550290255, + "grad_norm": 2.160437822341919, + "learning_rate": 1.0749627999928923e-05, + "loss": 2.2165, + "step": 2401 + }, + { + "epoch": 0.7911400222322862, + "grad_norm": 2.1976757049560547, + "learning_rate": 1.0717061422231361e-05, + "loss": 1.9749, + "step": 2402 + }, + { + "epoch": 0.791469389435547, + "grad_norm": 2.3027257919311523, + "learning_rate": 1.0684538327991055e-05, + "loss": 2.4518, + "step": 2403 + }, + { + "epoch": 0.7917987566388077, + "grad_norm": 2.299556255340576, + "learning_rate": 1.06520587532088e-05, + "loss": 2.4594, + "step": 2404 + }, + { + "epoch": 0.7921281238420684, + "grad_norm": 2.538569211959839, + "learning_rate": 1.0619622733837198e-05, + "loss": 2.3555, + "step": 2405 + }, + { + "epoch": 0.7924574910453291, + "grad_norm": 2.4368882179260254, + "learning_rate": 1.0587230305780666e-05, + "loss": 2.3176, + "step": 2406 + }, + { + "epoch": 0.7927868582485899, + "grad_norm": 2.313861131668091, + "learning_rate": 1.0554881504895326e-05, + "loss": 2.4077, + "step": 2407 + }, + { + "epoch": 0.7931162254518507, + "grad_norm": 2.871065139770508, + "learning_rate": 1.0522576366989035e-05, + "loss": 2.0461, + "step": 2408 + }, + { + "epoch": 0.7934455926551114, + "grad_norm": 2.8059747219085693, + "learning_rate": 1.0490314927821315e-05, + "loss": 2.4673, + "step": 2409 + }, + { + "epoch": 0.7937749598583721, + "grad_norm": 2.2953708171844482, + "learning_rate": 1.0458097223103331e-05, + "loss": 2.0045, + "step": 2410 + }, + { + "epoch": 0.7941043270616328, + "grad_norm": 2.41961407661438, + "learning_rate": 1.0425923288497801e-05, + "loss": 2.2286, + "step": 2411 + }, + { + "epoch": 0.7944336942648935, + "grad_norm": 2.5134241580963135, + "learning_rate": 1.039379315961903e-05, + "loss": 2.2415, + "step": 2412 + }, + { + "epoch": 0.7947630614681543, + "grad_norm": 2.3589727878570557, + "learning_rate": 1.0361706872032812e-05, + "loss": 1.9094, + "step": 2413 + }, + { + "epoch": 0.795092428671415, + "grad_norm": 2.9635963439941406, + "learning_rate": 1.0329664461256411e-05, + "loss": 2.3048, + "step": 2414 + }, + { + "epoch": 0.7954217958746758, + "grad_norm": 2.931699275970459, + "learning_rate": 1.0297665962758535e-05, + "loss": 2.3376, + "step": 2415 + }, + { + "epoch": 0.7957511630779365, + "grad_norm": 2.3775904178619385, + "learning_rate": 1.0265711411959272e-05, + "loss": 1.9315, + "step": 2416 + }, + { + "epoch": 0.7960805302811973, + "grad_norm": 3.018141031265259, + "learning_rate": 1.0233800844230073e-05, + "loss": 2.0433, + "step": 2417 + }, + { + "epoch": 0.796409897484458, + "grad_norm": 2.5099568367004395, + "learning_rate": 1.0201934294893705e-05, + "loss": 2.4133, + "step": 2418 + }, + { + "epoch": 0.7967392646877187, + "grad_norm": 2.779721975326538, + "learning_rate": 1.0170111799224186e-05, + "loss": 2.1728, + "step": 2419 + }, + { + "epoch": 0.7970686318909794, + "grad_norm": 2.5381271839141846, + "learning_rate": 1.0138333392446786e-05, + "loss": 1.7606, + "step": 2420 + }, + { + "epoch": 0.7973979990942401, + "grad_norm": 2.808448314666748, + "learning_rate": 1.010659910973798e-05, + "loss": 1.9098, + "step": 2421 + }, + { + "epoch": 0.797727366297501, + "grad_norm": 2.90604829788208, + "learning_rate": 1.0074908986225395e-05, + "loss": 2.0743, + "step": 2422 + }, + { + "epoch": 0.7980567335007617, + "grad_norm": 2.6868839263916016, + "learning_rate": 1.0043263056987773e-05, + "loss": 2.0367, + "step": 2423 + }, + { + "epoch": 0.7983861007040224, + "grad_norm": 3.5458996295928955, + "learning_rate": 1.001166135705493e-05, + "loss": 2.1114, + "step": 2424 + }, + { + "epoch": 0.7987154679072831, + "grad_norm": 3.005037784576416, + "learning_rate": 9.980103921407757e-06, + "loss": 1.6499, + "step": 2425 + }, + { + "epoch": 0.7990448351105439, + "grad_norm": 2.083054542541504, + "learning_rate": 9.948590784978085e-06, + "loss": 2.5122, + "step": 2426 + }, + { + "epoch": 0.7993742023138046, + "grad_norm": 2.21122670173645, + "learning_rate": 9.917121982648764e-06, + "loss": 2.3761, + "step": 2427 + }, + { + "epoch": 0.7997035695170653, + "grad_norm": 1.965364694595337, + "learning_rate": 9.885697549253542e-06, + "loss": 2.0381, + "step": 2428 + }, + { + "epoch": 0.800032936720326, + "grad_norm": 2.5739123821258545, + "learning_rate": 9.854317519577077e-06, + "loss": 2.4915, + "step": 2429 + }, + { + "epoch": 0.8003623039235868, + "grad_norm": 2.479092597961426, + "learning_rate": 9.822981928354846e-06, + "loss": 2.3367, + "step": 2430 + }, + { + "epoch": 0.8006916711268476, + "grad_norm": 2.881108522415161, + "learning_rate": 9.791690810273173e-06, + "loss": 2.369, + "step": 2431 + }, + { + "epoch": 0.8010210383301083, + "grad_norm": 2.5094995498657227, + "learning_rate": 9.760444199969104e-06, + "loss": 2.4964, + "step": 2432 + }, + { + "epoch": 0.801350405533369, + "grad_norm": 2.3330986499786377, + "learning_rate": 9.729242132030452e-06, + "loss": 1.9987, + "step": 2433 + }, + { + "epoch": 0.8016797727366297, + "grad_norm": 2.8625338077545166, + "learning_rate": 9.698084640995719e-06, + "loss": 2.0433, + "step": 2434 + }, + { + "epoch": 0.8020091399398905, + "grad_norm": 2.96150541305542, + "learning_rate": 9.666971761354065e-06, + "loss": 2.4458, + "step": 2435 + }, + { + "epoch": 0.8023385071431512, + "grad_norm": 3.234882354736328, + "learning_rate": 9.635903527545264e-06, + "loss": 2.2081, + "step": 2436 + }, + { + "epoch": 0.802667874346412, + "grad_norm": 2.900803804397583, + "learning_rate": 9.604879973959668e-06, + "loss": 2.3133, + "step": 2437 + }, + { + "epoch": 0.8029972415496727, + "grad_norm": 2.271087646484375, + "learning_rate": 9.57390113493819e-06, + "loss": 2.052, + "step": 2438 + }, + { + "epoch": 0.8033266087529334, + "grad_norm": 2.268280267715454, + "learning_rate": 9.542967044772205e-06, + "loss": 1.9981, + "step": 2439 + }, + { + "epoch": 0.8036559759561942, + "grad_norm": 2.6638338565826416, + "learning_rate": 9.512077737703601e-06, + "loss": 1.9637, + "step": 2440 + }, + { + "epoch": 0.8039853431594549, + "grad_norm": 2.6653592586517334, + "learning_rate": 9.481233247924658e-06, + "loss": 1.9678, + "step": 2441 + }, + { + "epoch": 0.8043147103627156, + "grad_norm": 2.4272871017456055, + "learning_rate": 9.450433609578064e-06, + "loss": 2.0047, + "step": 2442 + }, + { + "epoch": 0.8046440775659763, + "grad_norm": 2.5219974517822266, + "learning_rate": 9.419678856756891e-06, + "loss": 2.3667, + "step": 2443 + }, + { + "epoch": 0.8049734447692372, + "grad_norm": 2.637106418609619, + "learning_rate": 9.388969023504457e-06, + "loss": 2.0492, + "step": 2444 + }, + { + "epoch": 0.8053028119724979, + "grad_norm": 2.4616830348968506, + "learning_rate": 9.358304143814401e-06, + "loss": 2.2313, + "step": 2445 + }, + { + "epoch": 0.8056321791757586, + "grad_norm": 2.7604615688323975, + "learning_rate": 9.327684251630597e-06, + "loss": 2.0396, + "step": 2446 + }, + { + "epoch": 0.8059615463790193, + "grad_norm": 2.6307003498077393, + "learning_rate": 9.297109380847119e-06, + "loss": 1.9813, + "step": 2447 + }, + { + "epoch": 0.80629091358228, + "grad_norm": 3.0449466705322266, + "learning_rate": 9.266579565308198e-06, + "loss": 2.2011, + "step": 2448 + }, + { + "epoch": 0.8066202807855408, + "grad_norm": 3.0230000019073486, + "learning_rate": 9.236094838808206e-06, + "loss": 1.7364, + "step": 2449 + }, + { + "epoch": 0.8069496479888015, + "grad_norm": 3.4453773498535156, + "learning_rate": 9.205655235091603e-06, + "loss": 2.0221, + "step": 2450 + }, + { + "epoch": 0.8072790151920622, + "grad_norm": 1.9125607013702393, + "learning_rate": 9.175260787852873e-06, + "loss": 2.2074, + "step": 2451 + }, + { + "epoch": 0.807608382395323, + "grad_norm": 2.63358736038208, + "learning_rate": 9.14491153073655e-06, + "loss": 2.5471, + "step": 2452 + }, + { + "epoch": 0.8079377495985838, + "grad_norm": 2.265695095062256, + "learning_rate": 9.114607497337135e-06, + "loss": 2.2188, + "step": 2453 + }, + { + "epoch": 0.8082671168018445, + "grad_norm": 2.2626595497131348, + "learning_rate": 9.084348721199059e-06, + "loss": 2.2639, + "step": 2454 + }, + { + "epoch": 0.8085964840051052, + "grad_norm": 2.104132652282715, + "learning_rate": 9.054135235816669e-06, + "loss": 2.286, + "step": 2455 + }, + { + "epoch": 0.8089258512083659, + "grad_norm": 2.395782709121704, + "learning_rate": 9.023967074634187e-06, + "loss": 2.1671, + "step": 2456 + }, + { + "epoch": 0.8092552184116266, + "grad_norm": 2.4479575157165527, + "learning_rate": 8.993844271045626e-06, + "loss": 2.2154, + "step": 2457 + }, + { + "epoch": 0.8095845856148874, + "grad_norm": 2.32531476020813, + "learning_rate": 8.963766858394823e-06, + "loss": 2.0973, + "step": 2458 + }, + { + "epoch": 0.8099139528181482, + "grad_norm": 2.2058169841766357, + "learning_rate": 8.933734869975374e-06, + "loss": 2.2149, + "step": 2459 + }, + { + "epoch": 0.8102433200214089, + "grad_norm": 2.1951229572296143, + "learning_rate": 8.903748339030582e-06, + "loss": 2.1389, + "step": 2460 + }, + { + "epoch": 0.8105726872246696, + "grad_norm": 2.675092935562134, + "learning_rate": 8.873807298753422e-06, + "loss": 2.2773, + "step": 2461 + }, + { + "epoch": 0.8109020544279303, + "grad_norm": 2.5307810306549072, + "learning_rate": 8.84391178228654e-06, + "loss": 2.0614, + "step": 2462 + }, + { + "epoch": 0.8112314216311911, + "grad_norm": 2.554347038269043, + "learning_rate": 8.814061822722174e-06, + "loss": 2.6156, + "step": 2463 + }, + { + "epoch": 0.8115607888344518, + "grad_norm": 2.7959494590759277, + "learning_rate": 8.784257453102124e-06, + "loss": 1.8778, + "step": 2464 + }, + { + "epoch": 0.8118901560377125, + "grad_norm": 3.0916404724121094, + "learning_rate": 8.754498706417741e-06, + "loss": 2.5136, + "step": 2465 + }, + { + "epoch": 0.8122195232409732, + "grad_norm": 2.7095844745635986, + "learning_rate": 8.724785615609865e-06, + "loss": 2.1074, + "step": 2466 + }, + { + "epoch": 0.8125488904442341, + "grad_norm": 2.490854263305664, + "learning_rate": 8.695118213568816e-06, + "loss": 2.132, + "step": 2467 + }, + { + "epoch": 0.8128782576474948, + "grad_norm": 2.495551824569702, + "learning_rate": 8.665496533134315e-06, + "loss": 1.9723, + "step": 2468 + }, + { + "epoch": 0.8132076248507555, + "grad_norm": 2.5706124305725098, + "learning_rate": 8.635920607095504e-06, + "loss": 2.1114, + "step": 2469 + }, + { + "epoch": 0.8135369920540162, + "grad_norm": 3.406837224960327, + "learning_rate": 8.606390468190833e-06, + "loss": 2.4752, + "step": 2470 + }, + { + "epoch": 0.8138663592572769, + "grad_norm": 2.7835068702697754, + "learning_rate": 8.576906149108104e-06, + "loss": 1.7684, + "step": 2471 + }, + { + "epoch": 0.8141957264605377, + "grad_norm": 2.842190742492676, + "learning_rate": 8.547467682484389e-06, + "loss": 1.7345, + "step": 2472 + }, + { + "epoch": 0.8145250936637984, + "grad_norm": 3.7220804691314697, + "learning_rate": 8.518075100905992e-06, + "loss": 2.0355, + "step": 2473 + }, + { + "epoch": 0.8148544608670591, + "grad_norm": 3.133759021759033, + "learning_rate": 8.488728436908483e-06, + "loss": 1.9687, + "step": 2474 + }, + { + "epoch": 0.8151838280703199, + "grad_norm": 3.0628323554992676, + "learning_rate": 8.45942772297652e-06, + "loss": 1.7354, + "step": 2475 + }, + { + "epoch": 0.8155131952735807, + "grad_norm": 2.066558599472046, + "learning_rate": 8.430172991543944e-06, + "loss": 2.533, + "step": 2476 + }, + { + "epoch": 0.8158425624768414, + "grad_norm": 2.2155539989471436, + "learning_rate": 8.4009642749937e-06, + "loss": 2.5765, + "step": 2477 + }, + { + "epoch": 0.8161719296801021, + "grad_norm": 2.202932834625244, + "learning_rate": 8.371801605657781e-06, + "loss": 2.2965, + "step": 2478 + }, + { + "epoch": 0.8165012968833628, + "grad_norm": 2.7432045936584473, + "learning_rate": 8.34268501581722e-06, + "loss": 2.5274, + "step": 2479 + }, + { + "epoch": 0.8168306640866235, + "grad_norm": 2.2428295612335205, + "learning_rate": 8.313614537702042e-06, + "loss": 2.0618, + "step": 2480 + }, + { + "epoch": 0.8171600312898843, + "grad_norm": 2.3830106258392334, + "learning_rate": 8.284590203491232e-06, + "loss": 2.2108, + "step": 2481 + }, + { + "epoch": 0.8174893984931451, + "grad_norm": 2.368865966796875, + "learning_rate": 8.255612045312667e-06, + "loss": 2.1992, + "step": 2482 + }, + { + "epoch": 0.8178187656964058, + "grad_norm": 2.4614439010620117, + "learning_rate": 8.226680095243155e-06, + "loss": 2.1652, + "step": 2483 + }, + { + "epoch": 0.8181481328996665, + "grad_norm": 2.3410730361938477, + "learning_rate": 8.197794385308333e-06, + "loss": 1.9139, + "step": 2484 + }, + { + "epoch": 0.8184775001029273, + "grad_norm": 2.664398431777954, + "learning_rate": 8.168954947482654e-06, + "loss": 2.5085, + "step": 2485 + }, + { + "epoch": 0.818806867306188, + "grad_norm": 2.8934290409088135, + "learning_rate": 8.140161813689352e-06, + "loss": 2.2249, + "step": 2486 + }, + { + "epoch": 0.8191362345094487, + "grad_norm": 2.2794628143310547, + "learning_rate": 8.111415015800406e-06, + "loss": 1.8115, + "step": 2487 + }, + { + "epoch": 0.8194656017127094, + "grad_norm": 2.8373513221740723, + "learning_rate": 8.08271458563652e-06, + "loss": 2.2524, + "step": 2488 + }, + { + "epoch": 0.8197949689159701, + "grad_norm": 3.0686585903167725, + "learning_rate": 8.054060554967024e-06, + "loss": 2.0289, + "step": 2489 + }, + { + "epoch": 0.820124336119231, + "grad_norm": 2.923983097076416, + "learning_rate": 8.025452955509943e-06, + "loss": 2.1504, + "step": 2490 + }, + { + "epoch": 0.8204537033224917, + "grad_norm": 2.634913682937622, + "learning_rate": 7.996891818931879e-06, + "loss": 2.0277, + "step": 2491 + }, + { + "epoch": 0.8207830705257524, + "grad_norm": 2.4638595581054688, + "learning_rate": 7.968377176848003e-06, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.8211124377290131, + "grad_norm": 2.867859363555908, + "learning_rate": 7.939909060822025e-06, + "loss": 2.0976, + "step": 2493 + }, + { + "epoch": 0.8214418049322739, + "grad_norm": 2.6945831775665283, + "learning_rate": 7.911487502366166e-06, + "loss": 2.3166, + "step": 2494 + }, + { + "epoch": 0.8217711721355346, + "grad_norm": 2.430236577987671, + "learning_rate": 7.883112532941072e-06, + "loss": 2.1648, + "step": 2495 + }, + { + "epoch": 0.8221005393387953, + "grad_norm": 2.43375563621521, + "learning_rate": 7.85478418395586e-06, + "loss": 2.2089, + "step": 2496 + }, + { + "epoch": 0.822429906542056, + "grad_norm": 2.8123996257781982, + "learning_rate": 7.826502486768018e-06, + "loss": 2.2756, + "step": 2497 + }, + { + "epoch": 0.8227592737453168, + "grad_norm": 2.7135322093963623, + "learning_rate": 7.798267472683407e-06, + "loss": 2.0788, + "step": 2498 + }, + { + "epoch": 0.8230886409485776, + "grad_norm": 2.929687261581421, + "learning_rate": 7.770079172956201e-06, + "loss": 1.9092, + "step": 2499 + }, + { + "epoch": 0.8234180081518383, + "grad_norm": 3.2018990516662598, + "learning_rate": 7.741937618788875e-06, + "loss": 1.9998, + "step": 2500 + }, + { + "epoch": 0.823747375355099, + "grad_norm": 2.0544869899749756, + "learning_rate": 7.713842841332164e-06, + "loss": 2.3543, + "step": 2501 + }, + { + "epoch": 0.8240767425583597, + "grad_norm": 2.4761202335357666, + "learning_rate": 7.685794871684998e-06, + "loss": 2.6125, + "step": 2502 + }, + { + "epoch": 0.8244061097616204, + "grad_norm": 2.063749074935913, + "learning_rate": 7.657793740894504e-06, + "loss": 2.2784, + "step": 2503 + }, + { + "epoch": 0.8247354769648813, + "grad_norm": 2.2412142753601074, + "learning_rate": 7.629839479955998e-06, + "loss": 2.3776, + "step": 2504 + }, + { + "epoch": 0.825064844168142, + "grad_norm": 2.3341736793518066, + "learning_rate": 7.6019321198128655e-06, + "loss": 2.4814, + "step": 2505 + }, + { + "epoch": 0.8253942113714027, + "grad_norm": 2.342841625213623, + "learning_rate": 7.574071691356621e-06, + "loss": 2.3653, + "step": 2506 + }, + { + "epoch": 0.8257235785746634, + "grad_norm": 2.213564157485962, + "learning_rate": 7.546258225426778e-06, + "loss": 2.2279, + "step": 2507 + }, + { + "epoch": 0.8260529457779242, + "grad_norm": 2.4937024116516113, + "learning_rate": 7.518491752810897e-06, + "loss": 2.4755, + "step": 2508 + }, + { + "epoch": 0.8263823129811849, + "grad_norm": 2.3210904598236084, + "learning_rate": 7.49077230424452e-06, + "loss": 1.9111, + "step": 2509 + }, + { + "epoch": 0.8267116801844456, + "grad_norm": 2.904571771621704, + "learning_rate": 7.463099910411137e-06, + "loss": 2.7039, + "step": 2510 + }, + { + "epoch": 0.8270410473877063, + "grad_norm": 2.4512391090393066, + "learning_rate": 7.435474601942133e-06, + "loss": 2.1503, + "step": 2511 + }, + { + "epoch": 0.827370414590967, + "grad_norm": 2.329566240310669, + "learning_rate": 7.407896409416809e-06, + "loss": 2.0849, + "step": 2512 + }, + { + "epoch": 0.8276997817942279, + "grad_norm": 2.870210886001587, + "learning_rate": 7.380365363362263e-06, + "loss": 1.9724, + "step": 2513 + }, + { + "epoch": 0.8280291489974886, + "grad_norm": 2.983409881591797, + "learning_rate": 7.3528814942534445e-06, + "loss": 2.4121, + "step": 2514 + }, + { + "epoch": 0.8283585162007493, + "grad_norm": 2.7085869312286377, + "learning_rate": 7.3254448325130675e-06, + "loss": 1.9919, + "step": 2515 + }, + { + "epoch": 0.82868788340401, + "grad_norm": 2.715024948120117, + "learning_rate": 7.298055408511595e-06, + "loss": 2.0174, + "step": 2516 + }, + { + "epoch": 0.8290172506072708, + "grad_norm": 2.552612066268921, + "learning_rate": 7.270713252567191e-06, + "loss": 2.3856, + "step": 2517 + }, + { + "epoch": 0.8293466178105315, + "grad_norm": 2.6823573112487793, + "learning_rate": 7.243418394945711e-06, + "loss": 1.9564, + "step": 2518 + }, + { + "epoch": 0.8296759850137922, + "grad_norm": 2.3931643962860107, + "learning_rate": 7.216170865860655e-06, + "loss": 1.6479, + "step": 2519 + }, + { + "epoch": 0.830005352217053, + "grad_norm": 2.824610710144043, + "learning_rate": 7.188970695473107e-06, + "loss": 2.1261, + "step": 2520 + }, + { + "epoch": 0.8303347194203137, + "grad_norm": 2.481971502304077, + "learning_rate": 7.161817913891755e-06, + "loss": 1.9109, + "step": 2521 + }, + { + "epoch": 0.8306640866235745, + "grad_norm": 2.6014606952667236, + "learning_rate": 7.134712551172823e-06, + "loss": 2.1093, + "step": 2522 + }, + { + "epoch": 0.8309934538268352, + "grad_norm": 3.1851959228515625, + "learning_rate": 7.107654637320049e-06, + "loss": 2.0577, + "step": 2523 + }, + { + "epoch": 0.8313228210300959, + "grad_norm": 3.0487592220306396, + "learning_rate": 7.080644202284648e-06, + "loss": 1.911, + "step": 2524 + }, + { + "epoch": 0.8316521882333566, + "grad_norm": 3.059511423110962, + "learning_rate": 7.0536812759652684e-06, + "loss": 2.0921, + "step": 2525 + }, + { + "epoch": 0.8319815554366174, + "grad_norm": 2.1465368270874023, + "learning_rate": 7.0267658882079976e-06, + "loss": 2.5033, + "step": 2526 + }, + { + "epoch": 0.8323109226398782, + "grad_norm": 2.2625226974487305, + "learning_rate": 6.999898068806254e-06, + "loss": 2.1521, + "step": 2527 + }, + { + "epoch": 0.8326402898431389, + "grad_norm": 2.0414772033691406, + "learning_rate": 6.973077847500842e-06, + "loss": 2.1878, + "step": 2528 + }, + { + "epoch": 0.8329696570463996, + "grad_norm": 2.8698439598083496, + "learning_rate": 6.946305253979862e-06, + "loss": 2.1674, + "step": 2529 + }, + { + "epoch": 0.8332990242496603, + "grad_norm": 2.2601876258850098, + "learning_rate": 6.919580317878705e-06, + "loss": 2.2299, + "step": 2530 + }, + { + "epoch": 0.8336283914529211, + "grad_norm": 2.2976324558258057, + "learning_rate": 6.892903068779993e-06, + "loss": 2.0683, + "step": 2531 + }, + { + "epoch": 0.8339577586561818, + "grad_norm": 2.5393729209899902, + "learning_rate": 6.866273536213586e-06, + "loss": 2.0545, + "step": 2532 + }, + { + "epoch": 0.8342871258594425, + "grad_norm": 2.5304923057556152, + "learning_rate": 6.83969174965648e-06, + "loss": 2.1273, + "step": 2533 + }, + { + "epoch": 0.8346164930627032, + "grad_norm": 2.707395076751709, + "learning_rate": 6.813157738532855e-06, + "loss": 2.2007, + "step": 2534 + }, + { + "epoch": 0.8349458602659641, + "grad_norm": 2.2538604736328125, + "learning_rate": 6.7866715322140175e-06, + "loss": 2.1026, + "step": 2535 + }, + { + "epoch": 0.8352752274692248, + "grad_norm": 2.6380133628845215, + "learning_rate": 6.760233160018326e-06, + "loss": 2.3277, + "step": 2536 + }, + { + "epoch": 0.8356045946724855, + "grad_norm": 2.7567970752716064, + "learning_rate": 6.73384265121122e-06, + "loss": 2.1349, + "step": 2537 + }, + { + "epoch": 0.8359339618757462, + "grad_norm": 2.683866024017334, + "learning_rate": 6.707500035005115e-06, + "loss": 2.1828, + "step": 2538 + }, + { + "epoch": 0.8362633290790069, + "grad_norm": 2.5327346324920654, + "learning_rate": 6.68120534055946e-06, + "loss": 2.1023, + "step": 2539 + }, + { + "epoch": 0.8365926962822677, + "grad_norm": 2.509141206741333, + "learning_rate": 6.654958596980626e-06, + "loss": 2.2115, + "step": 2540 + }, + { + "epoch": 0.8369220634855284, + "grad_norm": 2.387683629989624, + "learning_rate": 6.628759833321918e-06, + "loss": 2.0021, + "step": 2541 + }, + { + "epoch": 0.8372514306887892, + "grad_norm": 2.726041078567505, + "learning_rate": 6.602609078583538e-06, + "loss": 2.3552, + "step": 2542 + }, + { + "epoch": 0.8375807978920499, + "grad_norm": 2.4618892669677734, + "learning_rate": 6.576506361712526e-06, + "loss": 1.9296, + "step": 2543 + }, + { + "epoch": 0.8379101650953107, + "grad_norm": 2.628038167953491, + "learning_rate": 6.550451711602778e-06, + "loss": 1.9976, + "step": 2544 + }, + { + "epoch": 0.8382395322985714, + "grad_norm": 2.565180540084839, + "learning_rate": 6.524445157094938e-06, + "loss": 2.2082, + "step": 2545 + }, + { + "epoch": 0.8385688995018321, + "grad_norm": 2.320673942565918, + "learning_rate": 6.4984867269764545e-06, + "loss": 1.7515, + "step": 2546 + }, + { + "epoch": 0.8388982667050928, + "grad_norm": 2.666262626647949, + "learning_rate": 6.472576449981477e-06, + "loss": 1.8173, + "step": 2547 + }, + { + "epoch": 0.8392276339083535, + "grad_norm": 2.7199695110321045, + "learning_rate": 6.446714354790873e-06, + "loss": 1.7931, + "step": 2548 + }, + { + "epoch": 0.8395570011116144, + "grad_norm": 2.7355263233184814, + "learning_rate": 6.420900470032165e-06, + "loss": 1.698, + "step": 2549 + }, + { + "epoch": 0.8398863683148751, + "grad_norm": 3.240067481994629, + "learning_rate": 6.395134824279525e-06, + "loss": 1.9727, + "step": 2550 + }, + { + "epoch": 0.8402157355181358, + "grad_norm": 3.454439640045166, + "learning_rate": 6.369417446053694e-06, + "loss": 2.2517, + "step": 2551 + }, + { + "epoch": 0.8405451027213965, + "grad_norm": 2.324172019958496, + "learning_rate": 6.343748363822011e-06, + "loss": 2.3305, + "step": 2552 + }, + { + "epoch": 0.8408744699246572, + "grad_norm": 2.483888626098633, + "learning_rate": 6.3181276059983586e-06, + "loss": 2.2321, + "step": 2553 + }, + { + "epoch": 0.841203837127918, + "grad_norm": 2.10294508934021, + "learning_rate": 6.292555200943107e-06, + "loss": 2.2044, + "step": 2554 + }, + { + "epoch": 0.8415332043311787, + "grad_norm": 2.234780788421631, + "learning_rate": 6.267031176963123e-06, + "loss": 2.0835, + "step": 2555 + }, + { + "epoch": 0.8418625715344394, + "grad_norm": 2.515031576156616, + "learning_rate": 6.241555562311712e-06, + "loss": 2.3331, + "step": 2556 + }, + { + "epoch": 0.8421919387377002, + "grad_norm": 2.2735471725463867, + "learning_rate": 6.2161283851885966e-06, + "loss": 2.2742, + "step": 2557 + }, + { + "epoch": 0.842521305940961, + "grad_norm": 3.118337869644165, + "learning_rate": 6.1907496737398595e-06, + "loss": 2.5658, + "step": 2558 + }, + { + "epoch": 0.8428506731442217, + "grad_norm": 2.6256306171417236, + "learning_rate": 6.165419456057964e-06, + "loss": 2.1742, + "step": 2559 + }, + { + "epoch": 0.8431800403474824, + "grad_norm": 2.4760029315948486, + "learning_rate": 6.140137760181686e-06, + "loss": 2.1152, + "step": 2560 + }, + { + "epoch": 0.8435094075507431, + "grad_norm": 2.9300849437713623, + "learning_rate": 6.11490461409609e-06, + "loss": 2.5476, + "step": 2561 + }, + { + "epoch": 0.8438387747540038, + "grad_norm": 2.8328449726104736, + "learning_rate": 6.089720045732495e-06, + "loss": 2.1809, + "step": 2562 + }, + { + "epoch": 0.8441681419572646, + "grad_norm": 2.6135897636413574, + "learning_rate": 6.06458408296845e-06, + "loss": 2.5915, + "step": 2563 + }, + { + "epoch": 0.8444975091605254, + "grad_norm": 2.574871301651001, + "learning_rate": 6.039496753627711e-06, + "loss": 1.9764, + "step": 2564 + }, + { + "epoch": 0.8448268763637861, + "grad_norm": 2.5949325561523438, + "learning_rate": 6.014458085480179e-06, + "loss": 2.1941, + "step": 2565 + }, + { + "epoch": 0.8451562435670468, + "grad_norm": 2.195402145385742, + "learning_rate": 5.989468106241919e-06, + "loss": 2.0671, + "step": 2566 + }, + { + "epoch": 0.8454856107703076, + "grad_norm": 2.7210750579833984, + "learning_rate": 5.964526843575069e-06, + "loss": 1.946, + "step": 2567 + }, + { + "epoch": 0.8458149779735683, + "grad_norm": 2.7602195739746094, + "learning_rate": 5.9396343250878676e-06, + "loss": 1.8686, + "step": 2568 + }, + { + "epoch": 0.846144345176829, + "grad_norm": 2.3747804164886475, + "learning_rate": 5.914790578334595e-06, + "loss": 2.1268, + "step": 2569 + }, + { + "epoch": 0.8464737123800897, + "grad_norm": 2.572371006011963, + "learning_rate": 5.889995630815515e-06, + "loss": 1.9882, + "step": 2570 + }, + { + "epoch": 0.8468030795833504, + "grad_norm": 2.7990000247955322, + "learning_rate": 5.8652495099769e-06, + "loss": 2.1318, + "step": 2571 + }, + { + "epoch": 0.8471324467866113, + "grad_norm": 2.5445713996887207, + "learning_rate": 5.840552243210978e-06, + "loss": 1.7079, + "step": 2572 + }, + { + "epoch": 0.847461813989872, + "grad_norm": 2.591435432434082, + "learning_rate": 5.8159038578558866e-06, + "loss": 1.7843, + "step": 2573 + }, + { + "epoch": 0.8477911811931327, + "grad_norm": 2.6687839031219482, + "learning_rate": 5.791304381195662e-06, + "loss": 1.8852, + "step": 2574 + }, + { + "epoch": 0.8481205483963934, + "grad_norm": 3.511777877807617, + "learning_rate": 5.766753840460204e-06, + "loss": 1.7108, + "step": 2575 + }, + { + "epoch": 0.8484499155996542, + "grad_norm": 1.7083003520965576, + "learning_rate": 5.742252262825221e-06, + "loss": 2.2711, + "step": 2576 + }, + { + "epoch": 0.8487792828029149, + "grad_norm": 2.8398375511169434, + "learning_rate": 5.7177996754122496e-06, + "loss": 2.6802, + "step": 2577 + }, + { + "epoch": 0.8491086500061756, + "grad_norm": 2.1514852046966553, + "learning_rate": 5.693396105288595e-06, + "loss": 2.4099, + "step": 2578 + }, + { + "epoch": 0.8494380172094363, + "grad_norm": 2.346045970916748, + "learning_rate": 5.669041579467293e-06, + "loss": 2.4432, + "step": 2579 + }, + { + "epoch": 0.8497673844126971, + "grad_norm": 2.4979259967803955, + "learning_rate": 5.644736124907097e-06, + "loss": 2.3092, + "step": 2580 + }, + { + "epoch": 0.8500967516159579, + "grad_norm": 2.5898611545562744, + "learning_rate": 5.620479768512432e-06, + "loss": 2.5798, + "step": 2581 + }, + { + "epoch": 0.8504261188192186, + "grad_norm": 2.240121364593506, + "learning_rate": 5.596272537133402e-06, + "loss": 2.3491, + "step": 2582 + }, + { + "epoch": 0.8507554860224793, + "grad_norm": 2.3242783546447754, + "learning_rate": 5.572114457565692e-06, + "loss": 2.1093, + "step": 2583 + }, + { + "epoch": 0.85108485322574, + "grad_norm": 2.1993753910064697, + "learning_rate": 5.5480055565506116e-06, + "loss": 2.0687, + "step": 2584 + }, + { + "epoch": 0.8514142204290008, + "grad_norm": 2.7326152324676514, + "learning_rate": 5.52394586077502e-06, + "loss": 2.3334, + "step": 2585 + }, + { + "epoch": 0.8517435876322615, + "grad_norm": 2.5875439643859863, + "learning_rate": 5.499935396871314e-06, + "loss": 2.3656, + "step": 2586 + }, + { + "epoch": 0.8520729548355223, + "grad_norm": 2.634803533554077, + "learning_rate": 5.475974191417399e-06, + "loss": 2.3282, + "step": 2587 + }, + { + "epoch": 0.852402322038783, + "grad_norm": 2.4338207244873047, + "learning_rate": 5.452062270936648e-06, + "loss": 2.0295, + "step": 2588 + }, + { + "epoch": 0.8527316892420437, + "grad_norm": 2.489284038543701, + "learning_rate": 5.4281996618978646e-06, + "loss": 1.9639, + "step": 2589 + }, + { + "epoch": 0.8530610564453045, + "grad_norm": 2.8681483268737793, + "learning_rate": 5.404386390715293e-06, + "loss": 2.1408, + "step": 2590 + }, + { + "epoch": 0.8533904236485652, + "grad_norm": 2.817866086959839, + "learning_rate": 5.3806224837485545e-06, + "loss": 2.255, + "step": 2591 + }, + { + "epoch": 0.8537197908518259, + "grad_norm": 2.494019031524658, + "learning_rate": 5.356907967302627e-06, + "loss": 1.9273, + "step": 2592 + }, + { + "epoch": 0.8540491580550866, + "grad_norm": 2.7966413497924805, + "learning_rate": 5.333242867627814e-06, + "loss": 2.5284, + "step": 2593 + }, + { + "epoch": 0.8543785252583475, + "grad_norm": 2.6782631874084473, + "learning_rate": 5.309627210919721e-06, + "loss": 2.0938, + "step": 2594 + }, + { + "epoch": 0.8547078924616082, + "grad_norm": 2.4979779720306396, + "learning_rate": 5.286061023319228e-06, + "loss": 2.2774, + "step": 2595 + }, + { + "epoch": 0.8550372596648689, + "grad_norm": 2.979830026626587, + "learning_rate": 5.262544330912445e-06, + "loss": 2.1949, + "step": 2596 + }, + { + "epoch": 0.8553666268681296, + "grad_norm": 2.6444127559661865, + "learning_rate": 5.23907715973071e-06, + "loss": 1.8533, + "step": 2597 + }, + { + "epoch": 0.8556959940713903, + "grad_norm": 2.309861183166504, + "learning_rate": 5.215659535750522e-06, + "loss": 1.8244, + "step": 2598 + }, + { + "epoch": 0.8560253612746511, + "grad_norm": 2.973512649536133, + "learning_rate": 5.192291484893563e-06, + "loss": 1.8739, + "step": 2599 + }, + { + "epoch": 0.8563547284779118, + "grad_norm": 3.404737710952759, + "learning_rate": 5.16897303302662e-06, + "loss": 1.8598, + "step": 2600 + }, + { + "epoch": 0.8566840956811725, + "grad_norm": 2.0394158363342285, + "learning_rate": 5.145704205961577e-06, + "loss": 2.415, + "step": 2601 + }, + { + "epoch": 0.8570134628844333, + "grad_norm": 1.9012643098831177, + "learning_rate": 5.122485029455398e-06, + "loss": 2.1571, + "step": 2602 + }, + { + "epoch": 0.857342830087694, + "grad_norm": 2.2303390502929688, + "learning_rate": 5.099315529210081e-06, + "loss": 2.3392, + "step": 2603 + }, + { + "epoch": 0.8576721972909548, + "grad_norm": 2.5212888717651367, + "learning_rate": 5.076195730872641e-06, + "loss": 2.338, + "step": 2604 + }, + { + "epoch": 0.8580015644942155, + "grad_norm": 2.758668899536133, + "learning_rate": 5.053125660035068e-06, + "loss": 2.2201, + "step": 2605 + }, + { + "epoch": 0.8583309316974762, + "grad_norm": 2.390583038330078, + "learning_rate": 5.030105342234314e-06, + "loss": 2.4438, + "step": 2606 + }, + { + "epoch": 0.8586602989007369, + "grad_norm": 2.5289313793182373, + "learning_rate": 5.007134802952274e-06, + "loss": 2.1411, + "step": 2607 + }, + { + "epoch": 0.8589896661039977, + "grad_norm": 2.3819549083709717, + "learning_rate": 4.984214067615689e-06, + "loss": 2.1277, + "step": 2608 + }, + { + "epoch": 0.8593190333072585, + "grad_norm": 2.2808547019958496, + "learning_rate": 4.961343161596227e-06, + "loss": 2.2532, + "step": 2609 + }, + { + "epoch": 0.8596484005105192, + "grad_norm": 2.3810598850250244, + "learning_rate": 4.938522110210369e-06, + "loss": 2.3429, + "step": 2610 + }, + { + "epoch": 0.8599777677137799, + "grad_norm": 2.7011477947235107, + "learning_rate": 4.915750938719421e-06, + "loss": 2.31, + "step": 2611 + }, + { + "epoch": 0.8603071349170406, + "grad_norm": 2.4542653560638428, + "learning_rate": 4.893029672329474e-06, + "loss": 2.3009, + "step": 2612 + }, + { + "epoch": 0.8606365021203014, + "grad_norm": 2.194711923599243, + "learning_rate": 4.870358336191383e-06, + "loss": 2.0109, + "step": 2613 + }, + { + "epoch": 0.8609658693235621, + "grad_norm": 2.693532943725586, + "learning_rate": 4.8477369554007055e-06, + "loss": 2.3555, + "step": 2614 + }, + { + "epoch": 0.8612952365268228, + "grad_norm": 2.6600592136383057, + "learning_rate": 4.825165554997741e-06, + "loss": 2.2698, + "step": 2615 + }, + { + "epoch": 0.8616246037300835, + "grad_norm": 2.6437413692474365, + "learning_rate": 4.802644159967435e-06, + "loss": 2.0872, + "step": 2616 + }, + { + "epoch": 0.8619539709333444, + "grad_norm": 2.502390146255493, + "learning_rate": 4.780172795239401e-06, + "loss": 2.0786, + "step": 2617 + }, + { + "epoch": 0.8622833381366051, + "grad_norm": 2.4297420978546143, + "learning_rate": 4.7577514856878616e-06, + "loss": 2.0086, + "step": 2618 + }, + { + "epoch": 0.8626127053398658, + "grad_norm": 2.603891134262085, + "learning_rate": 4.735380256131627e-06, + "loss": 2.0819, + "step": 2619 + }, + { + "epoch": 0.8629420725431265, + "grad_norm": 2.4713308811187744, + "learning_rate": 4.713059131334102e-06, + "loss": 2.1809, + "step": 2620 + }, + { + "epoch": 0.8632714397463872, + "grad_norm": 2.54862642288208, + "learning_rate": 4.6907881360031715e-06, + "loss": 1.937, + "step": 2621 + }, + { + "epoch": 0.863600806949648, + "grad_norm": 2.82242488861084, + "learning_rate": 4.668567294791282e-06, + "loss": 1.9862, + "step": 2622 + }, + { + "epoch": 0.8639301741529087, + "grad_norm": 3.0480566024780273, + "learning_rate": 4.646396632295347e-06, + "loss": 2.105, + "step": 2623 + }, + { + "epoch": 0.8642595413561694, + "grad_norm": 2.951061248779297, + "learning_rate": 4.624276173056735e-06, + "loss": 1.9754, + "step": 2624 + }, + { + "epoch": 0.8645889085594302, + "grad_norm": 2.960705041885376, + "learning_rate": 4.602205941561238e-06, + "loss": 1.648, + "step": 2625 + }, + { + "epoch": 0.864918275762691, + "grad_norm": 1.7927899360656738, + "learning_rate": 4.5801859622390584e-06, + "loss": 2.4304, + "step": 2626 + }, + { + "epoch": 0.8652476429659517, + "grad_norm": 2.601215124130249, + "learning_rate": 4.558216259464765e-06, + "loss": 2.4331, + "step": 2627 + }, + { + "epoch": 0.8655770101692124, + "grad_norm": 2.0217134952545166, + "learning_rate": 4.536296857557282e-06, + "loss": 2.1572, + "step": 2628 + }, + { + "epoch": 0.8659063773724731, + "grad_norm": 2.2973663806915283, + "learning_rate": 4.514427780779845e-06, + "loss": 2.0926, + "step": 2629 + }, + { + "epoch": 0.8662357445757338, + "grad_norm": 2.2276058197021484, + "learning_rate": 4.492609053339991e-06, + "loss": 2.1517, + "step": 2630 + }, + { + "epoch": 0.8665651117789946, + "grad_norm": 2.29813814163208, + "learning_rate": 4.470840699389528e-06, + "loss": 2.0991, + "step": 2631 + }, + { + "epoch": 0.8668944789822554, + "grad_norm": 2.494429588317871, + "learning_rate": 4.4491227430244804e-06, + "loss": 2.1413, + "step": 2632 + }, + { + "epoch": 0.8672238461855161, + "grad_norm": 2.598921775817871, + "learning_rate": 4.4274552082851084e-06, + "loss": 2.2659, + "step": 2633 + }, + { + "epoch": 0.8675532133887768, + "grad_norm": 2.702181339263916, + "learning_rate": 4.4058381191558525e-06, + "loss": 2.4259, + "step": 2634 + }, + { + "epoch": 0.8678825805920376, + "grad_norm": 2.753028154373169, + "learning_rate": 4.384271499565323e-06, + "loss": 2.4114, + "step": 2635 + }, + { + "epoch": 0.8682119477952983, + "grad_norm": 2.4215359687805176, + "learning_rate": 4.362755373386246e-06, + "loss": 2.3793, + "step": 2636 + }, + { + "epoch": 0.868541314998559, + "grad_norm": 2.8576810359954834, + "learning_rate": 4.341289764435463e-06, + "loss": 2.3855, + "step": 2637 + }, + { + "epoch": 0.8688706822018197, + "grad_norm": 2.6904537677764893, + "learning_rate": 4.319874696473908e-06, + "loss": 2.1941, + "step": 2638 + }, + { + "epoch": 0.8692000494050804, + "grad_norm": 2.96549916267395, + "learning_rate": 4.298510193206545e-06, + "loss": 2.1925, + "step": 2639 + }, + { + "epoch": 0.8695294166083413, + "grad_norm": 2.366469144821167, + "learning_rate": 4.277196278282386e-06, + "loss": 2.0381, + "step": 2640 + }, + { + "epoch": 0.869858783811602, + "grad_norm": 2.5164620876312256, + "learning_rate": 4.255932975294441e-06, + "loss": 2.0369, + "step": 2641 + }, + { + "epoch": 0.8701881510148627, + "grad_norm": 3.4024715423583984, + "learning_rate": 4.234720307779693e-06, + "loss": 2.0745, + "step": 2642 + }, + { + "epoch": 0.8705175182181234, + "grad_norm": 2.9264469146728516, + "learning_rate": 4.213558299219078e-06, + "loss": 2.1802, + "step": 2643 + }, + { + "epoch": 0.8708468854213842, + "grad_norm": 2.655367612838745, + "learning_rate": 4.192446973037451e-06, + "loss": 2.1807, + "step": 2644 + }, + { + "epoch": 0.8711762526246449, + "grad_norm": 2.6134543418884277, + "learning_rate": 4.171386352603585e-06, + "loss": 2.1425, + "step": 2645 + }, + { + "epoch": 0.8715056198279056, + "grad_norm": 2.909407615661621, + "learning_rate": 4.150376461230087e-06, + "loss": 2.1832, + "step": 2646 + }, + { + "epoch": 0.8718349870311664, + "grad_norm": 2.7573821544647217, + "learning_rate": 4.129417322173446e-06, + "loss": 1.8164, + "step": 2647 + }, + { + "epoch": 0.8721643542344271, + "grad_norm": 3.42594051361084, + "learning_rate": 4.108508958633955e-06, + "loss": 2.0909, + "step": 2648 + }, + { + "epoch": 0.8724937214376879, + "grad_norm": 2.7599639892578125, + "learning_rate": 4.087651393755703e-06, + "loss": 1.6129, + "step": 2649 + }, + { + "epoch": 0.8728230886409486, + "grad_norm": 3.7333033084869385, + "learning_rate": 4.066844650626556e-06, + "loss": 2.2567, + "step": 2650 + }, + { + "epoch": 0.8731524558442093, + "grad_norm": 2.1264333724975586, + "learning_rate": 4.046088752278127e-06, + "loss": 2.4013, + "step": 2651 + }, + { + "epoch": 0.87348182304747, + "grad_norm": 2.2733492851257324, + "learning_rate": 4.025383721685722e-06, + "loss": 2.4881, + "step": 2652 + }, + { + "epoch": 0.8738111902507307, + "grad_norm": 2.3039214611053467, + "learning_rate": 4.004729581768363e-06, + "loss": 2.0644, + "step": 2653 + }, + { + "epoch": 0.8741405574539916, + "grad_norm": 2.163909912109375, + "learning_rate": 3.9841263553887395e-06, + "loss": 2.3237, + "step": 2654 + }, + { + "epoch": 0.8744699246572523, + "grad_norm": 2.235494613647461, + "learning_rate": 3.96357406535317e-06, + "loss": 2.2701, + "step": 2655 + }, + { + "epoch": 0.874799291860513, + "grad_norm": 2.8736233711242676, + "learning_rate": 3.943072734411607e-06, + "loss": 2.4804, + "step": 2656 + }, + { + "epoch": 0.8751286590637737, + "grad_norm": 2.4093754291534424, + "learning_rate": 3.922622385257579e-06, + "loss": 2.1099, + "step": 2657 + }, + { + "epoch": 0.8754580262670345, + "grad_norm": 3.4634957313537598, + "learning_rate": 3.902223040528185e-06, + "loss": 2.7003, + "step": 2658 + }, + { + "epoch": 0.8757873934702952, + "grad_norm": 2.40132474899292, + "learning_rate": 3.8818747228040795e-06, + "loss": 2.3205, + "step": 2659 + }, + { + "epoch": 0.8761167606735559, + "grad_norm": 2.1787571907043457, + "learning_rate": 3.861577454609417e-06, + "loss": 1.9683, + "step": 2660 + }, + { + "epoch": 0.8764461278768166, + "grad_norm": 2.4341671466827393, + "learning_rate": 3.841331258411846e-06, + "loss": 1.9743, + "step": 2661 + }, + { + "epoch": 0.8767754950800773, + "grad_norm": 2.984081745147705, + "learning_rate": 3.8211361566224905e-06, + "loss": 2.2883, + "step": 2662 + }, + { + "epoch": 0.8771048622833382, + "grad_norm": 2.602597236633301, + "learning_rate": 3.8009921715959184e-06, + "loss": 2.2132, + "step": 2663 + }, + { + "epoch": 0.8774342294865989, + "grad_norm": 2.761525869369507, + "learning_rate": 3.7808993256300884e-06, + "loss": 1.8199, + "step": 2664 + }, + { + "epoch": 0.8777635966898596, + "grad_norm": 2.7709145545959473, + "learning_rate": 3.7608576409663866e-06, + "loss": 2.265, + "step": 2665 + }, + { + "epoch": 0.8780929638931203, + "grad_norm": 2.5388407707214355, + "learning_rate": 3.7408671397895444e-06, + "loss": 1.9894, + "step": 2666 + }, + { + "epoch": 0.8784223310963811, + "grad_norm": 2.6022965908050537, + "learning_rate": 3.7209278442276507e-06, + "loss": 2.1244, + "step": 2667 + }, + { + "epoch": 0.8787516982996418, + "grad_norm": 2.502260208129883, + "learning_rate": 3.7010397763521e-06, + "loss": 2.1518, + "step": 2668 + }, + { + "epoch": 0.8790810655029025, + "grad_norm": 2.6602087020874023, + "learning_rate": 3.681202958177604e-06, + "loss": 2.2532, + "step": 2669 + }, + { + "epoch": 0.8794104327061633, + "grad_norm": 2.9550154209136963, + "learning_rate": 3.661417411662105e-06, + "loss": 2.0059, + "step": 2670 + }, + { + "epoch": 0.879739799909424, + "grad_norm": 2.7335567474365234, + "learning_rate": 3.641683158706827e-06, + "loss": 2.4175, + "step": 2671 + }, + { + "epoch": 0.8800691671126848, + "grad_norm": 2.868046522140503, + "learning_rate": 3.6220002211562023e-06, + "loss": 1.9707, + "step": 2672 + }, + { + "epoch": 0.8803985343159455, + "grad_norm": 3.0974791049957275, + "learning_rate": 3.602368620797869e-06, + "loss": 1.9962, + "step": 2673 + }, + { + "epoch": 0.8807279015192062, + "grad_norm": 2.911442518234253, + "learning_rate": 3.5827883793626225e-06, + "loss": 1.8045, + "step": 2674 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 3.6394643783569336, + "learning_rate": 3.5632595185244143e-06, + "loss": 1.8825, + "step": 2675 + }, + { + "epoch": 0.8813866359257277, + "grad_norm": 1.845200538635254, + "learning_rate": 3.543782059900347e-06, + "loss": 2.3288, + "step": 2676 + }, + { + "epoch": 0.8817160031289885, + "grad_norm": 2.175915002822876, + "learning_rate": 3.524356025050568e-06, + "loss": 2.1404, + "step": 2677 + }, + { + "epoch": 0.8820453703322492, + "grad_norm": 2.516981363296509, + "learning_rate": 3.5049814354783496e-06, + "loss": 2.4723, + "step": 2678 + }, + { + "epoch": 0.8823747375355099, + "grad_norm": 2.449856758117676, + "learning_rate": 3.4856583126300078e-06, + "loss": 2.1453, + "step": 2679 + }, + { + "epoch": 0.8827041047387706, + "grad_norm": 2.3987417221069336, + "learning_rate": 3.4663866778948728e-06, + "loss": 2.2724, + "step": 2680 + }, + { + "epoch": 0.8830334719420314, + "grad_norm": 2.567919969558716, + "learning_rate": 3.4471665526053075e-06, + "loss": 2.4146, + "step": 2681 + }, + { + "epoch": 0.8833628391452921, + "grad_norm": 2.4699082374572754, + "learning_rate": 3.4279979580366274e-06, + "loss": 2.422, + "step": 2682 + }, + { + "epoch": 0.8836922063485528, + "grad_norm": 2.8117380142211914, + "learning_rate": 3.408880915407142e-06, + "loss": 2.5623, + "step": 2683 + }, + { + "epoch": 0.8840215735518135, + "grad_norm": 2.675783634185791, + "learning_rate": 3.3898154458780527e-06, + "loss": 2.4708, + "step": 2684 + }, + { + "epoch": 0.8843509407550744, + "grad_norm": 2.3889050483703613, + "learning_rate": 3.3708015705535055e-06, + "loss": 1.9012, + "step": 2685 + }, + { + "epoch": 0.8846803079583351, + "grad_norm": 2.9828548431396484, + "learning_rate": 3.35183931048052e-06, + "loss": 2.7445, + "step": 2686 + }, + { + "epoch": 0.8850096751615958, + "grad_norm": 2.40610933303833, + "learning_rate": 3.3329286866490064e-06, + "loss": 2.0757, + "step": 2687 + }, + { + "epoch": 0.8853390423648565, + "grad_norm": 2.6663005352020264, + "learning_rate": 3.314069719991697e-06, + "loss": 2.161, + "step": 2688 + }, + { + "epoch": 0.8856684095681172, + "grad_norm": 3.467353582382202, + "learning_rate": 3.2952624313841275e-06, + "loss": 2.5355, + "step": 2689 + }, + { + "epoch": 0.885997776771378, + "grad_norm": 2.527111053466797, + "learning_rate": 3.276506841644655e-06, + "loss": 2.1816, + "step": 2690 + }, + { + "epoch": 0.8863271439746387, + "grad_norm": 2.746145009994507, + "learning_rate": 3.257802971534402e-06, + "loss": 2.1686, + "step": 2691 + }, + { + "epoch": 0.8866565111778995, + "grad_norm": 2.5353686809539795, + "learning_rate": 3.2391508417572438e-06, + "loss": 2.2508, + "step": 2692 + }, + { + "epoch": 0.8869858783811602, + "grad_norm": 2.6209967136383057, + "learning_rate": 3.2205504729597714e-06, + "loss": 2.0796, + "step": 2693 + }, + { + "epoch": 0.887315245584421, + "grad_norm": 2.5521199703216553, + "learning_rate": 3.2020018857313096e-06, + "loss": 2.2875, + "step": 2694 + }, + { + "epoch": 0.8876446127876817, + "grad_norm": 2.6525521278381348, + "learning_rate": 3.183505100603812e-06, + "loss": 2.0905, + "step": 2695 + }, + { + "epoch": 0.8879739799909424, + "grad_norm": 3.0146303176879883, + "learning_rate": 3.1650601380519397e-06, + "loss": 2.0727, + "step": 2696 + }, + { + "epoch": 0.8883033471942031, + "grad_norm": 3.2892794609069824, + "learning_rate": 3.1466670184929714e-06, + "loss": 2.0854, + "step": 2697 + }, + { + "epoch": 0.8886327143974638, + "grad_norm": 2.7834742069244385, + "learning_rate": 3.128325762286799e-06, + "loss": 1.9612, + "step": 2698 + }, + { + "epoch": 0.8889620816007247, + "grad_norm": 2.958502769470215, + "learning_rate": 3.110036389735904e-06, + "loss": 1.9484, + "step": 2699 + }, + { + "epoch": 0.8892914488039854, + "grad_norm": 2.957582712173462, + "learning_rate": 3.091798921085343e-06, + "loss": 1.6971, + "step": 2700 + }, + { + "epoch": 0.8896208160072461, + "grad_norm": 2.2949178218841553, + "learning_rate": 3.0736133765227213e-06, + "loss": 2.2748, + "step": 2701 + }, + { + "epoch": 0.8899501832105068, + "grad_norm": 2.3045599460601807, + "learning_rate": 3.055479776178144e-06, + "loss": 2.2503, + "step": 2702 + }, + { + "epoch": 0.8902795504137675, + "grad_norm": 2.671010732650757, + "learning_rate": 3.0373981401242423e-06, + "loss": 2.5059, + "step": 2703 + }, + { + "epoch": 0.8906089176170283, + "grad_norm": 2.3719635009765625, + "learning_rate": 3.0193684883761177e-06, + "loss": 2.3507, + "step": 2704 + }, + { + "epoch": 0.890938284820289, + "grad_norm": 2.4885711669921875, + "learning_rate": 3.0013908408913328e-06, + "loss": 2.4243, + "step": 2705 + }, + { + "epoch": 0.8912676520235497, + "grad_norm": 2.3072006702423096, + "learning_rate": 2.9834652175698806e-06, + "loss": 2.1482, + "step": 2706 + }, + { + "epoch": 0.8915970192268104, + "grad_norm": 2.133578300476074, + "learning_rate": 2.965591638254178e-06, + "loss": 1.98, + "step": 2707 + }, + { + "epoch": 0.8919263864300713, + "grad_norm": 2.2153220176696777, + "learning_rate": 2.9477701227290046e-06, + "loss": 2.1145, + "step": 2708 + }, + { + "epoch": 0.892255753633332, + "grad_norm": 2.178952932357788, + "learning_rate": 2.9300006907215404e-06, + "loss": 2.1486, + "step": 2709 + }, + { + "epoch": 0.8925851208365927, + "grad_norm": 3.1489574909210205, + "learning_rate": 2.912283361901297e-06, + "loss": 2.6655, + "step": 2710 + }, + { + "epoch": 0.8929144880398534, + "grad_norm": 2.3635940551757812, + "learning_rate": 2.8946181558801135e-06, + "loss": 2.1683, + "step": 2711 + }, + { + "epoch": 0.8932438552431141, + "grad_norm": 3.0581488609313965, + "learning_rate": 2.8770050922121318e-06, + "loss": 2.2093, + "step": 2712 + }, + { + "epoch": 0.8935732224463749, + "grad_norm": 2.5770411491394043, + "learning_rate": 2.8594441903937895e-06, + "loss": 2.0562, + "step": 2713 + }, + { + "epoch": 0.8939025896496356, + "grad_norm": 2.952260732650757, + "learning_rate": 2.8419354698637657e-06, + "loss": 2.3356, + "step": 2714 + }, + { + "epoch": 0.8942319568528964, + "grad_norm": 2.414741039276123, + "learning_rate": 2.824478950002979e-06, + "loss": 2.1069, + "step": 2715 + }, + { + "epoch": 0.8945613240561571, + "grad_norm": 2.4834258556365967, + "learning_rate": 2.807074650134578e-06, + "loss": 1.668, + "step": 2716 + }, + { + "epoch": 0.8948906912594179, + "grad_norm": 3.139636516571045, + "learning_rate": 2.7897225895238965e-06, + "loss": 2.1568, + "step": 2717 + }, + { + "epoch": 0.8952200584626786, + "grad_norm": 2.8860831260681152, + "learning_rate": 2.772422787378459e-06, + "loss": 1.9833, + "step": 2718 + }, + { + "epoch": 0.8955494256659393, + "grad_norm": 2.7193524837493896, + "learning_rate": 2.755175262847931e-06, + "loss": 2.0764, + "step": 2719 + }, + { + "epoch": 0.8958787928692, + "grad_norm": 2.3650174140930176, + "learning_rate": 2.737980035024107e-06, + "loss": 1.6969, + "step": 2720 + }, + { + "epoch": 0.8962081600724607, + "grad_norm": 2.5429396629333496, + "learning_rate": 2.720837122940895e-06, + "loss": 1.9325, + "step": 2721 + }, + { + "epoch": 0.8965375272757216, + "grad_norm": 2.7163567543029785, + "learning_rate": 2.703746545574304e-06, + "loss": 2.2487, + "step": 2722 + }, + { + "epoch": 0.8968668944789823, + "grad_norm": 2.875310182571411, + "learning_rate": 2.6867083218424014e-06, + "loss": 1.6736, + "step": 2723 + }, + { + "epoch": 0.897196261682243, + "grad_norm": 3.140059232711792, + "learning_rate": 2.669722470605307e-06, + "loss": 1.938, + "step": 2724 + }, + { + "epoch": 0.8975256288855037, + "grad_norm": 3.804771661758423, + "learning_rate": 2.6527890106651576e-06, + "loss": 2.3505, + "step": 2725 + }, + { + "epoch": 0.8978549960887645, + "grad_norm": 2.1390380859375, + "learning_rate": 2.635907960766126e-06, + "loss": 2.4625, + "step": 2726 + }, + { + "epoch": 0.8981843632920252, + "grad_norm": 2.6339430809020996, + "learning_rate": 2.6190793395943268e-06, + "loss": 2.6437, + "step": 2727 + }, + { + "epoch": 0.8985137304952859, + "grad_norm": 2.0819966793060303, + "learning_rate": 2.6023031657778752e-06, + "loss": 2.1295, + "step": 2728 + }, + { + "epoch": 0.8988430976985466, + "grad_norm": 2.2953975200653076, + "learning_rate": 2.585579457886811e-06, + "loss": 2.131, + "step": 2729 + }, + { + "epoch": 0.8991724649018074, + "grad_norm": 2.132822275161743, + "learning_rate": 2.568908234433115e-06, + "loss": 2.114, + "step": 2730 + }, + { + "epoch": 0.8995018321050682, + "grad_norm": 2.3440139293670654, + "learning_rate": 2.5522895138706593e-06, + "loss": 2.3049, + "step": 2731 + }, + { + "epoch": 0.8998311993083289, + "grad_norm": 2.4961066246032715, + "learning_rate": 2.5357233145952065e-06, + "loss": 2.3662, + "step": 2732 + }, + { + "epoch": 0.9001605665115896, + "grad_norm": 2.0596587657928467, + "learning_rate": 2.5192096549443655e-06, + "loss": 1.7812, + "step": 2733 + }, + { + "epoch": 0.9004899337148503, + "grad_norm": 2.2845394611358643, + "learning_rate": 2.5027485531976035e-06, + "loss": 2.0744, + "step": 2734 + }, + { + "epoch": 0.9008193009181111, + "grad_norm": 2.4581706523895264, + "learning_rate": 2.4863400275762063e-06, + "loss": 2.3042, + "step": 2735 + }, + { + "epoch": 0.9011486681213718, + "grad_norm": 2.356445074081421, + "learning_rate": 2.4699840962432606e-06, + "loss": 2.3077, + "step": 2736 + }, + { + "epoch": 0.9014780353246326, + "grad_norm": 2.9179089069366455, + "learning_rate": 2.453680777303641e-06, + "loss": 2.4067, + "step": 2737 + }, + { + "epoch": 0.9018074025278933, + "grad_norm": 2.447138547897339, + "learning_rate": 2.437430088803966e-06, + "loss": 2.2238, + "step": 2738 + }, + { + "epoch": 0.902136769731154, + "grad_norm": 2.1415274143218994, + "learning_rate": 2.4212320487326246e-06, + "loss": 1.9145, + "step": 2739 + }, + { + "epoch": 0.9024661369344148, + "grad_norm": 2.2996044158935547, + "learning_rate": 2.405086675019691e-06, + "loss": 2.2053, + "step": 2740 + }, + { + "epoch": 0.9027955041376755, + "grad_norm": 2.8865909576416016, + "learning_rate": 2.3889939855369693e-06, + "loss": 2.4935, + "step": 2741 + }, + { + "epoch": 0.9031248713409362, + "grad_norm": 2.9362375736236572, + "learning_rate": 2.372953998097943e-06, + "loss": 2.0295, + "step": 2742 + }, + { + "epoch": 0.9034542385441969, + "grad_norm": 2.538898468017578, + "learning_rate": 2.356966730457749e-06, + "loss": 1.9073, + "step": 2743 + }, + { + "epoch": 0.9037836057474578, + "grad_norm": 2.7134170532226562, + "learning_rate": 2.34103220031317e-06, + "loss": 2.0446, + "step": 2744 + }, + { + "epoch": 0.9041129729507185, + "grad_norm": 2.733494758605957, + "learning_rate": 2.325150425302619e-06, + "loss": 2.2672, + "step": 2745 + }, + { + "epoch": 0.9044423401539792, + "grad_norm": 2.400979995727539, + "learning_rate": 2.3093214230061e-06, + "loss": 2.0618, + "step": 2746 + }, + { + "epoch": 0.9047717073572399, + "grad_norm": 2.6966586112976074, + "learning_rate": 2.293545210945203e-06, + "loss": 1.9108, + "step": 2747 + }, + { + "epoch": 0.9051010745605006, + "grad_norm": 2.9036622047424316, + "learning_rate": 2.2778218065830968e-06, + "loss": 2.2464, + "step": 2748 + }, + { + "epoch": 0.9054304417637614, + "grad_norm": 2.609510898590088, + "learning_rate": 2.262151227324488e-06, + "loss": 1.8424, + "step": 2749 + }, + { + "epoch": 0.9057598089670221, + "grad_norm": 2.7685904502868652, + "learning_rate": 2.2465334905156065e-06, + "loss": 1.7066, + "step": 2750 + }, + { + "epoch": 0.9060891761702828, + "grad_norm": 2.4640228748321533, + "learning_rate": 2.230968613444201e-06, + "loss": 2.6138, + "step": 2751 + }, + { + "epoch": 0.9064185433735436, + "grad_norm": 2.218118190765381, + "learning_rate": 2.2154566133394784e-06, + "loss": 2.2352, + "step": 2752 + }, + { + "epoch": 0.9067479105768043, + "grad_norm": 2.2658257484436035, + "learning_rate": 2.1999975073721423e-06, + "loss": 2.4639, + "step": 2753 + }, + { + "epoch": 0.9070772777800651, + "grad_norm": 2.1403121948242188, + "learning_rate": 2.184591312654344e-06, + "loss": 2.1344, + "step": 2754 + }, + { + "epoch": 0.9074066449833258, + "grad_norm": 2.4829635620117188, + "learning_rate": 2.169238046239658e-06, + "loss": 2.414, + "step": 2755 + }, + { + "epoch": 0.9077360121865865, + "grad_norm": 2.313821315765381, + "learning_rate": 2.1539377251230673e-06, + "loss": 1.9432, + "step": 2756 + }, + { + "epoch": 0.9080653793898472, + "grad_norm": 2.121927261352539, + "learning_rate": 2.13869036624097e-06, + "loss": 2.0372, + "step": 2757 + }, + { + "epoch": 0.908394746593108, + "grad_norm": 2.498476266860962, + "learning_rate": 2.1234959864711025e-06, + "loss": 2.1289, + "step": 2758 + }, + { + "epoch": 0.9087241137963687, + "grad_norm": 2.445488691329956, + "learning_rate": 2.1083546026325895e-06, + "loss": 2.3582, + "step": 2759 + }, + { + "epoch": 0.9090534809996295, + "grad_norm": 2.175457239151001, + "learning_rate": 2.093266231485874e-06, + "loss": 1.9455, + "step": 2760 + }, + { + "epoch": 0.9093828482028902, + "grad_norm": 2.539600372314453, + "learning_rate": 2.0782308897327297e-06, + "loss": 2.4663, + "step": 2761 + }, + { + "epoch": 0.9097122154061509, + "grad_norm": 2.7965781688690186, + "learning_rate": 2.063248594016226e-06, + "loss": 2.4572, + "step": 2762 + }, + { + "epoch": 0.9100415826094117, + "grad_norm": 2.535855293273926, + "learning_rate": 2.0483193609207086e-06, + "loss": 2.1825, + "step": 2763 + }, + { + "epoch": 0.9103709498126724, + "grad_norm": 2.506638526916504, + "learning_rate": 2.0334432069718023e-06, + "loss": 2.2343, + "step": 2764 + }, + { + "epoch": 0.9107003170159331, + "grad_norm": 2.5606038570404053, + "learning_rate": 2.0186201486363465e-06, + "loss": 2.016, + "step": 2765 + }, + { + "epoch": 0.9110296842191938, + "grad_norm": 2.3744375705718994, + "learning_rate": 2.003850202322444e-06, + "loss": 1.8271, + "step": 2766 + }, + { + "epoch": 0.9113590514224547, + "grad_norm": 2.7433090209960938, + "learning_rate": 1.989133384379377e-06, + "loss": 2.1174, + "step": 2767 + }, + { + "epoch": 0.9116884186257154, + "grad_norm": 2.728475570678711, + "learning_rate": 1.9744697110976373e-06, + "loss": 1.9861, + "step": 2768 + }, + { + "epoch": 0.9120177858289761, + "grad_norm": 2.9022865295410156, + "learning_rate": 1.9598591987088853e-06, + "loss": 2.2861, + "step": 2769 + }, + { + "epoch": 0.9123471530322368, + "grad_norm": 2.443715810775757, + "learning_rate": 1.9453018633859344e-06, + "loss": 1.8767, + "step": 2770 + }, + { + "epoch": 0.9126765202354975, + "grad_norm": 3.8203299045562744, + "learning_rate": 1.930797721242722e-06, + "loss": 1.98, + "step": 2771 + }, + { + "epoch": 0.9130058874387583, + "grad_norm": 2.870041608810425, + "learning_rate": 1.9163467883343223e-06, + "loss": 2.0627, + "step": 2772 + }, + { + "epoch": 0.913335254642019, + "grad_norm": 2.7346694469451904, + "learning_rate": 1.9019490806569063e-06, + "loss": 2.3428, + "step": 2773 + }, + { + "epoch": 0.9136646218452797, + "grad_norm": 3.569988489151001, + "learning_rate": 1.88760461414772e-06, + "loss": 2.3849, + "step": 2774 + }, + { + "epoch": 0.9139939890485405, + "grad_norm": 3.2586612701416016, + "learning_rate": 1.8733134046850898e-06, + "loss": 1.825, + "step": 2775 + }, + { + "epoch": 0.9143233562518013, + "grad_norm": 2.39789080619812, + "learning_rate": 1.8590754680883782e-06, + "loss": 2.234, + "step": 2776 + }, + { + "epoch": 0.914652723455062, + "grad_norm": 2.7146317958831787, + "learning_rate": 1.8448908201179892e-06, + "loss": 2.5098, + "step": 2777 + }, + { + "epoch": 0.9149820906583227, + "grad_norm": 2.7325544357299805, + "learning_rate": 1.830759476475319e-06, + "loss": 2.3236, + "step": 2778 + }, + { + "epoch": 0.9153114578615834, + "grad_norm": 2.257678747177124, + "learning_rate": 1.816681452802782e-06, + "loss": 2.1115, + "step": 2779 + }, + { + "epoch": 0.9156408250648441, + "grad_norm": 2.2593493461608887, + "learning_rate": 1.8026567646837633e-06, + "loss": 2.4698, + "step": 2780 + }, + { + "epoch": 0.9159701922681049, + "grad_norm": 2.5541627407073975, + "learning_rate": 1.7886854276426057e-06, + "loss": 2.3059, + "step": 2781 + }, + { + "epoch": 0.9162995594713657, + "grad_norm": 2.6000893115997314, + "learning_rate": 1.774767457144605e-06, + "loss": 2.2396, + "step": 2782 + }, + { + "epoch": 0.9166289266746264, + "grad_norm": 2.629995107650757, + "learning_rate": 1.7609028685959704e-06, + "loss": 2.0903, + "step": 2783 + }, + { + "epoch": 0.9169582938778871, + "grad_norm": 2.4334161281585693, + "learning_rate": 1.7470916773438373e-06, + "loss": 2.1372, + "step": 2784 + }, + { + "epoch": 0.9172876610811479, + "grad_norm": 2.6131606101989746, + "learning_rate": 1.733333898676215e-06, + "loss": 2.5038, + "step": 2785 + }, + { + "epoch": 0.9176170282844086, + "grad_norm": 2.5102593898773193, + "learning_rate": 1.7196295478220048e-06, + "loss": 1.9959, + "step": 2786 + }, + { + "epoch": 0.9179463954876693, + "grad_norm": 2.299788236618042, + "learning_rate": 1.705978639950967e-06, + "loss": 1.9553, + "step": 2787 + }, + { + "epoch": 0.91827576269093, + "grad_norm": 2.4039552211761475, + "learning_rate": 1.692381190173692e-06, + "loss": 2.1673, + "step": 2788 + }, + { + "epoch": 0.9186051298941907, + "grad_norm": 2.8299825191497803, + "learning_rate": 1.6788372135416063e-06, + "loss": 2.0858, + "step": 2789 + }, + { + "epoch": 0.9189344970974516, + "grad_norm": 2.670093059539795, + "learning_rate": 1.6653467250469402e-06, + "loss": 2.2654, + "step": 2790 + }, + { + "epoch": 0.9192638643007123, + "grad_norm": 2.81526255607605, + "learning_rate": 1.65190973962272e-06, + "loss": 1.9366, + "step": 2791 + }, + { + "epoch": 0.919593231503973, + "grad_norm": 2.22592830657959, + "learning_rate": 1.6385262721427374e-06, + "loss": 1.7114, + "step": 2792 + }, + { + "epoch": 0.9199225987072337, + "grad_norm": 2.677290916442871, + "learning_rate": 1.625196337421564e-06, + "loss": 2.3907, + "step": 2793 + }, + { + "epoch": 0.9202519659104944, + "grad_norm": 2.487548828125, + "learning_rate": 1.6119199502144966e-06, + "loss": 1.9388, + "step": 2794 + }, + { + "epoch": 0.9205813331137552, + "grad_norm": 3.054719924926758, + "learning_rate": 1.5986971252175686e-06, + "loss": 2.0144, + "step": 2795 + }, + { + "epoch": 0.9209107003170159, + "grad_norm": 2.6823740005493164, + "learning_rate": 1.5855278770675108e-06, + "loss": 1.9984, + "step": 2796 + }, + { + "epoch": 0.9212400675202767, + "grad_norm": 2.6444766521453857, + "learning_rate": 1.5724122203417679e-06, + "loss": 1.9557, + "step": 2797 + }, + { + "epoch": 0.9215694347235374, + "grad_norm": 3.096118211746216, + "learning_rate": 1.5593501695584433e-06, + "loss": 1.9882, + "step": 2798 + }, + { + "epoch": 0.9218988019267982, + "grad_norm": 3.212019681930542, + "learning_rate": 1.5463417391763101e-06, + "loss": 2.1233, + "step": 2799 + }, + { + "epoch": 0.9222281691300589, + "grad_norm": 2.8375954627990723, + "learning_rate": 1.5333869435948e-06, + "loss": 1.7124, + "step": 2800 + }, + { + "epoch": 0.9225575363333196, + "grad_norm": 2.4758059978485107, + "learning_rate": 1.5204857971539477e-06, + "loss": 2.2537, + "step": 2801 + }, + { + "epoch": 0.9228869035365803, + "grad_norm": 2.129150629043579, + "learning_rate": 1.5076383141344352e-06, + "loss": 2.0797, + "step": 2802 + }, + { + "epoch": 0.923216270739841, + "grad_norm": 2.272265672683716, + "learning_rate": 1.4948445087575091e-06, + "loss": 2.1409, + "step": 2803 + }, + { + "epoch": 0.9235456379431018, + "grad_norm": 2.231919288635254, + "learning_rate": 1.4821043951850133e-06, + "loss": 2.1279, + "step": 2804 + }, + { + "epoch": 0.9238750051463626, + "grad_norm": 2.3199281692504883, + "learning_rate": 1.4694179875193726e-06, + "loss": 2.208, + "step": 2805 + }, + { + "epoch": 0.9242043723496233, + "grad_norm": 2.0582661628723145, + "learning_rate": 1.4567852998035426e-06, + "loss": 2.0938, + "step": 2806 + }, + { + "epoch": 0.924533739552884, + "grad_norm": 2.4887518882751465, + "learning_rate": 1.4442063460210263e-06, + "loss": 2.3684, + "step": 2807 + }, + { + "epoch": 0.9248631067561448, + "grad_norm": 2.5550124645233154, + "learning_rate": 1.4316811400958362e-06, + "loss": 2.0291, + "step": 2808 + }, + { + "epoch": 0.9251924739594055, + "grad_norm": 2.2390754222869873, + "learning_rate": 1.4192096958925038e-06, + "loss": 1.8898, + "step": 2809 + }, + { + "epoch": 0.9255218411626662, + "grad_norm": 2.5573508739471436, + "learning_rate": 1.4067920272160362e-06, + "loss": 2.3696, + "step": 2810 + }, + { + "epoch": 0.9258512083659269, + "grad_norm": 2.908836841583252, + "learning_rate": 1.394428147811927e-06, + "loss": 2.4529, + "step": 2811 + }, + { + "epoch": 0.9261805755691876, + "grad_norm": 2.5992095470428467, + "learning_rate": 1.382118071366123e-06, + "loss": 1.9448, + "step": 2812 + }, + { + "epoch": 0.9265099427724485, + "grad_norm": 2.3026111125946045, + "learning_rate": 1.3698618115050244e-06, + "loss": 2.0529, + "step": 2813 + }, + { + "epoch": 0.9268393099757092, + "grad_norm": 2.8297219276428223, + "learning_rate": 1.3576593817954286e-06, + "loss": 2.1613, + "step": 2814 + }, + { + "epoch": 0.9271686771789699, + "grad_norm": 2.73018479347229, + "learning_rate": 1.345510795744581e-06, + "loss": 2.0114, + "step": 2815 + }, + { + "epoch": 0.9274980443822306, + "grad_norm": 3.0178444385528564, + "learning_rate": 1.3334160668001195e-06, + "loss": 2.2001, + "step": 2816 + }, + { + "epoch": 0.9278274115854914, + "grad_norm": 3.0993990898132324, + "learning_rate": 1.3213752083500508e-06, + "loss": 2.2696, + "step": 2817 + }, + { + "epoch": 0.9281567787887521, + "grad_norm": 2.804314136505127, + "learning_rate": 1.3093882337227693e-06, + "loss": 2.0842, + "step": 2818 + }, + { + "epoch": 0.9284861459920128, + "grad_norm": 2.4682846069335938, + "learning_rate": 1.2974551561870107e-06, + "loss": 1.8982, + "step": 2819 + }, + { + "epoch": 0.9288155131952736, + "grad_norm": 2.5155272483825684, + "learning_rate": 1.2855759889518594e-06, + "loss": 2.1026, + "step": 2820 + }, + { + "epoch": 0.9291448803985343, + "grad_norm": 2.995867967605591, + "learning_rate": 1.2737507451667074e-06, + "loss": 2.0615, + "step": 2821 + }, + { + "epoch": 0.9294742476017951, + "grad_norm": 2.7689971923828125, + "learning_rate": 1.2619794379212845e-06, + "loss": 2.1408, + "step": 2822 + }, + { + "epoch": 0.9298036148050558, + "grad_norm": 2.611830234527588, + "learning_rate": 1.2502620802455955e-06, + "loss": 1.7406, + "step": 2823 + }, + { + "epoch": 0.9301329820083165, + "grad_norm": 2.6598401069641113, + "learning_rate": 1.2385986851099318e-06, + "loss": 1.5781, + "step": 2824 + }, + { + "epoch": 0.9304623492115772, + "grad_norm": 4.383054733276367, + "learning_rate": 1.226989265424855e-06, + "loss": 1.6148, + "step": 2825 + }, + { + "epoch": 0.930791716414838, + "grad_norm": 1.9361492395401, + "learning_rate": 1.2154338340411797e-06, + "loss": 2.2058, + "step": 2826 + }, + { + "epoch": 0.9311210836180988, + "grad_norm": 2.430833101272583, + "learning_rate": 1.2039324037499578e-06, + "loss": 2.2598, + "step": 2827 + }, + { + "epoch": 0.9314504508213595, + "grad_norm": 2.149946928024292, + "learning_rate": 1.192484987282466e-06, + "loss": 2.2, + "step": 2828 + }, + { + "epoch": 0.9317798180246202, + "grad_norm": 2.194275140762329, + "learning_rate": 1.181091597310191e-06, + "loss": 2.4223, + "step": 2829 + }, + { + "epoch": 0.9321091852278809, + "grad_norm": 2.4965078830718994, + "learning_rate": 1.1697522464448107e-06, + "loss": 2.3354, + "step": 2830 + }, + { + "epoch": 0.9324385524311417, + "grad_norm": 2.4370152950286865, + "learning_rate": 1.1584669472382014e-06, + "loss": 2.2227, + "step": 2831 + }, + { + "epoch": 0.9327679196344024, + "grad_norm": 2.3684237003326416, + "learning_rate": 1.1472357121823873e-06, + "loss": 2.0301, + "step": 2832 + }, + { + "epoch": 0.9330972868376631, + "grad_norm": 2.8566548824310303, + "learning_rate": 1.136058553709568e-06, + "loss": 2.6412, + "step": 2833 + }, + { + "epoch": 0.9334266540409238, + "grad_norm": 2.3684935569763184, + "learning_rate": 1.124935484192069e-06, + "loss": 2.0108, + "step": 2834 + }, + { + "epoch": 0.9337560212441847, + "grad_norm": 2.404863119125366, + "learning_rate": 1.1138665159423411e-06, + "loss": 2.2514, + "step": 2835 + }, + { + "epoch": 0.9340853884474454, + "grad_norm": 2.4747257232666016, + "learning_rate": 1.102851661212967e-06, + "loss": 2.1483, + "step": 2836 + }, + { + "epoch": 0.9344147556507061, + "grad_norm": 2.7829651832580566, + "learning_rate": 1.0918909321966097e-06, + "loss": 2.513, + "step": 2837 + }, + { + "epoch": 0.9347441228539668, + "grad_norm": 2.477625608444214, + "learning_rate": 1.0809843410260312e-06, + "loss": 2.0257, + "step": 2838 + }, + { + "epoch": 0.9350734900572275, + "grad_norm": 3.145524024963379, + "learning_rate": 1.0701318997740629e-06, + "loss": 2.3148, + "step": 2839 + }, + { + "epoch": 0.9354028572604883, + "grad_norm": 3.496830701828003, + "learning_rate": 1.0593336204535952e-06, + "loss": 2.4637, + "step": 2840 + }, + { + "epoch": 0.935732224463749, + "grad_norm": 2.429844856262207, + "learning_rate": 1.0485895150175672e-06, + "loss": 1.8108, + "step": 2841 + }, + { + "epoch": 0.9360615916670098, + "grad_norm": 2.853318452835083, + "learning_rate": 1.0378995953589488e-06, + "loss": 2.4304, + "step": 2842 + }, + { + "epoch": 0.9363909588702705, + "grad_norm": 2.5983059406280518, + "learning_rate": 1.0272638733107298e-06, + "loss": 2.0046, + "step": 2843 + }, + { + "epoch": 0.9367203260735312, + "grad_norm": 2.3808035850524902, + "learning_rate": 1.0166823606459097e-06, + "loss": 1.8917, + "step": 2844 + }, + { + "epoch": 0.937049693276792, + "grad_norm": 2.6919662952423096, + "learning_rate": 1.0061550690774857e-06, + "loss": 1.9641, + "step": 2845 + }, + { + "epoch": 0.9373790604800527, + "grad_norm": 2.5325708389282227, + "learning_rate": 9.956820102584253e-07, + "loss": 1.8109, + "step": 2846 + }, + { + "epoch": 0.9377084276833134, + "grad_norm": 3.0093464851379395, + "learning_rate": 9.85263195781666e-07, + "loss": 2.1708, + "step": 2847 + }, + { + "epoch": 0.9380377948865741, + "grad_norm": 2.9539554119110107, + "learning_rate": 9.748986371801106e-07, + "loss": 2.1125, + "step": 2848 + }, + { + "epoch": 0.938367162089835, + "grad_norm": 2.907458543777466, + "learning_rate": 9.645883459265982e-07, + "loss": 1.437, + "step": 2849 + }, + { + "epoch": 0.9386965292930957, + "grad_norm": 3.331385374069214, + "learning_rate": 9.543323334338939e-07, + "loss": 1.8898, + "step": 2850 + }, + { + "epoch": 0.9390258964963564, + "grad_norm": 1.8661000728607178, + "learning_rate": 9.441306110546944e-07, + "loss": 2.2593, + "step": 2851 + }, + { + "epoch": 0.9393552636996171, + "grad_norm": 2.427933692932129, + "learning_rate": 9.339831900815776e-07, + "loss": 2.6372, + "step": 2852 + }, + { + "epoch": 0.9396846309028778, + "grad_norm": 2.3956634998321533, + "learning_rate": 9.238900817470308e-07, + "loss": 2.1686, + "step": 2853 + }, + { + "epoch": 0.9400139981061386, + "grad_norm": 2.860553503036499, + "learning_rate": 9.138512972234225e-07, + "loss": 1.998, + "step": 2854 + }, + { + "epoch": 0.9403433653093993, + "grad_norm": 2.519784927368164, + "learning_rate": 9.03866847622975e-07, + "loss": 2.4247, + "step": 2855 + }, + { + "epoch": 0.94067273251266, + "grad_norm": 2.6663100719451904, + "learning_rate": 8.939367439977808e-07, + "loss": 2.4213, + "step": 2856 + }, + { + "epoch": 0.9410020997159207, + "grad_norm": 2.3024487495422363, + "learning_rate": 8.840609973397585e-07, + "loss": 2.4378, + "step": 2857 + }, + { + "epoch": 0.9413314669191816, + "grad_norm": 2.864820957183838, + "learning_rate": 8.7423961858068e-07, + "loss": 2.4476, + "step": 2858 + }, + { + "epoch": 0.9416608341224423, + "grad_norm": 2.920145034790039, + "learning_rate": 8.644726185921159e-07, + "loss": 2.2515, + "step": 2859 + }, + { + "epoch": 0.941990201325703, + "grad_norm": 2.6233792304992676, + "learning_rate": 8.547600081854456e-07, + "loss": 2.4181, + "step": 2860 + }, + { + "epoch": 0.9423195685289637, + "grad_norm": 2.389942169189453, + "learning_rate": 8.451017981118525e-07, + "loss": 2.3034, + "step": 2861 + }, + { + "epoch": 0.9426489357322244, + "grad_norm": 2.579540252685547, + "learning_rate": 8.354979990622957e-07, + "loss": 2.5177, + "step": 2862 + }, + { + "epoch": 0.9429783029354852, + "grad_norm": 2.6074090003967285, + "learning_rate": 8.259486216675105e-07, + "loss": 1.9105, + "step": 2863 + }, + { + "epoch": 0.943307670138746, + "grad_norm": 2.744189977645874, + "learning_rate": 8.164536764979857e-07, + "loss": 2.1191, + "step": 2864 + }, + { + "epoch": 0.9436370373420067, + "grad_norm": 2.729266405105591, + "learning_rate": 8.070131740639586e-07, + "loss": 2.1855, + "step": 2865 + }, + { + "epoch": 0.9439664045452674, + "grad_norm": 2.6576309204101562, + "learning_rate": 7.976271248154033e-07, + "loss": 2.0916, + "step": 2866 + }, + { + "epoch": 0.9442957717485282, + "grad_norm": 2.8636012077331543, + "learning_rate": 7.882955391420143e-07, + "loss": 2.3881, + "step": 2867 + }, + { + "epoch": 0.9446251389517889, + "grad_norm": 2.3858633041381836, + "learning_rate": 7.790184273732071e-07, + "loss": 1.817, + "step": 2868 + }, + { + "epoch": 0.9449545061550496, + "grad_norm": 2.9734864234924316, + "learning_rate": 7.697957997780947e-07, + "loss": 2.0361, + "step": 2869 + }, + { + "epoch": 0.9452838733583103, + "grad_norm": 2.881843090057373, + "learning_rate": 7.606276665654777e-07, + "loss": 2.2603, + "step": 2870 + }, + { + "epoch": 0.945613240561571, + "grad_norm": 2.7740421295166016, + "learning_rate": 7.515140378838381e-07, + "loss": 2.1547, + "step": 2871 + }, + { + "epoch": 0.9459426077648319, + "grad_norm": 2.4186999797821045, + "learning_rate": 7.424549238213174e-07, + "loss": 1.6122, + "step": 2872 + }, + { + "epoch": 0.9462719749680926, + "grad_norm": 2.858855724334717, + "learning_rate": 7.33450334405722e-07, + "loss": 1.7438, + "step": 2873 + }, + { + "epoch": 0.9466013421713533, + "grad_norm": 2.9262588024139404, + "learning_rate": 7.245002796045008e-07, + "loss": 1.896, + "step": 2874 + }, + { + "epoch": 0.946930709374614, + "grad_norm": 4.089500427246094, + "learning_rate": 7.156047693247403e-07, + "loss": 2.6594, + "step": 2875 + }, + { + "epoch": 0.9472600765778748, + "grad_norm": 2.226041078567505, + "learning_rate": 7.067638134131472e-07, + "loss": 2.405, + "step": 2876 + }, + { + "epoch": 0.9475894437811355, + "grad_norm": 2.344412326812744, + "learning_rate": 6.97977421656032e-07, + "loss": 2.4191, + "step": 2877 + }, + { + "epoch": 0.9479188109843962, + "grad_norm": 2.1592891216278076, + "learning_rate": 6.892456037793204e-07, + "loss": 2.4441, + "step": 2878 + }, + { + "epoch": 0.9482481781876569, + "grad_norm": 2.270063877105713, + "learning_rate": 6.805683694485143e-07, + "loss": 1.9226, + "step": 2879 + }, + { + "epoch": 0.9485775453909177, + "grad_norm": 2.3790276050567627, + "learning_rate": 6.719457282687136e-07, + "loss": 2.3371, + "step": 2880 + }, + { + "epoch": 0.9489069125941785, + "grad_norm": 2.7500150203704834, + "learning_rate": 6.633776897845667e-07, + "loss": 2.4479, + "step": 2881 + }, + { + "epoch": 0.9492362797974392, + "grad_norm": 2.5941100120544434, + "learning_rate": 6.548642634803037e-07, + "loss": 2.2519, + "step": 2882 + }, + { + "epoch": 0.9495656470006999, + "grad_norm": 2.158876895904541, + "learning_rate": 6.464054587796809e-07, + "loss": 2.4172, + "step": 2883 + }, + { + "epoch": 0.9498950142039606, + "grad_norm": 2.5841729640960693, + "learning_rate": 6.380012850460082e-07, + "loss": 2.297, + "step": 2884 + }, + { + "epoch": 0.9502243814072214, + "grad_norm": 2.719069004058838, + "learning_rate": 6.296517515821055e-07, + "loss": 2.2869, + "step": 2885 + }, + { + "epoch": 0.9505537486104821, + "grad_norm": 2.4023020267486572, + "learning_rate": 6.213568676303294e-07, + "loss": 2.2206, + "step": 2886 + }, + { + "epoch": 0.9508831158137429, + "grad_norm": 2.7940988540649414, + "learning_rate": 6.131166423725354e-07, + "loss": 2.0593, + "step": 2887 + }, + { + "epoch": 0.9512124830170036, + "grad_norm": 2.5038106441497803, + "learning_rate": 6.04931084930066e-07, + "loss": 2.0466, + "step": 2888 + }, + { + "epoch": 0.9515418502202643, + "grad_norm": 2.42012619972229, + "learning_rate": 5.968002043637733e-07, + "loss": 2.0527, + "step": 2889 + }, + { + "epoch": 0.9518712174235251, + "grad_norm": 2.5355756282806396, + "learning_rate": 5.887240096739633e-07, + "loss": 2.2696, + "step": 2890 + }, + { + "epoch": 0.9522005846267858, + "grad_norm": 2.5182111263275146, + "learning_rate": 5.807025098004127e-07, + "loss": 1.7523, + "step": 2891 + }, + { + "epoch": 0.9525299518300465, + "grad_norm": 2.754927396774292, + "learning_rate": 5.727357136223633e-07, + "loss": 2.0462, + "step": 2892 + }, + { + "epoch": 0.9528593190333072, + "grad_norm": 2.4149563312530518, + "learning_rate": 5.648236299584997e-07, + "loss": 2.0588, + "step": 2893 + }, + { + "epoch": 0.9531886862365679, + "grad_norm": 2.708524703979492, + "learning_rate": 5.56966267566944e-07, + "loss": 2.0039, + "step": 2894 + }, + { + "epoch": 0.9535180534398288, + "grad_norm": 2.535885810852051, + "learning_rate": 5.491636351452445e-07, + "loss": 2.0345, + "step": 2895 + }, + { + "epoch": 0.9538474206430895, + "grad_norm": 3.0200557708740234, + "learning_rate": 5.414157413303755e-07, + "loss": 1.7782, + "step": 2896 + }, + { + "epoch": 0.9541767878463502, + "grad_norm": 2.9520061016082764, + "learning_rate": 5.33722594698699e-07, + "loss": 2.0286, + "step": 2897 + }, + { + "epoch": 0.9545061550496109, + "grad_norm": 2.945594549179077, + "learning_rate": 5.260842037659919e-07, + "loss": 1.8728, + "step": 2898 + }, + { + "epoch": 0.9548355222528717, + "grad_norm": 3.269399642944336, + "learning_rate": 5.185005769874185e-07, + "loss": 2.2312, + "step": 2899 + }, + { + "epoch": 0.9551648894561324, + "grad_norm": 3.335346221923828, + "learning_rate": 5.10971722757525e-07, + "loss": 2.1155, + "step": 2900 + }, + { + "epoch": 0.9554942566593931, + "grad_norm": 1.8741716146469116, + "learning_rate": 5.034976494102284e-07, + "loss": 2.2083, + "step": 2901 + }, + { + "epoch": 0.9558236238626538, + "grad_norm": 2.057013511657715, + "learning_rate": 4.960783652187939e-07, + "loss": 2.1859, + "step": 2902 + }, + { + "epoch": 0.9561529910659146, + "grad_norm": 2.220473051071167, + "learning_rate": 4.887138783958467e-07, + "loss": 2.3862, + "step": 2903 + }, + { + "epoch": 0.9564823582691754, + "grad_norm": 2.3261795043945312, + "learning_rate": 4.814041970933713e-07, + "loss": 2.317, + "step": 2904 + }, + { + "epoch": 0.9568117254724361, + "grad_norm": 2.2345638275146484, + "learning_rate": 4.7414932940265664e-07, + "loss": 2.5307, + "step": 2905 + }, + { + "epoch": 0.9571410926756968, + "grad_norm": 2.4309141635894775, + "learning_rate": 4.669492833543454e-07, + "loss": 2.2482, + "step": 2906 + }, + { + "epoch": 0.9574704598789575, + "grad_norm": 2.4680659770965576, + "learning_rate": 4.598040669183734e-07, + "loss": 2.6021, + "step": 2907 + }, + { + "epoch": 0.9577998270822183, + "grad_norm": 2.261176347732544, + "learning_rate": 4.5271368800400283e-07, + "loss": 2.277, + "step": 2908 + }, + { + "epoch": 0.958129194285479, + "grad_norm": 2.5138943195343018, + "learning_rate": 4.4567815445977767e-07, + "loss": 2.2765, + "step": 2909 + }, + { + "epoch": 0.9584585614887398, + "grad_norm": 2.699733257293701, + "learning_rate": 4.386974740735461e-07, + "loss": 2.098, + "step": 2910 + }, + { + "epoch": 0.9587879286920005, + "grad_norm": 2.301736831665039, + "learning_rate": 4.317716545724215e-07, + "loss": 2.1506, + "step": 2911 + }, + { + "epoch": 0.9591172958952612, + "grad_norm": 2.6759886741638184, + "learning_rate": 4.2490070362281587e-07, + "loss": 2.0459, + "step": 2912 + }, + { + "epoch": 0.959446663098522, + "grad_norm": 2.393205404281616, + "learning_rate": 4.180846288303786e-07, + "loss": 2.05, + "step": 2913 + }, + { + "epoch": 0.9597760303017827, + "grad_norm": 2.5351758003234863, + "learning_rate": 4.113234377400299e-07, + "loss": 1.8458, + "step": 2914 + }, + { + "epoch": 0.9601053975050434, + "grad_norm": 2.2176060676574707, + "learning_rate": 4.0461713783593313e-07, + "loss": 1.9287, + "step": 2915 + }, + { + "epoch": 0.9604347647083041, + "grad_norm": 3.161897659301758, + "learning_rate": 3.9796573654148905e-07, + "loss": 2.5525, + "step": 2916 + }, + { + "epoch": 0.960764131911565, + "grad_norm": 2.861020803451538, + "learning_rate": 3.9136924121933594e-07, + "loss": 1.9952, + "step": 2917 + }, + { + "epoch": 0.9610934991148257, + "grad_norm": 2.7215144634246826, + "learning_rate": 3.848276591713329e-07, + "loss": 2.1098, + "step": 2918 + }, + { + "epoch": 0.9614228663180864, + "grad_norm": 2.677161455154419, + "learning_rate": 3.783409976385488e-07, + "loss": 2.0806, + "step": 2919 + }, + { + "epoch": 0.9617522335213471, + "grad_norm": 3.068319082260132, + "learning_rate": 3.719092638012622e-07, + "loss": 1.9768, + "step": 2920 + }, + { + "epoch": 0.9620816007246078, + "grad_norm": 3.429572820663452, + "learning_rate": 3.655324647789615e-07, + "loss": 1.8987, + "step": 2921 + }, + { + "epoch": 0.9624109679278686, + "grad_norm": 3.0230207443237305, + "learning_rate": 3.5921060763030033e-07, + "loss": 2.1049, + "step": 2922 + }, + { + "epoch": 0.9627403351311293, + "grad_norm": 3.3592071533203125, + "learning_rate": 3.5294369935314207e-07, + "loss": 2.118, + "step": 2923 + }, + { + "epoch": 0.96306970233439, + "grad_norm": 2.8637266159057617, + "learning_rate": 3.4673174688450994e-07, + "loss": 2.0448, + "step": 2924 + }, + { + "epoch": 0.9633990695376508, + "grad_norm": 3.2536802291870117, + "learning_rate": 3.4057475710059796e-07, + "loss": 1.8276, + "step": 2925 + }, + { + "epoch": 0.9637284367409116, + "grad_norm": 2.2297792434692383, + "learning_rate": 3.3447273681676547e-07, + "loss": 2.4885, + "step": 2926 + }, + { + "epoch": 0.9640578039441723, + "grad_norm": 2.3141520023345947, + "learning_rate": 3.284256927875262e-07, + "loss": 2.5599, + "step": 2927 + }, + { + "epoch": 0.964387171147433, + "grad_norm": 2.275432586669922, + "learning_rate": 3.224336317065202e-07, + "loss": 2.1016, + "step": 2928 + }, + { + "epoch": 0.9647165383506937, + "grad_norm": 2.278019666671753, + "learning_rate": 3.1649656020654726e-07, + "loss": 2.2134, + "step": 2929 + }, + { + "epoch": 0.9650459055539544, + "grad_norm": 2.311898946762085, + "learning_rate": 3.106144848595283e-07, + "loss": 2.386, + "step": 2930 + }, + { + "epoch": 0.9653752727572152, + "grad_norm": 2.683455228805542, + "learning_rate": 3.047874121765049e-07, + "loss": 2.6858, + "step": 2931 + }, + { + "epoch": 0.965704639960476, + "grad_norm": 2.5970003604888916, + "learning_rate": 2.9901534860764524e-07, + "loss": 2.3024, + "step": 2932 + }, + { + "epoch": 0.9660340071637367, + "grad_norm": 2.268040418624878, + "learning_rate": 2.9329830054221076e-07, + "loss": 2.1205, + "step": 2933 + }, + { + "epoch": 0.9663633743669974, + "grad_norm": 2.2917988300323486, + "learning_rate": 2.876362743085781e-07, + "loss": 2.0411, + "step": 2934 + }, + { + "epoch": 0.9666927415702582, + "grad_norm": 2.5120959281921387, + "learning_rate": 2.820292761742116e-07, + "loss": 2.1063, + "step": 2935 + }, + { + "epoch": 0.9670221087735189, + "grad_norm": 2.403350353240967, + "learning_rate": 2.7647731234565767e-07, + "loss": 2.4898, + "step": 2936 + }, + { + "epoch": 0.9673514759767796, + "grad_norm": 2.1802988052368164, + "learning_rate": 2.7098038896856136e-07, + "loss": 2.0262, + "step": 2937 + }, + { + "epoch": 0.9676808431800403, + "grad_norm": 2.6515746116638184, + "learning_rate": 2.655385121276277e-07, + "loss": 2.0875, + "step": 2938 + }, + { + "epoch": 0.968010210383301, + "grad_norm": 2.75669527053833, + "learning_rate": 2.6015168784663255e-07, + "loss": 1.963, + "step": 2939 + }, + { + "epoch": 0.9683395775865619, + "grad_norm": 2.94972562789917, + "learning_rate": 2.5481992208841175e-07, + "loss": 2.2923, + "step": 2940 + }, + { + "epoch": 0.9686689447898226, + "grad_norm": 2.706496477127075, + "learning_rate": 2.4954322075485536e-07, + "loss": 2.4328, + "step": 2941 + }, + { + "epoch": 0.9689983119930833, + "grad_norm": 3.0476579666137695, + "learning_rate": 2.4432158968689666e-07, + "loss": 2.018, + "step": 2942 + }, + { + "epoch": 0.969327679196344, + "grad_norm": 2.800839900970459, + "learning_rate": 2.391550346645288e-07, + "loss": 2.0205, + "step": 2943 + }, + { + "epoch": 0.9696570463996047, + "grad_norm": 2.6107017993927, + "learning_rate": 2.3404356140675488e-07, + "loss": 2.1034, + "step": 2944 + }, + { + "epoch": 0.9699864136028655, + "grad_norm": 2.8112082481384277, + "learning_rate": 2.2898717557161554e-07, + "loss": 2.3968, + "step": 2945 + }, + { + "epoch": 0.9703157808061262, + "grad_norm": 2.9608657360076904, + "learning_rate": 2.2398588275618914e-07, + "loss": 2.1058, + "step": 2946 + }, + { + "epoch": 0.970645148009387, + "grad_norm": 2.786487340927124, + "learning_rate": 2.1903968849653623e-07, + "loss": 1.85, + "step": 2947 + }, + { + "epoch": 0.9709745152126477, + "grad_norm": 2.775702714920044, + "learning_rate": 2.1414859826776045e-07, + "loss": 1.7929, + "step": 2948 + }, + { + "epoch": 0.9713038824159085, + "grad_norm": 2.9629390239715576, + "learning_rate": 2.0931261748395326e-07, + "loss": 2.2975, + "step": 2949 + }, + { + "epoch": 0.9716332496191692, + "grad_norm": 2.814885139465332, + "learning_rate": 2.045317514982048e-07, + "loss": 1.4608, + "step": 2950 + }, + { + "epoch": 0.9719626168224299, + "grad_norm": 1.88387131690979, + "learning_rate": 1.9980600560259854e-07, + "loss": 2.3342, + "step": 2951 + }, + { + "epoch": 0.9722919840256906, + "grad_norm": 2.205899238586426, + "learning_rate": 1.9513538502820562e-07, + "loss": 2.3497, + "step": 2952 + }, + { + "epoch": 0.9726213512289513, + "grad_norm": 2.299445867538452, + "learning_rate": 1.9051989494506817e-07, + "loss": 2.3363, + "step": 2953 + }, + { + "epoch": 0.9729507184322121, + "grad_norm": 2.064168930053711, + "learning_rate": 1.8595954046222165e-07, + "loss": 2.2905, + "step": 2954 + }, + { + "epoch": 0.9732800856354729, + "grad_norm": 2.4672629833221436, + "learning_rate": 1.814543266276447e-07, + "loss": 2.4163, + "step": 2955 + }, + { + "epoch": 0.9736094528387336, + "grad_norm": 2.261767864227295, + "learning_rate": 1.7700425842830382e-07, + "loss": 2.3594, + "step": 2956 + }, + { + "epoch": 0.9739388200419943, + "grad_norm": 2.067182779312134, + "learning_rate": 1.726093407901086e-07, + "loss": 2.0535, + "step": 2957 + }, + { + "epoch": 0.9742681872452551, + "grad_norm": 2.835082769393921, + "learning_rate": 1.682695785779287e-07, + "loss": 2.2098, + "step": 2958 + }, + { + "epoch": 0.9745975544485158, + "grad_norm": 2.3787896633148193, + "learning_rate": 1.639849765955659e-07, + "loss": 2.0753, + "step": 2959 + }, + { + "epoch": 0.9749269216517765, + "grad_norm": 2.4231419563293457, + "learning_rate": 1.5975553958578193e-07, + "loss": 2.0069, + "step": 2960 + }, + { + "epoch": 0.9752562888550372, + "grad_norm": 2.6847152709960938, + "learning_rate": 1.5558127223027075e-07, + "loss": 2.346, + "step": 2961 + }, + { + "epoch": 0.975585656058298, + "grad_norm": 2.533125638961792, + "learning_rate": 1.514621791496418e-07, + "loss": 2.0978, + "step": 2962 + }, + { + "epoch": 0.9759150232615588, + "grad_norm": 2.544414520263672, + "learning_rate": 1.4739826490345332e-07, + "loss": 1.8396, + "step": 2963 + }, + { + "epoch": 0.9762443904648195, + "grad_norm": 2.6801774501800537, + "learning_rate": 1.4338953399016808e-07, + "loss": 2.0108, + "step": 2964 + }, + { + "epoch": 0.9765737576680802, + "grad_norm": 2.523963212966919, + "learning_rate": 1.3943599084716984e-07, + "loss": 2.0665, + "step": 2965 + }, + { + "epoch": 0.9769031248713409, + "grad_norm": 2.6162335872650146, + "learning_rate": 1.3553763985075795e-07, + "loss": 2.0451, + "step": 2966 + }, + { + "epoch": 0.9772324920746017, + "grad_norm": 2.9109017848968506, + "learning_rate": 1.3169448531612506e-07, + "loss": 1.9372, + "step": 2967 + }, + { + "epoch": 0.9775618592778624, + "grad_norm": 2.5702192783355713, + "learning_rate": 1.279065314973793e-07, + "loss": 2.0518, + "step": 2968 + }, + { + "epoch": 0.9778912264811231, + "grad_norm": 2.58215069770813, + "learning_rate": 1.2417378258751665e-07, + "loss": 1.9024, + "step": 2969 + }, + { + "epoch": 0.9782205936843839, + "grad_norm": 2.6025025844573975, + "learning_rate": 1.2049624271843752e-07, + "loss": 1.7492, + "step": 2970 + }, + { + "epoch": 0.9785499608876446, + "grad_norm": 3.015721559524536, + "learning_rate": 1.1687391596090779e-07, + "loss": 2.3415, + "step": 2971 + }, + { + "epoch": 0.9788793280909054, + "grad_norm": 2.538717746734619, + "learning_rate": 1.133068063245979e-07, + "loss": 1.6529, + "step": 2972 + }, + { + "epoch": 0.9792086952941661, + "grad_norm": 3.052213191986084, + "learning_rate": 1.0979491775804373e-07, + "loss": 2.429, + "step": 2973 + }, + { + "epoch": 0.9795380624974268, + "grad_norm": 3.415714740753174, + "learning_rate": 1.0633825414865794e-07, + "loss": 1.8479, + "step": 2974 + }, + { + "epoch": 0.9798674297006875, + "grad_norm": 2.9711825847625732, + "learning_rate": 1.029368193227298e-07, + "loss": 1.6421, + "step": 2975 + }, + { + "epoch": 0.9801967969039483, + "grad_norm": 2.3700506687164307, + "learning_rate": 9.959061704540862e-08, + "loss": 2.5403, + "step": 2976 + }, + { + "epoch": 0.980526164107209, + "grad_norm": 2.2373926639556885, + "learning_rate": 9.629965102070371e-08, + "loss": 2.4728, + "step": 2977 + }, + { + "epoch": 0.9808555313104698, + "grad_norm": 2.5330634117126465, + "learning_rate": 9.306392489147886e-08, + "loss": 2.3086, + "step": 2978 + }, + { + "epoch": 0.9811848985137305, + "grad_norm": 2.203402042388916, + "learning_rate": 8.988344223946343e-08, + "loss": 2.0723, + "step": 2979 + }, + { + "epoch": 0.9815142657169912, + "grad_norm": 2.7258424758911133, + "learning_rate": 8.67582065852246e-08, + "loss": 2.3731, + "step": 2980 + }, + { + "epoch": 0.981843632920252, + "grad_norm": 2.4026784896850586, + "learning_rate": 8.3688221388184e-08, + "loss": 1.9317, + "step": 2981 + }, + { + "epoch": 0.9821730001235127, + "grad_norm": 2.536128044128418, + "learning_rate": 8.067349004658998e-08, + "loss": 2.2797, + "step": 2982 + }, + { + "epoch": 0.9825023673267734, + "grad_norm": 2.5913748741149902, + "learning_rate": 7.77140158975509e-08, + "loss": 2.2726, + "step": 2983 + }, + { + "epoch": 0.9828317345300341, + "grad_norm": 2.1963465213775635, + "learning_rate": 7.480980221699075e-08, + "loss": 1.9628, + "step": 2984 + }, + { + "epoch": 0.983161101733295, + "grad_norm": 2.823356866836548, + "learning_rate": 7.196085221966575e-08, + "loss": 2.0853, + "step": 2985 + }, + { + "epoch": 0.9834904689365557, + "grad_norm": 2.5668678283691406, + "learning_rate": 6.916716905916998e-08, + "loss": 2.3201, + "step": 2986 + }, + { + "epoch": 0.9838198361398164, + "grad_norm": 2.500898599624634, + "learning_rate": 6.642875582791863e-08, + "loss": 2.2272, + "step": 2987 + }, + { + "epoch": 0.9841492033430771, + "grad_norm": 2.5262458324432373, + "learning_rate": 6.374561555713143e-08, + "loss": 2.0321, + "step": 2988 + }, + { + "epoch": 0.9844785705463378, + "grad_norm": 2.6859130859375, + "learning_rate": 6.111775121686037e-08, + "loss": 2.1567, + "step": 2989 + }, + { + "epoch": 0.9848079377495986, + "grad_norm": 2.619617462158203, + "learning_rate": 5.854516571597857e-08, + "loss": 1.9454, + "step": 2990 + }, + { + "epoch": 0.9851373049528593, + "grad_norm": 3.04783034324646, + "learning_rate": 5.602786190214149e-08, + "loss": 2.4391, + "step": 2991 + }, + { + "epoch": 0.98546667215612, + "grad_norm": 2.7410717010498047, + "learning_rate": 5.3565842561831283e-08, + "loss": 2.2959, + "step": 2992 + }, + { + "epoch": 0.9857960393593808, + "grad_norm": 2.6359169483184814, + "learning_rate": 5.115911042034016e-08, + "loss": 1.8954, + "step": 2993 + }, + { + "epoch": 0.9861254065626415, + "grad_norm": 2.6837198734283447, + "learning_rate": 4.880766814174265e-08, + "loss": 2.0915, + "step": 2994 + }, + { + "epoch": 0.9864547737659023, + "grad_norm": 2.4030003547668457, + "learning_rate": 4.651151832892886e-08, + "loss": 1.6885, + "step": 2995 + }, + { + "epoch": 0.986784140969163, + "grad_norm": 2.9259960651397705, + "learning_rate": 4.427066352356568e-08, + "loss": 2.1477, + "step": 2996 + }, + { + "epoch": 0.9871135081724237, + "grad_norm": 3.3638339042663574, + "learning_rate": 4.208510620612449e-08, + "loss": 2.1854, + "step": 2997 + }, + { + "epoch": 0.9874428753756844, + "grad_norm": 3.0315308570861816, + "learning_rate": 3.995484879587008e-08, + "loss": 1.9279, + "step": 2998 + }, + { + "epoch": 0.9877722425789452, + "grad_norm": 3.3093647956848145, + "learning_rate": 3.787989365083844e-08, + "loss": 1.8074, + "step": 2999 + }, + { + "epoch": 0.988101609782206, + "grad_norm": 3.9945030212402344, + "learning_rate": 3.5860243067870056e-08, + "loss": 1.9964, + "step": 3000 + }, + { + "epoch": 0.9884309769854667, + "grad_norm": 1.813816785812378, + "learning_rate": 3.3895899282571083e-08, + "loss": 2.3394, + "step": 3001 + }, + { + "epoch": 0.9887603441887274, + "grad_norm": 1.9609613418579102, + "learning_rate": 3.198686446932997e-08, + "loss": 2.1147, + "step": 3002 + }, + { + "epoch": 0.9890897113919881, + "grad_norm": 2.0877554416656494, + "learning_rate": 3.013314074131746e-08, + "loss": 2.2933, + "step": 3003 + }, + { + "epoch": 0.9894190785952489, + "grad_norm": 2.293480396270752, + "learning_rate": 2.8334730150475498e-08, + "loss": 2.3962, + "step": 3004 + }, + { + "epoch": 0.9897484457985096, + "grad_norm": 2.9149646759033203, + "learning_rate": 2.659163468751724e-08, + "loss": 2.2562, + "step": 3005 + }, + { + "epoch": 0.9900778130017703, + "grad_norm": 2.495413064956665, + "learning_rate": 2.490385628192704e-08, + "loss": 2.2912, + "step": 3006 + }, + { + "epoch": 0.990407180205031, + "grad_norm": 2.356163501739502, + "learning_rate": 2.3271396801960442e-08, + "loss": 2.107, + "step": 3007 + }, + { + "epoch": 0.9907365474082919, + "grad_norm": 2.2497031688690186, + "learning_rate": 2.1694258054633097e-08, + "loss": 2.5133, + "step": 3008 + }, + { + "epoch": 0.9910659146115526, + "grad_norm": 2.5045933723449707, + "learning_rate": 2.0172441785726302e-08, + "loss": 2.1748, + "step": 3009 + }, + { + "epoch": 0.9913952818148133, + "grad_norm": 2.6602587699890137, + "learning_rate": 1.8705949679781454e-08, + "loss": 1.9129, + "step": 3010 + }, + { + "epoch": 0.991724649018074, + "grad_norm": 2.7580525875091553, + "learning_rate": 1.7294783360105593e-08, + "loss": 2.117, + "step": 3011 + }, + { + "epoch": 0.9920540162213347, + "grad_norm": 2.4535562992095947, + "learning_rate": 1.5938944388765865e-08, + "loss": 2.0893, + "step": 3012 + }, + { + "epoch": 0.9923833834245955, + "grad_norm": 2.4854702949523926, + "learning_rate": 1.4638434266572853e-08, + "loss": 1.9715, + "step": 3013 + }, + { + "epoch": 0.9927127506278562, + "grad_norm": 2.8677029609680176, + "learning_rate": 1.3393254433102798e-08, + "loss": 2.3892, + "step": 3014 + }, + { + "epoch": 0.993042117831117, + "grad_norm": 2.844385862350464, + "learning_rate": 1.2203406266680927e-08, + "loss": 2.2286, + "step": 3015 + }, + { + "epoch": 0.9933714850343777, + "grad_norm": 2.4216244220733643, + "learning_rate": 1.1068891084392574e-08, + "loss": 2.004, + "step": 3016 + }, + { + "epoch": 0.9937008522376385, + "grad_norm": 2.4584288597106934, + "learning_rate": 9.989710142055408e-09, + "loss": 2.2331, + "step": 3017 + }, + { + "epoch": 0.9940302194408992, + "grad_norm": 2.5853490829467773, + "learning_rate": 8.965864634252752e-09, + "loss": 1.6233, + "step": 3018 + }, + { + "epoch": 0.9943595866441599, + "grad_norm": 2.8570823669433594, + "learning_rate": 7.997355694311371e-09, + "loss": 2.0152, + "step": 3019 + }, + { + "epoch": 0.9946889538474206, + "grad_norm": 2.8787717819213867, + "learning_rate": 7.084184394307025e-09, + "loss": 1.9575, + "step": 3020 + }, + { + "epoch": 0.9950183210506813, + "grad_norm": 2.658804416656494, + "learning_rate": 6.226351745042269e-09, + "loss": 2.1558, + "step": 3021 + }, + { + "epoch": 0.9953476882539422, + "grad_norm": 2.7765133380889893, + "learning_rate": 5.423858696090855e-09, + "loss": 1.8984, + "step": 3022 + }, + { + "epoch": 0.9956770554572029, + "grad_norm": 2.8934073448181152, + "learning_rate": 4.676706135747777e-09, + "loss": 2.0635, + "step": 3023 + }, + { + "epoch": 0.9960064226604636, + "grad_norm": 2.6557319164276123, + "learning_rate": 3.984894891062574e-09, + "loss": 1.7287, + "step": 3024 + }, + { + "epoch": 0.9963357898637243, + "grad_norm": 3.173056125640869, + "learning_rate": 3.3484257278226795e-09, + "loss": 1.834, + "step": 3025 + }, + { + "epoch": 0.9966651570669851, + "grad_norm": 2.187110424041748, + "learning_rate": 2.7672993505534206e-09, + "loss": 2.694, + "step": 3026 + }, + { + "epoch": 0.9969945242702458, + "grad_norm": 2.553576707839966, + "learning_rate": 2.2415164025235693e-09, + "loss": 2.3846, + "step": 3027 + }, + { + "epoch": 0.9973238914735065, + "grad_norm": 2.6559970378875732, + "learning_rate": 1.7710774657286876e-09, + "loss": 2.021, + "step": 3028 + }, + { + "epoch": 0.9976532586767672, + "grad_norm": 2.1335747241973877, + "learning_rate": 1.3559830609244373e-09, + "loss": 2.1005, + "step": 3029 + }, + { + "epoch": 0.997982625880028, + "grad_norm": 2.4700515270233154, + "learning_rate": 9.962336475821677e-10, + "loss": 2.14, + "step": 3030 + }, + { + "epoch": 0.9983119930832888, + "grad_norm": 2.239428758621216, + "learning_rate": 6.918296239222244e-10, + "loss": 1.8967, + "step": 3031 + }, + { + "epoch": 0.9986413602865495, + "grad_norm": 3.131831169128418, + "learning_rate": 4.427713268972955e-10, + "loss": 1.9217, + "step": 3032 + }, + { + "epoch": 0.9989707274898102, + "grad_norm": 2.660928726196289, + "learning_rate": 2.4905903219796244e-10, + "loss": 2.2264, + "step": 3033 + }, + { + "epoch": 0.9993000946930709, + "grad_norm": 2.3439199924468994, + "learning_rate": 1.1069295425270021e-10, + "loss": 1.7058, + "step": 3034 + }, + { + "epoch": 0.9996294618963317, + "grad_norm": 2.770085573196411, + "learning_rate": 2.7673246222326144e-11, + "loss": 1.6823, + "step": 3035 + }, + { + "epoch": 0.9999588290995924, + "grad_norm": 3.2446770668029785, + "learning_rate": 0.0, + "loss": 2.2193, + "step": 3036 + }, + { + "epoch": 0.9999588290995924, + "eval_loss": 2.1753156185150146, + "eval_runtime": 761.5568, + "eval_samples_per_second": 3.358, + "eval_steps_per_second": 1.679, + "step": 3036 } ], "logging_steps": 1, @@ -15992,12 +21313,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 3.492436769658372e+18, + "total_flos": 4.656582359544496e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null