{ "best_metric": 0.2554427981376648, "best_model_checkpoint": "./output/checkpoint-300", "epoch": 2.459016393442623, "eval_steps": 150, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020491803278688523, "grad_norm": 10.688218116760254, "learning_rate": 4.125e-06, "loss": 0.6384, "step": 10 }, { "epoch": 0.040983606557377046, "grad_norm": 11.446691513061523, "learning_rate": 8.25e-06, "loss": 0.5146, "step": 20 }, { "epoch": 0.06147540983606557, "grad_norm": 8.416762351989746, "learning_rate": 1.2375e-05, "loss": 0.2907, "step": 30 }, { "epoch": 0.08196721311475409, "grad_norm": 6.841181755065918, "learning_rate": 1.65e-05, "loss": 0.2331, "step": 40 }, { "epoch": 0.10245901639344263, "grad_norm": 6.821372032165527, "learning_rate": 2.0625e-05, "loss": 0.2542, "step": 50 }, { "epoch": 0.12295081967213115, "grad_norm": 5.243076801300049, "learning_rate": 2.475e-05, "loss": 0.209, "step": 60 }, { "epoch": 0.14344262295081966, "grad_norm": 6.732413291931152, "learning_rate": 2.8874999999999997e-05, "loss": 0.2261, "step": 70 }, { "epoch": 0.16393442622950818, "grad_norm": 7.677738189697266, "learning_rate": 3.3e-05, "loss": 0.217, "step": 80 }, { "epoch": 0.18442622950819673, "grad_norm": 8.282678604125977, "learning_rate": 3.7125e-05, "loss": 0.2412, "step": 90 }, { "epoch": 0.20491803278688525, "grad_norm": 5.906556129455566, "learning_rate": 4.125e-05, "loss": 0.2219, "step": 100 }, { "epoch": 0.22540983606557377, "grad_norm": 4.646184921264648, "learning_rate": 4.12495760935163e-05, "loss": 0.2674, "step": 110 }, { "epoch": 0.2459016393442623, "grad_norm": 6.774251937866211, "learning_rate": 4.1248304391490334e-05, "loss": 0.2465, "step": 120 }, { "epoch": 0.26639344262295084, "grad_norm": 3.765692949295044, "learning_rate": 4.1246184946196796e-05, "loss": 0.2332, "step": 130 }, { "epoch": 0.28688524590163933, "grad_norm": 4.509705543518066, "learning_rate": 4.124321784475777e-05, "loss": 0.223, "step": 140 }, { "epoch": 0.3073770491803279, "grad_norm": 4.340013027191162, "learning_rate": 4.123940320913919e-05, "loss": 0.2249, "step": 150 }, { "epoch": 0.3073770491803279, "eval_loss": 0.2637089788913727, "eval_runtime": 48.2668, "eval_samples_per_second": 8.992, "eval_steps_per_second": 8.992, "step": 150 }, { "epoch": 0.32786885245901637, "grad_norm": 5.232619285583496, "learning_rate": 4.123474119614577e-05, "loss": 0.2252, "step": 160 }, { "epoch": 0.3483606557377049, "grad_norm": 7.21963357925415, "learning_rate": 4.1229231997414614e-05, "loss": 0.2504, "step": 170 }, { "epoch": 0.36885245901639346, "grad_norm": 4.6124091148376465, "learning_rate": 4.1222875839407306e-05, "loss": 0.241, "step": 180 }, { "epoch": 0.38934426229508196, "grad_norm": 3.9211668968200684, "learning_rate": 4.121567298340059e-05, "loss": 0.2655, "step": 190 }, { "epoch": 0.4098360655737705, "grad_norm": 4.975547790527344, "learning_rate": 4.120762372547569e-05, "loss": 0.245, "step": 200 }, { "epoch": 0.430327868852459, "grad_norm": 5.420774936676025, "learning_rate": 4.119872839650605e-05, "loss": 0.2432, "step": 210 }, { "epoch": 0.45081967213114754, "grad_norm": 4.491248607635498, "learning_rate": 4.118898736214381e-05, "loss": 0.2298, "step": 220 }, { "epoch": 0.4713114754098361, "grad_norm": 4.1634087562561035, "learning_rate": 4.117840102280475e-05, "loss": 0.2498, "step": 230 }, { "epoch": 0.4918032786885246, "grad_norm": 4.817688941955566, "learning_rate": 4.116696981365181e-05, "loss": 0.263, "step": 240 }, { "epoch": 0.5122950819672131, "grad_norm": 4.0787177085876465, "learning_rate": 4.115469420457721e-05, "loss": 0.2268, "step": 250 }, { "epoch": 0.5327868852459017, "grad_norm": 3.804409980773926, "learning_rate": 4.1141574700183186e-05, "loss": 0.2906, "step": 260 }, { "epoch": 0.5532786885245902, "grad_norm": 2.279557704925537, "learning_rate": 4.1127611839761155e-05, "loss": 0.2568, "step": 270 }, { "epoch": 0.5737704918032787, "grad_norm": 4.525150299072266, "learning_rate": 4.111280619726964e-05, "loss": 0.2316, "step": 280 }, { "epoch": 0.5942622950819673, "grad_norm": 3.5028858184814453, "learning_rate": 4.109715838131059e-05, "loss": 0.2715, "step": 290 }, { "epoch": 0.6147540983606558, "grad_norm": 2.9255504608154297, "learning_rate": 4.108066903510445e-05, "loss": 0.2216, "step": 300 }, { "epoch": 0.6147540983606558, "eval_loss": 0.2554427981376648, "eval_runtime": 47.2419, "eval_samples_per_second": 9.187, "eval_steps_per_second": 9.187, "step": 300 }, { "epoch": 0.6352459016393442, "grad_norm": 4.664366722106934, "learning_rate": 4.106333883646366e-05, "loss": 0.2704, "step": 310 }, { "epoch": 0.6557377049180327, "grad_norm": 6.327053070068359, "learning_rate": 4.104516849776479e-05, "loss": 0.2516, "step": 320 }, { "epoch": 0.6762295081967213, "grad_norm": 4.575782775878906, "learning_rate": 4.1026158765919306e-05, "loss": 0.2214, "step": 330 }, { "epoch": 0.6967213114754098, "grad_norm": 6.128705024719238, "learning_rate": 4.100631042234283e-05, "loss": 0.2463, "step": 340 }, { "epoch": 0.7172131147540983, "grad_norm": 3.766876459121704, "learning_rate": 4.098562428292304e-05, "loss": 0.2264, "step": 350 }, { "epoch": 0.7377049180327869, "grad_norm": 4.255002021789551, "learning_rate": 4.096410119798607e-05, "loss": 0.2186, "step": 360 }, { "epoch": 0.7581967213114754, "grad_norm": 2.6822333335876465, "learning_rate": 4.094174205226167e-05, "loss": 0.2379, "step": 370 }, { "epoch": 0.7786885245901639, "grad_norm": 2.5499961376190186, "learning_rate": 4.0918547764846736e-05, "loss": 0.2482, "step": 380 }, { "epoch": 0.7991803278688525, "grad_norm": 4.9328131675720215, "learning_rate": 4.089451928916758e-05, "loss": 0.2127, "step": 390 }, { "epoch": 0.819672131147541, "grad_norm": 3.1114511489868164, "learning_rate": 4.0869657612940723e-05, "loss": 0.1951, "step": 400 }, { "epoch": 0.8401639344262295, "grad_norm": 2.780898094177246, "learning_rate": 4.08439637581323e-05, "loss": 0.2342, "step": 410 }, { "epoch": 0.860655737704918, "grad_norm": 4.129917144775391, "learning_rate": 4.081743878091604e-05, "loss": 0.2477, "step": 420 }, { "epoch": 0.8811475409836066, "grad_norm": 1.9615871906280518, "learning_rate": 4.079008377162988e-05, "loss": 0.2315, "step": 430 }, { "epoch": 0.9016393442622951, "grad_norm": 4.094489097595215, "learning_rate": 4.0761899854731085e-05, "loss": 0.2491, "step": 440 }, { "epoch": 0.9221311475409836, "grad_norm": 3.1119954586029053, "learning_rate": 4.073288818875011e-05, "loss": 0.2273, "step": 450 }, { "epoch": 0.9221311475409836, "eval_loss": 0.2569008469581604, "eval_runtime": 47.4412, "eval_samples_per_second": 9.148, "eval_steps_per_second": 9.148, "step": 450 }, { "epoch": 0.9426229508196722, "grad_norm": 5.038336277008057, "learning_rate": 4.070304996624291e-05, "loss": 0.2415, "step": 460 }, { "epoch": 0.9631147540983607, "grad_norm": 2.174974203109741, "learning_rate": 4.067238641374194e-05, "loss": 0.2165, "step": 470 }, { "epoch": 0.9836065573770492, "grad_norm": 2.632232427597046, "learning_rate": 4.0640898791705745e-05, "loss": 0.234, "step": 480 }, { "epoch": 1.0040983606557377, "grad_norm": 2.3789021968841553, "learning_rate": 4.060858839446713e-05, "loss": 0.2315, "step": 490 }, { "epoch": 1.0245901639344261, "grad_norm": 5.131577968597412, "learning_rate": 4.057545655017998e-05, "loss": 0.1357, "step": 500 }, { "epoch": 1.0450819672131149, "grad_norm": 2.3640201091766357, "learning_rate": 4.054150462076465e-05, "loss": 0.1466, "step": 510 }, { "epoch": 1.0655737704918034, "grad_norm": 2.5582334995269775, "learning_rate": 4.0506734001851976e-05, "loss": 0.1301, "step": 520 }, { "epoch": 1.0860655737704918, "grad_norm": 1.3578604459762573, "learning_rate": 4.0471146122725904e-05, "loss": 0.134, "step": 530 }, { "epoch": 1.1065573770491803, "grad_norm": 4.0858635902404785, "learning_rate": 4.043474244626477e-05, "loss": 0.1571, "step": 540 }, { "epoch": 1.1270491803278688, "grad_norm": 3.3123364448547363, "learning_rate": 4.0397524468881125e-05, "loss": 0.1283, "step": 550 }, { "epoch": 1.1475409836065573, "grad_norm": 2.651021957397461, "learning_rate": 4.0359493720460244e-05, "loss": 0.1477, "step": 560 }, { "epoch": 1.1680327868852458, "grad_norm": 3.309422492980957, "learning_rate": 4.032065176429724e-05, "loss": 0.134, "step": 570 }, { "epoch": 1.1885245901639343, "grad_norm": 2.194988489151001, "learning_rate": 4.0281000197032795e-05, "loss": 0.137, "step": 580 }, { "epoch": 1.209016393442623, "grad_norm": 5.226597309112549, "learning_rate": 4.0240540648587546e-05, "loss": 0.1536, "step": 590 }, { "epoch": 1.2295081967213115, "grad_norm": 3.474353075027466, "learning_rate": 4.019927478209504e-05, "loss": 0.129, "step": 600 }, { "epoch": 1.2295081967213115, "eval_loss": 0.2682632803916931, "eval_runtime": 47.3944, "eval_samples_per_second": 9.157, "eval_steps_per_second": 9.157, "step": 600 }, { "epoch": 1.25, "grad_norm": 2.8130834102630615, "learning_rate": 4.015720429383344e-05, "loss": 0.1478, "step": 610 }, { "epoch": 1.2704918032786885, "grad_norm": 4.518388748168945, "learning_rate": 4.0114330913155726e-05, "loss": 0.116, "step": 620 }, { "epoch": 1.290983606557377, "grad_norm": 2.860029458999634, "learning_rate": 4.007065640241867e-05, "loss": 0.1453, "step": 630 }, { "epoch": 1.3114754098360657, "grad_norm": 5.334210395812988, "learning_rate": 4.002618255691033e-05, "loss": 0.1406, "step": 640 }, { "epoch": 1.331967213114754, "grad_norm": 2.467299699783325, "learning_rate": 3.9980911204776306e-05, "loss": 0.1528, "step": 650 }, { "epoch": 1.3524590163934427, "grad_norm": 2.271044969558716, "learning_rate": 3.993484420694458e-05, "loss": 0.132, "step": 660 }, { "epoch": 1.3729508196721312, "grad_norm": 5.004654407501221, "learning_rate": 3.988798345704899e-05, "loss": 0.1454, "step": 670 }, { "epoch": 1.3934426229508197, "grad_norm": 3.49129581451416, "learning_rate": 3.984033088135143e-05, "loss": 0.1606, "step": 680 }, { "epoch": 1.4139344262295082, "grad_norm": 1.928194284439087, "learning_rate": 3.979188843866263e-05, "loss": 0.1575, "step": 690 }, { "epoch": 1.4344262295081966, "grad_norm": 3.2966251373291016, "learning_rate": 3.97426581202617e-05, "loss": 0.1446, "step": 700 }, { "epoch": 1.4549180327868854, "grad_norm": 3.7774240970611572, "learning_rate": 3.969264194981418e-05, "loss": 0.1545, "step": 710 }, { "epoch": 1.4754098360655736, "grad_norm": 2.730626344680786, "learning_rate": 3.9641841983288953e-05, "loss": 0.1356, "step": 720 }, { "epoch": 1.4959016393442623, "grad_norm": 2.9814343452453613, "learning_rate": 3.959026030887367e-05, "loss": 0.1619, "step": 730 }, { "epoch": 1.5163934426229508, "grad_norm": 2.328427791595459, "learning_rate": 3.953789904688893e-05, "loss": 0.1366, "step": 740 }, { "epoch": 1.5368852459016393, "grad_norm": 2.9205949306488037, "learning_rate": 3.948476034970113e-05, "loss": 0.1342, "step": 750 }, { "epoch": 1.5368852459016393, "eval_loss": 0.2687544822692871, "eval_runtime": 47.2924, "eval_samples_per_second": 9.177, "eval_steps_per_second": 9.177, "step": 750 }, { "epoch": 1.5573770491803278, "grad_norm": 3.180044651031494, "learning_rate": 3.943084640163398e-05, "loss": 0.1529, "step": 760 }, { "epoch": 1.5778688524590163, "grad_norm": 5.1344757080078125, "learning_rate": 3.937615941887873e-05, "loss": 0.1586, "step": 770 }, { "epoch": 1.598360655737705, "grad_norm": 3.872201442718506, "learning_rate": 3.932070164940304e-05, "loss": 0.1325, "step": 780 }, { "epoch": 1.6188524590163933, "grad_norm": 2.4643361568450928, "learning_rate": 3.926447537285859e-05, "loss": 0.154, "step": 790 }, { "epoch": 1.639344262295082, "grad_norm": 2.1674957275390625, "learning_rate": 3.920748290048739e-05, "loss": 0.1467, "step": 800 }, { "epoch": 1.6598360655737705, "grad_norm": 3.749096393585205, "learning_rate": 3.914972657502677e-05, "loss": 0.141, "step": 810 }, { "epoch": 1.680327868852459, "grad_norm": 2.411086082458496, "learning_rate": 3.9091208770613036e-05, "loss": 0.1637, "step": 820 }, { "epoch": 1.7008196721311475, "grad_norm": 3.2505710124969482, "learning_rate": 3.9031931892683937e-05, "loss": 0.1851, "step": 830 }, { "epoch": 1.721311475409836, "grad_norm": 2.310121536254883, "learning_rate": 3.897189837787975e-05, "loss": 0.1316, "step": 840 }, { "epoch": 1.7418032786885247, "grad_norm": 2.3931281566619873, "learning_rate": 3.891111069394313e-05, "loss": 0.1527, "step": 850 }, { "epoch": 1.762295081967213, "grad_norm": 2.8994643688201904, "learning_rate": 3.884957133961768e-05, "loss": 0.1377, "step": 860 }, { "epoch": 1.7827868852459017, "grad_norm": 3.968078374862671, "learning_rate": 3.878728284454522e-05, "loss": 0.1527, "step": 870 }, { "epoch": 1.8032786885245902, "grad_norm": 5.090726375579834, "learning_rate": 3.872424776916183e-05, "loss": 0.1682, "step": 880 }, { "epoch": 1.8237704918032787, "grad_norm": 2.774803876876831, "learning_rate": 3.866046870459253e-05, "loss": 0.1526, "step": 890 }, { "epoch": 1.8442622950819674, "grad_norm": 3.3825931549072266, "learning_rate": 3.8595948272544905e-05, "loss": 0.1696, "step": 900 }, { "epoch": 1.8442622950819674, "eval_loss": 0.27062124013900757, "eval_runtime": 47.0499, "eval_samples_per_second": 9.224, "eval_steps_per_second": 9.224, "step": 900 }, { "epoch": 1.8647540983606556, "grad_norm": 5.232215404510498, "learning_rate": 3.8530689125201184e-05, "loss": 0.1638, "step": 910 }, { "epoch": 1.8852459016393444, "grad_norm": 3.0692834854125977, "learning_rate": 3.8464693945109305e-05, "loss": 0.1743, "step": 920 }, { "epoch": 1.9057377049180326, "grad_norm": 2.1382927894592285, "learning_rate": 3.839796544507265e-05, "loss": 0.1396, "step": 930 }, { "epoch": 1.9262295081967213, "grad_norm": 4.51501989364624, "learning_rate": 3.833050636803849e-05, "loss": 0.1721, "step": 940 }, { "epoch": 1.9467213114754098, "grad_norm": 2.188847541809082, "learning_rate": 3.826231948698527e-05, "loss": 0.1549, "step": 950 }, { "epoch": 1.9672131147540983, "grad_norm": 5.189305305480957, "learning_rate": 3.819340760480859e-05, "loss": 0.1588, "step": 960 }, { "epoch": 1.987704918032787, "grad_norm": 3.801185131072998, "learning_rate": 3.812377355420602e-05, "loss": 0.1559, "step": 970 }, { "epoch": 2.0081967213114753, "grad_norm": 2.1336910724639893, "learning_rate": 3.805342019756065e-05, "loss": 0.1184, "step": 980 }, { "epoch": 2.028688524590164, "grad_norm": 2.940256118774414, "learning_rate": 3.7982350426823406e-05, "loss": 0.097, "step": 990 }, { "epoch": 2.0491803278688523, "grad_norm": 1.487196445465088, "learning_rate": 3.791056716339421e-05, "loss": 0.073, "step": 1000 }, { "epoch": 2.069672131147541, "grad_norm": 3.6881015300750732, "learning_rate": 3.783807335800187e-05, "loss": 0.0971, "step": 1010 }, { "epoch": 2.0901639344262297, "grad_norm": 4.102665424346924, "learning_rate": 3.776487199058277e-05, "loss": 0.1018, "step": 1020 }, { "epoch": 2.110655737704918, "grad_norm": 2.7358601093292236, "learning_rate": 3.769096607015843e-05, "loss": 0.0848, "step": 1030 }, { "epoch": 2.1311475409836067, "grad_norm": 2.9479758739471436, "learning_rate": 3.761635863471175e-05, "loss": 0.0999, "step": 1040 }, { "epoch": 2.151639344262295, "grad_norm": 2.3228375911712646, "learning_rate": 3.754105275106222e-05, "loss": 0.0852, "step": 1050 }, { "epoch": 2.151639344262295, "eval_loss": 0.29612693190574646, "eval_runtime": 47.1357, "eval_samples_per_second": 9.207, "eval_steps_per_second": 9.207, "step": 1050 }, { "epoch": 2.1721311475409837, "grad_norm": 1.8034647703170776, "learning_rate": 3.746505151473972e-05, "loss": 0.084, "step": 1060 }, { "epoch": 2.192622950819672, "grad_norm": 2.4312632083892822, "learning_rate": 3.738835804985743e-05, "loss": 0.0808, "step": 1070 }, { "epoch": 2.2131147540983607, "grad_norm": 3.123870611190796, "learning_rate": 3.731097550898329e-05, "loss": 0.0988, "step": 1080 }, { "epoch": 2.2336065573770494, "grad_norm": 1.6723111867904663, "learning_rate": 3.723290707301047e-05, "loss": 0.0867, "step": 1090 }, { "epoch": 2.2540983606557377, "grad_norm": 3.273495674133301, "learning_rate": 3.7154155951026605e-05, "loss": 0.0885, "step": 1100 }, { "epoch": 2.2745901639344264, "grad_norm": 4.717736721038818, "learning_rate": 3.707472538018187e-05, "loss": 0.1093, "step": 1110 }, { "epoch": 2.2950819672131146, "grad_norm": 1.6000672578811646, "learning_rate": 3.6994618625555925e-05, "loss": 0.0945, "step": 1120 }, { "epoch": 2.3155737704918034, "grad_norm": 3.0358238220214844, "learning_rate": 3.691383898002368e-05, "loss": 0.0971, "step": 1130 }, { "epoch": 2.3360655737704916, "grad_norm": 2.1267294883728027, "learning_rate": 3.683238976412e-05, "loss": 0.0867, "step": 1140 }, { "epoch": 2.3565573770491803, "grad_norm": 2.6568408012390137, "learning_rate": 3.675027432590312e-05, "loss": 0.0857, "step": 1150 }, { "epoch": 2.3770491803278686, "grad_norm": 4.555295944213867, "learning_rate": 3.666749604081707e-05, "loss": 0.0827, "step": 1160 }, { "epoch": 2.3975409836065573, "grad_norm": 5.612463474273682, "learning_rate": 3.6584058311552954e-05, "loss": 0.0932, "step": 1170 }, { "epoch": 2.418032786885246, "grad_norm": 2.8655173778533936, "learning_rate": 3.6499964567909e-05, "loss": 0.0966, "step": 1180 }, { "epoch": 2.4385245901639343, "grad_norm": 2.9781882762908936, "learning_rate": 3.641521826664964e-05, "loss": 0.0891, "step": 1190 }, { "epoch": 2.459016393442623, "grad_norm": 2.27463960647583, "learning_rate": 3.63298228913634e-05, "loss": 0.1021, "step": 1200 }, { "epoch": 2.459016393442623, "eval_loss": 0.2947063744068146, "eval_runtime": 47.1827, "eval_samples_per_second": 9.198, "eval_steps_per_second": 9.198, "step": 1200 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.371689568341197e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }