{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 610, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16393442622950818, "grad_norm": 0.4943847358226776, "learning_rate": 0.00019986740898848306, "loss": 1.1081, "step": 10 }, { "epoch": 0.32786885245901637, "grad_norm": 0.44973108172416687, "learning_rate": 0.0001994699875614589, "loss": 0.9226, "step": 20 }, { "epoch": 0.4918032786885246, "grad_norm": 0.3362947404384613, "learning_rate": 0.00019880878960910772, "loss": 0.887, "step": 30 }, { "epoch": 0.6557377049180327, "grad_norm": 0.36153319478034973, "learning_rate": 0.0001978855685095358, "loss": 0.8615, "step": 40 }, { "epoch": 0.819672131147541, "grad_norm": 0.36022189259529114, "learning_rate": 0.00019670277247913205, "loss": 0.8234, "step": 50 }, { "epoch": 0.9836065573770492, "grad_norm": 0.4029320478439331, "learning_rate": 0.00019526353808033825, "loss": 0.8149, "step": 60 }, { "epoch": 1.1475409836065573, "grad_norm": 0.3638781011104584, "learning_rate": 0.00019357168190404936, "loss": 0.799, "step": 70 }, { "epoch": 1.3114754098360657, "grad_norm": 0.3549768924713135, "learning_rate": 0.0001916316904487005, "loss": 0.785, "step": 80 }, { "epoch": 1.4754098360655736, "grad_norm": 0.3937470018863678, "learning_rate": 0.00018944870822287956, "loss": 0.7776, "step": 90 }, { "epoch": 1.639344262295082, "grad_norm": 0.36256495118141174, "learning_rate": 0.00018702852410301554, "loss": 0.7756, "step": 100 }, { "epoch": 1.8032786885245902, "grad_norm": 0.3645057678222656, "learning_rate": 0.00018437755598231856, "loss": 0.767, "step": 110 }, { "epoch": 1.9672131147540983, "grad_norm": 0.3446103632450104, "learning_rate": 0.00018150283375168114, "loss": 0.7543, "step": 120 }, { "epoch": 2.1311475409836067, "grad_norm": 0.4181569516658783, "learning_rate": 0.00017841198065767107, "loss": 0.7231, "step": 130 }, { "epoch": 2.2950819672131146, "grad_norm": 0.40038439631462097, "learning_rate": 0.00017511319308705198, "loss": 0.7348, "step": 140 }, { "epoch": 2.459016393442623, "grad_norm": 0.3984282612800598, "learning_rate": 0.00017161521883143934, "loss": 0.7255, "step": 150 }, { "epoch": 2.6229508196721314, "grad_norm": 0.4186936318874359, "learning_rate": 0.00016792733388972932, "loss": 0.7263, "step": 160 }, { "epoch": 2.7868852459016393, "grad_norm": 0.4004381597042084, "learning_rate": 0.00016405931786981755, "loss": 0.7271, "step": 170 }, { "epoch": 2.9508196721311473, "grad_norm": 0.4597119390964508, "learning_rate": 0.00016002142805483685, "loss": 0.7125, "step": 180 }, { "epoch": 3.1147540983606556, "grad_norm": 0.4180513322353363, "learning_rate": 0.00015582437220268647, "loss": 0.6925, "step": 190 }, { "epoch": 3.278688524590164, "grad_norm": 0.4401358366012573, "learning_rate": 0.0001514792801509831, "loss": 0.6863, "step": 200 }, { "epoch": 3.442622950819672, "grad_norm": 0.4415852427482605, "learning_rate": 0.000146997674302732, "loss": 0.6857, "step": 210 }, { "epoch": 3.6065573770491803, "grad_norm": 0.4144597053527832, "learning_rate": 0.0001423914390709861, "loss": 0.6807, "step": 220 }, { "epoch": 3.7704918032786887, "grad_norm": 0.4162183403968811, "learning_rate": 0.00013767278936351854, "loss": 0.6846, "step": 230 }, { "epoch": 3.9344262295081966, "grad_norm": 0.42117443680763245, "learning_rate": 0.0001328542381910835, "loss": 0.6887, "step": 240 }, { "epoch": 4.098360655737705, "grad_norm": 0.4591551125049591, "learning_rate": 0.00012794856348516095, "loss": 0.6543, "step": 250 }, { "epoch": 4.262295081967213, "grad_norm": 0.4592267572879791, "learning_rate": 0.0001229687742131796, "loss": 0.6482, "step": 260 }, { "epoch": 4.426229508196721, "grad_norm": 0.47369667887687683, "learning_rate": 0.00011792807588107357, "loss": 0.6494, "step": 270 }, { "epoch": 4.590163934426229, "grad_norm": 0.45578595995903015, "learning_rate": 0.00011283983551465511, "loss": 0.6559, "step": 280 }, { "epoch": 4.754098360655737, "grad_norm": 0.4742227792739868, "learning_rate": 0.00010771754621266466, "loss": 0.6462, "step": 290 }, { "epoch": 4.918032786885246, "grad_norm": 0.48461949825286865, "learning_rate": 0.00010257479136549889, "loss": 0.6501, "step": 300 }, { "epoch": 5.081967213114754, "grad_norm": 0.4797608554363251, "learning_rate": 9.742520863450115e-05, "loss": 0.6452, "step": 310 }, { "epoch": 5.245901639344262, "grad_norm": 0.505832314491272, "learning_rate": 9.228245378733537e-05, "loss": 0.6178, "step": 320 }, { "epoch": 5.409836065573771, "grad_norm": 0.5265078544616699, "learning_rate": 8.71601644853449e-05, "loss": 0.6177, "step": 330 }, { "epoch": 5.573770491803279, "grad_norm": 0.5109356045722961, "learning_rate": 8.207192411892646e-05, "loss": 0.6225, "step": 340 }, { "epoch": 5.737704918032787, "grad_norm": 0.5139452815055847, "learning_rate": 7.703122578682046e-05, "loss": 0.6229, "step": 350 }, { "epoch": 5.901639344262295, "grad_norm": 0.5581080913543701, "learning_rate": 7.205143651483906e-05, "loss": 0.6189, "step": 360 }, { "epoch": 6.065573770491803, "grad_norm": 0.4794338345527649, "learning_rate": 6.714576180891654e-05, "loss": 0.6087, "step": 370 }, { "epoch": 6.229508196721311, "grad_norm": 0.5142917037010193, "learning_rate": 6.232721063648148e-05, "loss": 0.5977, "step": 380 }, { "epoch": 6.39344262295082, "grad_norm": 0.547099769115448, "learning_rate": 5.7608560929013946e-05, "loss": 0.6058, "step": 390 }, { "epoch": 6.557377049180328, "grad_norm": 0.5487104654312134, "learning_rate": 5.300232569726804e-05, "loss": 0.5939, "step": 400 }, { "epoch": 6.721311475409836, "grad_norm": 0.5189688205718994, "learning_rate": 4.852071984901696e-05, "loss": 0.6015, "step": 410 }, { "epoch": 6.885245901639344, "grad_norm": 0.5185168981552124, "learning_rate": 4.417562779731355e-05, "loss": 0.5949, "step": 420 }, { "epoch": 7.049180327868853, "grad_norm": 0.5486496090888977, "learning_rate": 3.997857194516319e-05, "loss": 0.5957, "step": 430 }, { "epoch": 7.213114754098361, "grad_norm": 0.5592466592788696, "learning_rate": 3.594068213018249e-05, "loss": 0.582, "step": 440 }, { "epoch": 7.377049180327869, "grad_norm": 0.5590682625770569, "learning_rate": 3.207266611027069e-05, "loss": 0.5779, "step": 450 }, { "epoch": 7.540983606557377, "grad_norm": 0.546293318271637, "learning_rate": 2.8384781168560693e-05, "loss": 0.5736, "step": 460 }, { "epoch": 7.704918032786885, "grad_norm": 0.5725670456886292, "learning_rate": 2.4886806912948035e-05, "loss": 0.5825, "step": 470 }, { "epoch": 7.868852459016393, "grad_norm": 0.574937105178833, "learning_rate": 2.1588019342328968e-05, "loss": 0.5843, "step": 480 }, { "epoch": 8.032786885245901, "grad_norm": 0.5568664073944092, "learning_rate": 1.8497166248318876e-05, "loss": 0.5791, "step": 490 }, { "epoch": 8.19672131147541, "grad_norm": 0.568524181842804, "learning_rate": 1.562244401768144e-05, "loss": 0.5673, "step": 500 }, { "epoch": 8.360655737704919, "grad_norm": 0.5631764531135559, "learning_rate": 1.2971475896984475e-05, "loss": 0.5557, "step": 510 }, { "epoch": 8.524590163934427, "grad_norm": 0.5955241322517395, "learning_rate": 1.0551291777120464e-05, "loss": 0.5741, "step": 520 }, { "epoch": 8.688524590163935, "grad_norm": 0.5839523077011108, "learning_rate": 8.368309551299536e-06, "loss": 0.5674, "step": 530 }, { "epoch": 8.852459016393443, "grad_norm": 0.5877330899238586, "learning_rate": 6.428318095950647e-06, "loss": 0.5819, "step": 540 }, { "epoch": 9.01639344262295, "grad_norm": 0.5768188238143921, "learning_rate": 4.7364619196617495e-06, "loss": 0.5728, "step": 550 }, { "epoch": 9.180327868852459, "grad_norm": 0.5796675682067871, "learning_rate": 3.2972275208679625e-06, "loss": 0.5682, "step": 560 }, { "epoch": 9.344262295081966, "grad_norm": 0.6149064898490906, "learning_rate": 2.1144314904642195e-06, "loss": 0.5699, "step": 570 }, { "epoch": 9.508196721311476, "grad_norm": 0.558674156665802, "learning_rate": 1.1912103908922945e-06, "loss": 0.5663, "step": 580 }, { "epoch": 9.672131147540984, "grad_norm": 0.563441812992096, "learning_rate": 5.300124385410943e-07, "loss": 0.5626, "step": 590 }, { "epoch": 9.836065573770492, "grad_norm": 0.5652551651000977, "learning_rate": 1.3259101151694708e-07, "loss": 0.5579, "step": 600 }, { "epoch": 10.0, "grad_norm": 0.5645861625671387, "learning_rate": 0.0, "loss": 0.5631, "step": 610 }, { "epoch": 10.0, "step": 610, "total_flos": 4.672887793385472e+16, "train_loss": 0.6668467552935491, "train_runtime": 2732.8065, "train_samples_per_second": 2.679, "train_steps_per_second": 0.223 } ], "logging_steps": 10, "max_steps": 610, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.672887793385472e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }