{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008150126138483479, "eval_steps": 100000000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.15012613848348e-06, "grad_norm": 43.29102325439453, "learning_rate": 1.0000000000000001e-07, "loss": 81.0533, "step": 1 }, { "epoch": 0.0002445037841545044, "grad_norm": 42.03493881225586, "learning_rate": 3e-06, "loss": 70.3325, "step": 30 }, { "epoch": 0.0004890075683090088, "grad_norm": 10.497812271118164, "learning_rate": 6e-06, "loss": 23.8844, "step": 60 }, { "epoch": 0.0007335113524635132, "grad_norm": 6.692718029022217, "learning_rate": 9e-06, "loss": 11.7198, "step": 90 }, { "epoch": 0.0009780151366180175, "grad_norm": 9.11446475982666, "learning_rate": 1.2e-05, "loss": 10.3199, "step": 120 }, { "epoch": 0.001222518920772522, "grad_norm": 16.92243766784668, "learning_rate": 1.5e-05, "loss": 9.5338, "step": 150 }, { "epoch": 0.0014670227049270264, "grad_norm": 12.929216384887695, "learning_rate": 1.8e-05, "loss": 8.9114, "step": 180 }, { "epoch": 0.0017115264890815308, "grad_norm": 8.830116271972656, "learning_rate": 2.1e-05, "loss": 8.3867, "step": 210 }, { "epoch": 0.001956030273236035, "grad_norm": 7.348124980926514, "learning_rate": 2.4e-05, "loss": 8.0113, "step": 240 }, { "epoch": 0.0022005340573905395, "grad_norm": 12.751787185668945, "learning_rate": 2.7000000000000002e-05, "loss": 7.6093, "step": 270 }, { "epoch": 0.002445037841545044, "grad_norm": 15.10444164276123, "learning_rate": 3e-05, "loss": 7.3484, "step": 300 }, { "epoch": 0.0026895416256995483, "grad_norm": 6.653383731842041, "learning_rate": 3.3e-05, "loss": 7.2105, "step": 330 }, { "epoch": 0.0029340454098540527, "grad_norm": 6.986039161682129, "learning_rate": 3.6e-05, "loss": 7.0228, "step": 360 }, { "epoch": 0.003178549194008557, "grad_norm": 6.230088710784912, "learning_rate": 3.9000000000000006e-05, "loss": 6.8948, "step": 390 }, { "epoch": 0.0034230529781630616, "grad_norm": 8.170981407165527, "learning_rate": 4.2e-05, "loss": 6.6965, "step": 420 }, { "epoch": 0.0036675567623175656, "grad_norm": 5.6345930099487305, "learning_rate": 4.5e-05, "loss": 6.5988, "step": 450 }, { "epoch": 0.00391206054647207, "grad_norm": 5.156513214111328, "learning_rate": 4.8e-05, "loss": 6.4795, "step": 480 }, { "epoch": 0.0041565643306265745, "grad_norm": 5.4964189529418945, "learning_rate": 4.999999990869806e-05, "loss": 6.333, "step": 510 }, { "epoch": 0.004401068114781079, "grad_norm": 3.67378306388855, "learning_rate": 4.999999853916893e-05, "loss": 6.2208, "step": 540 }, { "epoch": 0.004645571898935583, "grad_norm": 8.507222175598145, "learning_rate": 4.9999995526204936e-05, "loss": 6.1097, "step": 570 }, { "epoch": 0.004890075683090088, "grad_norm": 3.756618022918701, "learning_rate": 4.999999086980628e-05, "loss": 5.9886, "step": 600 }, { "epoch": 0.005134579467244592, "grad_norm": 3.8149781227111816, "learning_rate": 4.999998456997326e-05, "loss": 5.8779, "step": 630 }, { "epoch": 0.005379083251399097, "grad_norm": 3.840543270111084, "learning_rate": 4.999997662670628e-05, "loss": 5.805, "step": 660 }, { "epoch": 0.005623587035553601, "grad_norm": 3.4208931922912598, "learning_rate": 4.999996704000589e-05, "loss": 5.6992, "step": 690 }, { "epoch": 0.0058680908197081055, "grad_norm": 3.2975683212280273, "learning_rate": 4.99999558098727e-05, "loss": 5.6531, "step": 720 }, { "epoch": 0.00611259460386261, "grad_norm": 4.05631160736084, "learning_rate": 4.9999942936307445e-05, "loss": 5.554, "step": 750 }, { "epoch": 0.006357098388017114, "grad_norm": 3.1539864540100098, "learning_rate": 4.9999928419310994e-05, "loss": 5.4931, "step": 780 }, { "epoch": 0.006601602172171619, "grad_norm": 4.811732292175293, "learning_rate": 4.999991225888427e-05, "loss": 5.4204, "step": 810 }, { "epoch": 0.006846105956326123, "grad_norm": 2.9593210220336914, "learning_rate": 4.999989445502837e-05, "loss": 5.3687, "step": 840 }, { "epoch": 0.007090609740480627, "grad_norm": 3.942239284515381, "learning_rate": 4.9999875007744436e-05, "loss": 5.3238, "step": 870 }, { "epoch": 0.007335113524635131, "grad_norm": 2.29752254486084, "learning_rate": 4.9999853917033756e-05, "loss": 5.2423, "step": 900 }, { "epoch": 0.007579617308789636, "grad_norm": 2.243770122528076, "learning_rate": 4.999983118289773e-05, "loss": 5.2384, "step": 930 }, { "epoch": 0.00782412109294414, "grad_norm": 2.5572686195373535, "learning_rate": 4.999980680533782e-05, "loss": 5.1761, "step": 960 }, { "epoch": 0.008068624877098645, "grad_norm": 2.4739913940429688, "learning_rate": 4.999978078435567e-05, "loss": 5.1215, "step": 990 } ], "logging_steps": 30, "max_steps": 368091, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 9.730730754048e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }