{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.011701380762930026, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011701380762930026, "eval_loss": 5.071834564208984, "eval_runtime": 261.8333, "eval_samples_per_second": 54.974, "eval_steps_per_second": 6.875, "step": 1 }, { "epoch": 0.00035104142288790077, "grad_norm": 6.64661169052124, "learning_rate": 3e-05, "loss": 4.9241, "step": 3 }, { "epoch": 0.0007020828457758015, "grad_norm": 3.513359785079956, "learning_rate": 6e-05, "loss": 4.6983, "step": 6 }, { "epoch": 0.0010531242686637023, "grad_norm": 3.402698278427124, "learning_rate": 9e-05, "loss": 4.7042, "step": 9 }, { "epoch": 0.0010531242686637023, "eval_loss": 4.4165263175964355, "eval_runtime": 262.1486, "eval_samples_per_second": 54.908, "eval_steps_per_second": 6.866, "step": 9 }, { "epoch": 0.001404165691551603, "grad_norm": 3.1393704414367676, "learning_rate": 9.987820251299122e-05, "loss": 4.1249, "step": 12 }, { "epoch": 0.0017552071144395038, "grad_norm": 2.3505287170410156, "learning_rate": 9.924038765061042e-05, "loss": 3.8557, "step": 15 }, { "epoch": 0.0021062485373274046, "grad_norm": 2.0289347171783447, "learning_rate": 9.806308479691595e-05, "loss": 3.9179, "step": 18 }, { "epoch": 0.0021062485373274046, "eval_loss": 3.685943841934204, "eval_runtime": 261.9757, "eval_samples_per_second": 54.944, "eval_steps_per_second": 6.871, "step": 18 }, { "epoch": 0.0024572899602153054, "grad_norm": 2.3705809116363525, "learning_rate": 9.635919272833938e-05, "loss": 3.6618, "step": 21 }, { "epoch": 0.002808331383103206, "grad_norm": 2.002850294113159, "learning_rate": 9.414737964294636e-05, "loss": 3.4756, "step": 24 }, { "epoch": 0.003159372805991107, "grad_norm": 1.663278579711914, "learning_rate": 9.145187862775209e-05, "loss": 3.3452, "step": 27 }, { "epoch": 0.003159372805991107, "eval_loss": 3.578341245651245, "eval_runtime": 262.0449, "eval_samples_per_second": 54.93, "eval_steps_per_second": 6.869, "step": 27 }, { "epoch": 0.0035104142288790077, "grad_norm": 1.877441167831421, "learning_rate": 8.83022221559489e-05, "loss": 3.4687, "step": 30 }, { "epoch": 0.0038614556517669084, "grad_norm": 1.7343926429748535, "learning_rate": 8.473291852294987e-05, "loss": 3.4485, "step": 33 }, { "epoch": 0.004212497074654809, "grad_norm": 1.607003927230835, "learning_rate": 8.07830737662829e-05, "loss": 3.414, "step": 36 }, { "epoch": 0.004212497074654809, "eval_loss": 3.5100536346435547, "eval_runtime": 261.9844, "eval_samples_per_second": 54.942, "eval_steps_per_second": 6.871, "step": 36 }, { "epoch": 0.00456353849754271, "grad_norm": 2.106229305267334, "learning_rate": 7.649596321166024e-05, "loss": 3.4085, "step": 39 }, { "epoch": 0.004914579920430611, "grad_norm": 1.4526017904281616, "learning_rate": 7.191855733945387e-05, "loss": 3.4435, "step": 42 }, { "epoch": 0.0052656213433185115, "grad_norm": 1.8056683540344238, "learning_rate": 6.710100716628344e-05, "loss": 3.6258, "step": 45 }, { "epoch": 0.0052656213433185115, "eval_loss": 3.472996234893799, "eval_runtime": 262.4211, "eval_samples_per_second": 54.851, "eval_steps_per_second": 6.859, "step": 45 }, { "epoch": 0.005616662766206412, "grad_norm": 2.4865214824676514, "learning_rate": 6.209609477998338e-05, "loss": 3.4346, "step": 48 }, { "epoch": 0.005967704189094313, "grad_norm": 1.5772249698638916, "learning_rate": 5.695865504800327e-05, "loss": 3.4463, "step": 51 }, { "epoch": 0.006318745611982214, "grad_norm": 1.514796257019043, "learning_rate": 5.174497483512506e-05, "loss": 3.4054, "step": 54 }, { "epoch": 0.006318745611982214, "eval_loss": 3.453535318374634, "eval_runtime": 262.1313, "eval_samples_per_second": 54.911, "eval_steps_per_second": 6.867, "step": 54 }, { "epoch": 0.0066697870348701145, "grad_norm": 1.575348973274231, "learning_rate": 4.6512176312793736e-05, "loss": 3.4188, "step": 57 }, { "epoch": 0.007020828457758015, "grad_norm": 1.889758586883545, "learning_rate": 4.131759111665349e-05, "loss": 3.2958, "step": 60 }, { "epoch": 0.007371869880645916, "grad_norm": 1.7029871940612793, "learning_rate": 3.6218132209150045e-05, "loss": 3.1771, "step": 63 }, { "epoch": 0.007371869880645916, "eval_loss": 3.4409759044647217, "eval_runtime": 262.1868, "eval_samples_per_second": 54.9, "eval_steps_per_second": 6.865, "step": 63 }, { "epoch": 0.007722911303533817, "grad_norm": 1.5269502401351929, "learning_rate": 3.12696703292044e-05, "loss": 3.4087, "step": 66 }, { "epoch": 0.008073952726421718, "grad_norm": 2.0026628971099854, "learning_rate": 2.6526421860705473e-05, "loss": 3.4728, "step": 69 }, { "epoch": 0.008424994149309618, "grad_norm": 1.3194637298583984, "learning_rate": 2.2040354826462668e-05, "loss": 3.1512, "step": 72 }, { "epoch": 0.008424994149309618, "eval_loss": 3.4321975708007812, "eval_runtime": 262.2406, "eval_samples_per_second": 54.889, "eval_steps_per_second": 6.864, "step": 72 }, { "epoch": 0.00877603557219752, "grad_norm": 1.8826606273651123, "learning_rate": 1.7860619515673033e-05, "loss": 3.4841, "step": 75 }, { "epoch": 0.00912707699508542, "grad_norm": 2.0330681800842285, "learning_rate": 1.4033009983067452e-05, "loss": 3.3632, "step": 78 }, { "epoch": 0.009478118417973322, "grad_norm": 1.4071763753890991, "learning_rate": 1.0599462319663905e-05, "loss": 3.2883, "step": 81 }, { "epoch": 0.009478118417973322, "eval_loss": 3.425398588180542, "eval_runtime": 262.4725, "eval_samples_per_second": 54.84, "eval_steps_per_second": 6.858, "step": 81 }, { "epoch": 0.009829159840861221, "grad_norm": 1.6387964487075806, "learning_rate": 7.597595192178702e-06, "loss": 3.1512, "step": 84 }, { "epoch": 0.010180201263749123, "grad_norm": 1.5741945505142212, "learning_rate": 5.060297685041659e-06, "loss": 3.2649, "step": 87 }, { "epoch": 0.010531242686637023, "grad_norm": 1.4074112176895142, "learning_rate": 3.0153689607045845e-06, "loss": 3.2711, "step": 90 }, { "epoch": 0.010531242686637023, "eval_loss": 3.4211525917053223, "eval_runtime": 262.4203, "eval_samples_per_second": 54.851, "eval_steps_per_second": 6.859, "step": 90 }, { "epoch": 0.010882284109524925, "grad_norm": 1.1304495334625244, "learning_rate": 1.4852136862001764e-06, "loss": 3.0629, "step": 93 }, { "epoch": 0.011233325532412824, "grad_norm": 1.7654685974121094, "learning_rate": 4.865965629214819e-07, "loss": 3.3242, "step": 96 }, { "epoch": 0.011584366955300726, "grad_norm": 1.1728322505950928, "learning_rate": 3.04586490452119e-08, "loss": 2.992, "step": 99 }, { "epoch": 0.011584366955300726, "eval_loss": 3.4203412532806396, "eval_runtime": 262.2463, "eval_samples_per_second": 54.887, "eval_steps_per_second": 6.864, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.05873754406912e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }