{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0615384615384613, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020512820512820513, "eval_loss": 0.4737852215766907, "eval_runtime": 6.0877, "eval_samples_per_second": 13.47, "eval_steps_per_second": 1.807, "step": 1 }, { "epoch": 0.06153846153846154, "grad_norm": 0.3213890790939331, "learning_rate": 1.5e-05, "loss": 0.4539, "step": 3 }, { "epoch": 0.12307692307692308, "grad_norm": 0.2761625051498413, "learning_rate": 3e-05, "loss": 0.3822, "step": 6 }, { "epoch": 0.18461538461538463, "grad_norm": 0.3234591782093048, "learning_rate": 4.5e-05, "loss": 0.4102, "step": 9 }, { "epoch": 0.18461538461538463, "eval_loss": 0.4136742055416107, "eval_runtime": 6.1577, "eval_samples_per_second": 13.317, "eval_steps_per_second": 1.786, "step": 9 }, { "epoch": 0.24615384615384617, "grad_norm": 0.25304943323135376, "learning_rate": 4.993910125649561e-05, "loss": 0.3169, "step": 12 }, { "epoch": 0.3076923076923077, "grad_norm": 0.24388673901557922, "learning_rate": 4.962019382530521e-05, "loss": 0.2744, "step": 15 }, { "epoch": 0.36923076923076925, "grad_norm": 0.3849531412124634, "learning_rate": 4.9031542398457974e-05, "loss": 0.2113, "step": 18 }, { "epoch": 0.36923076923076925, "eval_loss": 0.2518386244773865, "eval_runtime": 6.171, "eval_samples_per_second": 13.288, "eval_steps_per_second": 1.783, "step": 18 }, { "epoch": 0.4307692307692308, "grad_norm": 0.11890580505132675, "learning_rate": 4.817959636416969e-05, "loss": 0.1947, "step": 21 }, { "epoch": 0.49230769230769234, "grad_norm": 0.08844118565320969, "learning_rate": 4.707368982147318e-05, "loss": 0.1607, "step": 24 }, { "epoch": 0.5538461538461539, "grad_norm": 0.11252105236053467, "learning_rate": 4.572593931387604e-05, "loss": 0.1561, "step": 27 }, { "epoch": 0.5538461538461539, "eval_loss": 0.22388997673988342, "eval_runtime": 6.1962, "eval_samples_per_second": 13.234, "eval_steps_per_second": 1.775, "step": 27 }, { "epoch": 0.6153846153846154, "grad_norm": 0.08897673338651657, "learning_rate": 4.415111107797445e-05, "loss": 0.1666, "step": 30 }, { "epoch": 0.676923076923077, "grad_norm": 0.07681296765804291, "learning_rate": 4.2366459261474933e-05, "loss": 0.143, "step": 33 }, { "epoch": 0.7384615384615385, "grad_norm": 0.06421305239200592, "learning_rate": 4.039153688314145e-05, "loss": 0.1437, "step": 36 }, { "epoch": 0.7384615384615385, "eval_loss": 0.20718072354793549, "eval_runtime": 6.1992, "eval_samples_per_second": 13.228, "eval_steps_per_second": 1.774, "step": 36 }, { "epoch": 0.8, "grad_norm": 0.06632912904024124, "learning_rate": 3.824798160583012e-05, "loss": 0.1493, "step": 39 }, { "epoch": 0.8615384615384616, "grad_norm": 0.07067277282476425, "learning_rate": 3.5959278669726935e-05, "loss": 0.1826, "step": 42 }, { "epoch": 0.9230769230769231, "grad_norm": 0.1275985985994339, "learning_rate": 3.355050358314172e-05, "loss": 0.1586, "step": 45 }, { "epoch": 0.9230769230769231, "eval_loss": 0.20062483847141266, "eval_runtime": 6.1869, "eval_samples_per_second": 13.254, "eval_steps_per_second": 1.778, "step": 45 }, { "epoch": 0.9846153846153847, "grad_norm": 0.09195121377706528, "learning_rate": 3.104804738999169e-05, "loss": 0.1905, "step": 48 }, { "epoch": 1.0512820512820513, "grad_norm": 0.06787902116775513, "learning_rate": 2.8479327524001636e-05, "loss": 0.1806, "step": 51 }, { "epoch": 1.1128205128205129, "grad_norm": 0.09576894342899323, "learning_rate": 2.587248741756253e-05, "loss": 0.1388, "step": 54 }, { "epoch": 1.1128205128205129, "eval_loss": 0.19303785264492035, "eval_runtime": 6.1714, "eval_samples_per_second": 13.287, "eval_steps_per_second": 1.782, "step": 54 }, { "epoch": 1.1743589743589744, "grad_norm": 0.08612611144781113, "learning_rate": 2.3256088156396868e-05, "loss": 0.135, "step": 57 }, { "epoch": 1.235897435897436, "grad_norm": 0.11174845695495605, "learning_rate": 2.0658795558326743e-05, "loss": 0.1637, "step": 60 }, { "epoch": 1.2974358974358975, "grad_norm": 0.11176323890686035, "learning_rate": 1.8109066104575023e-05, "loss": 0.1396, "step": 63 }, { "epoch": 1.2974358974358975, "eval_loss": 0.1862833946943283, "eval_runtime": 6.1736, "eval_samples_per_second": 13.282, "eval_steps_per_second": 1.782, "step": 63 }, { "epoch": 1.358974358974359, "grad_norm": 0.10889393836259842, "learning_rate": 1.56348351646022e-05, "loss": 0.1378, "step": 66 }, { "epoch": 1.4205128205128206, "grad_norm": 0.08364809304475784, "learning_rate": 1.3263210930352737e-05, "loss": 0.146, "step": 69 }, { "epoch": 1.4820512820512821, "grad_norm": 0.09872214496135712, "learning_rate": 1.1020177413231334e-05, "loss": 0.1477, "step": 72 }, { "epoch": 1.4820512820512821, "eval_loss": 0.18112681806087494, "eval_runtime": 6.1696, "eval_samples_per_second": 13.291, "eval_steps_per_second": 1.783, "step": 72 }, { "epoch": 1.5435897435897434, "grad_norm": 0.10146892070770264, "learning_rate": 8.930309757836517e-06, "loss": 0.1505, "step": 75 }, { "epoch": 1.6051282051282052, "grad_norm": 0.10352431237697601, "learning_rate": 7.016504991533726e-06, "loss": 0.1465, "step": 78 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1181349977850914, "learning_rate": 5.299731159831953e-06, "loss": 0.1222, "step": 81 }, { "epoch": 1.6666666666666665, "eval_loss": 0.17734608054161072, "eval_runtime": 6.189, "eval_samples_per_second": 13.249, "eval_steps_per_second": 1.777, "step": 81 }, { "epoch": 1.7282051282051283, "grad_norm": 0.1693313866853714, "learning_rate": 3.798797596089351e-06, "loss": 0.1043, "step": 84 }, { "epoch": 1.7897435897435896, "grad_norm": 0.13682527840137482, "learning_rate": 2.5301488425208296e-06, "loss": 0.1204, "step": 87 }, { "epoch": 1.8512820512820514, "grad_norm": 0.10424583405256271, "learning_rate": 1.5076844803522922e-06, "loss": 0.1331, "step": 90 }, { "epoch": 1.8512820512820514, "eval_loss": 0.1759835034608841, "eval_runtime": 6.1621, "eval_samples_per_second": 13.307, "eval_steps_per_second": 1.785, "step": 90 }, { "epoch": 1.9128205128205127, "grad_norm": 0.11523737013339996, "learning_rate": 7.426068431000882e-07, "loss": 0.1086, "step": 93 }, { "epoch": 1.9743589743589745, "grad_norm": 0.12982092797756195, "learning_rate": 2.4329828146074095e-07, "loss": 0.1222, "step": 96 }, { "epoch": 2.041025641025641, "grad_norm": 0.0937347263097763, "learning_rate": 1.522932452260595e-08, "loss": 0.1387, "step": 99 }, { "epoch": 2.041025641025641, "eval_loss": 0.17561671137809753, "eval_runtime": 6.1813, "eval_samples_per_second": 13.266, "eval_steps_per_second": 1.78, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0718222633612083e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }