{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8181818181818183, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01818181818181818, "eval_loss": 0.8232752680778503, "eval_runtime": 1.7017, "eval_samples_per_second": 54.651, "eval_steps_per_second": 7.052, "step": 1 }, { "epoch": 0.05454545454545454, "grad_norm": 1.0431333780288696, "learning_rate": 3e-05, "loss": 0.9096, "step": 3 }, { "epoch": 0.10909090909090909, "grad_norm": 1.381081461906433, "learning_rate": 6e-05, "loss": 0.7918, "step": 6 }, { "epoch": 0.16363636363636364, "grad_norm": 0.8481101393699646, "learning_rate": 9e-05, "loss": 0.699, "step": 9 }, { "epoch": 0.16363636363636364, "eval_loss": 0.5536388158798218, "eval_runtime": 1.6975, "eval_samples_per_second": 54.787, "eval_steps_per_second": 7.069, "step": 9 }, { "epoch": 0.21818181818181817, "grad_norm": 0.7306303977966309, "learning_rate": 9.987820251299122e-05, "loss": 0.5831, "step": 12 }, { "epoch": 0.2727272727272727, "grad_norm": 0.8772233128547668, "learning_rate": 9.924038765061042e-05, "loss": 0.4517, "step": 15 }, { "epoch": 0.32727272727272727, "grad_norm": 0.8330588936805725, "learning_rate": 9.806308479691595e-05, "loss": 0.463, "step": 18 }, { "epoch": 0.32727272727272727, "eval_loss": 0.37208396196365356, "eval_runtime": 1.7251, "eval_samples_per_second": 53.91, "eval_steps_per_second": 6.956, "step": 18 }, { "epoch": 0.38181818181818183, "grad_norm": 0.6052883863449097, "learning_rate": 9.635919272833938e-05, "loss": 0.4153, "step": 21 }, { "epoch": 0.43636363636363634, "grad_norm": 0.5696281790733337, "learning_rate": 9.414737964294636e-05, "loss": 0.398, "step": 24 }, { "epoch": 0.4909090909090909, "grad_norm": 0.5094695687294006, "learning_rate": 9.145187862775209e-05, "loss": 0.381, "step": 27 }, { "epoch": 0.4909090909090909, "eval_loss": 0.3064245879650116, "eval_runtime": 1.7094, "eval_samples_per_second": 54.404, "eval_steps_per_second": 7.02, "step": 27 }, { "epoch": 0.5454545454545454, "grad_norm": 0.47634822130203247, "learning_rate": 8.83022221559489e-05, "loss": 0.3259, "step": 30 }, { "epoch": 0.6, "grad_norm": 0.6172341704368591, "learning_rate": 8.473291852294987e-05, "loss": 0.3468, "step": 33 }, { "epoch": 0.6545454545454545, "grad_norm": 0.44620198011398315, "learning_rate": 8.07830737662829e-05, "loss": 0.2712, "step": 36 }, { "epoch": 0.6545454545454545, "eval_loss": 0.2684729993343353, "eval_runtime": 1.7078, "eval_samples_per_second": 54.456, "eval_steps_per_second": 7.027, "step": 36 }, { "epoch": 0.7090909090909091, "grad_norm": 0.5440018177032471, "learning_rate": 7.649596321166024e-05, "loss": 0.36, "step": 39 }, { "epoch": 0.7636363636363637, "grad_norm": 0.5840175747871399, "learning_rate": 7.191855733945387e-05, "loss": 0.3477, "step": 42 }, { "epoch": 0.8181818181818182, "grad_norm": 0.4900560677051544, "learning_rate": 6.710100716628344e-05, "loss": 0.3062, "step": 45 }, { "epoch": 0.8181818181818182, "eval_loss": 0.2494433969259262, "eval_runtime": 1.7163, "eval_samples_per_second": 54.187, "eval_steps_per_second": 6.992, "step": 45 }, { "epoch": 0.8727272727272727, "grad_norm": 0.4461391270160675, "learning_rate": 6.209609477998338e-05, "loss": 0.2657, "step": 48 }, { "epoch": 0.9272727272727272, "grad_norm": 0.535493016242981, "learning_rate": 5.695865504800327e-05, "loss": 0.2891, "step": 51 }, { "epoch": 0.9818181818181818, "grad_norm": 0.6309211254119873, "learning_rate": 5.174497483512506e-05, "loss": 0.271, "step": 54 }, { "epoch": 0.9818181818181818, "eval_loss": 0.23513805866241455, "eval_runtime": 1.7162, "eval_samples_per_second": 54.189, "eval_steps_per_second": 6.992, "step": 54 }, { "epoch": 1.0363636363636364, "grad_norm": 0.46374499797821045, "learning_rate": 4.6512176312793736e-05, "loss": 0.264, "step": 57 }, { "epoch": 1.0909090909090908, "grad_norm": 0.43277519941329956, "learning_rate": 4.131759111665349e-05, "loss": 0.262, "step": 60 }, { "epoch": 1.1454545454545455, "grad_norm": 0.4681129455566406, "learning_rate": 3.6218132209150045e-05, "loss": 0.2522, "step": 63 }, { "epoch": 1.1454545454545455, "eval_loss": 0.224752277135849, "eval_runtime": 1.7139, "eval_samples_per_second": 54.261, "eval_steps_per_second": 7.001, "step": 63 }, { "epoch": 1.2, "grad_norm": 0.4968392252922058, "learning_rate": 3.12696703292044e-05, "loss": 0.206, "step": 66 }, { "epoch": 1.2545454545454544, "grad_norm": 0.47857439517974854, "learning_rate": 2.6526421860705473e-05, "loss": 0.2197, "step": 69 }, { "epoch": 1.309090909090909, "grad_norm": 0.4433331787586212, "learning_rate": 2.2040354826462668e-05, "loss": 0.2292, "step": 72 }, { "epoch": 1.309090909090909, "eval_loss": 0.21840831637382507, "eval_runtime": 1.7104, "eval_samples_per_second": 54.372, "eval_steps_per_second": 7.016, "step": 72 }, { "epoch": 1.3636363636363638, "grad_norm": 0.5370104908943176, "learning_rate": 1.7860619515673033e-05, "loss": 0.2634, "step": 75 }, { "epoch": 1.4181818181818182, "grad_norm": 0.5451434254646301, "learning_rate": 1.4033009983067452e-05, "loss": 0.2539, "step": 78 }, { "epoch": 1.4727272727272727, "grad_norm": 0.5778789520263672, "learning_rate": 1.0599462319663905e-05, "loss": 0.2318, "step": 81 }, { "epoch": 1.4727272727272727, "eval_loss": 0.21548418700695038, "eval_runtime": 1.7211, "eval_samples_per_second": 54.034, "eval_steps_per_second": 6.972, "step": 81 }, { "epoch": 1.5272727272727273, "grad_norm": 0.5370736122131348, "learning_rate": 7.597595192178702e-06, "loss": 0.2867, "step": 84 }, { "epoch": 1.5818181818181818, "grad_norm": 0.48193562030792236, "learning_rate": 5.060297685041659e-06, "loss": 0.1957, "step": 87 }, { "epoch": 1.6363636363636362, "grad_norm": 0.4694143533706665, "learning_rate": 3.0153689607045845e-06, "loss": 0.2136, "step": 90 }, { "epoch": 1.6363636363636362, "eval_loss": 0.21406370401382446, "eval_runtime": 1.7178, "eval_samples_per_second": 54.14, "eval_steps_per_second": 6.986, "step": 90 }, { "epoch": 1.690909090909091, "grad_norm": 0.5793366432189941, "learning_rate": 1.4852136862001764e-06, "loss": 0.24, "step": 93 }, { "epoch": 1.7454545454545456, "grad_norm": 0.5438080430030823, "learning_rate": 4.865965629214819e-07, "loss": 0.2427, "step": 96 }, { "epoch": 1.8, "grad_norm": 0.49919649958610535, "learning_rate": 3.04586490452119e-08, "loss": 0.2304, "step": 99 }, { "epoch": 1.8, "eval_loss": 0.21381090581417084, "eval_runtime": 1.7104, "eval_samples_per_second": 54.374, "eval_steps_per_second": 7.016, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.057450833104077e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }