{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03753554502369668, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003791469194312796, "eval_loss": 1.3354411125183105, "eval_runtime": 561.0446, "eval_samples_per_second": 7.917, "eval_steps_per_second": 0.991, "step": 1 }, { "epoch": 0.001137440758293839, "grad_norm": 8.923999786376953, "learning_rate": 3e-05, "loss": 5.2252, "step": 3 }, { "epoch": 0.002274881516587678, "grad_norm": 9.253809928894043, "learning_rate": 6e-05, "loss": 5.0773, "step": 6 }, { "epoch": 0.0034123222748815166, "grad_norm": 7.367655277252197, "learning_rate": 9e-05, "loss": 4.104, "step": 9 }, { "epoch": 0.0034123222748815166, "eval_loss": 0.9018532037734985, "eval_runtime": 565.3238, "eval_samples_per_second": 7.857, "eval_steps_per_second": 0.984, "step": 9 }, { "epoch": 0.004549763033175356, "grad_norm": 7.7459797859191895, "learning_rate": 9.987820251299122e-05, "loss": 3.5208, "step": 12 }, { "epoch": 0.005687203791469194, "grad_norm": 20.93357276916504, "learning_rate": 9.924038765061042e-05, "loss": 2.6521, "step": 15 }, { "epoch": 0.006824644549763033, "grad_norm": 6.177847862243652, "learning_rate": 9.806308479691595e-05, "loss": 2.6246, "step": 18 }, { "epoch": 0.006824644549763033, "eval_loss": 0.6228094100952148, "eval_runtime": 565.3914, "eval_samples_per_second": 7.857, "eval_steps_per_second": 0.983, "step": 18 }, { "epoch": 0.007962085308056872, "grad_norm": 5.295720100402832, "learning_rate": 9.635919272833938e-05, "loss": 2.5272, "step": 21 }, { "epoch": 0.009099526066350712, "grad_norm": 4.090085983276367, "learning_rate": 9.414737964294636e-05, "loss": 2.3416, "step": 24 }, { "epoch": 0.010236966824644549, "grad_norm": 4.162299156188965, "learning_rate": 9.145187862775209e-05, "loss": 2.0675, "step": 27 }, { "epoch": 0.010236966824644549, "eval_loss": 0.562384307384491, "eval_runtime": 565.6143, "eval_samples_per_second": 7.853, "eval_steps_per_second": 0.983, "step": 27 }, { "epoch": 0.011374407582938388, "grad_norm": 4.30471658706665, "learning_rate": 8.83022221559489e-05, "loss": 2.0407, "step": 30 }, { "epoch": 0.012511848341232227, "grad_norm": 4.804416179656982, "learning_rate": 8.473291852294987e-05, "loss": 2.0456, "step": 33 }, { "epoch": 0.013649289099526066, "grad_norm": 3.96138596534729, "learning_rate": 8.07830737662829e-05, "loss": 2.0244, "step": 36 }, { "epoch": 0.013649289099526066, "eval_loss": 0.5297911167144775, "eval_runtime": 565.4741, "eval_samples_per_second": 7.855, "eval_steps_per_second": 0.983, "step": 36 }, { "epoch": 0.014786729857819906, "grad_norm": 4.675422191619873, "learning_rate": 7.649596321166024e-05, "loss": 2.015, "step": 39 }, { "epoch": 0.015924170616113745, "grad_norm": 4.122074604034424, "learning_rate": 7.191855733945387e-05, "loss": 2.0857, "step": 42 }, { "epoch": 0.017061611374407582, "grad_norm": 4.02851676940918, "learning_rate": 6.710100716628344e-05, "loss": 2.0372, "step": 45 }, { "epoch": 0.017061611374407582, "eval_loss": 0.5146540999412537, "eval_runtime": 565.5761, "eval_samples_per_second": 7.854, "eval_steps_per_second": 0.983, "step": 45 }, { "epoch": 0.018199052132701423, "grad_norm": 4.6452507972717285, "learning_rate": 6.209609477998338e-05, "loss": 1.932, "step": 48 }, { "epoch": 0.01933649289099526, "grad_norm": 4.3586039543151855, "learning_rate": 5.695865504800327e-05, "loss": 2.1188, "step": 51 }, { "epoch": 0.020473933649289098, "grad_norm": 4.268399715423584, "learning_rate": 5.174497483512506e-05, "loss": 1.9989, "step": 54 }, { "epoch": 0.020473933649289098, "eval_loss": 0.5036410689353943, "eval_runtime": 565.5702, "eval_samples_per_second": 7.854, "eval_steps_per_second": 0.983, "step": 54 }, { "epoch": 0.02161137440758294, "grad_norm": 3.966719627380371, "learning_rate": 4.6512176312793736e-05, "loss": 2.1469, "step": 57 }, { "epoch": 0.022748815165876776, "grad_norm": 4.320970058441162, "learning_rate": 4.131759111665349e-05, "loss": 2.1051, "step": 60 }, { "epoch": 0.023886255924170617, "grad_norm": 4.397201061248779, "learning_rate": 3.6218132209150045e-05, "loss": 1.864, "step": 63 }, { "epoch": 0.023886255924170617, "eval_loss": 0.4941074252128601, "eval_runtime": 565.4444, "eval_samples_per_second": 7.856, "eval_steps_per_second": 0.983, "step": 63 }, { "epoch": 0.025023696682464455, "grad_norm": 4.262881278991699, "learning_rate": 3.12696703292044e-05, "loss": 1.9055, "step": 66 }, { "epoch": 0.026161137440758295, "grad_norm": 4.537021636962891, "learning_rate": 2.6526421860705473e-05, "loss": 2.0358, "step": 69 }, { "epoch": 0.027298578199052133, "grad_norm": 3.677988052368164, "learning_rate": 2.2040354826462668e-05, "loss": 1.6868, "step": 72 }, { "epoch": 0.027298578199052133, "eval_loss": 0.48585864901542664, "eval_runtime": 565.2465, "eval_samples_per_second": 7.859, "eval_steps_per_second": 0.984, "step": 72 }, { "epoch": 0.02843601895734597, "grad_norm": 4.369206428527832, "learning_rate": 1.7860619515673033e-05, "loss": 2.1532, "step": 75 }, { "epoch": 0.02957345971563981, "grad_norm": 4.181845664978027, "learning_rate": 1.4033009983067452e-05, "loss": 2.0465, "step": 78 }, { "epoch": 0.03071090047393365, "grad_norm": 4.068225383758545, "learning_rate": 1.0599462319663905e-05, "loss": 1.9805, "step": 81 }, { "epoch": 0.03071090047393365, "eval_loss": 0.48237505555152893, "eval_runtime": 565.3076, "eval_samples_per_second": 7.858, "eval_steps_per_second": 0.984, "step": 81 }, { "epoch": 0.03184834123222749, "grad_norm": 4.095105171203613, "learning_rate": 7.597595192178702e-06, "loss": 2.1349, "step": 84 }, { "epoch": 0.03298578199052133, "grad_norm": 4.034782886505127, "learning_rate": 5.060297685041659e-06, "loss": 1.989, "step": 87 }, { "epoch": 0.034123222748815164, "grad_norm": 4.185176372528076, "learning_rate": 3.0153689607045845e-06, "loss": 1.9752, "step": 90 }, { "epoch": 0.034123222748815164, "eval_loss": 0.48054438829421997, "eval_runtime": 565.9474, "eval_samples_per_second": 7.849, "eval_steps_per_second": 0.982, "step": 90 }, { "epoch": 0.035260663507109005, "grad_norm": 3.7425358295440674, "learning_rate": 1.4852136862001764e-06, "loss": 1.9, "step": 93 }, { "epoch": 0.036398104265402846, "grad_norm": 3.9407403469085693, "learning_rate": 4.865965629214819e-07, "loss": 1.9243, "step": 96 }, { "epoch": 0.03753554502369668, "grad_norm": 3.88885760307312, "learning_rate": 3.04586490452119e-08, "loss": 1.9794, "step": 99 }, { "epoch": 0.03753554502369668, "eval_loss": 0.4799988865852356, "eval_runtime": 565.6344, "eval_samples_per_second": 7.853, "eval_steps_per_second": 0.983, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6599302131141837e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }