{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04697091006138244, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047445363698366103, "eval_loss": 1.2206629514694214, "eval_runtime": 2455.084, "eval_samples_per_second": 2.892, "eval_steps_per_second": 1.446, "step": 1 }, { "epoch": 0.001423360910950983, "grad_norm": 42.169185638427734, "learning_rate": 3e-05, "loss": 38.8115, "step": 3 }, { "epoch": 0.002846721821901966, "grad_norm": 30.064516067504883, "learning_rate": 6e-05, "loss": 36.5887, "step": 6 }, { "epoch": 0.004270082732852949, "grad_norm": 24.0466365814209, "learning_rate": 9e-05, "loss": 31.7486, "step": 9 }, { "epoch": 0.004270082732852949, "eval_loss": 0.872442364692688, "eval_runtime": 2457.1575, "eval_samples_per_second": 2.89, "eval_steps_per_second": 1.445, "step": 9 }, { "epoch": 0.005693443643803932, "grad_norm": 26.121597290039062, "learning_rate": 0.00012, "loss": 28.0893, "step": 12 }, { "epoch": 0.007116804554754915, "grad_norm": 22.931711196899414, "learning_rate": 0.00015000000000000001, "loss": 26.3657, "step": 15 }, { "epoch": 0.008540165465705898, "grad_norm": 15.313858032226562, "learning_rate": 0.00018, "loss": 22.876, "step": 18 }, { "epoch": 0.008540165465705898, "eval_loss": 0.7377527952194214, "eval_runtime": 730.2515, "eval_samples_per_second": 9.723, "eval_steps_per_second": 4.861, "step": 18 }, { "epoch": 0.009963526376656881, "grad_norm": 14.844147682189941, "learning_rate": 0.0001999229036240723, "loss": 23.8388, "step": 21 }, { "epoch": 0.011386887287607865, "grad_norm": 13.040925025939941, "learning_rate": 0.00019876883405951377, "loss": 23.2438, "step": 24 }, { "epoch": 0.012810248198558846, "grad_norm": 13.732440948486328, "learning_rate": 0.00019624552364536473, "loss": 23.504, "step": 27 }, { "epoch": 0.012810248198558846, "eval_loss": 0.708083987236023, "eval_runtime": 2490.0558, "eval_samples_per_second": 2.851, "eval_steps_per_second": 1.426, "step": 27 }, { "epoch": 0.01423360910950983, "grad_norm": 15.219266891479492, "learning_rate": 0.0001923879532511287, "loss": 23.009, "step": 30 }, { "epoch": 0.015656970020460813, "grad_norm": 11.847440719604492, "learning_rate": 0.00018724960070727972, "loss": 21.94, "step": 33 }, { "epoch": 0.017080330931411796, "grad_norm": 11.252641677856445, "learning_rate": 0.00018090169943749476, "loss": 22.467, "step": 36 }, { "epoch": 0.017080330931411796, "eval_loss": 0.6968169808387756, "eval_runtime": 1821.4615, "eval_samples_per_second": 3.898, "eval_steps_per_second": 1.949, "step": 36 }, { "epoch": 0.01850369184236278, "grad_norm": 11.163553237915039, "learning_rate": 0.00017343225094356855, "loss": 22.5554, "step": 39 }, { "epoch": 0.019927052753313763, "grad_norm": 13.20541763305664, "learning_rate": 0.00016494480483301836, "loss": 22.9775, "step": 42 }, { "epoch": 0.021350413664264746, "grad_norm": 10.816298484802246, "learning_rate": 0.00015555702330196023, "loss": 22.2379, "step": 45 }, { "epoch": 0.021350413664264746, "eval_loss": 0.6865644454956055, "eval_runtime": 673.7961, "eval_samples_per_second": 10.537, "eval_steps_per_second": 5.269, "step": 45 }, { "epoch": 0.02277377457521573, "grad_norm": 10.934300422668457, "learning_rate": 0.00014539904997395468, "loss": 21.4568, "step": 48 }, { "epoch": 0.024197135486166713, "grad_norm": 10.668505668640137, "learning_rate": 0.0001346117057077493, "loss": 21.2971, "step": 51 }, { "epoch": 0.025620496397117692, "grad_norm": 10.522425651550293, "learning_rate": 0.00012334453638559057, "loss": 22.4085, "step": 54 }, { "epoch": 0.025620496397117692, "eval_loss": 0.6808061003684998, "eval_runtime": 674.5537, "eval_samples_per_second": 10.525, "eval_steps_per_second": 5.263, "step": 54 }, { "epoch": 0.027043857308068676, "grad_norm": 10.834729194641113, "learning_rate": 0.00011175373974578378, "loss": 22.0063, "step": 57 }, { "epoch": 0.02846721821901966, "grad_norm": 10.45215892791748, "learning_rate": 0.0001, "loss": 21.7114, "step": 60 }, { "epoch": 0.029890579129970642, "grad_norm": 10.509422302246094, "learning_rate": 8.824626025421626e-05, "loss": 22.147, "step": 63 }, { "epoch": 0.029890579129970642, "eval_loss": 0.6736528873443604, "eval_runtime": 674.4225, "eval_samples_per_second": 10.528, "eval_steps_per_second": 5.264, "step": 63 }, { "epoch": 0.031313940040921626, "grad_norm": 10.096047401428223, "learning_rate": 7.66554636144095e-05, "loss": 21.0243, "step": 66 }, { "epoch": 0.03273730095187261, "grad_norm": 10.614026069641113, "learning_rate": 6.538829429225069e-05, "loss": 22.6279, "step": 69 }, { "epoch": 0.03416066186282359, "grad_norm": 9.915692329406738, "learning_rate": 5.4600950026045326e-05, "loss": 21.7931, "step": 72 }, { "epoch": 0.03416066186282359, "eval_loss": 0.6665549278259277, "eval_runtime": 673.4954, "eval_samples_per_second": 10.542, "eval_steps_per_second": 5.271, "step": 72 }, { "epoch": 0.03558402277377457, "grad_norm": 10.606143951416016, "learning_rate": 4.444297669803981e-05, "loss": 21.1355, "step": 75 }, { "epoch": 0.03700738368472556, "grad_norm": 9.829690933227539, "learning_rate": 3.5055195166981645e-05, "loss": 20.704, "step": 78 }, { "epoch": 0.03843074459567654, "grad_norm": 9.532461166381836, "learning_rate": 2.6567749056431467e-05, "loss": 20.9497, "step": 81 }, { "epoch": 0.03843074459567654, "eval_loss": 0.6634002327919006, "eval_runtime": 673.9541, "eval_samples_per_second": 10.535, "eval_steps_per_second": 5.267, "step": 81 }, { "epoch": 0.039854105506627525, "grad_norm": 10.550149917602539, "learning_rate": 1.9098300562505266e-05, "loss": 21.4862, "step": 84 }, { "epoch": 0.041277466417578505, "grad_norm": 9.829084396362305, "learning_rate": 1.2750399292720283e-05, "loss": 21.2587, "step": 87 }, { "epoch": 0.04270082732852949, "grad_norm": 10.305970191955566, "learning_rate": 7.612046748871327e-06, "loss": 21.4128, "step": 90 }, { "epoch": 0.04270082732852949, "eval_loss": 0.6616738438606262, "eval_runtime": 674.758, "eval_samples_per_second": 10.522, "eval_steps_per_second": 5.261, "step": 90 }, { "epoch": 0.04412418823948047, "grad_norm": 10.540351867675781, "learning_rate": 3.7544763546352834e-06, "loss": 22.0735, "step": 93 }, { "epoch": 0.04554754915043146, "grad_norm": 10.391834259033203, "learning_rate": 1.231165940486234e-06, "loss": 22.2369, "step": 96 }, { "epoch": 0.04697091006138244, "grad_norm": 10.5779390335083, "learning_rate": 7.709637592770991e-08, "loss": 21.495, "step": 99 }, { "epoch": 0.04697091006138244, "eval_loss": 0.6612978577613831, "eval_runtime": 674.8148, "eval_samples_per_second": 10.521, "eval_steps_per_second": 5.261, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.2196614900128154e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }