{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24040267447975358, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002404026744797536, "eval_loss": 3.228058099746704, "eval_runtime": 156.333, "eval_samples_per_second": 8.968, "eval_steps_per_second": 4.484, "step": 1 }, { "epoch": 0.007212080234392608, "grad_norm": 1.9650676250457764, "learning_rate": 3e-05, "loss": 2.349, "step": 3 }, { "epoch": 0.014424160468785216, "grad_norm": 1.4064522981643677, "learning_rate": 6e-05, "loss": 2.2216, "step": 6 }, { "epoch": 0.021636240703177823, "grad_norm": 1.3426527976989746, "learning_rate": 9e-05, "loss": 1.9405, "step": 9 }, { "epoch": 0.021636240703177823, "eval_loss": 1.9358376264572144, "eval_runtime": 156.5558, "eval_samples_per_second": 8.955, "eval_steps_per_second": 4.478, "step": 9 }, { "epoch": 0.02884832093757043, "grad_norm": 1.1980935335159302, "learning_rate": 0.00012, "loss": 1.7996, "step": 12 }, { "epoch": 0.03606040117196304, "grad_norm": 0.9815410375595093, "learning_rate": 0.00015000000000000001, "loss": 1.6674, "step": 15 }, { "epoch": 0.043272481406355645, "grad_norm": 1.1842740774154663, "learning_rate": 0.00018, "loss": 1.5861, "step": 18 }, { "epoch": 0.043272481406355645, "eval_loss": 1.4581845998764038, "eval_runtime": 156.5288, "eval_samples_per_second": 8.957, "eval_steps_per_second": 4.478, "step": 18 }, { "epoch": 0.05048456164074825, "grad_norm": 0.9527170062065125, "learning_rate": 0.0001999229036240723, "loss": 1.5801, "step": 21 }, { "epoch": 0.05769664187514086, "grad_norm": 0.9741354584693909, "learning_rate": 0.00019876883405951377, "loss": 1.6089, "step": 24 }, { "epoch": 0.06490872210953347, "grad_norm": 0.8610622882843018, "learning_rate": 0.00019624552364536473, "loss": 1.5059, "step": 27 }, { "epoch": 0.06490872210953347, "eval_loss": 1.3450061082839966, "eval_runtime": 156.5047, "eval_samples_per_second": 8.958, "eval_steps_per_second": 4.479, "step": 27 }, { "epoch": 0.07212080234392608, "grad_norm": 0.988355815410614, "learning_rate": 0.0001923879532511287, "loss": 1.4597, "step": 30 }, { "epoch": 0.07933288257831868, "grad_norm": 0.9052969813346863, "learning_rate": 0.00018724960070727972, "loss": 1.3148, "step": 33 }, { "epoch": 0.08654496281271129, "grad_norm": 0.9871351718902588, "learning_rate": 0.00018090169943749476, "loss": 1.4494, "step": 36 }, { "epoch": 0.08654496281271129, "eval_loss": 1.2605727910995483, "eval_runtime": 156.5558, "eval_samples_per_second": 8.955, "eval_steps_per_second": 4.478, "step": 36 }, { "epoch": 0.0937570430471039, "grad_norm": 1.216485619544983, "learning_rate": 0.00017343225094356855, "loss": 1.3575, "step": 39 }, { "epoch": 0.1009691232814965, "grad_norm": 1.0345182418823242, "learning_rate": 0.00016494480483301836, "loss": 1.316, "step": 42 }, { "epoch": 0.10818120351588911, "grad_norm": 1.2272484302520752, "learning_rate": 0.00015555702330196023, "loss": 1.3834, "step": 45 }, { "epoch": 0.10818120351588911, "eval_loss": 1.199688196182251, "eval_runtime": 156.5688, "eval_samples_per_second": 8.955, "eval_steps_per_second": 4.477, "step": 45 }, { "epoch": 0.11539328375028172, "grad_norm": 0.8153639435768127, "learning_rate": 0.00014539904997395468, "loss": 1.3282, "step": 48 }, { "epoch": 0.12260536398467432, "grad_norm": 0.9661998748779297, "learning_rate": 0.0001346117057077493, "loss": 1.2335, "step": 51 }, { "epoch": 0.12981744421906694, "grad_norm": 0.9782885313034058, "learning_rate": 0.00012334453638559057, "loss": 1.1531, "step": 54 }, { "epoch": 0.12981744421906694, "eval_loss": 1.1464760303497314, "eval_runtime": 156.5981, "eval_samples_per_second": 8.953, "eval_steps_per_second": 4.476, "step": 54 }, { "epoch": 0.13702952445345953, "grad_norm": 1.0093810558319092, "learning_rate": 0.00011175373974578378, "loss": 1.1367, "step": 57 }, { "epoch": 0.14424160468785216, "grad_norm": 0.9035248160362244, "learning_rate": 0.0001, "loss": 1.2007, "step": 60 }, { "epoch": 0.15145368492224476, "grad_norm": 1.1476191282272339, "learning_rate": 8.824626025421626e-05, "loss": 1.1702, "step": 63 }, { "epoch": 0.15145368492224476, "eval_loss": 1.1219838857650757, "eval_runtime": 156.5351, "eval_samples_per_second": 8.956, "eval_steps_per_second": 4.478, "step": 63 }, { "epoch": 0.15866576515663736, "grad_norm": 1.004503607749939, "learning_rate": 7.66554636144095e-05, "loss": 1.1231, "step": 66 }, { "epoch": 0.16587784539102998, "grad_norm": 0.8763646483421326, "learning_rate": 6.538829429225069e-05, "loss": 1.1959, "step": 69 }, { "epoch": 0.17308992562542258, "grad_norm": 1.0786497592926025, "learning_rate": 5.4600950026045326e-05, "loss": 1.2195, "step": 72 }, { "epoch": 0.17308992562542258, "eval_loss": 1.0894047021865845, "eval_runtime": 156.5655, "eval_samples_per_second": 8.955, "eval_steps_per_second": 4.477, "step": 72 }, { "epoch": 0.18030200585981518, "grad_norm": 1.005875587463379, "learning_rate": 4.444297669803981e-05, "loss": 1.0467, "step": 75 }, { "epoch": 0.1875140860942078, "grad_norm": 0.9949033260345459, "learning_rate": 3.5055195166981645e-05, "loss": 1.3012, "step": 78 }, { "epoch": 0.1947261663286004, "grad_norm": 0.9470193982124329, "learning_rate": 2.6567749056431467e-05, "loss": 1.237, "step": 81 }, { "epoch": 0.1947261663286004, "eval_loss": 1.070177674293518, "eval_runtime": 156.4864, "eval_samples_per_second": 8.959, "eval_steps_per_second": 4.48, "step": 81 }, { "epoch": 0.201938246562993, "grad_norm": 0.9711623787879944, "learning_rate": 1.9098300562505266e-05, "loss": 0.9719, "step": 84 }, { "epoch": 0.20915032679738563, "grad_norm": 1.0642070770263672, "learning_rate": 1.2750399292720283e-05, "loss": 1.2356, "step": 87 }, { "epoch": 0.21636240703177823, "grad_norm": 0.8378780484199524, "learning_rate": 7.612046748871327e-06, "loss": 1.1906, "step": 90 }, { "epoch": 0.21636240703177823, "eval_loss": 1.0622040033340454, "eval_runtime": 156.5263, "eval_samples_per_second": 8.957, "eval_steps_per_second": 4.478, "step": 90 }, { "epoch": 0.22357448726617082, "grad_norm": 0.9879840016365051, "learning_rate": 3.7544763546352834e-06, "loss": 1.1005, "step": 93 }, { "epoch": 0.23078656750056345, "grad_norm": 0.884789764881134, "learning_rate": 1.231165940486234e-06, "loss": 1.0352, "step": 96 }, { "epoch": 0.23799864773495605, "grad_norm": 1.1920340061187744, "learning_rate": 7.709637592770991e-08, "loss": 1.117, "step": 99 }, { "epoch": 0.23799864773495605, "eval_loss": 1.0599479675292969, "eval_runtime": 156.5531, "eval_samples_per_second": 8.955, "eval_steps_per_second": 4.478, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.969948643447276e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }