{ "best_metric": 3.155805826187134, "best_model_checkpoint": "miner_id_24/checkpoint-20", "epoch": 0.06983240223463687, "eval_steps": 5, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002793296089385475, "grad_norm": 1448.4268798828125, "learning_rate": 2e-05, "loss": 12.7662, "step": 1 }, { "epoch": 0.002793296089385475, "eval_loss": 3.3325388431549072, "eval_runtime": 6.0267, "eval_samples_per_second": 25.055, "eval_steps_per_second": 12.611, "step": 1 }, { "epoch": 0.00558659217877095, "grad_norm": 1353.778076171875, "learning_rate": 4e-05, "loss": 12.4396, "step": 2 }, { "epoch": 0.008379888268156424, "grad_norm": 806.8947143554688, "learning_rate": 6e-05, "loss": 13.0645, "step": 3 }, { "epoch": 0.0111731843575419, "grad_norm": 999.5733032226562, "learning_rate": 8e-05, "loss": 12.6938, "step": 4 }, { "epoch": 0.013966480446927373, "grad_norm": 1524.6124267578125, "learning_rate": 0.0001, "loss": 12.2966, "step": 5 }, { "epoch": 0.013966480446927373, "eval_loss": 3.323124885559082, "eval_runtime": 4.753, "eval_samples_per_second": 31.769, "eval_steps_per_second": 15.99, "step": 5 }, { "epoch": 0.01675977653631285, "grad_norm": 1987.1434326171875, "learning_rate": 0.00012, "loss": 13.0857, "step": 6 }, { "epoch": 0.019553072625698324, "grad_norm": 1016.9581909179688, "learning_rate": 0.00014, "loss": 12.7557, "step": 7 }, { "epoch": 0.0223463687150838, "grad_norm": 847.552490234375, "learning_rate": 0.00016, "loss": 13.379, "step": 8 }, { "epoch": 0.025139664804469275, "grad_norm": 1183.8375244140625, "learning_rate": 0.00018, "loss": 12.5125, "step": 9 }, { "epoch": 0.027932960893854747, "grad_norm": 1316.9891357421875, "learning_rate": 0.0002, "loss": 12.9677, "step": 10 }, { "epoch": 0.027932960893854747, "eval_loss": 3.275075674057007, "eval_runtime": 4.8015, "eval_samples_per_second": 31.448, "eval_steps_per_second": 15.828, "step": 10 }, { "epoch": 0.030726256983240222, "grad_norm": 2734.73876953125, "learning_rate": 0.00019781476007338058, "loss": 13.0272, "step": 11 }, { "epoch": 0.0335195530726257, "grad_norm": 1246.3319091796875, "learning_rate": 0.0001913545457642601, "loss": 14.1851, "step": 12 }, { "epoch": 0.036312849162011177, "grad_norm": 959.4293212890625, "learning_rate": 0.00018090169943749476, "loss": 12.7241, "step": 13 }, { "epoch": 0.03910614525139665, "grad_norm": 1641.6357421875, "learning_rate": 0.00016691306063588583, "loss": 13.6186, "step": 14 }, { "epoch": 0.04189944134078212, "grad_norm": 1507.56640625, "learning_rate": 0.00015000000000000001, "loss": 11.9316, "step": 15 }, { "epoch": 0.04189944134078212, "eval_loss": 3.233804702758789, "eval_runtime": 4.8696, "eval_samples_per_second": 31.009, "eval_steps_per_second": 15.607, "step": 15 }, { "epoch": 0.0446927374301676, "grad_norm": 2713.677001953125, "learning_rate": 0.00013090169943749476, "loss": 13.57, "step": 16 }, { "epoch": 0.04748603351955307, "grad_norm": 1984.106689453125, "learning_rate": 0.00011045284632676536, "loss": 11.9724, "step": 17 }, { "epoch": 0.05027932960893855, "grad_norm": 851.5194702148438, "learning_rate": 8.954715367323468e-05, "loss": 12.0016, "step": 18 }, { "epoch": 0.05307262569832402, "grad_norm": 986.9415893554688, "learning_rate": 6.909830056250527e-05, "loss": 12.7948, "step": 19 }, { "epoch": 0.055865921787709494, "grad_norm": 939.6593017578125, "learning_rate": 5.000000000000002e-05, "loss": 13.3214, "step": 20 }, { "epoch": 0.055865921787709494, "eval_loss": 3.155805826187134, "eval_runtime": 5.0184, "eval_samples_per_second": 30.089, "eval_steps_per_second": 15.144, "step": 20 }, { "epoch": 0.05865921787709497, "grad_norm": 665.962646484375, "learning_rate": 3.308693936411421e-05, "loss": 13.8204, "step": 21 }, { "epoch": 0.061452513966480445, "grad_norm": 1509.6649169921875, "learning_rate": 1.9098300562505266e-05, "loss": 13.2364, "step": 22 }, { "epoch": 0.06424581005586592, "grad_norm": 1729.46875, "learning_rate": 8.645454235739903e-06, "loss": 13.1418, "step": 23 }, { "epoch": 0.0670391061452514, "grad_norm": 1822.8065185546875, "learning_rate": 2.1852399266194314e-06, "loss": 11.3855, "step": 24 }, { "epoch": 0.06983240223463687, "grad_norm": 1578.525146484375, "learning_rate": 0.0, "loss": 12.9873, "step": 25 }, { "epoch": 0.06983240223463687, "eval_loss": 3.159228563308716, "eval_runtime": 4.9163, "eval_samples_per_second": 30.714, "eval_steps_per_second": 15.459, "step": 25 } ], "logging_steps": 1, "max_steps": 25, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 76719587328000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }