{ "best_metric": 7.740330219268799, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.186219739292365, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00186219739292365, "grad_norm": 7.960118770599365, "learning_rate": 1.5000000000000002e-07, "loss": 14.0369, "step": 1 }, { "epoch": 0.00186219739292365, "eval_loss": 13.924460411071777, "eval_runtime": 12.7592, "eval_samples_per_second": 70.929, "eval_steps_per_second": 8.935, "step": 1 }, { "epoch": 0.0037243947858473, "grad_norm": 8.014508247375488, "learning_rate": 3.0000000000000004e-07, "loss": 13.7099, "step": 2 }, { "epoch": 0.00558659217877095, "grad_norm": 8.808537483215332, "learning_rate": 4.5e-07, "loss": 13.8903, "step": 3 }, { "epoch": 0.0074487895716946, "grad_norm": 8.004558563232422, "learning_rate": 6.000000000000001e-07, "loss": 14.0816, "step": 4 }, { "epoch": 0.00931098696461825, "grad_norm": 7.907912254333496, "learning_rate": 7.5e-07, "loss": 14.4216, "step": 5 }, { "epoch": 0.0111731843575419, "grad_norm": 7.714488506317139, "learning_rate": 9e-07, "loss": 13.8048, "step": 6 }, { "epoch": 0.01303538175046555, "grad_norm": 8.773198127746582, "learning_rate": 1.0500000000000001e-06, "loss": 13.7771, "step": 7 }, { "epoch": 0.0148975791433892, "grad_norm": 8.656310081481934, "learning_rate": 1.2000000000000002e-06, "loss": 13.9204, "step": 8 }, { "epoch": 0.01675977653631285, "grad_norm": 7.340457439422607, "learning_rate": 1.35e-06, "loss": 14.1696, "step": 9 }, { "epoch": 0.0186219739292365, "grad_norm": 7.3591461181640625, "learning_rate": 1.5e-06, "loss": 13.6311, "step": 10 }, { "epoch": 0.020484171322160148, "grad_norm": 7.934558868408203, "learning_rate": 1.65e-06, "loss": 14.0909, "step": 11 }, { "epoch": 0.0223463687150838, "grad_norm": 8.731240272521973, "learning_rate": 1.8e-06, "loss": 13.8947, "step": 12 }, { "epoch": 0.024208566108007448, "grad_norm": 8.52802562713623, "learning_rate": 1.95e-06, "loss": 13.941, "step": 13 }, { "epoch": 0.0260707635009311, "grad_norm": 8.355961799621582, "learning_rate": 2.1000000000000002e-06, "loss": 13.9408, "step": 14 }, { "epoch": 0.027932960893854747, "grad_norm": 8.493719100952148, "learning_rate": 2.25e-06, "loss": 13.8167, "step": 15 }, { "epoch": 0.0297951582867784, "grad_norm": 7.872385025024414, "learning_rate": 2.4000000000000003e-06, "loss": 13.8842, "step": 16 }, { "epoch": 0.03165735567970205, "grad_norm": 7.678893566131592, "learning_rate": 2.55e-06, "loss": 13.8789, "step": 17 }, { "epoch": 0.0335195530726257, "grad_norm": 8.681936264038086, "learning_rate": 2.7e-06, "loss": 13.9825, "step": 18 }, { "epoch": 0.035381750465549346, "grad_norm": 8.17630672454834, "learning_rate": 2.8500000000000002e-06, "loss": 13.7963, "step": 19 }, { "epoch": 0.037243947858473, "grad_norm": 8.064035415649414, "learning_rate": 3e-06, "loss": 13.941, "step": 20 }, { "epoch": 0.03910614525139665, "grad_norm": 8.132279396057129, "learning_rate": 3.15e-06, "loss": 13.8393, "step": 21 }, { "epoch": 0.040968342644320296, "grad_norm": 8.09195613861084, "learning_rate": 3.3e-06, "loss": 13.9476, "step": 22 }, { "epoch": 0.04283054003724395, "grad_norm": 9.464410781860352, "learning_rate": 3.4500000000000004e-06, "loss": 13.9283, "step": 23 }, { "epoch": 0.0446927374301676, "grad_norm": 8.223386764526367, "learning_rate": 3.6e-06, "loss": 13.8445, "step": 24 }, { "epoch": 0.04655493482309125, "grad_norm": 8.496528625488281, "learning_rate": 3.75e-06, "loss": 13.9996, "step": 25 }, { "epoch": 0.048417132216014895, "grad_norm": 7.405032157897949, "learning_rate": 3.9e-06, "loss": 13.4262, "step": 26 }, { "epoch": 0.05027932960893855, "grad_norm": 7.917985916137695, "learning_rate": 4.05e-06, "loss": 13.7422, "step": 27 }, { "epoch": 0.0521415270018622, "grad_norm": 7.779364585876465, "learning_rate": 4.2000000000000004e-06, "loss": 13.8355, "step": 28 }, { "epoch": 0.054003724394785846, "grad_norm": 7.747602462768555, "learning_rate": 4.35e-06, "loss": 13.6872, "step": 29 }, { "epoch": 0.055865921787709494, "grad_norm": 8.12415599822998, "learning_rate": 4.5e-06, "loss": 13.8575, "step": 30 }, { "epoch": 0.05772811918063315, "grad_norm": 7.393126487731934, "learning_rate": 4.65e-06, "loss": 13.6684, "step": 31 }, { "epoch": 0.0595903165735568, "grad_norm": 7.366344451904297, "learning_rate": 4.800000000000001e-06, "loss": 13.2535, "step": 32 }, { "epoch": 0.061452513966480445, "grad_norm": 7.462878227233887, "learning_rate": 4.95e-06, "loss": 13.6117, "step": 33 }, { "epoch": 0.0633147113594041, "grad_norm": 8.132235527038574, "learning_rate": 5.1e-06, "loss": 14.0737, "step": 34 }, { "epoch": 0.06517690875232775, "grad_norm": 8.085715293884277, "learning_rate": 5.25e-06, "loss": 13.7491, "step": 35 }, { "epoch": 0.0670391061452514, "grad_norm": 7.527085304260254, "learning_rate": 5.4e-06, "loss": 13.4391, "step": 36 }, { "epoch": 0.06890130353817504, "grad_norm": 7.162932872772217, "learning_rate": 5.55e-06, "loss": 13.2994, "step": 37 }, { "epoch": 0.07076350093109869, "grad_norm": 6.486966133117676, "learning_rate": 5.7000000000000005e-06, "loss": 13.5081, "step": 38 }, { "epoch": 0.07262569832402235, "grad_norm": 7.425045967102051, "learning_rate": 5.850000000000001e-06, "loss": 13.1145, "step": 39 }, { "epoch": 0.074487895716946, "grad_norm": 7.909907817840576, "learning_rate": 6e-06, "loss": 13.3482, "step": 40 }, { "epoch": 0.07635009310986965, "grad_norm": 7.439389228820801, "learning_rate": 6.1499999999999996e-06, "loss": 13.4463, "step": 41 }, { "epoch": 0.0782122905027933, "grad_norm": 7.182775974273682, "learning_rate": 6.3e-06, "loss": 13.2005, "step": 42 }, { "epoch": 0.08007448789571694, "grad_norm": 7.072144031524658, "learning_rate": 6.45e-06, "loss": 12.8393, "step": 43 }, { "epoch": 0.08193668528864059, "grad_norm": 7.034002780914307, "learning_rate": 6.6e-06, "loss": 12.8302, "step": 44 }, { "epoch": 0.08379888268156424, "grad_norm": 6.952661037445068, "learning_rate": 6.750000000000001e-06, "loss": 13.1515, "step": 45 }, { "epoch": 0.0856610800744879, "grad_norm": 7.125630855560303, "learning_rate": 6.900000000000001e-06, "loss": 13.0473, "step": 46 }, { "epoch": 0.08752327746741155, "grad_norm": 7.452700138092041, "learning_rate": 7.049999999999999e-06, "loss": 13.0773, "step": 47 }, { "epoch": 0.0893854748603352, "grad_norm": 7.248137474060059, "learning_rate": 7.2e-06, "loss": 13.0051, "step": 48 }, { "epoch": 0.09124767225325885, "grad_norm": 6.84269905090332, "learning_rate": 7.35e-06, "loss": 12.6788, "step": 49 }, { "epoch": 0.0931098696461825, "grad_norm": 6.934720993041992, "learning_rate": 7.5e-06, "loss": 12.7921, "step": 50 }, { "epoch": 0.0931098696461825, "eval_loss": 12.700243949890137, "eval_runtime": 12.0083, "eval_samples_per_second": 75.365, "eval_steps_per_second": 9.493, "step": 50 }, { "epoch": 0.09497206703910614, "grad_norm": 6.651668548583984, "learning_rate": 7.65e-06, "loss": 12.8692, "step": 51 }, { "epoch": 0.09683426443202979, "grad_norm": 6.7815423011779785, "learning_rate": 7.8e-06, "loss": 12.2018, "step": 52 }, { "epoch": 0.09869646182495345, "grad_norm": 6.230504035949707, "learning_rate": 7.95e-06, "loss": 12.4279, "step": 53 }, { "epoch": 0.1005586592178771, "grad_norm": 6.884407043457031, "learning_rate": 8.1e-06, "loss": 12.726, "step": 54 }, { "epoch": 0.10242085661080075, "grad_norm": 6.8271918296813965, "learning_rate": 8.25e-06, "loss": 12.3977, "step": 55 }, { "epoch": 0.1042830540037244, "grad_norm": 6.264899253845215, "learning_rate": 8.400000000000001e-06, "loss": 12.1251, "step": 56 }, { "epoch": 0.10614525139664804, "grad_norm": 6.3193230628967285, "learning_rate": 8.55e-06, "loss": 12.2093, "step": 57 }, { "epoch": 0.10800744878957169, "grad_norm": 6.511574745178223, "learning_rate": 8.7e-06, "loss": 12.2221, "step": 58 }, { "epoch": 0.10986964618249534, "grad_norm": 6.219540119171143, "learning_rate": 8.85e-06, "loss": 12.2011, "step": 59 }, { "epoch": 0.11173184357541899, "grad_norm": 6.10862398147583, "learning_rate": 9e-06, "loss": 12.1536, "step": 60 }, { "epoch": 0.11359404096834265, "grad_norm": 6.370395183563232, "learning_rate": 9.15e-06, "loss": 11.9978, "step": 61 }, { "epoch": 0.1154562383612663, "grad_norm": 5.910678863525391, "learning_rate": 9.3e-06, "loss": 11.6684, "step": 62 }, { "epoch": 0.11731843575418995, "grad_norm": 5.520389080047607, "learning_rate": 9.450000000000001e-06, "loss": 11.4156, "step": 63 }, { "epoch": 0.1191806331471136, "grad_norm": 5.9435343742370605, "learning_rate": 9.600000000000001e-06, "loss": 11.8516, "step": 64 }, { "epoch": 0.12104283054003724, "grad_norm": 6.2594099044799805, "learning_rate": 9.75e-06, "loss": 11.3695, "step": 65 }, { "epoch": 0.12290502793296089, "grad_norm": 5.629462718963623, "learning_rate": 9.9e-06, "loss": 11.487, "step": 66 }, { "epoch": 0.12476722532588454, "grad_norm": 5.849459648132324, "learning_rate": 1.005e-05, "loss": 11.4961, "step": 67 }, { "epoch": 0.1266294227188082, "grad_norm": 5.782181739807129, "learning_rate": 1.02e-05, "loss": 11.1094, "step": 68 }, { "epoch": 0.12849162011173185, "grad_norm": 5.470081806182861, "learning_rate": 1.035e-05, "loss": 11.0299, "step": 69 }, { "epoch": 0.1303538175046555, "grad_norm": 5.412224769592285, "learning_rate": 1.05e-05, "loss": 10.8013, "step": 70 }, { "epoch": 0.13221601489757914, "grad_norm": 5.450751781463623, "learning_rate": 1.065e-05, "loss": 10.7882, "step": 71 }, { "epoch": 0.1340782122905028, "grad_norm": 5.348014831542969, "learning_rate": 1.08e-05, "loss": 10.8854, "step": 72 }, { "epoch": 0.13594040968342644, "grad_norm": 5.378203868865967, "learning_rate": 1.095e-05, "loss": 10.6439, "step": 73 }, { "epoch": 0.1378026070763501, "grad_norm": 5.656105041503906, "learning_rate": 1.11e-05, "loss": 10.9644, "step": 74 }, { "epoch": 0.13966480446927373, "grad_norm": 5.961797714233398, "learning_rate": 1.125e-05, "loss": 10.6811, "step": 75 }, { "epoch": 0.14152700186219738, "grad_norm": 5.887538433074951, "learning_rate": 1.1400000000000001e-05, "loss": 10.4977, "step": 76 }, { "epoch": 0.14338919925512103, "grad_norm": 5.464078426361084, "learning_rate": 1.1550000000000001e-05, "loss": 10.7534, "step": 77 }, { "epoch": 0.1452513966480447, "grad_norm": 5.274420261383057, "learning_rate": 1.1700000000000001e-05, "loss": 10.381, "step": 78 }, { "epoch": 0.14711359404096835, "grad_norm": 5.238426685333252, "learning_rate": 1.185e-05, "loss": 9.896, "step": 79 }, { "epoch": 0.148975791433892, "grad_norm": 5.165684223175049, "learning_rate": 1.2e-05, "loss": 9.9798, "step": 80 }, { "epoch": 0.15083798882681565, "grad_norm": 5.442358016967773, "learning_rate": 1.215e-05, "loss": 10.0177, "step": 81 }, { "epoch": 0.1527001862197393, "grad_norm": 5.217729568481445, "learning_rate": 1.2299999999999999e-05, "loss": 10.0035, "step": 82 }, { "epoch": 0.15456238361266295, "grad_norm": 4.957606792449951, "learning_rate": 1.245e-05, "loss": 9.9044, "step": 83 }, { "epoch": 0.1564245810055866, "grad_norm": 4.860937118530273, "learning_rate": 1.26e-05, "loss": 9.619, "step": 84 }, { "epoch": 0.15828677839851024, "grad_norm": 4.902154445648193, "learning_rate": 1.275e-05, "loss": 9.5559, "step": 85 }, { "epoch": 0.1601489757914339, "grad_norm": 5.0790252685546875, "learning_rate": 1.29e-05, "loss": 9.4307, "step": 86 }, { "epoch": 0.16201117318435754, "grad_norm": 5.074000358581543, "learning_rate": 1.305e-05, "loss": 9.3652, "step": 87 }, { "epoch": 0.16387337057728119, "grad_norm": 4.975531101226807, "learning_rate": 1.32e-05, "loss": 9.0473, "step": 88 }, { "epoch": 0.16573556797020483, "grad_norm": 4.921514987945557, "learning_rate": 1.3350000000000001e-05, "loss": 9.2745, "step": 89 }, { "epoch": 0.16759776536312848, "grad_norm": 4.96907377243042, "learning_rate": 1.3500000000000001e-05, "loss": 9.1972, "step": 90 }, { "epoch": 0.16945996275605213, "grad_norm": 5.297786235809326, "learning_rate": 1.3650000000000001e-05, "loss": 8.8342, "step": 91 }, { "epoch": 0.1713221601489758, "grad_norm": 4.619410037994385, "learning_rate": 1.3800000000000002e-05, "loss": 8.6951, "step": 92 }, { "epoch": 0.17318435754189945, "grad_norm": 5.202768325805664, "learning_rate": 1.395e-05, "loss": 9.0163, "step": 93 }, { "epoch": 0.1750465549348231, "grad_norm": 5.411356449127197, "learning_rate": 1.4099999999999999e-05, "loss": 8.376, "step": 94 }, { "epoch": 0.17690875232774675, "grad_norm": 4.667336463928223, "learning_rate": 1.4249999999999999e-05, "loss": 8.7688, "step": 95 }, { "epoch": 0.1787709497206704, "grad_norm": 4.727542877197266, "learning_rate": 1.44e-05, "loss": 8.6235, "step": 96 }, { "epoch": 0.18063314711359404, "grad_norm": 4.930461406707764, "learning_rate": 1.455e-05, "loss": 8.2712, "step": 97 }, { "epoch": 0.1824953445065177, "grad_norm": 5.125117778778076, "learning_rate": 1.47e-05, "loss": 8.3901, "step": 98 }, { "epoch": 0.18435754189944134, "grad_norm": 4.922242164611816, "learning_rate": 1.485e-05, "loss": 8.0681, "step": 99 }, { "epoch": 0.186219739292365, "grad_norm": 5.340170383453369, "learning_rate": 1.5e-05, "loss": 7.9208, "step": 100 }, { "epoch": 0.186219739292365, "eval_loss": 7.740330219268799, "eval_runtime": 11.9889, "eval_samples_per_second": 75.487, "eval_steps_per_second": 9.509, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6221490731089920.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }