{ "best_metric": 1.1311696767807007, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.3487020534676482, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007748934521503294, "eval_loss": 2.042634963989258, "eval_runtime": 19.3191, "eval_samples_per_second": 28.159, "eval_steps_per_second": 7.04, "step": 1 }, { "epoch": 0.0077489345215032935, "grad_norm": 2.596163511276245, "learning_rate": 4.0600000000000004e-05, "loss": 1.637, "step": 10 }, { "epoch": 0.015497869043006587, "grad_norm": 1.8155604600906372, "learning_rate": 8.120000000000001e-05, "loss": 1.4282, "step": 20 }, { "epoch": 0.02324680356450988, "grad_norm": 2.010477066040039, "learning_rate": 0.00012179999999999999, "loss": 1.3096, "step": 30 }, { "epoch": 0.030995738086013174, "grad_norm": 1.7271562814712524, "learning_rate": 0.00016240000000000002, "loss": 1.2131, "step": 40 }, { "epoch": 0.03874467260751647, "grad_norm": 3.4388530254364014, "learning_rate": 0.000203, "loss": 1.2151, "step": 50 }, { "epoch": 0.03874467260751647, "eval_loss": 1.2415974140167236, "eval_runtime": 19.4264, "eval_samples_per_second": 28.003, "eval_steps_per_second": 7.001, "step": 50 }, { "epoch": 0.04649360712901976, "grad_norm": 1.669398307800293, "learning_rate": 0.00020275275110137215, "loss": 1.1596, "step": 60 }, { "epoch": 0.05424254165052305, "grad_norm": 1.3598785400390625, "learning_rate": 0.00020201220897726938, "loss": 1.2513, "step": 70 }, { "epoch": 0.06199147617202635, "grad_norm": 1.4303092956542969, "learning_rate": 0.00020078198147448128, "loss": 1.1887, "step": 80 }, { "epoch": 0.06974041069352964, "grad_norm": 1.7135754823684692, "learning_rate": 0.00019906806213773937, "loss": 1.1901, "step": 90 }, { "epoch": 0.07748934521503294, "grad_norm": 4.795324325561523, "learning_rate": 0.0001968788010097697, "loss": 1.1915, "step": 100 }, { "epoch": 0.07748934521503294, "eval_loss": 1.2326695919036865, "eval_runtime": 19.5546, "eval_samples_per_second": 27.82, "eval_steps_per_second": 6.955, "step": 100 }, { "epoch": 0.08523827973653622, "grad_norm": 1.4039335250854492, "learning_rate": 0.00019422486395072398, "loss": 1.1134, "step": 110 }, { "epoch": 0.09298721425803952, "grad_norm": 1.290767788887024, "learning_rate": 0.0001911191806751811, "loss": 1.1767, "step": 120 }, { "epoch": 0.10073614877954282, "grad_norm": 1.4741291999816895, "learning_rate": 0.00018757688175987723, "loss": 1.2077, "step": 130 }, { "epoch": 0.1084850833010461, "grad_norm": 1.580878496170044, "learning_rate": 0.00018361522492905716, "loss": 1.1455, "step": 140 }, { "epoch": 0.1162340178225494, "grad_norm": 6.502963066101074, "learning_rate": 0.00017925351097657625, "loss": 1.2151, "step": 150 }, { "epoch": 0.1162340178225494, "eval_loss": 1.1908780336380005, "eval_runtime": 19.4086, "eval_samples_per_second": 28.029, "eval_steps_per_second": 7.007, "step": 150 }, { "epoch": 0.1239829523440527, "grad_norm": 1.1870495080947876, "learning_rate": 0.00017451298973437308, "loss": 0.9819, "step": 160 }, { "epoch": 0.13173188686555598, "grad_norm": 1.2544339895248413, "learning_rate": 0.0001694167565454241, "loss": 1.149, "step": 170 }, { "epoch": 0.13948082138705928, "grad_norm": 1.5018516778945923, "learning_rate": 0.0001639896397455543, "loss": 1.1653, "step": 180 }, { "epoch": 0.14722975590856258, "grad_norm": 1.6409251689910889, "learning_rate": 0.0001582580797022808, "loss": 1.1748, "step": 190 }, { "epoch": 0.15497869043006587, "grad_norm": 3.206493377685547, "learning_rate": 0.00015225, "loss": 1.1795, "step": 200 }, { "epoch": 0.15497869043006587, "eval_loss": 1.1657546758651733, "eval_runtime": 19.4529, "eval_samples_per_second": 27.965, "eval_steps_per_second": 6.991, "step": 200 }, { "epoch": 0.16272762495156917, "grad_norm": 1.2450393438339233, "learning_rate": 0.00014599467139909136, "loss": 1.0438, "step": 210 }, { "epoch": 0.17047655947307244, "grad_norm": 1.2467765808105469, "learning_rate": 0.0001395225692317151, "loss": 1.109, "step": 220 }, { "epoch": 0.17822549399457574, "grad_norm": 1.5617730617523193, "learning_rate": 0.00013286522492905717, "loss": 1.1722, "step": 230 }, { "epoch": 0.18597442851607904, "grad_norm": 1.4881824254989624, "learning_rate": 0.00012605507240336626, "loss": 1.1899, "step": 240 }, { "epoch": 0.19372336303758234, "grad_norm": 3.424774169921875, "learning_rate": 0.00011912529003319345, "loss": 1.2232, "step": 250 }, { "epoch": 0.19372336303758234, "eval_loss": 1.154661774635315, "eval_runtime": 19.4078, "eval_samples_per_second": 28.03, "eval_steps_per_second": 7.007, "step": 250 }, { "epoch": 0.20147229755908563, "grad_norm": 1.3052321672439575, "learning_rate": 0.00011210963902166683, "loss": 1.1019, "step": 260 }, { "epoch": 0.20922123208058893, "grad_norm": 1.3270354270935059, "learning_rate": 0.00010504229891530386, "loss": 1.1695, "step": 270 }, { "epoch": 0.2169701666020922, "grad_norm": 1.625852346420288, "learning_rate": 9.795770108469618e-05, "loss": 1.1769, "step": 280 }, { "epoch": 0.2247191011235955, "grad_norm": 1.5025311708450317, "learning_rate": 9.08903609783332e-05, "loss": 1.1659, "step": 290 }, { "epoch": 0.2324680356450988, "grad_norm": 3.024522066116333, "learning_rate": 8.387470996680658e-05, "loss": 1.1329, "step": 300 }, { "epoch": 0.2324680356450988, "eval_loss": 1.1374027729034424, "eval_runtime": 19.4072, "eval_samples_per_second": 28.031, "eval_steps_per_second": 7.008, "step": 300 }, { "epoch": 0.2402169701666021, "grad_norm": 1.227674961090088, "learning_rate": 7.694492759663374e-05, "loss": 1.1127, "step": 310 }, { "epoch": 0.2479659046881054, "grad_norm": 1.371392011642456, "learning_rate": 7.013477507094284e-05, "loss": 1.2046, "step": 320 }, { "epoch": 0.2557148392096087, "grad_norm": 1.4349315166473389, "learning_rate": 6.347743076828492e-05, "loss": 1.1213, "step": 330 }, { "epoch": 0.26346377373111196, "grad_norm": 1.58314847946167, "learning_rate": 5.700532860090863e-05, "loss": 1.1533, "step": 340 }, { "epoch": 0.2712127082526153, "grad_norm": 3.6435108184814453, "learning_rate": 5.075000000000002e-05, "loss": 1.155, "step": 350 }, { "epoch": 0.2712127082526153, "eval_loss": 1.1427217721939087, "eval_runtime": 19.5093, "eval_samples_per_second": 27.884, "eval_steps_per_second": 6.971, "step": 350 }, { "epoch": 0.27896164277411856, "grad_norm": 1.3242435455322266, "learning_rate": 4.4741920297719214e-05, "loss": 1.0931, "step": 360 }, { "epoch": 0.2867105772956218, "grad_norm": 1.3505114316940308, "learning_rate": 3.901036025444568e-05, "loss": 1.1299, "step": 370 }, { "epoch": 0.29445951181712515, "grad_norm": 1.325255036354065, "learning_rate": 3.358324345457592e-05, "loss": 1.1477, "step": 380 }, { "epoch": 0.3022084463386284, "grad_norm": 1.5533366203308105, "learning_rate": 2.8487010265626928e-05, "loss": 1.148, "step": 390 }, { "epoch": 0.30995738086013175, "grad_norm": 2.8041539192199707, "learning_rate": 2.3746489023423744e-05, "loss": 1.1461, "step": 400 }, { "epoch": 0.30995738086013175, "eval_loss": 1.1326929330825806, "eval_runtime": 19.4266, "eval_samples_per_second": 28.003, "eval_steps_per_second": 7.001, "step": 400 }, { "epoch": 0.317706315381635, "grad_norm": 1.2059953212738037, "learning_rate": 1.9384775070942844e-05, "loss": 0.8901, "step": 410 }, { "epoch": 0.32545524990313834, "grad_norm": 1.4342598915100098, "learning_rate": 1.5423118240122765e-05, "loss": 1.1229, "step": 420 }, { "epoch": 0.3332041844246416, "grad_norm": 1.244903802871704, "learning_rate": 1.188081932481891e-05, "loss": 1.1037, "step": 430 }, { "epoch": 0.3409531189461449, "grad_norm": 1.8167920112609863, "learning_rate": 8.775136049276001e-06, "loss": 1.131, "step": 440 }, { "epoch": 0.3487020534676482, "grad_norm": 2.944375991821289, "learning_rate": 6.121198990230306e-06, "loss": 1.0648, "step": 450 }, { "epoch": 0.3487020534676482, "eval_loss": 1.1311696767807007, "eval_runtime": 19.3838, "eval_samples_per_second": 28.065, "eval_steps_per_second": 7.016, "step": 450 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.344858262647603e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }