{ "best_metric": 10.301432609558105, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.01404908751176611, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.809817502353222e-05, "eval_loss": 10.378629684448242, "eval_runtime": 60.0386, "eval_samples_per_second": 249.589, "eval_steps_per_second": 62.41, "step": 1 }, { "epoch": 0.0002809817502353222, "grad_norm": 0.19126524031162262, "learning_rate": 4.0600000000000004e-05, "loss": 20.7601, "step": 10 }, { "epoch": 0.0005619635004706444, "grad_norm": 0.1812351495027542, "learning_rate": 8.120000000000001e-05, "loss": 20.7579, "step": 20 }, { "epoch": 0.0008429452507059666, "grad_norm": 0.21598900854587555, "learning_rate": 0.00012179999999999999, "loss": 20.7557, "step": 30 }, { "epoch": 0.0011239270009412888, "grad_norm": 0.29655301570892334, "learning_rate": 0.00016240000000000002, "loss": 20.7436, "step": 40 }, { "epoch": 0.001404908751176611, "grad_norm": 0.3212806284427643, "learning_rate": 0.000203, "loss": 20.7365, "step": 50 }, { "epoch": 0.001404908751176611, "eval_loss": 10.362621307373047, "eval_runtime": 60.4081, "eval_samples_per_second": 248.063, "eval_steps_per_second": 62.028, "step": 50 }, { "epoch": 0.0016858905014119332, "grad_norm": 0.4301055669784546, "learning_rate": 0.00020275275110137215, "loss": 20.7188, "step": 60 }, { "epoch": 0.0019668722516472557, "grad_norm": 0.3765835165977478, "learning_rate": 0.00020201220897726938, "loss": 20.6809, "step": 70 }, { "epoch": 0.0022478540018825775, "grad_norm": 0.3050801157951355, "learning_rate": 0.00020078198147448128, "loss": 20.6434, "step": 80 }, { "epoch": 0.0025288357521178998, "grad_norm": 0.31923723220825195, "learning_rate": 0.00019906806213773937, "loss": 20.6134, "step": 90 }, { "epoch": 0.002809817502353222, "grad_norm": 0.3668956160545349, "learning_rate": 0.0001968788010097697, "loss": 20.6138, "step": 100 }, { "epoch": 0.002809817502353222, "eval_loss": 10.308439254760742, "eval_runtime": 59.3542, "eval_samples_per_second": 252.467, "eval_steps_per_second": 63.129, "step": 100 }, { "epoch": 0.0030907992525885442, "grad_norm": 0.20467126369476318, "learning_rate": 0.00019422486395072398, "loss": 20.6304, "step": 110 }, { "epoch": 0.0033717810028238665, "grad_norm": 0.2518371343612671, "learning_rate": 0.0001911191806751811, "loss": 20.6217, "step": 120 }, { "epoch": 0.0036527627530591887, "grad_norm": 0.1735089272260666, "learning_rate": 0.00018757688175987723, "loss": 20.6238, "step": 130 }, { "epoch": 0.003933744503294511, "grad_norm": 0.17584048211574554, "learning_rate": 0.00018361522492905716, "loss": 20.6006, "step": 140 }, { "epoch": 0.004214726253529834, "grad_norm": 0.20633459091186523, "learning_rate": 0.00017925351097657625, "loss": 20.6024, "step": 150 }, { "epoch": 0.004214726253529834, "eval_loss": 10.306268692016602, "eval_runtime": 59.8224, "eval_samples_per_second": 250.492, "eval_steps_per_second": 62.635, "step": 150 }, { "epoch": 0.004495708003765155, "grad_norm": 0.18568071722984314, "learning_rate": 0.00017451298973437308, "loss": 20.6288, "step": 160 }, { "epoch": 0.004776689754000477, "grad_norm": 0.2910362184047699, "learning_rate": 0.0001694167565454241, "loss": 20.611, "step": 170 }, { "epoch": 0.0050576715042357995, "grad_norm": 0.2217985987663269, "learning_rate": 0.0001639896397455543, "loss": 20.618, "step": 180 }, { "epoch": 0.005338653254471122, "grad_norm": 0.19911141693592072, "learning_rate": 0.0001582580797022808, "loss": 20.6091, "step": 190 }, { "epoch": 0.005619635004706444, "grad_norm": 0.28544291853904724, "learning_rate": 0.00015225, "loss": 20.6026, "step": 200 }, { "epoch": 0.005619635004706444, "eval_loss": 10.305092811584473, "eval_runtime": 60.5578, "eval_samples_per_second": 247.45, "eval_steps_per_second": 61.875, "step": 200 }, { "epoch": 0.005900616754941766, "grad_norm": 0.21859851479530334, "learning_rate": 0.00014599467139909136, "loss": 20.6277, "step": 210 }, { "epoch": 0.0061815985051770885, "grad_norm": 0.2270508110523224, "learning_rate": 0.0001395225692317151, "loss": 20.6168, "step": 220 }, { "epoch": 0.006462580255412411, "grad_norm": 0.17180295288562775, "learning_rate": 0.00013286522492905717, "loss": 20.6051, "step": 230 }, { "epoch": 0.006743562005647733, "grad_norm": 0.2308344542980194, "learning_rate": 0.00012605507240336626, "loss": 20.6047, "step": 240 }, { "epoch": 0.007024543755883055, "grad_norm": 0.2163165658712387, "learning_rate": 0.00011912529003319345, "loss": 20.5976, "step": 250 }, { "epoch": 0.007024543755883055, "eval_loss": 10.303328514099121, "eval_runtime": 60.606, "eval_samples_per_second": 247.253, "eval_steps_per_second": 61.826, "step": 250 }, { "epoch": 0.0073055255061183775, "grad_norm": 0.1643211990594864, "learning_rate": 0.00011210963902166683, "loss": 20.6166, "step": 260 }, { "epoch": 0.0075865072563537, "grad_norm": 0.13470244407653809, "learning_rate": 0.00010504229891530386, "loss": 20.6187, "step": 270 }, { "epoch": 0.007867489006589023, "grad_norm": 0.15561600029468536, "learning_rate": 9.795770108469618e-05, "loss": 20.5993, "step": 280 }, { "epoch": 0.008148470756824344, "grad_norm": 0.1637904942035675, "learning_rate": 9.08903609783332e-05, "loss": 20.6033, "step": 290 }, { "epoch": 0.008429452507059667, "grad_norm": 0.2457241415977478, "learning_rate": 8.387470996680658e-05, "loss": 20.581, "step": 300 }, { "epoch": 0.008429452507059667, "eval_loss": 10.302334785461426, "eval_runtime": 59.2759, "eval_samples_per_second": 252.801, "eval_steps_per_second": 63.213, "step": 300 }, { "epoch": 0.008710434257294989, "grad_norm": 0.1403207629919052, "learning_rate": 7.694492759663374e-05, "loss": 20.6175, "step": 310 }, { "epoch": 0.00899141600753031, "grad_norm": 0.1851751208305359, "learning_rate": 7.013477507094284e-05, "loss": 20.6056, "step": 320 }, { "epoch": 0.009272397757765633, "grad_norm": 0.23650233447551727, "learning_rate": 6.347743076828492e-05, "loss": 20.6097, "step": 330 }, { "epoch": 0.009553379508000955, "grad_norm": 0.2193847894668579, "learning_rate": 5.700532860090863e-05, "loss": 20.5948, "step": 340 }, { "epoch": 0.009834361258236278, "grad_norm": 0.2748495638370514, "learning_rate": 5.075000000000002e-05, "loss": 20.5916, "step": 350 }, { "epoch": 0.009834361258236278, "eval_loss": 10.30180835723877, "eval_runtime": 60.3366, "eval_samples_per_second": 248.357, "eval_steps_per_second": 62.102, "step": 350 }, { "epoch": 0.010115343008471599, "grad_norm": 0.2530926764011383, "learning_rate": 4.4741920297719214e-05, "loss": 20.6187, "step": 360 }, { "epoch": 0.010396324758706922, "grad_norm": 0.18327485024929047, "learning_rate": 3.901036025444568e-05, "loss": 20.6102, "step": 370 }, { "epoch": 0.010677306508942244, "grad_norm": 0.1382702887058258, "learning_rate": 3.358324345457592e-05, "loss": 20.6094, "step": 380 }, { "epoch": 0.010958288259177567, "grad_norm": 0.19693830609321594, "learning_rate": 2.8487010265626928e-05, "loss": 20.5886, "step": 390 }, { "epoch": 0.011239270009412888, "grad_norm": 0.26368966698646545, "learning_rate": 2.3746489023423744e-05, "loss": 20.5852, "step": 400 }, { "epoch": 0.011239270009412888, "eval_loss": 10.301532745361328, "eval_runtime": 59.6237, "eval_samples_per_second": 251.326, "eval_steps_per_second": 62.844, "step": 400 }, { "epoch": 0.011520251759648211, "grad_norm": 0.15768560767173767, "learning_rate": 1.9384775070942844e-05, "loss": 20.6082, "step": 410 }, { "epoch": 0.011801233509883532, "grad_norm": 0.19848321378231049, "learning_rate": 1.5423118240122765e-05, "loss": 20.5956, "step": 420 }, { "epoch": 0.012082215260118856, "grad_norm": 0.14942528307437897, "learning_rate": 1.188081932481891e-05, "loss": 20.6074, "step": 430 }, { "epoch": 0.012363197010354177, "grad_norm": 0.21457645297050476, "learning_rate": 8.775136049276001e-06, "loss": 20.5856, "step": 440 }, { "epoch": 0.0126441787605895, "grad_norm": 0.3364667296409607, "learning_rate": 6.121198990230306e-06, "loss": 20.5984, "step": 450 }, { "epoch": 0.0126441787605895, "eval_loss": 10.30145263671875, "eval_runtime": 60.1852, "eval_samples_per_second": 248.982, "eval_steps_per_second": 62.258, "step": 450 }, { "epoch": 0.012925160510824821, "grad_norm": 0.16703709959983826, "learning_rate": 3.931937862260632e-06, "loss": 20.6069, "step": 460 }, { "epoch": 0.013206142261060145, "grad_norm": 0.19127300381660461, "learning_rate": 2.2180185255187225e-06, "loss": 20.6087, "step": 470 }, { "epoch": 0.013487124011295466, "grad_norm": 0.19735532999038696, "learning_rate": 9.877910227306082e-07, "loss": 20.6209, "step": 480 }, { "epoch": 0.013768105761530789, "grad_norm": 0.1791822463274002, "learning_rate": 2.472488986278439e-07, "loss": 20.5952, "step": 490 }, { "epoch": 0.01404908751176611, "grad_norm": 0.20496730506420135, "learning_rate": 0.0, "loss": 20.5889, "step": 500 }, { "epoch": 0.01404908751176611, "eval_loss": 10.301432609558105, "eval_runtime": 60.1498, "eval_samples_per_second": 249.128, "eval_steps_per_second": 62.294, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 21494130278400.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }