{ "best_metric": 1.1320114135742188, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.09911785112498761, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019823570224997521, "eval_loss": 1.3964250087738037, "eval_runtime": 129.3952, "eval_samples_per_second": 16.415, "eval_steps_per_second": 4.104, "step": 1 }, { "epoch": 0.001982357022499752, "grad_norm": 6.275260925292969, "learning_rate": 4.1400000000000003e-05, "loss": 2.0384, "step": 10 }, { "epoch": 0.003964714044999504, "grad_norm": 5.881035327911377, "learning_rate": 8.280000000000001e-05, "loss": 2.1167, "step": 20 }, { "epoch": 0.005947071067499257, "grad_norm": 9.9217529296875, "learning_rate": 0.00012419999999999998, "loss": 2.2128, "step": 30 }, { "epoch": 0.007929428089999009, "grad_norm": 13.89786148071289, "learning_rate": 0.00016560000000000001, "loss": 2.3928, "step": 40 }, { "epoch": 0.009911785112498761, "grad_norm": 49.89253234863281, "learning_rate": 0.000207, "loss": 3.0166, "step": 50 }, { "epoch": 0.009911785112498761, "eval_loss": 2.377807378768921, "eval_runtime": 129.3871, "eval_samples_per_second": 16.416, "eval_steps_per_second": 4.104, "step": 50 }, { "epoch": 0.011894142134998514, "grad_norm": 4.660886287689209, "learning_rate": 0.00020674787920189178, "loss": 2.4654, "step": 60 }, { "epoch": 0.013876499157498265, "grad_norm": 5.226744174957275, "learning_rate": 0.00020599274511475253, "loss": 2.2438, "step": 70 }, { "epoch": 0.015858856179998018, "grad_norm": 9.301616668701172, "learning_rate": 0.00020473827667594888, "loss": 2.4871, "step": 80 }, { "epoch": 0.01784121320249777, "grad_norm": 11.889968872070312, "learning_rate": 0.00020299058552961598, "loss": 2.5391, "step": 90 }, { "epoch": 0.019823570224997523, "grad_norm": 28.756938934326172, "learning_rate": 0.00020075818625134152, "loss": 3.021, "step": 100 }, { "epoch": 0.019823570224997523, "eval_loss": 1.7476463317871094, "eval_runtime": 129.3759, "eval_samples_per_second": 16.417, "eval_steps_per_second": 4.104, "step": 100 }, { "epoch": 0.021805927247497275, "grad_norm": 8.956504821777344, "learning_rate": 0.00019805195486600916, "loss": 2.4593, "step": 110 }, { "epoch": 0.023788284269997028, "grad_norm": 5.725518703460693, "learning_rate": 0.00019488507586089894, "loss": 2.3297, "step": 120 }, { "epoch": 0.025770641292496777, "grad_norm": 8.019457817077637, "learning_rate": 0.00019127297795219008, "loss": 2.5391, "step": 130 }, { "epoch": 0.02775299831499653, "grad_norm": 12.648486137390137, "learning_rate": 0.00018723325891780706, "loss": 2.7968, "step": 140 }, { "epoch": 0.029735355337496282, "grad_norm": 43.71430587768555, "learning_rate": 0.0001827855998628142, "loss": 3.4261, "step": 150 }, { "epoch": 0.029735355337496282, "eval_loss": 1.5400112867355347, "eval_runtime": 129.2867, "eval_samples_per_second": 16.429, "eval_steps_per_second": 4.107, "step": 150 }, { "epoch": 0.031717712359996035, "grad_norm": 4.096808910369873, "learning_rate": 0.0001779516693350504, "loss": 2.293, "step": 160 }, { "epoch": 0.033700069382495784, "grad_norm": 5.682216644287109, "learning_rate": 0.00017275501775814182, "loss": 2.279, "step": 170 }, { "epoch": 0.03568242640499554, "grad_norm": 8.98400592803955, "learning_rate": 0.00016722096269620562, "loss": 2.571, "step": 180 }, { "epoch": 0.03766478342749529, "grad_norm": 13.64550495147705, "learning_rate": 0.00016137646550922228, "loss": 2.9063, "step": 190 }, { "epoch": 0.039647140449995046, "grad_norm": 38.4648551940918, "learning_rate": 0.00015525, "loss": 3.2143, "step": 200 }, { "epoch": 0.039647140449995046, "eval_loss": 1.60282564163208, "eval_runtime": 129.3819, "eval_samples_per_second": 16.417, "eval_steps_per_second": 4.104, "step": 200 }, { "epoch": 0.041629497472494795, "grad_norm": 5.079794406890869, "learning_rate": 0.0001488714136926695, "loss": 2.3781, "step": 210 }, { "epoch": 0.04361185449499455, "grad_norm": 5.307718753814697, "learning_rate": 0.0001422717824185469, "loss": 2.3115, "step": 220 }, { "epoch": 0.0455942115174943, "grad_norm": 9.006290435791016, "learning_rate": 0.00013548325891780705, "loss": 2.5917, "step": 230 }, { "epoch": 0.047576568539994056, "grad_norm": 13.994711875915527, "learning_rate": 0.0001285389161945656, "loss": 2.7895, "step": 240 }, { "epoch": 0.049558925562493805, "grad_norm": 21.790035247802734, "learning_rate": 0.0001214725863885273, "loss": 2.6553, "step": 250 }, { "epoch": 0.049558925562493805, "eval_loss": 1.7899256944656372, "eval_runtime": 129.1236, "eval_samples_per_second": 16.449, "eval_steps_per_second": 4.112, "step": 250 }, { "epoch": 0.051541282584993554, "grad_norm": 5.842795372009277, "learning_rate": 0.00011431869594820213, "loss": 2.4341, "step": 260 }, { "epoch": 0.05352363960749331, "grad_norm": 5.807853698730469, "learning_rate": 0.00010711209790870886, "loss": 2.3098, "step": 270 }, { "epoch": 0.05550599662999306, "grad_norm": 8.860788345336914, "learning_rate": 9.988790209129117e-05, "loss": 2.418, "step": 280 }, { "epoch": 0.057488353652492816, "grad_norm": 11.287921905517578, "learning_rate": 9.268130405179787e-05, "loss": 2.5827, "step": 290 }, { "epoch": 0.059470710674992565, "grad_norm": 23.194669723510742, "learning_rate": 8.55274136114727e-05, "loss": 2.7311, "step": 300 }, { "epoch": 0.059470710674992565, "eval_loss": 1.3962997198104858, "eval_runtime": 129.4572, "eval_samples_per_second": 16.407, "eval_steps_per_second": 4.102, "step": 300 }, { "epoch": 0.06145306769749232, "grad_norm": 3.5669214725494385, "learning_rate": 7.84610838054344e-05, "loss": 2.1451, "step": 310 }, { "epoch": 0.06343542471999207, "grad_norm": 4.6063385009765625, "learning_rate": 7.151674108219295e-05, "loss": 2.2686, "step": 320 }, { "epoch": 0.06541778174249183, "grad_norm": 6.335786819458008, "learning_rate": 6.472821758145309e-05, "loss": 2.3037, "step": 330 }, { "epoch": 0.06740013876499157, "grad_norm": 10.314666748046875, "learning_rate": 5.8128586307330475e-05, "loss": 2.5303, "step": 340 }, { "epoch": 0.06938249578749132, "grad_norm": 22.63821029663086, "learning_rate": 5.175000000000002e-05, "loss": 2.9098, "step": 350 }, { "epoch": 0.06938249578749132, "eval_loss": 1.261065125465393, "eval_runtime": 129.2566, "eval_samples_per_second": 16.432, "eval_steps_per_second": 4.108, "step": 350 }, { "epoch": 0.07136485280999108, "grad_norm": 3.719794988632202, "learning_rate": 4.5623534490777714e-05, "loss": 2.0636, "step": 360 }, { "epoch": 0.07334720983249084, "grad_norm": 4.633784770965576, "learning_rate": 3.9779037303794365e-05, "loss": 2.1919, "step": 370 }, { "epoch": 0.07532956685499058, "grad_norm": 6.985335350036621, "learning_rate": 3.42449822418582e-05, "loss": 2.3824, "step": 380 }, { "epoch": 0.07731192387749034, "grad_norm": 10.074797630310059, "learning_rate": 2.9048330664949622e-05, "loss": 2.5754, "step": 390 }, { "epoch": 0.07929428089999009, "grad_norm": 47.354488372802734, "learning_rate": 2.4214400137185785e-05, "loss": 2.7607, "step": 400 }, { "epoch": 0.07929428089999009, "eval_loss": 1.1924641132354736, "eval_runtime": 129.1322, "eval_samples_per_second": 16.448, "eval_steps_per_second": 4.112, "step": 400 }, { "epoch": 0.08127663792248985, "grad_norm": 3.8752737045288086, "learning_rate": 1.976674108219295e-05, "loss": 2.018, "step": 410 }, { "epoch": 0.08325899494498959, "grad_norm": 5.539064884185791, "learning_rate": 1.572702204780991e-05, "loss": 2.234, "step": 420 }, { "epoch": 0.08524135196748935, "grad_norm": 7.250934600830078, "learning_rate": 1.2114924139101056e-05, "loss": 2.3844, "step": 430 }, { "epoch": 0.0872237089899891, "grad_norm": 11.239484786987305, "learning_rate": 8.948045133990798e-06, "loss": 2.2897, "step": 440 }, { "epoch": 0.08920606601248884, "grad_norm": 31.72355079650879, "learning_rate": 6.241813748658489e-06, "loss": 2.6709, "step": 450 }, { "epoch": 0.08920606601248884, "eval_loss": 1.1373093128204346, "eval_runtime": 129.3098, "eval_samples_per_second": 16.426, "eval_steps_per_second": 4.106, "step": 450 }, { "epoch": 0.0911884230349886, "grad_norm": 4.044310569763184, "learning_rate": 4.009414470383994e-06, "loss": 1.9538, "step": 460 }, { "epoch": 0.09317078005748836, "grad_norm": 5.821104526519775, "learning_rate": 2.261723324051111e-06, "loss": 2.0272, "step": 470 }, { "epoch": 0.09515313707998811, "grad_norm": 9.027103424072266, "learning_rate": 1.0072548852474675e-06, "loss": 2.3058, "step": 480 }, { "epoch": 0.09713549410248785, "grad_norm": 10.941502571105957, "learning_rate": 2.5212079810819554e-07, "loss": 2.3945, "step": 490 }, { "epoch": 0.09911785112498761, "grad_norm": 56.49876022338867, "learning_rate": 0.0, "loss": 2.6685, "step": 500 }, { "epoch": 0.09911785112498761, "eval_loss": 1.1320114135742188, "eval_runtime": 129.2595, "eval_samples_per_second": 16.432, "eval_steps_per_second": 4.108, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.158397565442458e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }