|
{ |
|
"best_metric": 1.1320114135742188, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.09911785112498761, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00019823570224997521, |
|
"eval_loss": 1.3964250087738037, |
|
"eval_runtime": 129.3952, |
|
"eval_samples_per_second": 16.415, |
|
"eval_steps_per_second": 4.104, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001982357022499752, |
|
"grad_norm": 6.275260925292969, |
|
"learning_rate": 4.1400000000000003e-05, |
|
"loss": 2.0384, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003964714044999504, |
|
"grad_norm": 5.881035327911377, |
|
"learning_rate": 8.280000000000001e-05, |
|
"loss": 2.1167, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005947071067499257, |
|
"grad_norm": 9.9217529296875, |
|
"learning_rate": 0.00012419999999999998, |
|
"loss": 2.2128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007929428089999009, |
|
"grad_norm": 13.89786148071289, |
|
"learning_rate": 0.00016560000000000001, |
|
"loss": 2.3928, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009911785112498761, |
|
"grad_norm": 49.89253234863281, |
|
"learning_rate": 0.000207, |
|
"loss": 3.0166, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009911785112498761, |
|
"eval_loss": 2.377807378768921, |
|
"eval_runtime": 129.3871, |
|
"eval_samples_per_second": 16.416, |
|
"eval_steps_per_second": 4.104, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011894142134998514, |
|
"grad_norm": 4.660886287689209, |
|
"learning_rate": 0.00020674787920189178, |
|
"loss": 2.4654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.013876499157498265, |
|
"grad_norm": 5.226744174957275, |
|
"learning_rate": 0.00020599274511475253, |
|
"loss": 2.2438, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.015858856179998018, |
|
"grad_norm": 9.301616668701172, |
|
"learning_rate": 0.00020473827667594888, |
|
"loss": 2.4871, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01784121320249777, |
|
"grad_norm": 11.889968872070312, |
|
"learning_rate": 0.00020299058552961598, |
|
"loss": 2.5391, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.019823570224997523, |
|
"grad_norm": 28.756938934326172, |
|
"learning_rate": 0.00020075818625134152, |
|
"loss": 3.021, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.019823570224997523, |
|
"eval_loss": 1.7476463317871094, |
|
"eval_runtime": 129.3759, |
|
"eval_samples_per_second": 16.417, |
|
"eval_steps_per_second": 4.104, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.021805927247497275, |
|
"grad_norm": 8.956504821777344, |
|
"learning_rate": 0.00019805195486600916, |
|
"loss": 2.4593, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.023788284269997028, |
|
"grad_norm": 5.725518703460693, |
|
"learning_rate": 0.00019488507586089894, |
|
"loss": 2.3297, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.025770641292496777, |
|
"grad_norm": 8.019457817077637, |
|
"learning_rate": 0.00019127297795219008, |
|
"loss": 2.5391, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02775299831499653, |
|
"grad_norm": 12.648486137390137, |
|
"learning_rate": 0.00018723325891780706, |
|
"loss": 2.7968, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.029735355337496282, |
|
"grad_norm": 43.71430587768555, |
|
"learning_rate": 0.0001827855998628142, |
|
"loss": 3.4261, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.029735355337496282, |
|
"eval_loss": 1.5400112867355347, |
|
"eval_runtime": 129.2867, |
|
"eval_samples_per_second": 16.429, |
|
"eval_steps_per_second": 4.107, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.031717712359996035, |
|
"grad_norm": 4.096808910369873, |
|
"learning_rate": 0.0001779516693350504, |
|
"loss": 2.293, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.033700069382495784, |
|
"grad_norm": 5.682216644287109, |
|
"learning_rate": 0.00017275501775814182, |
|
"loss": 2.279, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03568242640499554, |
|
"grad_norm": 8.98400592803955, |
|
"learning_rate": 0.00016722096269620562, |
|
"loss": 2.571, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03766478342749529, |
|
"grad_norm": 13.64550495147705, |
|
"learning_rate": 0.00016137646550922228, |
|
"loss": 2.9063, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.039647140449995046, |
|
"grad_norm": 38.4648551940918, |
|
"learning_rate": 0.00015525, |
|
"loss": 3.2143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.039647140449995046, |
|
"eval_loss": 1.60282564163208, |
|
"eval_runtime": 129.3819, |
|
"eval_samples_per_second": 16.417, |
|
"eval_steps_per_second": 4.104, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.041629497472494795, |
|
"grad_norm": 5.079794406890869, |
|
"learning_rate": 0.0001488714136926695, |
|
"loss": 2.3781, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04361185449499455, |
|
"grad_norm": 5.307718753814697, |
|
"learning_rate": 0.0001422717824185469, |
|
"loss": 2.3115, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0455942115174943, |
|
"grad_norm": 9.006290435791016, |
|
"learning_rate": 0.00013548325891780705, |
|
"loss": 2.5917, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.047576568539994056, |
|
"grad_norm": 13.994711875915527, |
|
"learning_rate": 0.0001285389161945656, |
|
"loss": 2.7895, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.049558925562493805, |
|
"grad_norm": 21.790035247802734, |
|
"learning_rate": 0.0001214725863885273, |
|
"loss": 2.6553, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.049558925562493805, |
|
"eval_loss": 1.7899256944656372, |
|
"eval_runtime": 129.1236, |
|
"eval_samples_per_second": 16.449, |
|
"eval_steps_per_second": 4.112, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.051541282584993554, |
|
"grad_norm": 5.842795372009277, |
|
"learning_rate": 0.00011431869594820213, |
|
"loss": 2.4341, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05352363960749331, |
|
"grad_norm": 5.807853698730469, |
|
"learning_rate": 0.00010711209790870886, |
|
"loss": 2.3098, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05550599662999306, |
|
"grad_norm": 8.860788345336914, |
|
"learning_rate": 9.988790209129117e-05, |
|
"loss": 2.418, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.057488353652492816, |
|
"grad_norm": 11.287921905517578, |
|
"learning_rate": 9.268130405179787e-05, |
|
"loss": 2.5827, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.059470710674992565, |
|
"grad_norm": 23.194669723510742, |
|
"learning_rate": 8.55274136114727e-05, |
|
"loss": 2.7311, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.059470710674992565, |
|
"eval_loss": 1.3962997198104858, |
|
"eval_runtime": 129.4572, |
|
"eval_samples_per_second": 16.407, |
|
"eval_steps_per_second": 4.102, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06145306769749232, |
|
"grad_norm": 3.5669214725494385, |
|
"learning_rate": 7.84610838054344e-05, |
|
"loss": 2.1451, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06343542471999207, |
|
"grad_norm": 4.6063385009765625, |
|
"learning_rate": 7.151674108219295e-05, |
|
"loss": 2.2686, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06541778174249183, |
|
"grad_norm": 6.335786819458008, |
|
"learning_rate": 6.472821758145309e-05, |
|
"loss": 2.3037, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06740013876499157, |
|
"grad_norm": 10.314666748046875, |
|
"learning_rate": 5.8128586307330475e-05, |
|
"loss": 2.5303, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06938249578749132, |
|
"grad_norm": 22.63821029663086, |
|
"learning_rate": 5.175000000000002e-05, |
|
"loss": 2.9098, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06938249578749132, |
|
"eval_loss": 1.261065125465393, |
|
"eval_runtime": 129.2566, |
|
"eval_samples_per_second": 16.432, |
|
"eval_steps_per_second": 4.108, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07136485280999108, |
|
"grad_norm": 3.719794988632202, |
|
"learning_rate": 4.5623534490777714e-05, |
|
"loss": 2.0636, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07334720983249084, |
|
"grad_norm": 4.633784770965576, |
|
"learning_rate": 3.9779037303794365e-05, |
|
"loss": 2.1919, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"grad_norm": 6.985335350036621, |
|
"learning_rate": 3.42449822418582e-05, |
|
"loss": 2.3824, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07731192387749034, |
|
"grad_norm": 10.074797630310059, |
|
"learning_rate": 2.9048330664949622e-05, |
|
"loss": 2.5754, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07929428089999009, |
|
"grad_norm": 47.354488372802734, |
|
"learning_rate": 2.4214400137185785e-05, |
|
"loss": 2.7607, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07929428089999009, |
|
"eval_loss": 1.1924641132354736, |
|
"eval_runtime": 129.1322, |
|
"eval_samples_per_second": 16.448, |
|
"eval_steps_per_second": 4.112, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08127663792248985, |
|
"grad_norm": 3.8752737045288086, |
|
"learning_rate": 1.976674108219295e-05, |
|
"loss": 2.018, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08325899494498959, |
|
"grad_norm": 5.539064884185791, |
|
"learning_rate": 1.572702204780991e-05, |
|
"loss": 2.234, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08524135196748935, |
|
"grad_norm": 7.250934600830078, |
|
"learning_rate": 1.2114924139101056e-05, |
|
"loss": 2.3844, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0872237089899891, |
|
"grad_norm": 11.239484786987305, |
|
"learning_rate": 8.948045133990798e-06, |
|
"loss": 2.2897, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08920606601248884, |
|
"grad_norm": 31.72355079650879, |
|
"learning_rate": 6.241813748658489e-06, |
|
"loss": 2.6709, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08920606601248884, |
|
"eval_loss": 1.1373093128204346, |
|
"eval_runtime": 129.3098, |
|
"eval_samples_per_second": 16.426, |
|
"eval_steps_per_second": 4.106, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0911884230349886, |
|
"grad_norm": 4.044310569763184, |
|
"learning_rate": 4.009414470383994e-06, |
|
"loss": 1.9538, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09317078005748836, |
|
"grad_norm": 5.821104526519775, |
|
"learning_rate": 2.261723324051111e-06, |
|
"loss": 2.0272, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09515313707998811, |
|
"grad_norm": 9.027103424072266, |
|
"learning_rate": 1.0072548852474675e-06, |
|
"loss": 2.3058, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09713549410248785, |
|
"grad_norm": 10.941502571105957, |
|
"learning_rate": 2.5212079810819554e-07, |
|
"loss": 2.3945, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09911785112498761, |
|
"grad_norm": 56.49876022338867, |
|
"learning_rate": 0.0, |
|
"loss": 2.6685, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09911785112498761, |
|
"eval_loss": 1.1320114135742188, |
|
"eval_runtime": 129.2595, |
|
"eval_samples_per_second": 16.432, |
|
"eval_steps_per_second": 4.108, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.158397565442458e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|