{ "best_metric": 11.038910865783691, "best_model_checkpoint": "miner_id_24/checkpoint-80", "epoch": 0.0036157374974576844, "eval_steps": 5, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.519671871822106e-05, "grad_norm": 0.741265594959259, "learning_rate": 2e-05, "loss": 44.3865, "step": 1 }, { "epoch": 4.519671871822106e-05, "eval_loss": 11.093368530273438, "eval_runtime": 175.5908, "eval_samples_per_second": 53.061, "eval_steps_per_second": 26.533, "step": 1 }, { "epoch": 9.039343743644212e-05, "grad_norm": 0.6581929326057434, "learning_rate": 4e-05, "loss": 44.3813, "step": 2 }, { "epoch": 0.00013559015615466317, "grad_norm": 0.6729432344436646, "learning_rate": 6e-05, "loss": 44.3393, "step": 3 }, { "epoch": 0.00018078687487288423, "grad_norm": 0.6872175335884094, "learning_rate": 8e-05, "loss": 44.3795, "step": 4 }, { "epoch": 0.00022598359359110527, "grad_norm": 0.704067051410675, "learning_rate": 0.0001, "loss": 44.389, "step": 5 }, { "epoch": 0.00022598359359110527, "eval_loss": 11.093063354492188, "eval_runtime": 176.3334, "eval_samples_per_second": 52.837, "eval_steps_per_second": 26.422, "step": 5 }, { "epoch": 0.00027118031230932634, "grad_norm": 0.6682418584823608, "learning_rate": 0.00012, "loss": 44.3529, "step": 6 }, { "epoch": 0.0003163770310275474, "grad_norm": 0.6353705525398254, "learning_rate": 0.00014, "loss": 44.4355, "step": 7 }, { "epoch": 0.00036157374974576847, "grad_norm": 0.6866922974586487, "learning_rate": 0.00016, "loss": 44.4036, "step": 8 }, { "epoch": 0.00040677046846398953, "grad_norm": 0.7315343618392944, "learning_rate": 0.00018, "loss": 44.3756, "step": 9 }, { "epoch": 0.00045196718718221055, "grad_norm": 0.6867555379867554, "learning_rate": 0.0002, "loss": 44.3678, "step": 10 }, { "epoch": 0.00045196718718221055, "eval_loss": 11.091917037963867, "eval_runtime": 176.1396, "eval_samples_per_second": 52.896, "eval_steps_per_second": 26.451, "step": 10 }, { "epoch": 0.0004971639059004316, "grad_norm": 0.7067858576774597, "learning_rate": 0.0001999979446958366, "loss": 44.3933, "step": 11 }, { "epoch": 0.0005423606246186527, "grad_norm": 0.7694055438041687, "learning_rate": 0.00019999177886783194, "loss": 44.3476, "step": 12 }, { "epoch": 0.0005875573433368737, "grad_norm": 0.6980550289154053, "learning_rate": 0.00019998150276943902, "loss": 44.3621, "step": 13 }, { "epoch": 0.0006327540620550948, "grad_norm": 0.7399426698684692, "learning_rate": 0.000199967116823068, "loss": 44.3727, "step": 14 }, { "epoch": 0.0006779507807733159, "grad_norm": 0.6623771786689758, "learning_rate": 0.0001999486216200688, "loss": 44.3563, "step": 15 }, { "epoch": 0.0006779507807733159, "eval_loss": 11.089905738830566, "eval_runtime": 176.0326, "eval_samples_per_second": 52.928, "eval_steps_per_second": 26.467, "step": 15 }, { "epoch": 0.0007231474994915369, "grad_norm": 0.6647756695747375, "learning_rate": 0.00019992601792070679, "loss": 44.3454, "step": 16 }, { "epoch": 0.000768344218209758, "grad_norm": 0.7416101694107056, "learning_rate": 0.00019989930665413147, "loss": 44.3249, "step": 17 }, { "epoch": 0.0008135409369279791, "grad_norm": 0.6351829171180725, "learning_rate": 0.00019986848891833845, "loss": 44.37, "step": 18 }, { "epoch": 0.0008587376556462001, "grad_norm": 0.6839431524276733, "learning_rate": 0.0001998335659801241, "loss": 44.3472, "step": 19 }, { "epoch": 0.0009039343743644211, "grad_norm": 0.6762228608131409, "learning_rate": 0.00019979453927503364, "loss": 44.3507, "step": 20 }, { "epoch": 0.0009039343743644211, "eval_loss": 11.087591171264648, "eval_runtime": 176.1534, "eval_samples_per_second": 52.891, "eval_steps_per_second": 26.449, "step": 20 }, { "epoch": 0.0009491310930826422, "grad_norm": 0.7993413209915161, "learning_rate": 0.00019975141040730207, "loss": 44.288, "step": 21 }, { "epoch": 0.0009943278118008632, "grad_norm": 0.6926490664482117, "learning_rate": 0.0001997041811497882, "loss": 44.3672, "step": 22 }, { "epoch": 0.0010395245305190844, "grad_norm": 0.7373084425926208, "learning_rate": 0.00019965285344390184, "loss": 44.3927, "step": 23 }, { "epoch": 0.0010847212492373054, "grad_norm": 0.6655643582344055, "learning_rate": 0.00019959742939952392, "loss": 44.3481, "step": 24 }, { "epoch": 0.0011299179679555265, "grad_norm": 0.7115928530693054, "learning_rate": 0.00019953791129491983, "loss": 44.3368, "step": 25 }, { "epoch": 0.0011299179679555265, "eval_loss": 11.085227012634277, "eval_runtime": 175.877, "eval_samples_per_second": 52.975, "eval_steps_per_second": 26.49, "step": 25 }, { "epoch": 0.0011751146866737475, "grad_norm": 0.7096830010414124, "learning_rate": 0.00019947430157664576, "loss": 44.3735, "step": 26 }, { "epoch": 0.0012203114053919684, "grad_norm": 0.6747312545776367, "learning_rate": 0.00019940660285944803, "loss": 44.3323, "step": 27 }, { "epoch": 0.0012655081241101896, "grad_norm": 0.7371957302093506, "learning_rate": 0.00019933481792615583, "loss": 44.2951, "step": 28 }, { "epoch": 0.0013107048428284106, "grad_norm": 0.7316697239875793, "learning_rate": 0.0001992589497275665, "loss": 44.3097, "step": 29 }, { "epoch": 0.0013559015615466317, "grad_norm": 0.6886783838272095, "learning_rate": 0.0001991790013823246, "loss": 44.3137, "step": 30 }, { "epoch": 0.0013559015615466317, "eval_loss": 11.082609176635742, "eval_runtime": 176.2695, "eval_samples_per_second": 52.857, "eval_steps_per_second": 26.431, "step": 30 }, { "epoch": 0.0014010982802648527, "grad_norm": 0.7027749419212341, "learning_rate": 0.00019909497617679348, "loss": 44.3391, "step": 31 }, { "epoch": 0.0014462949989830739, "grad_norm": 0.735598087310791, "learning_rate": 0.0001990068775649202, "loss": 44.3645, "step": 32 }, { "epoch": 0.0014914917177012948, "grad_norm": 0.7152600288391113, "learning_rate": 0.00019891470916809362, "loss": 44.3478, "step": 33 }, { "epoch": 0.001536688436419516, "grad_norm": 0.6983291506767273, "learning_rate": 0.00019881847477499557, "loss": 44.3252, "step": 34 }, { "epoch": 0.001581885155137737, "grad_norm": 0.6892045140266418, "learning_rate": 0.00019871817834144504, "loss": 44.2998, "step": 35 }, { "epoch": 0.001581885155137737, "eval_loss": 11.079712867736816, "eval_runtime": 176.1378, "eval_samples_per_second": 52.896, "eval_steps_per_second": 26.451, "step": 35 }, { "epoch": 0.0016270818738559581, "grad_norm": 0.7166262865066528, "learning_rate": 0.0001986138239902355, "loss": 44.3485, "step": 36 }, { "epoch": 0.001672278592574179, "grad_norm": 0.7545002102851868, "learning_rate": 0.0001985054160109657, "loss": 44.2613, "step": 37 }, { "epoch": 0.0017174753112924003, "grad_norm": 0.7944263219833374, "learning_rate": 0.00019839295885986296, "loss": 44.2665, "step": 38 }, { "epoch": 0.0017626720300106212, "grad_norm": 0.7216903567314148, "learning_rate": 0.0001982764571596004, "loss": 44.3546, "step": 39 }, { "epoch": 0.0018078687487288422, "grad_norm": 0.7492774128913879, "learning_rate": 0.00019815591569910654, "loss": 44.3223, "step": 40 }, { "epoch": 0.0018078687487288422, "eval_loss": 11.076553344726562, "eval_runtime": 176.1866, "eval_samples_per_second": 52.881, "eval_steps_per_second": 26.444, "step": 40 }, { "epoch": 0.0018530654674470634, "grad_norm": 0.8118460774421692, "learning_rate": 0.00019803133943336874, "loss": 44.3122, "step": 41 }, { "epoch": 0.0018982621861652843, "grad_norm": 0.7527559399604797, "learning_rate": 0.0001979027334832293, "loss": 44.3061, "step": 42 }, { "epoch": 0.0019434589048835055, "grad_norm": 0.7425262331962585, "learning_rate": 0.00019777010313517518, "loss": 44.2408, "step": 43 }, { "epoch": 0.0019886556236017264, "grad_norm": 0.753101646900177, "learning_rate": 0.00019763345384112043, "loss": 44.3362, "step": 44 }, { "epoch": 0.0020338523423199476, "grad_norm": 0.767737090587616, "learning_rate": 0.00019749279121818235, "loss": 44.2864, "step": 45 }, { "epoch": 0.0020338523423199476, "eval_loss": 11.072389602661133, "eval_runtime": 175.9667, "eval_samples_per_second": 52.948, "eval_steps_per_second": 26.477, "step": 45 }, { "epoch": 0.002079049061038169, "grad_norm": 0.7275786995887756, "learning_rate": 0.00019734812104845047, "loss": 44.3542, "step": 46 }, { "epoch": 0.0021242457797563895, "grad_norm": 0.6908650994300842, "learning_rate": 0.00019719944927874881, "loss": 44.3377, "step": 47 }, { "epoch": 0.0021694424984746107, "grad_norm": 0.7260599136352539, "learning_rate": 0.0001970467820203915, "loss": 44.2621, "step": 48 }, { "epoch": 0.002214639217192832, "grad_norm": 0.7138715982437134, "learning_rate": 0.00019689012554893154, "loss": 44.2338, "step": 49 }, { "epoch": 0.002259835935911053, "grad_norm": 0.7867954969406128, "learning_rate": 0.00019672948630390294, "loss": 44.3044, "step": 50 }, { "epoch": 0.002259835935911053, "eval_loss": 11.067892074584961, "eval_runtime": 176.5244, "eval_samples_per_second": 52.78, "eval_steps_per_second": 26.393, "step": 50 }, { "epoch": 0.002305032654629274, "grad_norm": 0.7787512540817261, "learning_rate": 0.00019656487088855592, "loss": 44.2918, "step": 51 }, { "epoch": 0.002350229373347495, "grad_norm": 0.7184544801712036, "learning_rate": 0.00019639628606958533, "loss": 44.2751, "step": 52 }, { "epoch": 0.002395426092065716, "grad_norm": 0.7348573803901672, "learning_rate": 0.0001962237387768529, "loss": 44.246, "step": 53 }, { "epoch": 0.002440622810783937, "grad_norm": 0.7713965773582458, "learning_rate": 0.00019604723610310194, "loss": 44.3292, "step": 54 }, { "epoch": 0.002485819529502158, "grad_norm": 0.8040369749069214, "learning_rate": 0.00019586678530366606, "loss": 44.2155, "step": 55 }, { "epoch": 0.002485819529502158, "eval_loss": 11.062650680541992, "eval_runtime": 176.278, "eval_samples_per_second": 52.854, "eval_steps_per_second": 26.43, "step": 55 }, { "epoch": 0.0025310162482203792, "grad_norm": 0.7459877133369446, "learning_rate": 0.00019568239379617088, "loss": 44.2188, "step": 56 }, { "epoch": 0.0025762129669386004, "grad_norm": 0.8008533716201782, "learning_rate": 0.00019549406916022905, "loss": 44.226, "step": 57 }, { "epoch": 0.002621409685656821, "grad_norm": 0.7918010354042053, "learning_rate": 0.00019530181913712872, "loss": 44.287, "step": 58 }, { "epoch": 0.0026666064043750423, "grad_norm": 0.7287217974662781, "learning_rate": 0.00019510565162951537, "loss": 44.2581, "step": 59 }, { "epoch": 0.0027118031230932635, "grad_norm": 0.7925474643707275, "learning_rate": 0.00019490557470106686, "loss": 44.2277, "step": 60 }, { "epoch": 0.0027118031230932635, "eval_loss": 11.05736255645752, "eval_runtime": 176.2465, "eval_samples_per_second": 52.863, "eval_steps_per_second": 26.435, "step": 60 }, { "epoch": 0.0027569998418114847, "grad_norm": 0.8553807735443115, "learning_rate": 0.00019470159657616215, "loss": 44.2439, "step": 61 }, { "epoch": 0.0028021965605297054, "grad_norm": 0.7586395740509033, "learning_rate": 0.00019449372563954293, "loss": 44.1943, "step": 62 }, { "epoch": 0.0028473932792479266, "grad_norm": 0.7628232836723328, "learning_rate": 0.0001942819704359693, "loss": 44.2594, "step": 63 }, { "epoch": 0.0028925899979661478, "grad_norm": 0.718551754951477, "learning_rate": 0.00019406633966986828, "loss": 44.2302, "step": 64 }, { "epoch": 0.002937786716684369, "grad_norm": 0.7625423073768616, "learning_rate": 0.00019384684220497605, "loss": 44.1989, "step": 65 }, { "epoch": 0.002937786716684369, "eval_loss": 11.051901817321777, "eval_runtime": 176.1759, "eval_samples_per_second": 52.885, "eval_steps_per_second": 26.445, "step": 65 }, { "epoch": 0.0029829834354025897, "grad_norm": 0.7891851663589478, "learning_rate": 0.00019362348706397373, "loss": 44.2199, "step": 66 }, { "epoch": 0.003028180154120811, "grad_norm": 0.6770808100700378, "learning_rate": 0.00019339628342811632, "loss": 44.1689, "step": 67 }, { "epoch": 0.003073376872839032, "grad_norm": 0.7498692870140076, "learning_rate": 0.0001931652406368554, "loss": 44.1741, "step": 68 }, { "epoch": 0.0031185735915572528, "grad_norm": 0.7661782503128052, "learning_rate": 0.0001929303681874552, "loss": 44.2123, "step": 69 }, { "epoch": 0.003163770310275474, "grad_norm": 0.6438837647438049, "learning_rate": 0.0001926916757346022, "loss": 44.1718, "step": 70 }, { "epoch": 0.003163770310275474, "eval_loss": 11.046669960021973, "eval_runtime": 176.3634, "eval_samples_per_second": 52.828, "eval_steps_per_second": 26.417, "step": 70 }, { "epoch": 0.003208967028993695, "grad_norm": 0.7522275447845459, "learning_rate": 0.00019244917309000817, "loss": 44.2246, "step": 71 }, { "epoch": 0.0032541637477119163, "grad_norm": 0.7135974168777466, "learning_rate": 0.00019220287022200707, "loss": 44.2111, "step": 72 }, { "epoch": 0.003299360466430137, "grad_norm": 0.7275662422180176, "learning_rate": 0.0001919527772551451, "loss": 44.1464, "step": 73 }, { "epoch": 0.003344557185148358, "grad_norm": 0.6742229461669922, "learning_rate": 0.00019169890446976454, "loss": 44.2105, "step": 74 }, { "epoch": 0.0033897539038665794, "grad_norm": 0.6085646152496338, "learning_rate": 0.00019144126230158127, "loss": 44.0926, "step": 75 }, { "epoch": 0.0033897539038665794, "eval_loss": 11.042237281799316, "eval_runtime": 176.114, "eval_samples_per_second": 52.903, "eval_steps_per_second": 26.454, "step": 75 }, { "epoch": 0.0034349506225848005, "grad_norm": 0.7245734333992004, "learning_rate": 0.0001911798613412557, "loss": 44.2154, "step": 76 }, { "epoch": 0.0034801473413030213, "grad_norm": 0.7311281561851501, "learning_rate": 0.0001909147123339575, "loss": 44.1687, "step": 77 }, { "epoch": 0.0035253440600212425, "grad_norm": 0.6399495601654053, "learning_rate": 0.0001906458261789238, "loss": 44.1596, "step": 78 }, { "epoch": 0.0035705407787394636, "grad_norm": 0.5650178790092468, "learning_rate": 0.00019037321392901136, "loss": 44.1466, "step": 79 }, { "epoch": 0.0036157374974576844, "grad_norm": 0.6039579510688782, "learning_rate": 0.0001900968867902419, "loss": 44.1955, "step": 80 }, { "epoch": 0.0036157374974576844, "eval_loss": 11.038910865783691, "eval_runtime": 176.3853, "eval_samples_per_second": 52.822, "eval_steps_per_second": 26.414, "step": 80 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 840538521600.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }