{ "best_metric": 1.9786142110824585, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 1.3034623217922607, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013034623217922607, "grad_norm": 0.6276217103004456, "learning_rate": 7.5e-05, "loss": 2.9674, "step": 1 }, { "epoch": 0.013034623217922607, "eval_loss": 3.5546200275421143, "eval_runtime": 0.2814, "eval_samples_per_second": 177.672, "eval_steps_per_second": 46.195, "step": 1 }, { "epoch": 0.026069246435845215, "grad_norm": 0.6501481533050537, "learning_rate": 0.00015, "loss": 3.2018, "step": 2 }, { "epoch": 0.039103869653767824, "grad_norm": 0.5839198231697083, "learning_rate": 0.000225, "loss": 3.3157, "step": 3 }, { "epoch": 0.05213849287169043, "grad_norm": 0.5366824865341187, "learning_rate": 0.0003, "loss": 3.3337, "step": 4 }, { "epoch": 0.06517311608961303, "grad_norm": 0.4950920641422272, "learning_rate": 0.0002999703922691041, "loss": 3.3257, "step": 5 }, { "epoch": 0.07820773930753565, "grad_norm": 0.4459711015224457, "learning_rate": 0.00029988158206334587, "loss": 3.2473, "step": 6 }, { "epoch": 0.09124236252545825, "grad_norm": 0.5029334425926208, "learning_rate": 0.00029973360833781664, "loss": 3.251, "step": 7 }, { "epoch": 0.10427698574338086, "grad_norm": 0.5529475808143616, "learning_rate": 0.0002995265359986831, "loss": 3.212, "step": 8 }, { "epoch": 0.11731160896130347, "grad_norm": 0.6158551573753357, "learning_rate": 0.00029926045587471686, "loss": 3.1893, "step": 9 }, { "epoch": 0.13034623217922606, "grad_norm": 0.6573907136917114, "learning_rate": 0.0002989354846774545, "loss": 3.1782, "step": 10 }, { "epoch": 0.14338085539714868, "grad_norm": 0.6919009685516357, "learning_rate": 0.0002985517649500034, "loss": 3.0629, "step": 11 }, { "epoch": 0.1564154786150713, "grad_norm": 0.7953146696090698, "learning_rate": 0.00029810946500451814, "loss": 3.132, "step": 12 }, { "epoch": 0.1694501018329939, "grad_norm": 0.9705007672309875, "learning_rate": 0.00029760877884837294, "loss": 2.7258, "step": 13 }, { "epoch": 0.1824847250509165, "grad_norm": 0.9804863333702087, "learning_rate": 0.0002970499260990637, "loss": 2.7512, "step": 14 }, { "epoch": 0.1955193482688391, "grad_norm": 0.6821589469909668, "learning_rate": 0.0002964331518878766, "loss": 2.6437, "step": 15 }, { "epoch": 0.20855397148676172, "grad_norm": 0.4936249554157257, "learning_rate": 0.0002957587267523652, "loss": 2.6137, "step": 16 }, { "epoch": 0.2215885947046843, "grad_norm": 0.5652141571044922, "learning_rate": 0.00029502694651768383, "loss": 2.6432, "step": 17 }, { "epoch": 0.23462321792260693, "grad_norm": 0.6153628826141357, "learning_rate": 0.000294238132166829, "loss": 2.6485, "step": 18 }, { "epoch": 0.24765784114052952, "grad_norm": 0.4337889552116394, "learning_rate": 0.0002933926296998457, "loss": 2.5919, "step": 19 }, { "epoch": 0.2606924643584521, "grad_norm": 0.5410802364349365, "learning_rate": 0.0002924908099820599, "loss": 2.6103, "step": 20 }, { "epoch": 0.27372708757637476, "grad_norm": 0.5030240416526794, "learning_rate": 0.00029153306858140533, "loss": 2.6104, "step": 21 }, { "epoch": 0.28676171079429735, "grad_norm": 0.38985803723335266, "learning_rate": 0.00029051982559491393, "loss": 2.5794, "step": 22 }, { "epoch": 0.29979633401221994, "grad_norm": 0.5979215502738953, "learning_rate": 0.00028945152546444754, "loss": 2.6287, "step": 23 }, { "epoch": 0.3128309572301426, "grad_norm": 0.5737847089767456, "learning_rate": 0.0002883286367817511, "loss": 2.6541, "step": 24 }, { "epoch": 0.3258655804480652, "grad_norm": 0.572758138179779, "learning_rate": 0.00028715165208291265, "loss": 2.6881, "step": 25 }, { "epoch": 0.3258655804480652, "eval_loss": 2.5153815746307373, "eval_runtime": 0.2797, "eval_samples_per_second": 178.771, "eval_steps_per_second": 46.481, "step": 25 }, { "epoch": 0.3389002036659878, "grad_norm": 1.4206351041793823, "learning_rate": 0.0002859210876323207, "loss": 2.4658, "step": 26 }, { "epoch": 0.35193482688391037, "grad_norm": 1.5302618741989136, "learning_rate": 0.00028463748319621396, "loss": 2.5304, "step": 27 }, { "epoch": 0.364969450101833, "grad_norm": 0.9723348021507263, "learning_rate": 0.00028330140180592156, "loss": 2.4213, "step": 28 }, { "epoch": 0.3780040733197556, "grad_norm": 0.4521373212337494, "learning_rate": 0.0002819134295108992, "loss": 2.4048, "step": 29 }, { "epoch": 0.3910386965376782, "grad_norm": 0.6521058678627014, "learning_rate": 0.00028047417512166837, "loss": 2.4019, "step": 30 }, { "epoch": 0.4040733197556008, "grad_norm": 0.7668533325195312, "learning_rate": 0.00027898426994277204, "loss": 2.4534, "step": 31 }, { "epoch": 0.41710794297352344, "grad_norm": 0.791877269744873, "learning_rate": 0.0002774443674958634, "loss": 2.4232, "step": 32 }, { "epoch": 0.43014256619144603, "grad_norm": 0.814365565776825, "learning_rate": 0.00027585514323305, "loss": 2.4323, "step": 33 }, { "epoch": 0.4431771894093686, "grad_norm": 0.6510496735572815, "learning_rate": 0.00027421729424061787, "loss": 2.3904, "step": 34 }, { "epoch": 0.45621181262729127, "grad_norm": 0.42264309525489807, "learning_rate": 0.00027253153893326646, "loss": 2.4016, "step": 35 }, { "epoch": 0.46924643584521386, "grad_norm": 0.5813013911247253, "learning_rate": 0.0002707986167389884, "loss": 2.4511, "step": 36 }, { "epoch": 0.48228105906313645, "grad_norm": 0.8300145864486694, "learning_rate": 0.0002690192877747315, "loss": 2.4128, "step": 37 }, { "epoch": 0.49531568228105904, "grad_norm": 0.7689920663833618, "learning_rate": 0.0002671943325129871, "loss": 2.3281, "step": 38 }, { "epoch": 0.5083503054989816, "grad_norm": 1.024122953414917, "learning_rate": 0.0002653245514394482, "loss": 2.3381, "step": 39 }, { "epoch": 0.5213849287169042, "grad_norm": 0.6759045720100403, "learning_rate": 0.0002634107647018905, "loss": 2.3008, "step": 40 }, { "epoch": 0.5344195519348269, "grad_norm": 0.3922083377838135, "learning_rate": 0.0002614538117504284, "loss": 2.2719, "step": 41 }, { "epoch": 0.5474541751527495, "grad_norm": 0.5142776966094971, "learning_rate": 0.0002594545509693043, "loss": 2.276, "step": 42 }, { "epoch": 0.5604887983706721, "grad_norm": 0.6373997926712036, "learning_rate": 0.00025741385930037295, "loss": 2.2602, "step": 43 }, { "epoch": 0.5735234215885947, "grad_norm": 0.6760398149490356, "learning_rate": 0.00025533263185844587, "loss": 2.3169, "step": 44 }, { "epoch": 0.5865580448065173, "grad_norm": 0.7617467045783997, "learning_rate": 0.00025321178153866423, "loss": 2.2778, "step": 45 }, { "epoch": 0.5995926680244399, "grad_norm": 0.7649106979370117, "learning_rate": 0.00025105223861607306, "loss": 2.2692, "step": 46 }, { "epoch": 0.6126272912423625, "grad_norm": 0.6090757250785828, "learning_rate": 0.0002488549503375719, "loss": 2.2845, "step": 47 }, { "epoch": 0.6256619144602852, "grad_norm": 0.45042526721954346, "learning_rate": 0.0002466208805064206, "loss": 2.2431, "step": 48 }, { "epoch": 0.6386965376782078, "grad_norm": 0.46896523237228394, "learning_rate": 0.00024435100905948387, "loss": 2.2985, "step": 49 }, { "epoch": 0.6517311608961304, "grad_norm": 0.5649595260620117, "learning_rate": 0.00024204633163739828, "loss": 2.4211, "step": 50 }, { "epoch": 0.6517311608961304, "eval_loss": 2.2425832748413086, "eval_runtime": 0.2798, "eval_samples_per_second": 178.69, "eval_steps_per_second": 46.46, "step": 50 }, { "epoch": 0.664765784114053, "grad_norm": 1.1472358703613281, "learning_rate": 0.00023970785914785144, "loss": 2.2791, "step": 51 }, { "epoch": 0.6778004073319756, "grad_norm": 1.3499984741210938, "learning_rate": 0.00023733661732216452, "loss": 2.3292, "step": 52 }, { "epoch": 0.6908350305498981, "grad_norm": 1.2178796529769897, "learning_rate": 0.00023493364626537257, "loss": 2.2938, "step": 53 }, { "epoch": 0.7038696537678207, "grad_norm": 0.9463868737220764, "learning_rate": 0.00023249999999999999, "loss": 2.2173, "step": 54 }, { "epoch": 0.7169042769857433, "grad_norm": 0.5750055909156799, "learning_rate": 0.00023003674600373153, "loss": 2.2036, "step": 55 }, { "epoch": 0.729938900203666, "grad_norm": 0.3371862769126892, "learning_rate": 0.00022754496474118133, "loss": 2.167, "step": 56 }, { "epoch": 0.7429735234215886, "grad_norm": 0.6258431673049927, "learning_rate": 0.00022502574918996517, "loss": 2.2211, "step": 57 }, { "epoch": 0.7560081466395112, "grad_norm": 0.7280526161193848, "learning_rate": 0.00022248020436128478, "loss": 2.2122, "step": 58 }, { "epoch": 0.7690427698574338, "grad_norm": 0.8275133371353149, "learning_rate": 0.00021990944681523302, "loss": 2.2137, "step": 59 }, { "epoch": 0.7820773930753564, "grad_norm": 0.8840050101280212, "learning_rate": 0.0002173146041710339, "loss": 2.2311, "step": 60 }, { "epoch": 0.795112016293279, "grad_norm": 0.957916796207428, "learning_rate": 0.00021469681461243153, "loss": 2.2478, "step": 61 }, { "epoch": 0.8081466395112016, "grad_norm": 1.0393691062927246, "learning_rate": 0.00021205722638844505, "loss": 2.2579, "step": 62 }, { "epoch": 0.8211812627291243, "grad_norm": 0.3389835059642792, "learning_rate": 0.00020939699730970873, "loss": 2.1799, "step": 63 }, { "epoch": 0.8342158859470469, "grad_norm": 0.5756048560142517, "learning_rate": 0.00020671729424061788, "loss": 2.1684, "step": 64 }, { "epoch": 0.8472505091649695, "grad_norm": 0.7830222249031067, "learning_rate": 0.00020401929258750365, "loss": 2.1438, "step": 65 }, { "epoch": 0.8602851323828921, "grad_norm": 0.8117411732673645, "learning_rate": 0.00020130417578306082, "loss": 2.1536, "step": 66 }, { "epoch": 0.8733197556008147, "grad_norm": 0.7848119735717773, "learning_rate": 0.0001985731347672554, "loss": 2.129, "step": 67 }, { "epoch": 0.8863543788187372, "grad_norm": 0.6703603863716125, "learning_rate": 0.00019582736746493853, "loss": 2.152, "step": 68 }, { "epoch": 0.8993890020366598, "grad_norm": 0.4845719635486603, "learning_rate": 0.00019306807826039747, "loss": 2.1237, "step": 69 }, { "epoch": 0.9124236252545825, "grad_norm": 0.3355918228626251, "learning_rate": 0.00019029647746907283, "loss": 2.0711, "step": 70 }, { "epoch": 0.9254582484725051, "grad_norm": 0.33707523345947266, "learning_rate": 0.00018751378080667378, "loss": 2.1033, "step": 71 }, { "epoch": 0.9384928716904277, "grad_norm": 0.5474686622619629, "learning_rate": 0.00018472120885592555, "loss": 2.0707, "step": 72 }, { "epoch": 0.9515274949083503, "grad_norm": 0.71000075340271, "learning_rate": 0.00018191998653118108, "loss": 2.1169, "step": 73 }, { "epoch": 0.9645621181262729, "grad_norm": 0.8560431003570557, "learning_rate": 0.0001791113425411332, "loss": 2.176, "step": 74 }, { "epoch": 0.9775967413441955, "grad_norm": 1.2276349067687988, "learning_rate": 0.0001762965088498626, "loss": 2.2966, "step": 75 }, { "epoch": 0.9775967413441955, "eval_loss": 2.044175148010254, "eval_runtime": 0.2806, "eval_samples_per_second": 178.197, "eval_steps_per_second": 46.331, "step": 75 }, { "epoch": 0.9906313645621181, "grad_norm": 0.28324252367019653, "learning_rate": 0.0001734767201364573, "loss": 2.0594, "step": 76 }, { "epoch": 1.0036659877800407, "grad_norm": 0.4252139627933502, "learning_rate": 0.00017065321325344194, "loss": 2.9023, "step": 77 }, { "epoch": 1.0167006109979633, "grad_norm": 0.532595157623291, "learning_rate": 0.00016782722668425316, "loss": 1.9476, "step": 78 }, { "epoch": 1.0297352342158859, "grad_norm": 0.696269690990448, "learning_rate": 0.000165, "loss": 2.089, "step": 79 }, { "epoch": 1.0427698574338085, "grad_norm": 0.6571292281150818, "learning_rate": 0.00016217277331574678, "loss": 2.0514, "step": 80 }, { "epoch": 1.055804480651731, "grad_norm": 0.5735260248184204, "learning_rate": 0.00015934678674655805, "loss": 2.0645, "step": 81 }, { "epoch": 1.0688391038696539, "grad_norm": 0.418550968170166, "learning_rate": 0.0001565232798635427, "loss": 2.0639, "step": 82 }, { "epoch": 1.0818737270875765, "grad_norm": 0.32165512442588806, "learning_rate": 0.00015370349115013742, "loss": 2.0412, "step": 83 }, { "epoch": 1.094908350305499, "grad_norm": 0.3411344289779663, "learning_rate": 0.0001508886574588668, "loss": 2.0738, "step": 84 }, { "epoch": 1.1079429735234216, "grad_norm": 0.4526989459991455, "learning_rate": 0.0001480800134688189, "loss": 2.0482, "step": 85 }, { "epoch": 1.1209775967413442, "grad_norm": 0.5264050960540771, "learning_rate": 0.00014527879114407445, "loss": 2.0155, "step": 86 }, { "epoch": 1.1340122199592668, "grad_norm": 0.6333541870117188, "learning_rate": 0.0001424862191933262, "loss": 2.029, "step": 87 }, { "epoch": 1.1470468431771894, "grad_norm": 0.6475998759269714, "learning_rate": 0.00013970352253092714, "loss": 2.0732, "step": 88 }, { "epoch": 1.160081466395112, "grad_norm": 0.5682183504104614, "learning_rate": 0.00013693192173960253, "loss": 1.6717, "step": 89 }, { "epoch": 1.1731160896130346, "grad_norm": 0.48593777418136597, "learning_rate": 0.00013417263253506147, "loss": 2.5498, "step": 90 }, { "epoch": 1.1861507128309572, "grad_norm": 0.517917811870575, "learning_rate": 0.00013142686523274463, "loss": 2.0097, "step": 91 }, { "epoch": 1.1991853360488798, "grad_norm": 0.5828862190246582, "learning_rate": 0.00012869582421693912, "loss": 1.9987, "step": 92 }, { "epoch": 1.2122199592668024, "grad_norm": 0.5273678302764893, "learning_rate": 0.00012598070741249632, "loss": 2.0205, "step": 93 }, { "epoch": 1.225254582484725, "grad_norm": 0.49020346999168396, "learning_rate": 0.00012328270575938212, "loss": 1.9981, "step": 94 }, { "epoch": 1.2382892057026478, "grad_norm": 0.35303086042404175, "learning_rate": 0.00012060300269029128, "loss": 1.9699, "step": 95 }, { "epoch": 1.2513238289205701, "grad_norm": 0.27531367540359497, "learning_rate": 0.00011794277361155495, "loss": 2.0035, "step": 96 }, { "epoch": 1.264358452138493, "grad_norm": 0.31903597712516785, "learning_rate": 0.00011530318538756846, "loss": 1.9783, "step": 97 }, { "epoch": 1.2773930753564156, "grad_norm": 0.43081673979759216, "learning_rate": 0.0001126853958289661, "loss": 2.0053, "step": 98 }, { "epoch": 1.2904276985743381, "grad_norm": 0.5114902257919312, "learning_rate": 0.00011009055318476698, "loss": 2.0032, "step": 99 }, { "epoch": 1.3034623217922607, "grad_norm": 0.6454872488975525, "learning_rate": 0.00010751979563871518, "loss": 2.0436, "step": 100 }, { "epoch": 1.3034623217922607, "eval_loss": 1.9786142110824585, "eval_runtime": 0.2787, "eval_samples_per_second": 179.386, "eval_steps_per_second": 46.64, "step": 100 } ], "logging_steps": 1, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.44280275468288e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }