|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999836966268321, |
|
"eval_steps": 500, |
|
"global_step": 46002, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0025, |
|
"loss": 6.1489, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.005, |
|
"loss": 4.849, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.004944446913470513, |
|
"loss": 4.3185, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.004888893826941025, |
|
"loss": 4.0835, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.004833340740411538, |
|
"loss": 3.9459, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0047777876538820496, |
|
"loss": 3.8551, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.004722234567352562, |
|
"loss": 3.7909, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.004666681480823075, |
|
"loss": 3.7385, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.004611128394293587, |
|
"loss": 3.6979, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.004555575307764099, |
|
"loss": 3.6587, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0045000222212346125, |
|
"loss": 3.6335, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.004444469134705124, |
|
"loss": 3.6055, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.004388916048175637, |
|
"loss": 3.582, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.004333362961646149, |
|
"loss": 3.5627, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.004277809875116662, |
|
"loss": 3.5455, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.004222256788587174, |
|
"loss": 3.5307, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.004166703702057686, |
|
"loss": 3.5155, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.004111150615528199, |
|
"loss": 3.5015, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.004055597528998711, |
|
"loss": 3.4861, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.004000044442469224, |
|
"loss": 3.4746, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.003944491355939736, |
|
"loss": 3.4666, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.0038889382694102487, |
|
"loss": 3.4558, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.003833385182880761, |
|
"loss": 3.4457, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0037778320963512734, |
|
"loss": 3.4388, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.0037222790098217855, |
|
"loss": 3.4297, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.0036667259232922985, |
|
"loss": 3.4141, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.0036111728367628107, |
|
"loss": 3.4104, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.0035556197502333233, |
|
"loss": 3.4001, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.0035000666637038354, |
|
"loss": 3.3922, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.003444513577174348, |
|
"loss": 3.3842, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00338896049064486, |
|
"loss": 3.3775, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.0033334074041153727, |
|
"loss": 3.3712, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.003277854317585885, |
|
"loss": 3.3677, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.0032223012310563974, |
|
"loss": 3.3633, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.0031667481445269096, |
|
"loss": 3.3547, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.0031111950579974226, |
|
"loss": 3.3504, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0030556419714679347, |
|
"loss": 3.3417, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0030000888849384473, |
|
"loss": 3.3381, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.00294453579840896, |
|
"loss": 3.3331, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.002888982711879472, |
|
"loss": 3.3252, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.0028334296253499846, |
|
"loss": 3.3194, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.0027778765388204968, |
|
"loss": 3.3189, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0027223234522910093, |
|
"loss": 3.3097, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.0026667703657615215, |
|
"loss": 3.3074, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.0026112172792320345, |
|
"loss": 3.3042, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.0025556641927025467, |
|
"loss": 3.2946, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.0025001111061730592, |
|
"loss": 3.2915, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.0024445580196435714, |
|
"loss": 3.2863, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 0.002389004933114084, |
|
"loss": 3.2838, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.0023334518465845965, |
|
"loss": 3.2779, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.0022778987600551087, |
|
"loss": 3.2719, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 0.0022223456735256213, |
|
"loss": 3.2683, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.0021667925869961334, |
|
"loss": 3.2655, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.002111239500466646, |
|
"loss": 3.2607, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.0020556864139371586, |
|
"loss": 3.2536, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.0020001333274076707, |
|
"loss": 3.2529, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.0019445802408781833, |
|
"loss": 3.2472, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.0018890271543486957, |
|
"loss": 3.2371, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.001833474067819208, |
|
"loss": 3.2356, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.0017779209812897204, |
|
"loss": 3.235, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.0017223678947602327, |
|
"loss": 3.2278, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.0016668148082307453, |
|
"loss": 3.2242, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.0016112617217012577, |
|
"loss": 3.2187, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.0015557086351717703, |
|
"loss": 3.2166, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.0015001555486422826, |
|
"loss": 3.2124, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.0014446024621127952, |
|
"loss": 3.2056, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.0013890493755833076, |
|
"loss": 3.2004, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 0.00133349628905382, |
|
"loss": 3.1971, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.0012779432025243323, |
|
"loss": 3.1939, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.0012223901159948447, |
|
"loss": 3.1899, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.0011668370294653572, |
|
"loss": 3.1844, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.0011112839429358696, |
|
"loss": 3.1825, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 0.001055730856406382, |
|
"loss": 3.1777, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 0.0010001777698768943, |
|
"loss": 3.1737, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.0009446246833474068, |
|
"loss": 3.1702, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.0008890715968179192, |
|
"loss": 3.1652, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.0008335185102884317, |
|
"loss": 3.1591, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 0.0007779654237589441, |
|
"loss": 3.156, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.0007224123372294565, |
|
"loss": 3.1518, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0006668592506999689, |
|
"loss": 3.1452, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.0006113061641704813, |
|
"loss": 3.1406, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 0.0005557530776409938, |
|
"loss": 3.1359, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.0005001999911115062, |
|
"loss": 3.135, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.0004446469045820186, |
|
"loss": 3.1247, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 0.000389093818052531, |
|
"loss": 3.1266, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 0.00033354073152304343, |
|
"loss": 3.1242, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 0.00027798764499355585, |
|
"loss": 3.12, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 0.00022243455846406826, |
|
"loss": 3.119, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.0001668814719345807, |
|
"loss": 3.1099, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.0001113283854050931, |
|
"loss": 3.1032, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 5.577529887560553e-05, |
|
"loss": 3.1045, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 2.222123461179503e-07, |
|
"loss": 3.1027, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 46002, |
|
"total_flos": 1.538554399872254e+18, |
|
"train_loss": 3.391347985035243, |
|
"train_runtime": 175663.9283, |
|
"train_samples_per_second": 16.76, |
|
"train_steps_per_second": 0.262 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 46002, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"total_flos": 1.538554399872254e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|