|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9328575759695888, |
|
"eval_steps": 500, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3762490749359131, |
|
"learning_rate": 0.00046641791044776124, |
|
"loss": 8.4353, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 6.534626483917236, |
|
"eval_runtime": 213.0459, |
|
"eval_samples_per_second": 264.877, |
|
"eval_steps_per_second": 8.28, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.518302321434021, |
|
"learning_rate": 0.0009328358208955225, |
|
"loss": 5.5553, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 4.976442813873291, |
|
"eval_runtime": 215.6331, |
|
"eval_samples_per_second": 261.699, |
|
"eval_steps_per_second": 8.181, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8457584381103516, |
|
"learning_rate": 0.0009789877154220063, |
|
"loss": 4.5996, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 4.408080101013184, |
|
"eval_runtime": 215.4944, |
|
"eval_samples_per_second": 261.868, |
|
"eval_steps_per_second": 8.186, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3413056135177612, |
|
"learning_rate": 0.0009544406539991162, |
|
"loss": 4.1961, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 4.121068954467773, |
|
"eval_runtime": 216.0979, |
|
"eval_samples_per_second": 261.136, |
|
"eval_steps_per_second": 8.163, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2387056350708008, |
|
"learning_rate": 0.0009298935925762261, |
|
"loss": 3.9584, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 3.929608106613159, |
|
"eval_runtime": 216.438, |
|
"eval_samples_per_second": 260.726, |
|
"eval_steps_per_second": 8.15, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.3727014064788818, |
|
"learning_rate": 0.0009053465311533362, |
|
"loss": 3.7833, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 3.787693738937378, |
|
"eval_runtime": 218.2815, |
|
"eval_samples_per_second": 258.524, |
|
"eval_steps_per_second": 8.081, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.5133860111236572, |
|
"learning_rate": 0.0008807994697304462, |
|
"loss": 3.6616, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 3.683837652206421, |
|
"eval_runtime": 217.4429, |
|
"eval_samples_per_second": 259.521, |
|
"eval_steps_per_second": 8.112, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3134872913360596, |
|
"learning_rate": 0.0008562524083075562, |
|
"loss": 3.5679, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 3.6060750484466553, |
|
"eval_runtime": 216.1087, |
|
"eval_samples_per_second": 261.123, |
|
"eval_steps_per_second": 8.163, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3464657068252563, |
|
"learning_rate": 0.0008317053468846664, |
|
"loss": 3.4964, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 3.54757022857666, |
|
"eval_runtime": 216.7125, |
|
"eval_samples_per_second": 260.396, |
|
"eval_steps_per_second": 8.14, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.3200907707214355, |
|
"learning_rate": 0.0008071582854617764, |
|
"loss": 3.4488, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 3.4986045360565186, |
|
"eval_runtime": 217.9581, |
|
"eval_samples_per_second": 258.908, |
|
"eval_steps_per_second": 8.093, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.514103651046753, |
|
"learning_rate": 0.0007826112240388863, |
|
"loss": 3.3973, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 3.4664642810821533, |
|
"eval_runtime": 218.2073, |
|
"eval_samples_per_second": 258.612, |
|
"eval_steps_per_second": 8.084, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4389874935150146, |
|
"learning_rate": 0.0007580641626159963, |
|
"loss": 3.3587, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 3.4231605529785156, |
|
"eval_runtime": 218.4087, |
|
"eval_samples_per_second": 258.373, |
|
"eval_steps_per_second": 8.077, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4937726259231567, |
|
"learning_rate": 0.0007335171011931065, |
|
"loss": 3.3231, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 3.390798330307007, |
|
"eval_runtime": 216.9327, |
|
"eval_samples_per_second": 260.131, |
|
"eval_steps_per_second": 8.132, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5942374467849731, |
|
"learning_rate": 0.0007089700397702165, |
|
"loss": 3.2971, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 3.361341714859009, |
|
"eval_runtime": 217.461, |
|
"eval_samples_per_second": 259.499, |
|
"eval_steps_per_second": 8.112, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.6310980319976807, |
|
"learning_rate": 0.0006844229783473265, |
|
"loss": 3.2679, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 3.33766770362854, |
|
"eval_runtime": 217.3952, |
|
"eval_samples_per_second": 259.578, |
|
"eval_steps_per_second": 8.114, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.6001648902893066, |
|
"learning_rate": 0.0006598759169244364, |
|
"loss": 3.2436, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 3.3167307376861572, |
|
"eval_runtime": 217.5617, |
|
"eval_samples_per_second": 259.379, |
|
"eval_steps_per_second": 8.108, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.590154767036438, |
|
"learning_rate": 0.0006353288555015467, |
|
"loss": 3.2322, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 3.2952842712402344, |
|
"eval_runtime": 217.0219, |
|
"eval_samples_per_second": 260.024, |
|
"eval_steps_per_second": 8.128, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.7063907384872437, |
|
"learning_rate": 0.0006107817940786566, |
|
"loss": 3.208, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 3.2771995067596436, |
|
"eval_runtime": 218.9112, |
|
"eval_samples_per_second": 257.78, |
|
"eval_steps_per_second": 8.058, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.544815182685852, |
|
"learning_rate": 0.0005862347326557666, |
|
"loss": 3.1923, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 3.263777017593384, |
|
"eval_runtime": 218.0114, |
|
"eval_samples_per_second": 258.844, |
|
"eval_steps_per_second": 8.091, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.615902304649353, |
|
"learning_rate": 0.0005616876712328766, |
|
"loss": 3.1683, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 3.2488255500793457, |
|
"eval_runtime": 217.5444, |
|
"eval_samples_per_second": 259.4, |
|
"eval_steps_per_second": 8.109, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.533378005027771, |
|
"learning_rate": 0.0005371897039328324, |
|
"loss": 3.158, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 3.2329940795898438, |
|
"eval_runtime": 214.5885, |
|
"eval_samples_per_second": 262.973, |
|
"eval_steps_per_second": 8.22, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.9465535879135132, |
|
"learning_rate": 0.0005126426425099425, |
|
"loss": 3.1439, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 3.2245969772338867, |
|
"eval_runtime": 216.8559, |
|
"eval_samples_per_second": 260.224, |
|
"eval_steps_per_second": 8.134, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.7405271530151367, |
|
"learning_rate": 0.0004880955810870526, |
|
"loss": 3.1374, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 3.208115339279175, |
|
"eval_runtime": 217.1863, |
|
"eval_samples_per_second": 259.828, |
|
"eval_steps_per_second": 8.122, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6891497373580933, |
|
"learning_rate": 0.0004635485196641626, |
|
"loss": 3.1213, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 3.200620651245117, |
|
"eval_runtime": 218.4169, |
|
"eval_samples_per_second": 258.364, |
|
"eval_steps_per_second": 8.076, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.73305082321167, |
|
"learning_rate": 0.00043909964648696415, |
|
"loss": 3.1144, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 3.188239812850952, |
|
"eval_runtime": 215.556, |
|
"eval_samples_per_second": 261.793, |
|
"eval_steps_per_second": 8.183, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.7891405820846558, |
|
"learning_rate": 0.0004145525850640742, |
|
"loss": 3.0963, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 3.174497365951538, |
|
"eval_runtime": 214.2396, |
|
"eval_samples_per_second": 263.401, |
|
"eval_steps_per_second": 8.234, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.6677337884902954, |
|
"learning_rate": 0.00039000552364118425, |
|
"loss": 3.097, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 3.165832042694092, |
|
"eval_runtime": 214.1769, |
|
"eval_samples_per_second": 263.479, |
|
"eval_steps_per_second": 8.236, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.7241294384002686, |
|
"learning_rate": 0.00036545846221829433, |
|
"loss": 3.076, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 3.1584606170654297, |
|
"eval_runtime": 264.545, |
|
"eval_samples_per_second": 213.313, |
|
"eval_steps_per_second": 6.668, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.835271954536438, |
|
"learning_rate": 0.00034091140079540435, |
|
"loss": 3.0666, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 3.151036262512207, |
|
"eval_runtime": 218.4608, |
|
"eval_samples_per_second": 258.312, |
|
"eval_steps_per_second": 8.075, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.7558091878890991, |
|
"learning_rate": 0.00031636433937251427, |
|
"loss": 3.0709, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 3.139883279800415, |
|
"eval_runtime": 11111.1382, |
|
"eval_samples_per_second": 5.079, |
|
"eval_steps_per_second": 0.159, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.6270012855529785, |
|
"learning_rate": 0.00029186637207247017, |
|
"loss": 3.0609, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 3.1347908973693848, |
|
"eval_runtime": 258.6315, |
|
"eval_samples_per_second": 218.191, |
|
"eval_steps_per_second": 6.821, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.971229910850525, |
|
"learning_rate": 0.0002673193106495802, |
|
"loss": 3.0527, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 3.127486228942871, |
|
"eval_runtime": 243.3689, |
|
"eval_samples_per_second": 231.874, |
|
"eval_steps_per_second": 7.248, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.7010846138000488, |
|
"learning_rate": 0.00024277224922669027, |
|
"loss": 3.0449, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 3.122706890106201, |
|
"eval_runtime": 259.8928, |
|
"eval_samples_per_second": 217.132, |
|
"eval_steps_per_second": 6.787, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.0105388164520264, |
|
"learning_rate": 0.00021822518780380032, |
|
"loss": 3.0407, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 3.1165504455566406, |
|
"eval_runtime": 264.3004, |
|
"eval_samples_per_second": 213.511, |
|
"eval_steps_per_second": 6.674, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.7834789752960205, |
|
"learning_rate": 0.00019372722050375605, |
|
"loss": 3.0376, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 3.10659122467041, |
|
"eval_runtime": 265.7435, |
|
"eval_samples_per_second": 212.351, |
|
"eval_steps_per_second": 6.638, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.8829165697097778, |
|
"learning_rate": 0.0001691801590808661, |
|
"loss": 3.0251, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 3.1006741523742676, |
|
"eval_runtime": 267.8646, |
|
"eval_samples_per_second": 210.67, |
|
"eval_steps_per_second": 6.585, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.7196826934814453, |
|
"learning_rate": 0.00014463309765797616, |
|
"loss": 3.0226, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 3.0954883098602295, |
|
"eval_runtime": 263.6733, |
|
"eval_samples_per_second": 214.019, |
|
"eval_steps_per_second": 6.69, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.7544931173324585, |
|
"learning_rate": 0.00012008603623508621, |
|
"loss": 3.0203, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 3.0885541439056396, |
|
"eval_runtime": 225.0692, |
|
"eval_samples_per_second": 250.727, |
|
"eval_steps_per_second": 7.838, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.659536361694336, |
|
"learning_rate": 9.558806893504194e-05, |
|
"loss": 3.0151, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 3.083906888961792, |
|
"eval_runtime": 257.4684, |
|
"eval_samples_per_second": 219.176, |
|
"eval_steps_per_second": 6.851, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.6146643161773682, |
|
"learning_rate": 7.1041007512152e-05, |
|
"loss": 3.0018, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 3.0793216228485107, |
|
"eval_runtime": 231.3706, |
|
"eval_samples_per_second": 243.899, |
|
"eval_steps_per_second": 7.624, |
|
"step": 20000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 21439, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"total_flos": 196340613120000.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|