|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 610, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.4943847358226776, |
|
"learning_rate": 0.00019986740898848306, |
|
"loss": 1.1081, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.44973108172416687, |
|
"learning_rate": 0.0001994699875614589, |
|
"loss": 0.9226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.3362947404384613, |
|
"learning_rate": 0.00019880878960910772, |
|
"loss": 0.887, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.36153319478034973, |
|
"learning_rate": 0.0001978855685095358, |
|
"loss": 0.8615, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.36022189259529114, |
|
"learning_rate": 0.00019670277247913205, |
|
"loss": 0.8234, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.4029320478439331, |
|
"learning_rate": 0.00019526353808033825, |
|
"loss": 0.8149, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1475409836065573, |
|
"grad_norm": 0.3638781011104584, |
|
"learning_rate": 0.00019357168190404936, |
|
"loss": 0.799, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 0.3549768924713135, |
|
"learning_rate": 0.0001916316904487005, |
|
"loss": 0.785, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4754098360655736, |
|
"grad_norm": 0.3937470018863678, |
|
"learning_rate": 0.00018944870822287956, |
|
"loss": 0.7776, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"grad_norm": 0.36256495118141174, |
|
"learning_rate": 0.00018702852410301554, |
|
"loss": 0.7756, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8032786885245902, |
|
"grad_norm": 0.3645057678222656, |
|
"learning_rate": 0.00018437755598231856, |
|
"loss": 0.767, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 0.3446103632450104, |
|
"learning_rate": 0.00018150283375168114, |
|
"loss": 0.7543, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.1311475409836067, |
|
"grad_norm": 0.4181569516658783, |
|
"learning_rate": 0.00017841198065767107, |
|
"loss": 0.7231, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.2950819672131146, |
|
"grad_norm": 0.40038439631462097, |
|
"learning_rate": 0.00017511319308705198, |
|
"loss": 0.7348, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.459016393442623, |
|
"grad_norm": 0.3984282612800598, |
|
"learning_rate": 0.00017161521883143934, |
|
"loss": 0.7255, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 0.4186936318874359, |
|
"learning_rate": 0.00016792733388972932, |
|
"loss": 0.7263, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.7868852459016393, |
|
"grad_norm": 0.4004381597042084, |
|
"learning_rate": 0.00016405931786981755, |
|
"loss": 0.7271, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.9508196721311473, |
|
"grad_norm": 0.4597119390964508, |
|
"learning_rate": 0.00016002142805483685, |
|
"loss": 0.7125, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.1147540983606556, |
|
"grad_norm": 0.4180513322353363, |
|
"learning_rate": 0.00015582437220268647, |
|
"loss": 0.6925, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.278688524590164, |
|
"grad_norm": 0.4401358366012573, |
|
"learning_rate": 0.0001514792801509831, |
|
"loss": 0.6863, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.442622950819672, |
|
"grad_norm": 0.4415852427482605, |
|
"learning_rate": 0.000146997674302732, |
|
"loss": 0.6857, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.6065573770491803, |
|
"grad_norm": 0.4144597053527832, |
|
"learning_rate": 0.0001423914390709861, |
|
"loss": 0.6807, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.7704918032786887, |
|
"grad_norm": 0.4162183403968811, |
|
"learning_rate": 0.00013767278936351854, |
|
"loss": 0.6846, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.9344262295081966, |
|
"grad_norm": 0.42117443680763245, |
|
"learning_rate": 0.0001328542381910835, |
|
"loss": 0.6887, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.098360655737705, |
|
"grad_norm": 0.4591551125049591, |
|
"learning_rate": 0.00012794856348516095, |
|
"loss": 0.6543, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.262295081967213, |
|
"grad_norm": 0.4592267572879791, |
|
"learning_rate": 0.0001229687742131796, |
|
"loss": 0.6482, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.426229508196721, |
|
"grad_norm": 0.47369667887687683, |
|
"learning_rate": 0.00011792807588107357, |
|
"loss": 0.6494, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.590163934426229, |
|
"grad_norm": 0.45578595995903015, |
|
"learning_rate": 0.00011283983551465511, |
|
"loss": 0.6559, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.754098360655737, |
|
"grad_norm": 0.4742227792739868, |
|
"learning_rate": 0.00010771754621266466, |
|
"loss": 0.6462, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.918032786885246, |
|
"grad_norm": 0.48461949825286865, |
|
"learning_rate": 0.00010257479136549889, |
|
"loss": 0.6501, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.081967213114754, |
|
"grad_norm": 0.4797608554363251, |
|
"learning_rate": 9.742520863450115e-05, |
|
"loss": 0.6452, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.245901639344262, |
|
"grad_norm": 0.505832314491272, |
|
"learning_rate": 9.228245378733537e-05, |
|
"loss": 0.6178, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.409836065573771, |
|
"grad_norm": 0.5265078544616699, |
|
"learning_rate": 8.71601644853449e-05, |
|
"loss": 0.6177, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.573770491803279, |
|
"grad_norm": 0.5109356045722961, |
|
"learning_rate": 8.207192411892646e-05, |
|
"loss": 0.6225, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.737704918032787, |
|
"grad_norm": 0.5139452815055847, |
|
"learning_rate": 7.703122578682046e-05, |
|
"loss": 0.6229, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.901639344262295, |
|
"grad_norm": 0.5581080913543701, |
|
"learning_rate": 7.205143651483906e-05, |
|
"loss": 0.6189, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.065573770491803, |
|
"grad_norm": 0.4794338345527649, |
|
"learning_rate": 6.714576180891654e-05, |
|
"loss": 0.6087, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.229508196721311, |
|
"grad_norm": 0.5142917037010193, |
|
"learning_rate": 6.232721063648148e-05, |
|
"loss": 0.5977, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.39344262295082, |
|
"grad_norm": 0.547099769115448, |
|
"learning_rate": 5.7608560929013946e-05, |
|
"loss": 0.6058, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.557377049180328, |
|
"grad_norm": 0.5487104654312134, |
|
"learning_rate": 5.300232569726804e-05, |
|
"loss": 0.5939, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.721311475409836, |
|
"grad_norm": 0.5189688205718994, |
|
"learning_rate": 4.852071984901696e-05, |
|
"loss": 0.6015, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.885245901639344, |
|
"grad_norm": 0.5185168981552124, |
|
"learning_rate": 4.417562779731355e-05, |
|
"loss": 0.5949, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.049180327868853, |
|
"grad_norm": 0.5486496090888977, |
|
"learning_rate": 3.997857194516319e-05, |
|
"loss": 0.5957, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.213114754098361, |
|
"grad_norm": 0.5592466592788696, |
|
"learning_rate": 3.594068213018249e-05, |
|
"loss": 0.582, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.377049180327869, |
|
"grad_norm": 0.5590682625770569, |
|
"learning_rate": 3.207266611027069e-05, |
|
"loss": 0.5779, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.540983606557377, |
|
"grad_norm": 0.546293318271637, |
|
"learning_rate": 2.8384781168560693e-05, |
|
"loss": 0.5736, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.704918032786885, |
|
"grad_norm": 0.5725670456886292, |
|
"learning_rate": 2.4886806912948035e-05, |
|
"loss": 0.5825, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.868852459016393, |
|
"grad_norm": 0.574937105178833, |
|
"learning_rate": 2.1588019342328968e-05, |
|
"loss": 0.5843, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.032786885245901, |
|
"grad_norm": 0.5568664073944092, |
|
"learning_rate": 1.8497166248318876e-05, |
|
"loss": 0.5791, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.19672131147541, |
|
"grad_norm": 0.568524181842804, |
|
"learning_rate": 1.562244401768144e-05, |
|
"loss": 0.5673, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.360655737704919, |
|
"grad_norm": 0.5631764531135559, |
|
"learning_rate": 1.2971475896984475e-05, |
|
"loss": 0.5557, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.524590163934427, |
|
"grad_norm": 0.5955241322517395, |
|
"learning_rate": 1.0551291777120464e-05, |
|
"loss": 0.5741, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.688524590163935, |
|
"grad_norm": 0.5839523077011108, |
|
"learning_rate": 8.368309551299536e-06, |
|
"loss": 0.5674, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.852459016393443, |
|
"grad_norm": 0.5877330899238586, |
|
"learning_rate": 6.428318095950647e-06, |
|
"loss": 0.5819, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.01639344262295, |
|
"grad_norm": 0.5768188238143921, |
|
"learning_rate": 4.7364619196617495e-06, |
|
"loss": 0.5728, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 9.180327868852459, |
|
"grad_norm": 0.5796675682067871, |
|
"learning_rate": 3.2972275208679625e-06, |
|
"loss": 0.5682, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.344262295081966, |
|
"grad_norm": 0.6149064898490906, |
|
"learning_rate": 2.1144314904642195e-06, |
|
"loss": 0.5699, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.508196721311476, |
|
"grad_norm": 0.558674156665802, |
|
"learning_rate": 1.1912103908922945e-06, |
|
"loss": 0.5663, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.672131147540984, |
|
"grad_norm": 0.563441812992096, |
|
"learning_rate": 5.300124385410943e-07, |
|
"loss": 0.5626, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.836065573770492, |
|
"grad_norm": 0.5652551651000977, |
|
"learning_rate": 1.3259101151694708e-07, |
|
"loss": 0.5579, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.5645861625671387, |
|
"learning_rate": 0.0, |
|
"loss": 0.5631, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 610, |
|
"total_flos": 4.672887793385472e+16, |
|
"train_loss": 0.6668467552935491, |
|
"train_runtime": 2732.8065, |
|
"train_samples_per_second": 2.679, |
|
"train_steps_per_second": 0.223 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 610, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.672887793385472e+16, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|