|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.185821697099893, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6707318425178528, |
|
"learning_rate": 1.97816091954023e-05, |
|
"loss": 2.2888, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9574712643678162e-05, |
|
"loss": 2.0848, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.7721680402755737, |
|
"learning_rate": 1.9344827586206897e-05, |
|
"loss": 2.0044, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.1140433549880981, |
|
"learning_rate": 1.9126436781609195e-05, |
|
"loss": 1.8016, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.7205075621604919, |
|
"learning_rate": 1.8896551724137934e-05, |
|
"loss": 1.7217, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.8933233618736267, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 1.5705, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.7114273905754089, |
|
"learning_rate": 1.8436781609195404e-05, |
|
"loss": 1.4006, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.7229479551315308, |
|
"learning_rate": 1.820689655172414e-05, |
|
"loss": 1.3137, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.9370490908622742, |
|
"learning_rate": 1.7977011494252874e-05, |
|
"loss": 1.1898, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.6051978468894958, |
|
"learning_rate": 1.774712643678161e-05, |
|
"loss": 1.1229, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.6857028007507324, |
|
"learning_rate": 1.7517241379310347e-05, |
|
"loss": 1.051, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.6715748310089111, |
|
"learning_rate": 1.7287356321839082e-05, |
|
"loss": 0.9894, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.5918118953704834, |
|
"learning_rate": 1.7057471264367817e-05, |
|
"loss": 0.9687, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.6621690392494202, |
|
"learning_rate": 1.6827586206896552e-05, |
|
"loss": 0.9199, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.6697206497192383, |
|
"learning_rate": 1.659770114942529e-05, |
|
"loss": 0.9303, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.8184316158294678, |
|
"learning_rate": 1.6367816091954025e-05, |
|
"loss": 0.8898, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 0.6429987549781799, |
|
"learning_rate": 1.613793103448276e-05, |
|
"loss": 0.8623, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 0.7518043518066406, |
|
"learning_rate": 1.5908045977011495e-05, |
|
"loss": 0.8239, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"grad_norm": 0.6667824983596802, |
|
"learning_rate": 1.567816091954023e-05, |
|
"loss": 0.8119, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 0.8569457530975342, |
|
"learning_rate": 1.5448275862068965e-05, |
|
"loss": 0.8139, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 0.7754850387573242, |
|
"learning_rate": 1.5218390804597702e-05, |
|
"loss": 0.7835, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 1.159196138381958, |
|
"learning_rate": 1.4988505747126439e-05, |
|
"loss": 0.7546, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 1.119764804840088, |
|
"learning_rate": 1.4758620689655174e-05, |
|
"loss": 0.7571, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 1.3600786924362183, |
|
"learning_rate": 1.452873563218391e-05, |
|
"loss": 0.7451, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"grad_norm": 0.7608994245529175, |
|
"learning_rate": 1.4298850574712644e-05, |
|
"loss": 0.7109, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"grad_norm": 1.0172290802001953, |
|
"learning_rate": 1.406896551724138e-05, |
|
"loss": 0.7228, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 1.042607069015503, |
|
"learning_rate": 1.3839080459770115e-05, |
|
"loss": 0.6939, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.8913071751594543, |
|
"learning_rate": 1.3609195402298852e-05, |
|
"loss": 0.6721, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.97, |
|
"grad_norm": 1.4283536672592163, |
|
"learning_rate": 1.3379310344827587e-05, |
|
"loss": 0.681, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 10.31, |
|
"grad_norm": 1.1445728540420532, |
|
"learning_rate": 1.3149425287356324e-05, |
|
"loss": 0.6484, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.66, |
|
"grad_norm": 1.425697684288025, |
|
"learning_rate": 1.2919540229885059e-05, |
|
"loss": 0.6558, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.8931305408477783, |
|
"learning_rate": 1.2689655172413795e-05, |
|
"loss": 0.6642, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"grad_norm": 1.0374151468276978, |
|
"learning_rate": 1.2459770114942529e-05, |
|
"loss": 0.6202, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 11.69, |
|
"grad_norm": 1.628758430480957, |
|
"learning_rate": 1.2229885057471265e-05, |
|
"loss": 0.6163, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 12.03, |
|
"grad_norm": 1.3881452083587646, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.6364, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 1.0961302518844604, |
|
"learning_rate": 1.1770114942528737e-05, |
|
"loss": 0.5963, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 12.72, |
|
"grad_norm": 1.1812736988067627, |
|
"learning_rate": 1.1540229885057472e-05, |
|
"loss": 0.6102, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 13.06, |
|
"grad_norm": 1.103151559829712, |
|
"learning_rate": 1.1310344827586209e-05, |
|
"loss": 0.5965, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 1.108560562133789, |
|
"learning_rate": 1.1080459770114944e-05, |
|
"loss": 0.58, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 1.322364091873169, |
|
"learning_rate": 1.085057471264368e-05, |
|
"loss": 0.5707, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"grad_norm": 1.2036404609680176, |
|
"learning_rate": 1.0620689655172414e-05, |
|
"loss": 0.5781, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 14.44, |
|
"grad_norm": 1.46902596950531, |
|
"learning_rate": 1.039080459770115e-05, |
|
"loss": 0.5413, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 14.78, |
|
"grad_norm": 0.9223589301109314, |
|
"learning_rate": 1.0160919540229885e-05, |
|
"loss": 0.5686, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 15.12, |
|
"grad_norm": 1.7452529668807983, |
|
"learning_rate": 9.931034482758622e-06, |
|
"loss": 0.5538, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 15.47, |
|
"grad_norm": 1.0680702924728394, |
|
"learning_rate": 9.701149425287357e-06, |
|
"loss": 0.5402, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"grad_norm": 1.4106242656707764, |
|
"learning_rate": 9.471264367816094e-06, |
|
"loss": 0.5629, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 16.15, |
|
"grad_norm": 1.7341551780700684, |
|
"learning_rate": 9.241379310344829e-06, |
|
"loss": 0.5538, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 2.115643262863159, |
|
"learning_rate": 9.011494252873564e-06, |
|
"loss": 0.5481, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 16.84, |
|
"grad_norm": 1.1589787006378174, |
|
"learning_rate": 8.7816091954023e-06, |
|
"loss": 0.4981, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 17.19, |
|
"grad_norm": 1.0696042776107788, |
|
"learning_rate": 8.551724137931035e-06, |
|
"loss": 0.5041, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 2.5991277871104e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|