|
{ |
|
"best_metric": 1.4920570850372314, |
|
"best_model_checkpoint": "./models_trained/sft_full_df_2/checkpoint-6696", |
|
"epoch": 2.9989922741014445, |
|
"eval_steps": 2232, |
|
"global_step": 6696, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08017019370731161, |
|
"grad_norm": 9.433825492858887, |
|
"learning_rate": 1.0022396416573349e-06, |
|
"loss": 2.5707, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.16034038741462323, |
|
"grad_norm": 5.339526176452637, |
|
"learning_rate": 2.0044792833146697e-06, |
|
"loss": 1.9288, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.24051058112193482, |
|
"grad_norm": 6.231087684631348, |
|
"learning_rate": 3.0067189249720046e-06, |
|
"loss": 1.6422, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.32068077482924645, |
|
"grad_norm": 6.053775787353516, |
|
"learning_rate": 4.0089585666293395e-06, |
|
"loss": 1.6095, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.40085096853655805, |
|
"grad_norm": 6.309902667999268, |
|
"learning_rate": 4.999999235639976e-06, |
|
"loss": 1.5876, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.48102116224386965, |
|
"grad_norm": 6.86934757232666, |
|
"learning_rate": 4.993742312315323e-06, |
|
"loss": 1.5839, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.5611913559511813, |
|
"grad_norm": 5.88812255859375, |
|
"learning_rate": 4.975275594860625e-06, |
|
"loss": 1.5634, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.6413615496584929, |
|
"grad_norm": 6.350925445556641, |
|
"learning_rate": 4.944689499521886e-06, |
|
"loss": 1.5406, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 0.7215317433658045, |
|
"grad_norm": 6.42539119720459, |
|
"learning_rate": 4.902133781117591e-06, |
|
"loss": 1.5544, |
|
"step": 1611 |
|
}, |
|
{ |
|
"epoch": 0.8017019370731161, |
|
"grad_norm": 6.217959880828857, |
|
"learning_rate": 4.847816799813184e-06, |
|
"loss": 1.5301, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8818721307804277, |
|
"grad_norm": 6.820368766784668, |
|
"learning_rate": 4.782004500953626e-06, |
|
"loss": 1.5179, |
|
"step": 1969 |
|
}, |
|
{ |
|
"epoch": 0.9620423244877393, |
|
"grad_norm": 7.019912242889404, |
|
"learning_rate": 4.705019112948941e-06, |
|
"loss": 1.5084, |
|
"step": 2148 |
|
}, |
|
{ |
|
"epoch": 0.9996640913671482, |
|
"eval_loss": 1.5233653783798218, |
|
"eval_runtime": 72.6117, |
|
"eval_samples_per_second": 27.337, |
|
"eval_steps_per_second": 13.675, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 1.042212518195051, |
|
"grad_norm": 6.613691806793213, |
|
"learning_rate": 4.617237569588121e-06, |
|
"loss": 1.5067, |
|
"step": 2327 |
|
}, |
|
{ |
|
"epoch": 1.1223827119023626, |
|
"grad_norm": 7.442265510559082, |
|
"learning_rate": 4.519089664506044e-06, |
|
"loss": 1.4983, |
|
"step": 2506 |
|
}, |
|
{ |
|
"epoch": 1.2025529056096742, |
|
"grad_norm": 9.082566261291504, |
|
"learning_rate": 4.411055946839413e-06, |
|
"loss": 1.4935, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.2827230993169858, |
|
"grad_norm": 7.871016979217529, |
|
"learning_rate": 4.293665368374987e-06, |
|
"loss": 1.4923, |
|
"step": 2864 |
|
}, |
|
{ |
|
"epoch": 1.3628932930242974, |
|
"grad_norm": 7.989074230194092, |
|
"learning_rate": 4.167492693710046e-06, |
|
"loss": 1.4794, |
|
"step": 3043 |
|
}, |
|
{ |
|
"epoch": 1.443063486731609, |
|
"grad_norm": 7.531464576721191, |
|
"learning_rate": 4.033155686105407e-06, |
|
"loss": 1.4967, |
|
"step": 3222 |
|
}, |
|
{ |
|
"epoch": 1.5232336804389206, |
|
"grad_norm": 7.842589378356934, |
|
"learning_rate": 3.8913120828095415e-06, |
|
"loss": 1.4583, |
|
"step": 3401 |
|
}, |
|
{ |
|
"epoch": 1.6034038741462322, |
|
"grad_norm": 8.500748634338379, |
|
"learning_rate": 3.7426563746631257e-06, |
|
"loss": 1.4787, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.6835740678535438, |
|
"grad_norm": 9.060755729675293, |
|
"learning_rate": 3.587916405751636e-06, |
|
"loss": 1.4766, |
|
"step": 3759 |
|
}, |
|
{ |
|
"epoch": 1.7637442615608554, |
|
"grad_norm": 8.827229499816895, |
|
"learning_rate": 3.4278498097546904e-06, |
|
"loss": 1.472, |
|
"step": 3938 |
|
}, |
|
{ |
|
"epoch": 1.843914455268167, |
|
"grad_norm": 8.073822975158691, |
|
"learning_rate": 3.2632403004403746e-06, |
|
"loss": 1.4988, |
|
"step": 4117 |
|
}, |
|
{ |
|
"epoch": 1.9240846489754788, |
|
"grad_norm": 9.162446022033691, |
|
"learning_rate": 3.0948938344669414e-06, |
|
"loss": 1.4583, |
|
"step": 4296 |
|
}, |
|
{ |
|
"epoch": 1.9993281827342964, |
|
"eval_loss": 1.496500849723816, |
|
"eval_runtime": 72.6334, |
|
"eval_samples_per_second": 27.329, |
|
"eval_steps_per_second": 13.671, |
|
"step": 4464 |
|
}, |
|
{ |
|
"epoch": 2.00425484268279, |
|
"grad_norm": 9.242565155029297, |
|
"learning_rate": 2.9236346652794664e-06, |
|
"loss": 1.4732, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 2.084425036390102, |
|
"grad_norm": 9.86392593383789, |
|
"learning_rate": 2.750301307422268e-06, |
|
"loss": 1.4263, |
|
"step": 4654 |
|
}, |
|
{ |
|
"epoch": 2.1645952300974134, |
|
"grad_norm": 10.472150802612305, |
|
"learning_rate": 2.575742431026521e-06, |
|
"loss": 1.4263, |
|
"step": 4833 |
|
}, |
|
{ |
|
"epoch": 2.244765423804725, |
|
"grad_norm": 11.642035484313965, |
|
"learning_rate": 2.40081270657435e-06, |
|
"loss": 1.4318, |
|
"step": 5012 |
|
}, |
|
{ |
|
"epoch": 2.3249356175120366, |
|
"grad_norm": 11.944063186645508, |
|
"learning_rate": 2.226368620284175e-06, |
|
"loss": 1.4432, |
|
"step": 5191 |
|
}, |
|
{ |
|
"epoch": 2.4051058112193484, |
|
"grad_norm": 10.567192077636719, |
|
"learning_rate": 2.0532642806058894e-06, |
|
"loss": 1.4353, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 2.48527600492666, |
|
"grad_norm": 13.025165557861328, |
|
"learning_rate": 1.8823472363580105e-06, |
|
"loss": 1.4327, |
|
"step": 5549 |
|
}, |
|
{ |
|
"epoch": 2.5654461986339716, |
|
"grad_norm": 9.754578590393066, |
|
"learning_rate": 1.714454326981919e-06, |
|
"loss": 1.4214, |
|
"step": 5728 |
|
}, |
|
{ |
|
"epoch": 2.6456163923412834, |
|
"grad_norm": 9.988022804260254, |
|
"learning_rate": 1.5504075852310582e-06, |
|
"loss": 1.4299, |
|
"step": 5907 |
|
}, |
|
{ |
|
"epoch": 2.725786586048595, |
|
"grad_norm": 9.87160587310791, |
|
"learning_rate": 1.3910102123562535e-06, |
|
"loss": 1.4144, |
|
"step": 6086 |
|
}, |
|
{ |
|
"epoch": 2.805956779755906, |
|
"grad_norm": 9.950671195983887, |
|
"learning_rate": 1.2370426454933122e-06, |
|
"loss": 1.4077, |
|
"step": 6265 |
|
}, |
|
{ |
|
"epoch": 2.886126973463218, |
|
"grad_norm": 11.326949119567871, |
|
"learning_rate": 1.0892587365076916e-06, |
|
"loss": 1.4311, |
|
"step": 6444 |
|
}, |
|
{ |
|
"epoch": 2.96629716717053, |
|
"grad_norm": 11.63015365600586, |
|
"learning_rate": 9.483820610052311e-07, |
|
"loss": 1.4271, |
|
"step": 6623 |
|
}, |
|
{ |
|
"epoch": 2.9989922741014445, |
|
"eval_loss": 1.4920570850372314, |
|
"eval_runtime": 72.6497, |
|
"eval_samples_per_second": 27.323, |
|
"eval_steps_per_second": 13.668, |
|
"step": 6696 |
|
} |
|
], |
|
"logging_steps": 179, |
|
"max_steps": 8928, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 2232, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.021394676596736e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|