mamung's picture
Training in progress, step 99, checkpoint
ec152a3 verified
raw
history blame
9.02 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.04697091006138244,
"eval_steps": 9,
"global_step": 99,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00047445363698366103,
"eval_loss": 1.2206629514694214,
"eval_runtime": 2455.084,
"eval_samples_per_second": 2.892,
"eval_steps_per_second": 1.446,
"step": 1
},
{
"epoch": 0.001423360910950983,
"grad_norm": 42.169185638427734,
"learning_rate": 3e-05,
"loss": 38.8115,
"step": 3
},
{
"epoch": 0.002846721821901966,
"grad_norm": 30.064516067504883,
"learning_rate": 6e-05,
"loss": 36.5887,
"step": 6
},
{
"epoch": 0.004270082732852949,
"grad_norm": 24.0466365814209,
"learning_rate": 9e-05,
"loss": 31.7486,
"step": 9
},
{
"epoch": 0.004270082732852949,
"eval_loss": 0.872442364692688,
"eval_runtime": 2457.1575,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 1.445,
"step": 9
},
{
"epoch": 0.005693443643803932,
"grad_norm": 26.121597290039062,
"learning_rate": 0.00012,
"loss": 28.0893,
"step": 12
},
{
"epoch": 0.007116804554754915,
"grad_norm": 22.931711196899414,
"learning_rate": 0.00015000000000000001,
"loss": 26.3657,
"step": 15
},
{
"epoch": 0.008540165465705898,
"grad_norm": 15.313858032226562,
"learning_rate": 0.00018,
"loss": 22.876,
"step": 18
},
{
"epoch": 0.008540165465705898,
"eval_loss": 0.7377527952194214,
"eval_runtime": 730.2515,
"eval_samples_per_second": 9.723,
"eval_steps_per_second": 4.861,
"step": 18
},
{
"epoch": 0.009963526376656881,
"grad_norm": 14.844147682189941,
"learning_rate": 0.0001999229036240723,
"loss": 23.8388,
"step": 21
},
{
"epoch": 0.011386887287607865,
"grad_norm": 13.040925025939941,
"learning_rate": 0.00019876883405951377,
"loss": 23.2438,
"step": 24
},
{
"epoch": 0.012810248198558846,
"grad_norm": 13.732440948486328,
"learning_rate": 0.00019624552364536473,
"loss": 23.504,
"step": 27
},
{
"epoch": 0.012810248198558846,
"eval_loss": 0.708083987236023,
"eval_runtime": 2490.0558,
"eval_samples_per_second": 2.851,
"eval_steps_per_second": 1.426,
"step": 27
},
{
"epoch": 0.01423360910950983,
"grad_norm": 15.219266891479492,
"learning_rate": 0.0001923879532511287,
"loss": 23.009,
"step": 30
},
{
"epoch": 0.015656970020460813,
"grad_norm": 11.847440719604492,
"learning_rate": 0.00018724960070727972,
"loss": 21.94,
"step": 33
},
{
"epoch": 0.017080330931411796,
"grad_norm": 11.252641677856445,
"learning_rate": 0.00018090169943749476,
"loss": 22.467,
"step": 36
},
{
"epoch": 0.017080330931411796,
"eval_loss": 0.6968169808387756,
"eval_runtime": 1821.4615,
"eval_samples_per_second": 3.898,
"eval_steps_per_second": 1.949,
"step": 36
},
{
"epoch": 0.01850369184236278,
"grad_norm": 11.163553237915039,
"learning_rate": 0.00017343225094356855,
"loss": 22.5554,
"step": 39
},
{
"epoch": 0.019927052753313763,
"grad_norm": 13.20541763305664,
"learning_rate": 0.00016494480483301836,
"loss": 22.9775,
"step": 42
},
{
"epoch": 0.021350413664264746,
"grad_norm": 10.816298484802246,
"learning_rate": 0.00015555702330196023,
"loss": 22.2379,
"step": 45
},
{
"epoch": 0.021350413664264746,
"eval_loss": 0.6865644454956055,
"eval_runtime": 673.7961,
"eval_samples_per_second": 10.537,
"eval_steps_per_second": 5.269,
"step": 45
},
{
"epoch": 0.02277377457521573,
"grad_norm": 10.934300422668457,
"learning_rate": 0.00014539904997395468,
"loss": 21.4568,
"step": 48
},
{
"epoch": 0.024197135486166713,
"grad_norm": 10.668505668640137,
"learning_rate": 0.0001346117057077493,
"loss": 21.2971,
"step": 51
},
{
"epoch": 0.025620496397117692,
"grad_norm": 10.522425651550293,
"learning_rate": 0.00012334453638559057,
"loss": 22.4085,
"step": 54
},
{
"epoch": 0.025620496397117692,
"eval_loss": 0.6808061003684998,
"eval_runtime": 674.5537,
"eval_samples_per_second": 10.525,
"eval_steps_per_second": 5.263,
"step": 54
},
{
"epoch": 0.027043857308068676,
"grad_norm": 10.834729194641113,
"learning_rate": 0.00011175373974578378,
"loss": 22.0063,
"step": 57
},
{
"epoch": 0.02846721821901966,
"grad_norm": 10.45215892791748,
"learning_rate": 0.0001,
"loss": 21.7114,
"step": 60
},
{
"epoch": 0.029890579129970642,
"grad_norm": 10.509422302246094,
"learning_rate": 8.824626025421626e-05,
"loss": 22.147,
"step": 63
},
{
"epoch": 0.029890579129970642,
"eval_loss": 0.6736528873443604,
"eval_runtime": 674.4225,
"eval_samples_per_second": 10.528,
"eval_steps_per_second": 5.264,
"step": 63
},
{
"epoch": 0.031313940040921626,
"grad_norm": 10.096047401428223,
"learning_rate": 7.66554636144095e-05,
"loss": 21.0243,
"step": 66
},
{
"epoch": 0.03273730095187261,
"grad_norm": 10.614026069641113,
"learning_rate": 6.538829429225069e-05,
"loss": 22.6279,
"step": 69
},
{
"epoch": 0.03416066186282359,
"grad_norm": 9.915692329406738,
"learning_rate": 5.4600950026045326e-05,
"loss": 21.7931,
"step": 72
},
{
"epoch": 0.03416066186282359,
"eval_loss": 0.6665549278259277,
"eval_runtime": 673.4954,
"eval_samples_per_second": 10.542,
"eval_steps_per_second": 5.271,
"step": 72
},
{
"epoch": 0.03558402277377457,
"grad_norm": 10.606143951416016,
"learning_rate": 4.444297669803981e-05,
"loss": 21.1355,
"step": 75
},
{
"epoch": 0.03700738368472556,
"grad_norm": 9.829690933227539,
"learning_rate": 3.5055195166981645e-05,
"loss": 20.704,
"step": 78
},
{
"epoch": 0.03843074459567654,
"grad_norm": 9.532461166381836,
"learning_rate": 2.6567749056431467e-05,
"loss": 20.9497,
"step": 81
},
{
"epoch": 0.03843074459567654,
"eval_loss": 0.6634002327919006,
"eval_runtime": 673.9541,
"eval_samples_per_second": 10.535,
"eval_steps_per_second": 5.267,
"step": 81
},
{
"epoch": 0.039854105506627525,
"grad_norm": 10.550149917602539,
"learning_rate": 1.9098300562505266e-05,
"loss": 21.4862,
"step": 84
},
{
"epoch": 0.041277466417578505,
"grad_norm": 9.829084396362305,
"learning_rate": 1.2750399292720283e-05,
"loss": 21.2587,
"step": 87
},
{
"epoch": 0.04270082732852949,
"grad_norm": 10.305970191955566,
"learning_rate": 7.612046748871327e-06,
"loss": 21.4128,
"step": 90
},
{
"epoch": 0.04270082732852949,
"eval_loss": 0.6616738438606262,
"eval_runtime": 674.758,
"eval_samples_per_second": 10.522,
"eval_steps_per_second": 5.261,
"step": 90
},
{
"epoch": 0.04412418823948047,
"grad_norm": 10.540351867675781,
"learning_rate": 3.7544763546352834e-06,
"loss": 22.0735,
"step": 93
},
{
"epoch": 0.04554754915043146,
"grad_norm": 10.391834259033203,
"learning_rate": 1.231165940486234e-06,
"loss": 22.2369,
"step": 96
},
{
"epoch": 0.04697091006138244,
"grad_norm": 10.5779390335083,
"learning_rate": 7.709637592770991e-08,
"loss": 21.495,
"step": 99
},
{
"epoch": 0.04697091006138244,
"eval_loss": 0.6612978577613831,
"eval_runtime": 674.8148,
"eval_samples_per_second": 10.521,
"eval_steps_per_second": 5.261,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.2196614900128154e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}