sheepy928's picture
Training in progress, step 500, checkpoint
effa56a verified
raw
history blame
8.17 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1208313194780087,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.0414,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 53.515953063964844,
"learning_rate": 6.5e-07,
"loss": 1.8355,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 27.920907974243164,
"learning_rate": 1.15e-06,
"loss": 1.5172,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 20.47172737121582,
"learning_rate": 1.65e-06,
"loss": 1.2654,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 8.903529167175293,
"learning_rate": 2.1499999999999997e-06,
"loss": 1.1095,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 5.902841567993164,
"learning_rate": 2.65e-06,
"loss": 1.0017,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 2.9340908527374268,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.8846,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 8.701367378234863,
"learning_rate": 3.6499999999999998e-06,
"loss": 0.8572,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 1.3205362558364868,
"learning_rate": 4.15e-06,
"loss": 0.839,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 3.0334558486938477,
"learning_rate": 4.65e-06,
"loss": 0.7838,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 1.8086520433425903,
"learning_rate": 5.15e-06,
"loss": 0.7782,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 29.50135040283203,
"learning_rate": 5.65e-06,
"loss": 0.7883,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 3.2708683013916016,
"learning_rate": 6.15e-06,
"loss": 0.7961,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 33.43790817260742,
"learning_rate": 6.650000000000001e-06,
"loss": 0.7626,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 1.2334959506988525,
"learning_rate": 7.15e-06,
"loss": 0.7732,
"step": 150
},
{
"epoch": 0.04,
"grad_norm": 1.7489686012268066,
"learning_rate": 7.65e-06,
"loss": 0.7715,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 3.358823776245117,
"learning_rate": 8.15e-06,
"loss": 0.7778,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 4.348245620727539,
"learning_rate": 8.65e-06,
"loss": 0.7619,
"step": 180
},
{
"epoch": 0.05,
"grad_norm": 0.9023125767707825,
"learning_rate": 9.15e-06,
"loss": 0.7746,
"step": 190
},
{
"epoch": 0.05,
"grad_norm": 0.9158114790916443,
"learning_rate": 9.65e-06,
"loss": 0.7592,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 0.6256226301193237,
"learning_rate": 1.0150000000000001e-05,
"loss": 0.7787,
"step": 210
},
{
"epoch": 0.05,
"grad_norm": 0.3805778920650482,
"learning_rate": 1.065e-05,
"loss": 0.7811,
"step": 220
},
{
"epoch": 0.06,
"grad_norm": 0.6968041062355042,
"learning_rate": 1.115e-05,
"loss": 0.73,
"step": 230
},
{
"epoch": 0.06,
"grad_norm": 0.9035410284996033,
"learning_rate": 1.1650000000000002e-05,
"loss": 0.7518,
"step": 240
},
{
"epoch": 0.06,
"grad_norm": 1.6727488040924072,
"learning_rate": 1.215e-05,
"loss": 0.7805,
"step": 250
},
{
"epoch": 0.06,
"grad_norm": 0.4153461158275604,
"learning_rate": 1.2650000000000001e-05,
"loss": 0.7706,
"step": 260
},
{
"epoch": 0.07,
"grad_norm": 1.4846651554107666,
"learning_rate": 1.3150000000000001e-05,
"loss": 0.7492,
"step": 270
},
{
"epoch": 0.07,
"grad_norm": 2.6631388664245605,
"learning_rate": 1.3650000000000001e-05,
"loss": 0.7681,
"step": 280
},
{
"epoch": 0.07,
"grad_norm": 0.6325013041496277,
"learning_rate": 1.415e-05,
"loss": 0.7753,
"step": 290
},
{
"epoch": 0.07,
"grad_norm": 0.877907395362854,
"learning_rate": 1.465e-05,
"loss": 0.7188,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 0.37142279744148254,
"learning_rate": 1.515e-05,
"loss": 0.7204,
"step": 310
},
{
"epoch": 0.08,
"grad_norm": 0.7337246537208557,
"learning_rate": 1.565e-05,
"loss": 0.778,
"step": 320
},
{
"epoch": 0.08,
"grad_norm": 0.5847220420837402,
"learning_rate": 1.6150000000000003e-05,
"loss": 0.7288,
"step": 330
},
{
"epoch": 0.08,
"grad_norm": 3.745180606842041,
"learning_rate": 1.665e-05,
"loss": 0.7531,
"step": 340
},
{
"epoch": 0.08,
"grad_norm": 0.357301265001297,
"learning_rate": 1.7150000000000004e-05,
"loss": 0.7448,
"step": 350
},
{
"epoch": 0.09,
"grad_norm": 0.9032486081123352,
"learning_rate": 1.765e-05,
"loss": 0.7651,
"step": 360
},
{
"epoch": 0.09,
"grad_norm": 0.2864232361316681,
"learning_rate": 1.815e-05,
"loss": 0.7193,
"step": 370
},
{
"epoch": 0.09,
"grad_norm": 1.8560261726379395,
"learning_rate": 1.865e-05,
"loss": 0.7421,
"step": 380
},
{
"epoch": 0.09,
"grad_norm": 0.2971792221069336,
"learning_rate": 1.915e-05,
"loss": 0.7171,
"step": 390
},
{
"epoch": 0.1,
"grad_norm": 0.40850627422332764,
"learning_rate": 1.9650000000000003e-05,
"loss": 0.7459,
"step": 400
},
{
"epoch": 0.1,
"grad_norm": 0.3934139311313629,
"learning_rate": 2.0150000000000002e-05,
"loss": 0.7205,
"step": 410
},
{
"epoch": 0.1,
"grad_norm": 1.8674131631851196,
"learning_rate": 2.065e-05,
"loss": 0.752,
"step": 420
},
{
"epoch": 0.1,
"grad_norm": 0.36707818508148193,
"learning_rate": 2.115e-05,
"loss": 0.7494,
"step": 430
},
{
"epoch": 0.11,
"grad_norm": 0.5044310092926025,
"learning_rate": 2.165e-05,
"loss": 0.7595,
"step": 440
},
{
"epoch": 0.11,
"grad_norm": 1.9025150537490845,
"learning_rate": 2.215e-05,
"loss": 0.7379,
"step": 450
},
{
"epoch": 0.11,
"grad_norm": 0.5004140138626099,
"learning_rate": 2.265e-05,
"loss": 0.7886,
"step": 460
},
{
"epoch": 0.11,
"grad_norm": 3.544482707977295,
"learning_rate": 2.3150000000000004e-05,
"loss": 0.7259,
"step": 470
},
{
"epoch": 0.12,
"grad_norm": 0.33023321628570557,
"learning_rate": 2.365e-05,
"loss": 0.7333,
"step": 480
},
{
"epoch": 0.12,
"grad_norm": 0.3548080325126648,
"learning_rate": 2.415e-05,
"loss": 0.7527,
"step": 490
},
{
"epoch": 0.12,
"grad_norm": 0.42450150847435,
"learning_rate": 2.465e-05,
"loss": 0.7443,
"step": 500
},
{
"epoch": 0.12,
"eval_loss": 0.7428915500640869,
"eval_runtime": 68.3143,
"eval_samples_per_second": 29.276,
"eval_steps_per_second": 0.922,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 20690,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.0807388003190702e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}