emilykang's picture
Training in progress, epoch 1
ae5a972 verified
raw
history blame
11.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 610,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.16393442622950818,
"grad_norm": 0.4943847358226776,
"learning_rate": 0.00019986740898848306,
"loss": 1.1081,
"step": 10
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.44973108172416687,
"learning_rate": 0.0001994699875614589,
"loss": 0.9226,
"step": 20
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.3362947404384613,
"learning_rate": 0.00019880878960910772,
"loss": 0.887,
"step": 30
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.36153319478034973,
"learning_rate": 0.0001978855685095358,
"loss": 0.8615,
"step": 40
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.36022189259529114,
"learning_rate": 0.00019670277247913205,
"loss": 0.8234,
"step": 50
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.4029320478439331,
"learning_rate": 0.00019526353808033825,
"loss": 0.8149,
"step": 60
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.3638781011104584,
"learning_rate": 0.00019357168190404936,
"loss": 0.799,
"step": 70
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.3549768924713135,
"learning_rate": 0.0001916316904487005,
"loss": 0.785,
"step": 80
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.3937470018863678,
"learning_rate": 0.00018944870822287956,
"loss": 0.7776,
"step": 90
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.36256495118141174,
"learning_rate": 0.00018702852410301554,
"loss": 0.7756,
"step": 100
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.3645057678222656,
"learning_rate": 0.00018437755598231856,
"loss": 0.767,
"step": 110
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.3446103632450104,
"learning_rate": 0.00018150283375168114,
"loss": 0.7543,
"step": 120
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.4181569516658783,
"learning_rate": 0.00017841198065767107,
"loss": 0.7231,
"step": 130
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.40038439631462097,
"learning_rate": 0.00017511319308705198,
"loss": 0.7348,
"step": 140
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.3984282612800598,
"learning_rate": 0.00017161521883143934,
"loss": 0.7255,
"step": 150
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.4186936318874359,
"learning_rate": 0.00016792733388972932,
"loss": 0.7263,
"step": 160
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.4004381597042084,
"learning_rate": 0.00016405931786981755,
"loss": 0.7271,
"step": 170
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.4597119390964508,
"learning_rate": 0.00016002142805483685,
"loss": 0.7125,
"step": 180
},
{
"epoch": 3.1147540983606556,
"grad_norm": 0.4180513322353363,
"learning_rate": 0.00015582437220268647,
"loss": 0.6925,
"step": 190
},
{
"epoch": 3.278688524590164,
"grad_norm": 0.4401358366012573,
"learning_rate": 0.0001514792801509831,
"loss": 0.6863,
"step": 200
},
{
"epoch": 3.442622950819672,
"grad_norm": 0.4415852427482605,
"learning_rate": 0.000146997674302732,
"loss": 0.6857,
"step": 210
},
{
"epoch": 3.6065573770491803,
"grad_norm": 0.4144597053527832,
"learning_rate": 0.0001423914390709861,
"loss": 0.6807,
"step": 220
},
{
"epoch": 3.7704918032786887,
"grad_norm": 0.4162183403968811,
"learning_rate": 0.00013767278936351854,
"loss": 0.6846,
"step": 230
},
{
"epoch": 3.9344262295081966,
"grad_norm": 0.42117443680763245,
"learning_rate": 0.0001328542381910835,
"loss": 0.6887,
"step": 240
},
{
"epoch": 4.098360655737705,
"grad_norm": 0.4591551125049591,
"learning_rate": 0.00012794856348516095,
"loss": 0.6543,
"step": 250
},
{
"epoch": 4.262295081967213,
"grad_norm": 0.4592267572879791,
"learning_rate": 0.0001229687742131796,
"loss": 0.6482,
"step": 260
},
{
"epoch": 4.426229508196721,
"grad_norm": 0.47369667887687683,
"learning_rate": 0.00011792807588107357,
"loss": 0.6494,
"step": 270
},
{
"epoch": 4.590163934426229,
"grad_norm": 0.45578595995903015,
"learning_rate": 0.00011283983551465511,
"loss": 0.6559,
"step": 280
},
{
"epoch": 4.754098360655737,
"grad_norm": 0.4742227792739868,
"learning_rate": 0.00010771754621266466,
"loss": 0.6462,
"step": 290
},
{
"epoch": 4.918032786885246,
"grad_norm": 0.48461949825286865,
"learning_rate": 0.00010257479136549889,
"loss": 0.6501,
"step": 300
},
{
"epoch": 5.081967213114754,
"grad_norm": 0.4797608554363251,
"learning_rate": 9.742520863450115e-05,
"loss": 0.6452,
"step": 310
},
{
"epoch": 5.245901639344262,
"grad_norm": 0.505832314491272,
"learning_rate": 9.228245378733537e-05,
"loss": 0.6178,
"step": 320
},
{
"epoch": 5.409836065573771,
"grad_norm": 0.5265078544616699,
"learning_rate": 8.71601644853449e-05,
"loss": 0.6177,
"step": 330
},
{
"epoch": 5.573770491803279,
"grad_norm": 0.5109356045722961,
"learning_rate": 8.207192411892646e-05,
"loss": 0.6225,
"step": 340
},
{
"epoch": 5.737704918032787,
"grad_norm": 0.5139452815055847,
"learning_rate": 7.703122578682046e-05,
"loss": 0.6229,
"step": 350
},
{
"epoch": 5.901639344262295,
"grad_norm": 0.5581080913543701,
"learning_rate": 7.205143651483906e-05,
"loss": 0.6189,
"step": 360
},
{
"epoch": 6.065573770491803,
"grad_norm": 0.4794338345527649,
"learning_rate": 6.714576180891654e-05,
"loss": 0.6087,
"step": 370
},
{
"epoch": 6.229508196721311,
"grad_norm": 0.5142917037010193,
"learning_rate": 6.232721063648148e-05,
"loss": 0.5977,
"step": 380
},
{
"epoch": 6.39344262295082,
"grad_norm": 0.547099769115448,
"learning_rate": 5.7608560929013946e-05,
"loss": 0.6058,
"step": 390
},
{
"epoch": 6.557377049180328,
"grad_norm": 0.5487104654312134,
"learning_rate": 5.300232569726804e-05,
"loss": 0.5939,
"step": 400
},
{
"epoch": 6.721311475409836,
"grad_norm": 0.5189688205718994,
"learning_rate": 4.852071984901696e-05,
"loss": 0.6015,
"step": 410
},
{
"epoch": 6.885245901639344,
"grad_norm": 0.5185168981552124,
"learning_rate": 4.417562779731355e-05,
"loss": 0.5949,
"step": 420
},
{
"epoch": 7.049180327868853,
"grad_norm": 0.5486496090888977,
"learning_rate": 3.997857194516319e-05,
"loss": 0.5957,
"step": 430
},
{
"epoch": 7.213114754098361,
"grad_norm": 0.5592466592788696,
"learning_rate": 3.594068213018249e-05,
"loss": 0.582,
"step": 440
},
{
"epoch": 7.377049180327869,
"grad_norm": 0.5590682625770569,
"learning_rate": 3.207266611027069e-05,
"loss": 0.5779,
"step": 450
},
{
"epoch": 7.540983606557377,
"grad_norm": 0.546293318271637,
"learning_rate": 2.8384781168560693e-05,
"loss": 0.5736,
"step": 460
},
{
"epoch": 7.704918032786885,
"grad_norm": 0.5725670456886292,
"learning_rate": 2.4886806912948035e-05,
"loss": 0.5825,
"step": 470
},
{
"epoch": 7.868852459016393,
"grad_norm": 0.574937105178833,
"learning_rate": 2.1588019342328968e-05,
"loss": 0.5843,
"step": 480
},
{
"epoch": 8.032786885245901,
"grad_norm": 0.5568664073944092,
"learning_rate": 1.8497166248318876e-05,
"loss": 0.5791,
"step": 490
},
{
"epoch": 8.19672131147541,
"grad_norm": 0.568524181842804,
"learning_rate": 1.562244401768144e-05,
"loss": 0.5673,
"step": 500
},
{
"epoch": 8.360655737704919,
"grad_norm": 0.5631764531135559,
"learning_rate": 1.2971475896984475e-05,
"loss": 0.5557,
"step": 510
},
{
"epoch": 8.524590163934427,
"grad_norm": 0.5955241322517395,
"learning_rate": 1.0551291777120464e-05,
"loss": 0.5741,
"step": 520
},
{
"epoch": 8.688524590163935,
"grad_norm": 0.5839523077011108,
"learning_rate": 8.368309551299536e-06,
"loss": 0.5674,
"step": 530
},
{
"epoch": 8.852459016393443,
"grad_norm": 0.5877330899238586,
"learning_rate": 6.428318095950647e-06,
"loss": 0.5819,
"step": 540
},
{
"epoch": 9.01639344262295,
"grad_norm": 0.5768188238143921,
"learning_rate": 4.7364619196617495e-06,
"loss": 0.5728,
"step": 550
},
{
"epoch": 9.180327868852459,
"grad_norm": 0.5796675682067871,
"learning_rate": 3.2972275208679625e-06,
"loss": 0.5682,
"step": 560
},
{
"epoch": 9.344262295081966,
"grad_norm": 0.6149064898490906,
"learning_rate": 2.1144314904642195e-06,
"loss": 0.5699,
"step": 570
},
{
"epoch": 9.508196721311476,
"grad_norm": 0.558674156665802,
"learning_rate": 1.1912103908922945e-06,
"loss": 0.5663,
"step": 580
},
{
"epoch": 9.672131147540984,
"grad_norm": 0.563441812992096,
"learning_rate": 5.300124385410943e-07,
"loss": 0.5626,
"step": 590
},
{
"epoch": 9.836065573770492,
"grad_norm": 0.5652551651000977,
"learning_rate": 1.3259101151694708e-07,
"loss": 0.5579,
"step": 600
},
{
"epoch": 10.0,
"grad_norm": 0.5645861625671387,
"learning_rate": 0.0,
"loss": 0.5631,
"step": 610
},
{
"epoch": 10.0,
"step": 610,
"total_flos": 4.672887793385472e+16,
"train_loss": 0.6668467552935491,
"train_runtime": 2732.8065,
"train_samples_per_second": 2.679,
"train_steps_per_second": 0.223
}
],
"logging_steps": 10,
"max_steps": 610,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 4.672887793385472e+16,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}