|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 19.942209243774414, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 7.7458, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.6758410930633545, |
|
"learning_rate": 9.896907216494846e-05, |
|
"loss": 5.6812, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.124385356903076, |
|
"learning_rate": 9.690721649484537e-05, |
|
"loss": 5.1671, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.7686610221862793, |
|
"learning_rate": 9.484536082474227e-05, |
|
"loss": 4.7855, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.7892873287200928, |
|
"learning_rate": 9.278350515463918e-05, |
|
"loss": 4.6247, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.682472229003906, |
|
"learning_rate": 9.072164948453609e-05, |
|
"loss": 4.4554, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.119821786880493, |
|
"learning_rate": 8.865979381443299e-05, |
|
"loss": 4.4782, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.42207932472229, |
|
"learning_rate": 8.65979381443299e-05, |
|
"loss": 4.4169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.197399139404297, |
|
"learning_rate": 8.453608247422681e-05, |
|
"loss": 4.248, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.766460418701172, |
|
"learning_rate": 8.247422680412371e-05, |
|
"loss": 4.3296, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 3.0895087718963623, |
|
"learning_rate": 8.041237113402063e-05, |
|
"loss": 4.2014, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.011801242828369, |
|
"learning_rate": 7.835051546391753e-05, |
|
"loss": 4.1966, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.4347891807556152, |
|
"learning_rate": 7.628865979381443e-05, |
|
"loss": 4.2682, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.443373441696167, |
|
"learning_rate": 7.422680412371135e-05, |
|
"loss": 4.2879, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.0630757808685303, |
|
"learning_rate": 7.216494845360825e-05, |
|
"loss": 4.2395, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 4.507055282592773, |
|
"learning_rate": 7.010309278350515e-05, |
|
"loss": 4.2041, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 3.149142265319824, |
|
"learning_rate": 6.804123711340207e-05, |
|
"loss": 4.1754, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 3.8777101039886475, |
|
"learning_rate": 6.597938144329897e-05, |
|
"loss": 4.1773, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 3.2908365726470947, |
|
"learning_rate": 6.391752577319587e-05, |
|
"loss": 4.1004, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 3.825634241104126, |
|
"learning_rate": 6.185567010309279e-05, |
|
"loss": 4.1731, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.8585126399993896, |
|
"learning_rate": 5.979381443298969e-05, |
|
"loss": 4.1421, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 3.541111946105957, |
|
"learning_rate": 5.7731958762886594e-05, |
|
"loss": 4.0777, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 4.357095718383789, |
|
"learning_rate": 5.567010309278351e-05, |
|
"loss": 4.0769, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 3.1230344772338867, |
|
"learning_rate": 5.360824742268041e-05, |
|
"loss": 4.1582, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.7142417430877686, |
|
"learning_rate": 5.1546391752577315e-05, |
|
"loss": 4.0817, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 3.0471880435943604, |
|
"learning_rate": 4.948453608247423e-05, |
|
"loss": 4.118, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 2.6254706382751465, |
|
"learning_rate": 4.7422680412371134e-05, |
|
"loss": 4.0025, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 3.170884132385254, |
|
"learning_rate": 4.536082474226804e-05, |
|
"loss": 4.0524, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 2.865476369857788, |
|
"learning_rate": 4.329896907216495e-05, |
|
"loss": 4.0805, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3.1281850337982178, |
|
"learning_rate": 4.1237113402061855e-05, |
|
"loss": 4.1221, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 3.0519344806671143, |
|
"learning_rate": 3.9175257731958764e-05, |
|
"loss": 4.1095, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 3.115818977355957, |
|
"learning_rate": 3.7113402061855674e-05, |
|
"loss": 4.1023, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 3.150238275527954, |
|
"learning_rate": 3.5051546391752576e-05, |
|
"loss": 4.0486, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 3.5786564350128174, |
|
"learning_rate": 3.2989690721649485e-05, |
|
"loss": 3.9712, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 2.9977972507476807, |
|
"learning_rate": 3.0927835051546395e-05, |
|
"loss": 4.0619, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 3.5539848804473877, |
|
"learning_rate": 2.8865979381443297e-05, |
|
"loss": 4.0779, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 3.2001516819000244, |
|
"learning_rate": 2.6804123711340206e-05, |
|
"loss": 3.9964, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 4.163519859313965, |
|
"learning_rate": 2.4742268041237116e-05, |
|
"loss": 4.0303, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 2.706822395324707, |
|
"learning_rate": 2.268041237113402e-05, |
|
"loss": 4.0643, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 3.3247110843658447, |
|
"learning_rate": 2.0618556701030927e-05, |
|
"loss": 3.9944, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 3.692864418029785, |
|
"learning_rate": 4.676753782668501e-05, |
|
"loss": 4.0656, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 4.354611873626709, |
|
"learning_rate": 4.53920220082531e-05, |
|
"loss": 3.9715, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 5.532437324523926, |
|
"learning_rate": 4.4016506189821186e-05, |
|
"loss": 4.0498, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 4.6008758544921875, |
|
"learning_rate": 4.264099037138927e-05, |
|
"loss": 4.0479, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 3.6320340633392334, |
|
"learning_rate": 4.126547455295736e-05, |
|
"loss": 4.014, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 3.8125405311584473, |
|
"learning_rate": 3.988995873452545e-05, |
|
"loss": 4.0399, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 4.067931652069092, |
|
"learning_rate": 3.8514442916093536e-05, |
|
"loss": 3.9899, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 4.198885917663574, |
|
"learning_rate": 3.713892709766162e-05, |
|
"loss": 4.0051, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 4.0433173179626465, |
|
"learning_rate": 3.576341127922971e-05, |
|
"loss": 4.0214, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 3.475057363510132, |
|
"learning_rate": 3.4387895460797805e-05, |
|
"loss": 4.0241, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 5.039187431335449, |
|
"learning_rate": 3.3012379642365886e-05, |
|
"loss": 3.9914, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 3.2291958332061768, |
|
"learning_rate": 3.163686382393397e-05, |
|
"loss": 4.0007, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 4.718263626098633, |
|
"learning_rate": 3.0261348005502068e-05, |
|
"loss": 3.99, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 3.466670513153076, |
|
"learning_rate": 2.8885832187070155e-05, |
|
"loss": 3.9816, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 4.72723388671875, |
|
"learning_rate": 2.751031636863824e-05, |
|
"loss": 4.0284, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 4.0743088722229, |
|
"learning_rate": 2.613480055020633e-05, |
|
"loss": 3.9387, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 3.0914433002471924, |
|
"learning_rate": 2.4759284731774414e-05, |
|
"loss": 3.9369, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 4.583773136138916, |
|
"learning_rate": 2.3383768913342505e-05, |
|
"loss": 4.0302, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 4.720504283905029, |
|
"learning_rate": 2.2008253094910593e-05, |
|
"loss": 3.9831, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 5.4584879875183105, |
|
"learning_rate": 2.063273727647868e-05, |
|
"loss": 4.0414, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 4.3133015632629395, |
|
"learning_rate": 1.9257221458046768e-05, |
|
"loss": 3.9594, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 4.162217617034912, |
|
"learning_rate": 1.7881705639614855e-05, |
|
"loss": 3.8796, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 3.863389730453491, |
|
"learning_rate": 1.6506189821182943e-05, |
|
"loss": 4.0042, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 4.166849613189697, |
|
"learning_rate": 1.5130674002751034e-05, |
|
"loss": 4.0042, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 4.148003101348877, |
|
"learning_rate": 1.375515818431912e-05, |
|
"loss": 4.0276, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 4.149908542633057, |
|
"learning_rate": 1.2379642365887207e-05, |
|
"loss": 3.9092, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 3.810363292694092, |
|
"learning_rate": 1.1004126547455296e-05, |
|
"loss": 3.9163, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 3.7628567218780518, |
|
"learning_rate": 9.628610729023384e-06, |
|
"loss": 3.9669, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"grad_norm": 4.967076301574707, |
|
"learning_rate": 8.253094910591471e-06, |
|
"loss": 4.0309, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 3.6391351222991943, |
|
"learning_rate": 6.87757909215956e-06, |
|
"loss": 3.983, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 4.78062105178833, |
|
"learning_rate": 5.502063273727648e-06, |
|
"loss": 3.923, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 4.412433624267578, |
|
"learning_rate": 4.126547455295736e-06, |
|
"loss": 3.9983, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"grad_norm": 3.6161208152770996, |
|
"learning_rate": 2.751031636863824e-06, |
|
"loss": 4.0139, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 4.792978286743164, |
|
"learning_rate": 1.375515818431912e-06, |
|
"loss": 3.9172, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 3.42587947845459, |
|
"learning_rate": 0.0, |
|
"loss": 3.9408, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 750, |
|
"total_flos": 2445756517957632.0, |
|
"train_loss": 0.26390928649902345, |
|
"train_runtime": 1983.5688, |
|
"train_samples_per_second": 9.075, |
|
"train_steps_per_second": 0.378 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2445756517957632.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|