|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.994764397905759, |
|
"eval_steps": 500, |
|
"global_step": 858, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.034904013961605584, |
|
"grad_norm": 1.636154953792276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9609, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06980802792321117, |
|
"grad_norm": 1.1847809904244873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8537, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10471204188481675, |
|
"grad_norm": 1.1055793026372138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8208, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13961605584642234, |
|
"grad_norm": 1.1454147166881017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8046, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17452006980802792, |
|
"grad_norm": 0.8149143192535275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7882, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 0.8125713341369675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7822, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2443280977312391, |
|
"grad_norm": 0.6705477513854557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7746, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2792321116928447, |
|
"grad_norm": 0.742108317973775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7674, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31413612565445026, |
|
"grad_norm": 0.5830122580252405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7613, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.34904013961605584, |
|
"grad_norm": 0.6867621444893013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7581, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38394415357766143, |
|
"grad_norm": 0.5706894443800514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7561, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 0.5933615440045283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7555, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4537521815008726, |
|
"grad_norm": 0.5976459344111194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7479, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4886561954624782, |
|
"grad_norm": 0.7595901956158283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7445, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5235602094240838, |
|
"grad_norm": 0.5243859212111798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7453, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5584642233856894, |
|
"grad_norm": 0.7053972260403277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7459, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5933682373472949, |
|
"grad_norm": 0.7356815513429203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7406, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.5406768959780917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7403, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6631762652705061, |
|
"grad_norm": 0.5731257742921576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7414, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6980802792321117, |
|
"grad_norm": 0.5893545114403889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7354, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7329842931937173, |
|
"grad_norm": 0.6666154404813628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7378, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7678883071553229, |
|
"grad_norm": 0.6379810550334492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.736, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8027923211169284, |
|
"grad_norm": 0.5761611687799336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7365, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.5490954549201844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7351, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8726003490401396, |
|
"grad_norm": 0.5577745326979847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7305, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9075043630017452, |
|
"grad_norm": 0.5309350088615197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7344, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9424083769633508, |
|
"grad_norm": 0.5171633944749564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7312, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9773123909249564, |
|
"grad_norm": 0.6439135188078838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.729, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.012216404886562, |
|
"grad_norm": 0.7944207887971882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.752, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"grad_norm": 0.6463556134515147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6937, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.082024432809773, |
|
"grad_norm": 0.9747756816715487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6922, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1169284467713787, |
|
"grad_norm": 0.6041467541568463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6946, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1518324607329844, |
|
"grad_norm": 0.6555191903371557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6905, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1867364746945899, |
|
"grad_norm": 0.7808834156906888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6905, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2216404886561953, |
|
"grad_norm": 0.689883618215288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 0.5458990467442779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6958, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2914485165794067, |
|
"grad_norm": 0.49774723989961944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6872, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3263525305410122, |
|
"grad_norm": 0.615067023750174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6913, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3612565445026177, |
|
"grad_norm": 0.5120804275981703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6943, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3961605584642234, |
|
"grad_norm": 0.6511934985434475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6922, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.431064572425829, |
|
"grad_norm": 0.6015850091580557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.691, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.4659685863874345, |
|
"grad_norm": 0.5548178493075747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6876, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.50087260034904, |
|
"grad_norm": 0.6084101340671536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5357766143106457, |
|
"grad_norm": 0.5785174417745115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6893, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.5706806282722514, |
|
"grad_norm": 0.6387925686406533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6889, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6055846422338569, |
|
"grad_norm": 0.511039789752418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6882, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6404886561954624, |
|
"grad_norm": 0.6303156357824996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6874, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 0.5463553041688999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6826, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7102966841186737, |
|
"grad_norm": 0.6680053603003989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6887, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.7452006980802792, |
|
"grad_norm": 0.5861342009392054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6877, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7801047120418847, |
|
"grad_norm": 0.5441609154940179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.687, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8150087260034904, |
|
"grad_norm": 0.493678213169674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6865, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.849912739965096, |
|
"grad_norm": 0.7325770535166638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6889, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8848167539267016, |
|
"grad_norm": 0.6627130061862745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.683, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.919720767888307, |
|
"grad_norm": 0.5392435344182795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6869, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.9546247818499127, |
|
"grad_norm": 0.5374264329462486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6874, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9895287958115184, |
|
"grad_norm": 0.4520265683973087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.686, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.024432809773124, |
|
"grad_norm": 0.7616966695399988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6906, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.0593368237347294, |
|
"grad_norm": 0.8040603166806708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6442, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 0.5161547369323151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6462, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.1291448516579408, |
|
"grad_norm": 0.5566643583686863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6465, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.164048865619546, |
|
"grad_norm": 0.7404248587220047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.1989528795811517, |
|
"grad_norm": 0.602474854437427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6451, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.2338568935427574, |
|
"grad_norm": 0.663987316506295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6454, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.268760907504363, |
|
"grad_norm": 0.513580773343669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6464, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.303664921465969, |
|
"grad_norm": 0.5572279672626476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6495, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.338568935427574, |
|
"grad_norm": 0.6124769197735679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6466, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.3734729493891797, |
|
"grad_norm": 0.6689033731877824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6487, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.4083769633507854, |
|
"grad_norm": 0.6242198004638967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6479, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.4432809773123907, |
|
"grad_norm": 0.5816264133167447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6473, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.4781849912739964, |
|
"grad_norm": 0.6182232441775428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6486, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 0.6438939308222409, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6523, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.5479930191972078, |
|
"grad_norm": 0.6270289995094971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6467, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.5828970331588135, |
|
"grad_norm": 0.5230942315565851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6491, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.6178010471204187, |
|
"grad_norm": 0.5004924007290114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6519, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.6527050610820244, |
|
"grad_norm": 0.5669482852337735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.68760907504363, |
|
"grad_norm": 0.6432274167649737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6487, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.7225130890052354, |
|
"grad_norm": 0.5996384982145978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6521, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.757417102966841, |
|
"grad_norm": 0.5437537271625021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6502, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.7923211169284468, |
|
"grad_norm": 0.5375883439387915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6494, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.8272251308900525, |
|
"grad_norm": 0.9130751404600511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6521, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.862129144851658, |
|
"grad_norm": 0.7285776360956339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6497, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.8970331588132634, |
|
"grad_norm": 0.6108923967332701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6491, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 0.6234985887070743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6471, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.966841186736475, |
|
"grad_norm": 0.6458996378236107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.648, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.994764397905759, |
|
"step": 858, |
|
"total_flos": 1436763197276160.0, |
|
"train_loss": 0.7012652700597589, |
|
"train_runtime": 12908.7913, |
|
"train_samples_per_second": 34.079, |
|
"train_steps_per_second": 0.066 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 858, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1436763197276160.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|