|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9934123847167324, |
|
"eval_steps": 500, |
|
"global_step": 852, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03513394817742644, |
|
"grad_norm": 0.43717896938323975, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 1.994, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07026789635485288, |
|
"grad_norm": 0.4105484187602997, |
|
"learning_rate": 0.00038461538461538467, |
|
"loss": 1.7122, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10540184453227931, |
|
"grad_norm": 0.39099714159965515, |
|
"learning_rate": 0.0004999710691449165, |
|
"loss": 1.7017, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14053579270970576, |
|
"grad_norm": 0.418653279542923, |
|
"learning_rate": 0.0004996456739191905, |
|
"loss": 1.7533, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1756697408871322, |
|
"grad_norm": 0.402630478143692, |
|
"learning_rate": 0.0004989591921187147, |
|
"loss": 1.6842, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21080368906455862, |
|
"grad_norm": 0.4017012119293213, |
|
"learning_rate": 0.0004979126166682133, |
|
"loss": 1.6915, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24593763724198506, |
|
"grad_norm": 0.5224213004112244, |
|
"learning_rate": 0.0004965074613305277, |
|
"loss": 1.7208, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2810715854194115, |
|
"grad_norm": 0.5111001133918762, |
|
"learning_rate": 0.0004947457585171148, |
|
"loss": 1.6386, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 0.6671903133392334, |
|
"learning_rate": 0.000492630056348375, |
|
"loss": 1.6592, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3513394817742644, |
|
"grad_norm": 0.4950112998485565, |
|
"learning_rate": 0.0004901634149680608, |
|
"loss": 1.6741, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3864734299516908, |
|
"grad_norm": 0.5058407783508301, |
|
"learning_rate": 0.0004873494021170953, |
|
"loss": 1.7568, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42160737812911725, |
|
"grad_norm": 0.8665825724601746, |
|
"learning_rate": 0.00048419208797320564, |
|
"loss": 1.7356, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4567413263065437, |
|
"grad_norm": 0.5124475955963135, |
|
"learning_rate": 0.00048069603926383277, |
|
"loss": 1.7199, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4918752744839701, |
|
"grad_norm": 0.505696177482605, |
|
"learning_rate": 0.0004768663126608342, |
|
"loss": 1.6813, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5270092226613966, |
|
"grad_norm": 0.509445071220398, |
|
"learning_rate": 0.0004727084474665322, |
|
"loss": 1.7074, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.562143170838823, |
|
"grad_norm": 0.5511056184768677, |
|
"learning_rate": 0.00046822845760168783, |
|
"loss": 1.6766, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5972771190162495, |
|
"grad_norm": 0.46946030855178833, |
|
"learning_rate": 0.0004634328229069881, |
|
"loss": 1.7018, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 0.49886614084243774, |
|
"learning_rate": 0.00045832847977062875, |
|
"loss": 1.7293, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6675450153711023, |
|
"grad_norm": 0.49424228072166443, |
|
"learning_rate": 0.0004529228110955478, |
|
"loss": 1.7306, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7026789635485288, |
|
"grad_norm": 0.5640605092048645, |
|
"learning_rate": 0.00044722363562082237, |
|
"loss": 1.7369, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7378129117259552, |
|
"grad_norm": 0.5802103281021118, |
|
"learning_rate": 0.0004412391966126735, |
|
"loss": 1.7463, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7729468599033816, |
|
"grad_norm": 0.5365092158317566, |
|
"learning_rate": 0.0004349781499414369, |
|
"loss": 1.7198, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.5591799020767212, |
|
"learning_rate": 0.00042844955156174345, |
|
"loss": 1.7298, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8432147562582345, |
|
"grad_norm": 0.5256664156913757, |
|
"learning_rate": 0.000421662844414021, |
|
"loss": 1.6863, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8783487044356609, |
|
"grad_norm": 0.6382088661193848, |
|
"learning_rate": 0.0004146278447662597, |
|
"loss": 1.7195, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9134826526130874, |
|
"grad_norm": 0.5260637402534485, |
|
"learning_rate": 0.00040735472801579887, |
|
"loss": 1.7135, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 0.5244185328483582, |
|
"learning_rate": 0.0003998540139716701, |
|
"loss": 1.6944, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9837505489679402, |
|
"grad_norm": 0.5998988747596741, |
|
"learning_rate": 0.00039213655163878436, |
|
"loss": 1.6982, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0188844971453668, |
|
"grad_norm": 0.6315314173698425, |
|
"learning_rate": 0.00038421350352597195, |
|
"loss": 1.5473, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0540184453227932, |
|
"grad_norm": 0.6495606899261475, |
|
"learning_rate": 0.00037609632950057095, |
|
"loss": 1.3535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0891523935002196, |
|
"grad_norm": 0.6094574928283691, |
|
"learning_rate": 0.0003677967702129177, |
|
"loss": 1.3452, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.124286341677646, |
|
"grad_norm": 0.5786643028259277, |
|
"learning_rate": 0.0003593268301147139, |
|
"loss": 1.3433, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 0.6357799172401428, |
|
"learning_rate": 0.00035069876009583234, |
|
"loss": 1.4166, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.194554238032499, |
|
"grad_norm": 0.6568606495857239, |
|
"learning_rate": 0.00034192503976467525, |
|
"loss": 1.323, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2296881862099254, |
|
"grad_norm": 0.6073617339134216, |
|
"learning_rate": 0.0003330183593977152, |
|
"loss": 1.389, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2648221343873518, |
|
"grad_norm": 0.5338005423545837, |
|
"learning_rate": 0.00032399160158432606, |
|
"loss": 1.3739, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2999560825647782, |
|
"grad_norm": 0.6769421100616455, |
|
"learning_rate": 0.00031485782259345406, |
|
"loss": 1.4024, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3350900307422047, |
|
"grad_norm": 0.6495899558067322, |
|
"learning_rate": 0.0003056302334890786, |
|
"loss": 1.3615, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.370223978919631, |
|
"grad_norm": 0.6488791108131409, |
|
"learning_rate": 0.0002963221810217786, |
|
"loss": 1.3548, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4053579270970575, |
|
"grad_norm": 0.746868908405304, |
|
"learning_rate": 0.00028694712832404195, |
|
"loss": 1.3749, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.440491875274484, |
|
"grad_norm": 0.6373124122619629, |
|
"learning_rate": 0.0002775186354372408, |
|
"loss": 1.3555, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.4756258234519104, |
|
"grad_norm": 0.6247098445892334, |
|
"learning_rate": 0.0002680503396984382, |
|
"loss": 1.3977, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5107597716293368, |
|
"grad_norm": 0.6373061537742615, |
|
"learning_rate": 0.00025855593601539415, |
|
"loss": 1.3637, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5458937198067633, |
|
"grad_norm": 0.7269704341888428, |
|
"learning_rate": 0.00024904915705830234, |
|
"loss": 1.4263, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.5810276679841897, |
|
"grad_norm": 0.6946704983711243, |
|
"learning_rate": 0.0002395437533969069, |
|
"loss": 1.3822, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 0.6399605870246887, |
|
"learning_rate": 0.0002300534736117292, |
|
"loss": 1.4348, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6512955643390426, |
|
"grad_norm": 0.6449073553085327, |
|
"learning_rate": 0.00022059204440817246, |
|
"loss": 1.3793, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.686429512516469, |
|
"grad_norm": 0.616384744644165, |
|
"learning_rate": 0.00021117315076226557, |
|
"loss": 1.3917, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7215634606938954, |
|
"grad_norm": 0.6383651494979858, |
|
"learning_rate": 0.0002018104161267652, |
|
"loss": 1.4097, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.7566974088713219, |
|
"grad_norm": 0.720451295375824, |
|
"learning_rate": 0.00019251738272624416, |
|
"loss": 1.3997, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7918313570487485, |
|
"grad_norm": 0.6487829685211182, |
|
"learning_rate": 0.00018330749196966806, |
|
"loss": 1.4366, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.826965305226175, |
|
"grad_norm": 0.6398463249206543, |
|
"learning_rate": 0.00017419406500879115, |
|
"loss": 1.3536, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.8620992534036014, |
|
"grad_norm": 0.7006503939628601, |
|
"learning_rate": 0.00016519028347049242, |
|
"loss": 1.3934, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8972332015810278, |
|
"grad_norm": 0.6170542240142822, |
|
"learning_rate": 0.00015630917039091919, |
|
"loss": 1.4171, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9323671497584543, |
|
"grad_norm": 0.6998418569564819, |
|
"learning_rate": 0.00014756357137901604, |
|
"loss": 1.3809, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.9675010979358807, |
|
"grad_norm": 0.6567032933235168, |
|
"learning_rate": 0.00013896613603668365, |
|
"loss": 1.3223, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.002635046113307, |
|
"grad_norm": 0.6512529253959656, |
|
"learning_rate": 0.00013052929966244216, |
|
"loss": 1.3693, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.0377689942907335, |
|
"grad_norm": 0.8671336770057678, |
|
"learning_rate": 0.00012226526526506093, |
|
"loss": 1.0046, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.07290294246816, |
|
"grad_norm": 0.7728586792945862, |
|
"learning_rate": 0.00011418598591317242, |
|
"loss": 1.0138, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.1080368906455864, |
|
"grad_norm": 0.8196272253990173, |
|
"learning_rate": 0.0001063031474463983, |
|
"loss": 0.9985, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.143170838823013, |
|
"grad_norm": 0.899488091468811, |
|
"learning_rate": 9.862815157299391e-05, |
|
"loss": 0.9397, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.1783047870004393, |
|
"grad_norm": 0.7770416736602783, |
|
"learning_rate": 9.117209937846053e-05, |
|
"loss": 0.9307, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.2134387351778657, |
|
"grad_norm": 0.8353050947189331, |
|
"learning_rate": 8.394577526897565e-05, |
|
"loss": 0.9334, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.248572683355292, |
|
"grad_norm": 0.7152834534645081, |
|
"learning_rate": 7.69596313728691e-05, |
|
"loss": 0.9483, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.2837066315327186, |
|
"grad_norm": 1.0104659795761108, |
|
"learning_rate": 7.022377242270251e-05, |
|
"loss": 0.942, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.318840579710145, |
|
"grad_norm": 0.7319175004959106, |
|
"learning_rate": 6.374794113982232e-05, |
|
"loss": 0.9242, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.3539745278875714, |
|
"grad_norm": 0.7895592451095581, |
|
"learning_rate": 5.7541504142523406e-05, |
|
"loss": 0.99, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.389108476064998, |
|
"grad_norm": 0.9462332725524902, |
|
"learning_rate": 5.161343839820762e-05, |
|
"loss": 0.9733, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.9174964427947998, |
|
"learning_rate": 4.597231823913112e-05, |
|
"loss": 0.9478, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.4593763724198507, |
|
"grad_norm": 0.7435886859893799, |
|
"learning_rate": 4.062630296052222e-05, |
|
"loss": 0.9487, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.494510320597277, |
|
"grad_norm": 0.908430278301239, |
|
"learning_rate": 3.558312501900718e-05, |
|
"loss": 0.9517, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.5296442687747036, |
|
"grad_norm": 0.8427926301956177, |
|
"learning_rate": 3.0850078848413704e-05, |
|
"loss": 0.93, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.56477821695213, |
|
"grad_norm": 0.8396946787834167, |
|
"learning_rate": 2.643401030912876e-05, |
|
"loss": 0.9528, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.5999121651295565, |
|
"grad_norm": 0.8085779547691345, |
|
"learning_rate": 2.234130678627169e-05, |
|
"loss": 0.9257, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.635046113306983, |
|
"grad_norm": 0.872416079044342, |
|
"learning_rate": 1.8577887951004264e-05, |
|
"loss": 0.9294, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.6701800614844093, |
|
"grad_norm": 0.8148744702339172, |
|
"learning_rate": 1.5149197198340014e-05, |
|
"loss": 0.9166, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.7053140096618358, |
|
"grad_norm": 0.8573042750358582, |
|
"learning_rate": 1.206019377383813e-05, |
|
"loss": 0.9481, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.740447957839262, |
|
"grad_norm": 0.7604958415031433, |
|
"learning_rate": 9.315345600569069e-06, |
|
"loss": 0.9425, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.7755819060166886, |
|
"grad_norm": 0.870971143245697, |
|
"learning_rate": 6.918622816727255e-06, |
|
"loss": 0.9087, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.810715854194115, |
|
"grad_norm": 0.9157975316047668, |
|
"learning_rate": 4.873492033237864e-06, |
|
"loss": 0.9547, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.8458498023715415, |
|
"grad_norm": 0.9207829236984253, |
|
"learning_rate": 3.1829113196638614e-06, |
|
"loss": 0.9275, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.880983750548968, |
|
"grad_norm": 0.9513605237007141, |
|
"learning_rate": 1.8493259256649187e-06, |
|
"loss": 0.981, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.9161176987263944, |
|
"grad_norm": 0.854158878326416, |
|
"learning_rate": 8.746647441975619e-07, |
|
"loss": 0.9124, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.951251646903821, |
|
"grad_norm": 0.8659394383430481, |
|
"learning_rate": 2.603375215716186e-07, |
|
"loss": 0.9643, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.9863855950812472, |
|
"grad_norm": 0.94295734167099, |
|
"learning_rate": 7.23281839820622e-09, |
|
"loss": 0.9282, |
|
"step": 850 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 852, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0935759953200589e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|