Marcus2112's picture
Upload folder using huggingface_hub
7fe335a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1191663819610522,
"eval_steps": 100,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01093269559275709,
"grad_norm": 165.28294372558594,
"learning_rate": 0.0002,
"loss": 624.4996,
"step": 10
},
{
"epoch": 0.02186539118551418,
"grad_norm": Infinity,
"learning_rate": 0.00019996112676182827,
"loss": 546.1801,
"step": 20
},
{
"epoch": 0.03279808677827127,
"grad_norm": 205.6680450439453,
"learning_rate": 0.0001998267889368103,
"loss": 510.2819,
"step": 30
},
{
"epoch": 0.04373078237102836,
"grad_norm": 128.06765747070312,
"learning_rate": 0.00019959663551763642,
"loss": 497.8397,
"step": 40
},
{
"epoch": 0.05466347796378545,
"grad_norm": 146.53973388671875,
"learning_rate": 0.0001992708874098054,
"loss": 495.9984,
"step": 50
},
{
"epoch": 0.06559617355654254,
"grad_norm": 99.703857421875,
"learning_rate": 0.0001988498572723623,
"loss": 494.1846,
"step": 60
},
{
"epoch": 0.07652886914929963,
"grad_norm": 151.4725341796875,
"learning_rate": 0.00019833394921780245,
"loss": 490.978,
"step": 70
},
{
"epoch": 0.08746156474205671,
"grad_norm": 112.11941528320312,
"learning_rate": 0.00019772365842419677,
"loss": 480.1951,
"step": 80
},
{
"epoch": 0.0983942603348138,
"grad_norm": 155.3424530029297,
"learning_rate": 0.0001970195706599109,
"loss": 471.4008,
"step": 90
},
{
"epoch": 0.1093269559275709,
"grad_norm": 175.84173583984375,
"learning_rate": 0.00019622236172137374,
"loss": 468.4172,
"step": 100
},
{
"epoch": 0.1093269559275709,
"eval_loss": 7.324180603027344,
"eval_runtime": 58.3519,
"eval_samples_per_second": 160.509,
"eval_steps_per_second": 10.043,
"step": 100
},
{
"epoch": 0.12025965152032798,
"grad_norm": 149.00534057617188,
"learning_rate": 0.0001953327967844356,
"loss": 465.3731,
"step": 110
},
{
"epoch": 0.13119234711308508,
"grad_norm": 236.33877563476562,
"learning_rate": 0.0001943517296699384,
"loss": 464.0816,
"step": 120
},
{
"epoch": 0.14212504270584217,
"grad_norm": 147.4077606201172,
"learning_rate": 0.00019328010202420258,
"loss": 463.7079,
"step": 130
},
{
"epoch": 0.15305773829859926,
"grad_norm": 136.97933959960938,
"learning_rate": 0.00019211894241521758,
"loss": 460.7253,
"step": 140
},
{
"epoch": 0.16399043389135634,
"grad_norm": 67.10714721679688,
"learning_rate": 0.0001908693653454033,
"loss": 454.1115,
"step": 150
},
{
"epoch": 0.17492312948411343,
"grad_norm": 157.962158203125,
"learning_rate": 0.00018953257018189024,
"loss": 451.1601,
"step": 160
},
{
"epoch": 0.1858558250768705,
"grad_norm": 124.69608306884766,
"learning_rate": 0.00018810984000534458,
"loss": 454.4596,
"step": 170
},
{
"epoch": 0.1967885206696276,
"grad_norm": 102.23240661621094,
"learning_rate": 0.00018660254037844388,
"loss": 455.1331,
"step": 180
},
{
"epoch": 0.20772121626238468,
"grad_norm": 118.30726623535156,
"learning_rate": 0.00018501211803518468,
"loss": 453.586,
"step": 190
},
{
"epoch": 0.2186539118551418,
"grad_norm": 98.61111450195312,
"learning_rate": 0.00018334009949228061,
"loss": 451.1779,
"step": 200
},
{
"epoch": 0.2186539118551418,
"eval_loss": 6.999808311462402,
"eval_runtime": 58.2479,
"eval_samples_per_second": 160.795,
"eval_steps_per_second": 10.06,
"step": 200
},
{
"epoch": 0.22958660744789888,
"grad_norm": 105.02420043945312,
"learning_rate": 0.00018158808958398338,
"loss": 449.5652,
"step": 210
},
{
"epoch": 0.24051930304065597,
"grad_norm": 236.24205017089844,
"learning_rate": 0.00017975776992173344,
"loss": 449.9907,
"step": 220
},
{
"epoch": 0.25145199863341305,
"grad_norm": 111.67731475830078,
"learning_rate": 0.00017785089728011798,
"loss": 448.5767,
"step": 230
},
{
"epoch": 0.26238469422617017,
"grad_norm": 118.7121810913086,
"learning_rate": 0.00017586930191068655,
"loss": 448.7768,
"step": 240
},
{
"epoch": 0.2733173898189272,
"grad_norm": 78.55050659179688,
"learning_rate": 0.00017381488578524173,
"loss": 447.5193,
"step": 250
},
{
"epoch": 0.28425008541168434,
"grad_norm": 143.79925537109375,
"learning_rate": 0.00017168962077029147,
"loss": 443.3945,
"step": 260
},
{
"epoch": 0.2951827810044414,
"grad_norm": 107.96155548095703,
"learning_rate": 0.00016949554673441534,
"loss": 440.8445,
"step": 270
},
{
"epoch": 0.3061154765971985,
"grad_norm": 86.58111572265625,
"learning_rate": 0.00016723476959036083,
"loss": 439.1689,
"step": 280
},
{
"epoch": 0.31704817218995557,
"grad_norm": 224.989013671875,
"learning_rate": 0.0001649094592737497,
"loss": 444.2089,
"step": 290
},
{
"epoch": 0.3279808677827127,
"grad_norm": 114.99178314208984,
"learning_rate": 0.00016252184766033342,
"loss": 449.9366,
"step": 300
},
{
"epoch": 0.3279808677827127,
"eval_loss": 7.026218414306641,
"eval_runtime": 58.1336,
"eval_samples_per_second": 161.112,
"eval_steps_per_second": 10.08,
"step": 300
},
{
"epoch": 0.33891356337546974,
"grad_norm": 66.66151428222656,
"learning_rate": 0.0001600742264237979,
"loss": 446.0257,
"step": 310
},
{
"epoch": 0.34984625896822685,
"grad_norm": 83.69017791748047,
"learning_rate": 0.00015756894483617267,
"loss": 438.0339,
"step": 320
},
{
"epoch": 0.36077895456098397,
"grad_norm": 71.6341552734375,
"learning_rate": 0.0001550084075129563,
"loss": 435.6428,
"step": 330
},
{
"epoch": 0.371711650153741,
"grad_norm": 76.65645599365234,
"learning_rate": 0.00015239507210512194,
"loss": 437.0487,
"step": 340
},
{
"epoch": 0.38264434574649814,
"grad_norm": 207.73638916015625,
"learning_rate": 0.00014973144694021876,
"loss": 437.2165,
"step": 350
},
{
"epoch": 0.3935770413392552,
"grad_norm": 90.09326171875,
"learning_rate": 0.00014702008861483266,
"loss": 447.2774,
"step": 360
},
{
"epoch": 0.4045097369320123,
"grad_norm": 60.52599334716797,
"learning_rate": 0.00014426359954071796,
"loss": 442.0876,
"step": 370
},
{
"epoch": 0.41544243252476937,
"grad_norm": 55.786434173583984,
"learning_rate": 0.00014146462544695426,
"loss": 435.3542,
"step": 380
},
{
"epoch": 0.4263751281175265,
"grad_norm": 185.42239379882812,
"learning_rate": 0.00013862585284052714,
"loss": 431.46,
"step": 390
},
{
"epoch": 0.4373078237102836,
"grad_norm": 90.76365661621094,
"learning_rate": 0.00013575000642776893,
"loss": 435.4126,
"step": 400
},
{
"epoch": 0.4373078237102836,
"eval_loss": 6.82781982421875,
"eval_runtime": 58.072,
"eval_samples_per_second": 161.283,
"eval_steps_per_second": 10.091,
"step": 400
},
{
"epoch": 0.44824051930304065,
"grad_norm": 70.12236022949219,
"learning_rate": 0.0001328398464991355,
"loss": 436.4229,
"step": 410
},
{
"epoch": 0.45917321489579777,
"grad_norm": 55.41183090209961,
"learning_rate": 0.00012989816627982848,
"loss": 434.1986,
"step": 420
},
{
"epoch": 0.4701059104885548,
"grad_norm": 247.78492736816406,
"learning_rate": 0.00012692778924880603,
"loss": 432.3758,
"step": 430
},
{
"epoch": 0.48103860608131194,
"grad_norm": 130.10150146484375,
"learning_rate": 0.0001239315664287558,
"loss": 438.7516,
"step": 440
},
{
"epoch": 0.491971301674069,
"grad_norm": 124.00569915771484,
"learning_rate": 0.00012091237364963071,
"loss": 443.4303,
"step": 450
},
{
"epoch": 0.5029039972668261,
"grad_norm": 70.69654846191406,
"learning_rate": 0.00011787310878837422,
"loss": 439.3571,
"step": 460
},
{
"epoch": 0.5138366928595832,
"grad_norm": 48.626583099365234,
"learning_rate": 0.00011481668898748475,
"loss": 433.0221,
"step": 470
},
{
"epoch": 0.5247693884523403,
"grad_norm": 47.17863464355469,
"learning_rate": 0.00011174604785508813,
"loss": 428.8387,
"step": 480
},
{
"epoch": 0.5357020840450973,
"grad_norm": 84.66542053222656,
"learning_rate": 0.00010866413264920678,
"loss": 426.4283,
"step": 490
},
{
"epoch": 0.5466347796378545,
"grad_norm": 213.61886596679688,
"learning_rate": 0.00010557390144892684,
"loss": 427.7443,
"step": 500
},
{
"epoch": 0.5466347796378545,
"eval_loss": 6.722092151641846,
"eval_runtime": 58.1233,
"eval_samples_per_second": 161.14,
"eval_steps_per_second": 10.082,
"step": 500
},
{
"epoch": 0.5575674752306116,
"grad_norm": 122.73770141601562,
"learning_rate": 0.0001024783203151793,
"loss": 433.5011,
"step": 510
},
{
"epoch": 0.5685001708233687,
"grad_norm": 101.38385009765625,
"learning_rate": 9.938036044386005e-05,
"loss": 435.1738,
"step": 520
},
{
"epoch": 0.5794328664161257,
"grad_norm": 114.54944610595703,
"learning_rate": 9.628299531402117e-05,
"loss": 434.8967,
"step": 530
},
{
"epoch": 0.5903655620088828,
"grad_norm": 84.79987335205078,
"learning_rate": 9.318919783387094e-05,
"loss": 432.9765,
"step": 540
},
{
"epoch": 0.6012982576016399,
"grad_norm": 61.6598014831543,
"learning_rate": 9.010193748732155e-05,
"loss": 430.1559,
"step": 550
},
{
"epoch": 0.612230953194397,
"grad_norm": 99.44583129882812,
"learning_rate": 8.702417748382385e-05,
"loss": 429.6009,
"step": 560
},
{
"epoch": 0.6231636487871541,
"grad_norm": 74.24359130859375,
"learning_rate": 8.395887191422397e-05,
"loss": 430.6433,
"step": 570
},
{
"epoch": 0.6340963443799111,
"grad_norm": 62.34027099609375,
"learning_rate": 8.090896291537273e-05,
"loss": 429.9604,
"step": 580
},
{
"epoch": 0.6450290399726682,
"grad_norm": 142.7094268798828,
"learning_rate": 7.787737784620803e-05,
"loss": 432.6778,
"step": 590
},
{
"epoch": 0.6559617355654254,
"grad_norm": 86.1541748046875,
"learning_rate": 7.486702647802213e-05,
"loss": 433.9323,
"step": 600
},
{
"epoch": 0.6559617355654254,
"eval_loss": 6.775545120239258,
"eval_runtime": 58.0771,
"eval_samples_per_second": 161.268,
"eval_steps_per_second": 10.09,
"step": 600
},
{
"epoch": 0.6668944311581825,
"grad_norm": 62.604488372802734,
"learning_rate": 7.188079820160904e-05,
"loss": 431.3976,
"step": 610
},
{
"epoch": 0.6778271267509395,
"grad_norm": 86.99163818359375,
"learning_rate": 6.892155925397436e-05,
"loss": 429.0467,
"step": 620
},
{
"epoch": 0.6887598223436966,
"grad_norm": 60.84334945678711,
"learning_rate": 6.59921499672677e-05,
"loss": 426.724,
"step": 630
},
{
"epoch": 0.6996925179364537,
"grad_norm": 63.2689208984375,
"learning_rate": 6.309538204257977e-05,
"loss": 426.1021,
"step": 640
},
{
"epoch": 0.7106252135292108,
"grad_norm": 100.661865234375,
"learning_rate": 6.02340358512196e-05,
"loss": 426.2488,
"step": 650
},
{
"epoch": 0.7215579091219679,
"grad_norm": 98.77324676513672,
"learning_rate": 5.7410857766062966e-05,
"loss": 426.7232,
"step": 660
},
{
"epoch": 0.7324906047147249,
"grad_norm": 83.56670379638672,
"learning_rate": 5.4628557525532976e-05,
"loss": 429.1631,
"step": 670
},
{
"epoch": 0.743423300307482,
"grad_norm": 123.90058135986328,
"learning_rate": 5.188980563274315e-05,
"loss": 430.625,
"step": 680
},
{
"epoch": 0.7543559959002392,
"grad_norm": 96.76585388183594,
"learning_rate": 4.9197230792299195e-05,
"loss": 430.7337,
"step": 690
},
{
"epoch": 0.7652886914929963,
"grad_norm": 110.13320922851562,
"learning_rate": 4.6553417387219886e-05,
"loss": 429.4984,
"step": 700
},
{
"epoch": 0.7652886914929963,
"eval_loss": 6.70954704284668,
"eval_runtime": 58.0031,
"eval_samples_per_second": 161.474,
"eval_steps_per_second": 10.103,
"step": 700
},
{
"epoch": 0.7762213870857533,
"grad_norm": 88.53307342529297,
"learning_rate": 4.396090299839852e-05,
"loss": 429.292,
"step": 710
},
{
"epoch": 0.7871540826785104,
"grad_norm": 83.58421325683594,
"learning_rate": 4.1422175968985955e-05,
"loss": 428.1954,
"step": 720
},
{
"epoch": 0.7980867782712675,
"grad_norm": 68.06840515136719,
"learning_rate": 3.8939673016032953e-05,
"loss": 427.5196,
"step": 730
},
{
"epoch": 0.8090194738640246,
"grad_norm": 101.87394714355469,
"learning_rate": 3.651577689168405e-05,
"loss": 426.9831,
"step": 740
},
{
"epoch": 0.8199521694567817,
"grad_norm": 91.1787338256836,
"learning_rate": 3.415281409616844e-05,
"loss": 426.4039,
"step": 750
},
{
"epoch": 0.8308848650495387,
"grad_norm": 76.29208374023438,
"learning_rate": 3.185305264478159e-05,
"loss": 427.1363,
"step": 760
},
{
"epoch": 0.8418175606422958,
"grad_norm": 106.453369140625,
"learning_rate": 2.9618699891002843e-05,
"loss": 428.5345,
"step": 770
},
{
"epoch": 0.852750256235053,
"grad_norm": 77.79237365722656,
"learning_rate": 2.745190040783646e-05,
"loss": 428.4736,
"step": 780
},
{
"epoch": 0.8636829518278101,
"grad_norm": 78.0290756225586,
"learning_rate": 2.5354733929410977e-05,
"loss": 427.3446,
"step": 790
},
{
"epoch": 0.8746156474205672,
"grad_norm": 82.6236801147461,
"learning_rate": 2.332921335481205e-05,
"loss": 426.4013,
"step": 800
},
{
"epoch": 0.8746156474205672,
"eval_loss": 6.655261516571045,
"eval_runtime": 57.9158,
"eval_samples_per_second": 161.718,
"eval_steps_per_second": 10.118,
"step": 800
},
{
"epoch": 0.8855483430133242,
"grad_norm": 88.00137329101562,
"learning_rate": 2.137728281606475e-05,
"loss": 425.3787,
"step": 810
},
{
"epoch": 0.8964810386060813,
"grad_norm": 68.48820495605469,
"learning_rate": 2e-05,
"loss": 425.5655,
"step": 820
},
{
"epoch": 0.9074137341988384,
"grad_norm": 99.16329956054688,
"learning_rate": 2e-05,
"loss": 425.2563,
"step": 830
},
{
"epoch": 0.9183464297915955,
"grad_norm": 82.26581573486328,
"learning_rate": 2e-05,
"loss": 424.7282,
"step": 840
},
{
"epoch": 0.9292791253843525,
"grad_norm": 76.19690704345703,
"learning_rate": 2e-05,
"loss": 424.4672,
"step": 850
},
{
"epoch": 0.9402118209771096,
"grad_norm": 76.67041778564453,
"learning_rate": 2e-05,
"loss": 424.8196,
"step": 860
},
{
"epoch": 0.9511445165698668,
"grad_norm": 91.26239013671875,
"learning_rate": 2e-05,
"loss": 424.8494,
"step": 870
},
{
"epoch": 0.9620772121626239,
"grad_norm": 87.45671844482422,
"learning_rate": 2e-05,
"loss": 424.553,
"step": 880
},
{
"epoch": 0.973009907755381,
"grad_norm": 83.90388488769531,
"learning_rate": 2e-05,
"loss": 424.6736,
"step": 890
},
{
"epoch": 0.983942603348138,
"grad_norm": 92.74352264404297,
"learning_rate": 2e-05,
"loss": 425.6448,
"step": 900
},
{
"epoch": 0.983942603348138,
"eval_loss": 6.660056114196777,
"eval_runtime": 57.8375,
"eval_samples_per_second": 161.937,
"eval_steps_per_second": 10.132,
"step": 900
},
{
"epoch": 0.9948752989408951,
"grad_norm": 82.22098541259766,
"learning_rate": 2e-05,
"loss": 426.0456,
"step": 910
},
{
"epoch": 1.0054663477963786,
"grad_norm": 85.19371032714844,
"learning_rate": 2e-05,
"loss": 413.45,
"step": 920
},
{
"epoch": 1.0163990433891357,
"grad_norm": 114.56877899169922,
"learning_rate": 2e-05,
"loss": 426.7532,
"step": 930
},
{
"epoch": 1.0273317389818928,
"grad_norm": 101.91117858886719,
"learning_rate": 2e-05,
"loss": 426.5692,
"step": 940
},
{
"epoch": 1.03826443457465,
"grad_norm": 98.91850280761719,
"learning_rate": 2e-05,
"loss": 427.7487,
"step": 950
},
{
"epoch": 1.049197130167407,
"grad_norm": 124.97265625,
"learning_rate": 2e-05,
"loss": 427.6109,
"step": 960
},
{
"epoch": 1.060129825760164,
"grad_norm": 118.64958190917969,
"learning_rate": 2e-05,
"loss": 427.6372,
"step": 970
},
{
"epoch": 1.071062521352921,
"grad_norm": 109.47184753417969,
"learning_rate": 2e-05,
"loss": 428.8791,
"step": 980
},
{
"epoch": 1.0819952169456781,
"grad_norm": 130.62338256835938,
"learning_rate": 2e-05,
"loss": 429.3186,
"step": 990
},
{
"epoch": 1.0929279125384352,
"grad_norm": 127.20825958251953,
"learning_rate": 2e-05,
"loss": 429.1682,
"step": 1000
},
{
"epoch": 1.0929279125384352,
"eval_loss": 6.715859413146973,
"eval_runtime": 57.8946,
"eval_samples_per_second": 161.777,
"eval_steps_per_second": 10.122,
"step": 1000
},
{
"epoch": 1.1038606081311924,
"grad_norm": 105.07234191894531,
"learning_rate": 2e-05,
"loss": 429.3496,
"step": 1010
},
{
"epoch": 1.1147933037239495,
"grad_norm": 143.77236938476562,
"learning_rate": 2e-05,
"loss": 429.6381,
"step": 1020
}
],
"logging_steps": 10,
"max_steps": 1024,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.705517388910559e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}