training_sentiment_analysis / trainer_state.json
tuanio's picture
End of training
f633d2f verified
{
"best_metric": 0.8174300254452926,
"best_model_checkpoint": "training_sentiment_analysis/checkpoint-8600",
"epoch": 20.0,
"eval_steps": 200,
"global_step": 18680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.21,
"grad_norm": 1.3381836414337158,
"learning_rate": 3.2119914346895075e-05,
"loss": 0.9299,
"step": 200
},
{
"epoch": 0.21,
"eval_accuracy": 0.638676844783715,
"eval_loss": 0.827367901802063,
"eval_runtime": 3.055,
"eval_samples_per_second": 514.569,
"eval_steps_per_second": 16.367,
"step": 200
},
{
"epoch": 0.43,
"grad_norm": 1.0220164060592651,
"learning_rate": 6.423982869379015e-05,
"loss": 0.7793,
"step": 400
},
{
"epoch": 0.43,
"eval_accuracy": 0.7188295165394402,
"eval_loss": 0.6643335223197937,
"eval_runtime": 3.0013,
"eval_samples_per_second": 523.77,
"eval_steps_per_second": 16.659,
"step": 400
},
{
"epoch": 0.64,
"grad_norm": 1.7421491146087646,
"learning_rate": 9.635974304068522e-05,
"loss": 0.6574,
"step": 600
},
{
"epoch": 0.64,
"eval_accuracy": 0.7659033078880407,
"eval_loss": 0.5868020057678223,
"eval_runtime": 2.9749,
"eval_samples_per_second": 528.422,
"eval_steps_per_second": 16.807,
"step": 600
},
{
"epoch": 0.86,
"grad_norm": 1.8133894205093384,
"learning_rate": 0.0001284796573875803,
"loss": 0.6132,
"step": 800
},
{
"epoch": 0.86,
"eval_accuracy": 0.772264631043257,
"eval_loss": 0.5582301616668701,
"eval_runtime": 2.9908,
"eval_samples_per_second": 525.617,
"eval_steps_per_second": 16.718,
"step": 800
},
{
"epoch": 1.07,
"grad_norm": 1.3071078062057495,
"learning_rate": 0.00016059957173447537,
"loss": 0.5791,
"step": 1000
},
{
"epoch": 1.07,
"eval_accuracy": 0.7830788804071247,
"eval_loss": 0.5515692234039307,
"eval_runtime": 2.9665,
"eval_samples_per_second": 529.915,
"eval_steps_per_second": 16.855,
"step": 1000
},
{
"epoch": 1.28,
"grad_norm": 1.0445743799209595,
"learning_rate": 0.00019271948608137044,
"loss": 0.554,
"step": 1200
},
{
"epoch": 1.28,
"eval_accuracy": 0.7964376590330788,
"eval_loss": 0.5187413692474365,
"eval_runtime": 2.9846,
"eval_samples_per_second": 526.705,
"eval_steps_per_second": 16.753,
"step": 1200
},
{
"epoch": 1.5,
"grad_norm": 1.0763362646102905,
"learning_rate": 0.0002248394004282655,
"loss": 0.5258,
"step": 1400
},
{
"epoch": 1.5,
"eval_accuracy": 0.8034351145038168,
"eval_loss": 0.5125576257705688,
"eval_runtime": 2.9831,
"eval_samples_per_second": 526.967,
"eval_steps_per_second": 16.761,
"step": 1400
},
{
"epoch": 1.71,
"grad_norm": 0.8554897308349609,
"learning_rate": 0.0002569593147751606,
"loss": 0.5373,
"step": 1600
},
{
"epoch": 1.71,
"eval_accuracy": 0.8002544529262087,
"eval_loss": 0.51680988073349,
"eval_runtime": 2.9726,
"eval_samples_per_second": 528.823,
"eval_steps_per_second": 16.82,
"step": 1600
},
{
"epoch": 1.93,
"grad_norm": 1.538806438446045,
"learning_rate": 0.0002890792291220556,
"loss": 0.5266,
"step": 1800
},
{
"epoch": 1.93,
"eval_accuracy": 0.8027989821882952,
"eval_loss": 0.5283887982368469,
"eval_runtime": 2.9766,
"eval_samples_per_second": 528.12,
"eval_steps_per_second": 16.798,
"step": 1800
},
{
"epoch": 2.14,
"grad_norm": 1.1234441995620728,
"learning_rate": 0.000297644539614561,
"loss": 0.5076,
"step": 2000
},
{
"epoch": 2.14,
"eval_accuracy": 0.7977099236641222,
"eval_loss": 0.5178301334381104,
"eval_runtime": 2.9829,
"eval_samples_per_second": 526.996,
"eval_steps_per_second": 16.762,
"step": 2000
},
{
"epoch": 2.36,
"grad_norm": 1.6212774515151978,
"learning_rate": 0.0002940756602426838,
"loss": 0.5094,
"step": 2200
},
{
"epoch": 2.36,
"eval_accuracy": 0.8027989821882952,
"eval_loss": 0.5134572982788086,
"eval_runtime": 2.981,
"eval_samples_per_second": 527.334,
"eval_steps_per_second": 16.773,
"step": 2200
},
{
"epoch": 2.57,
"grad_norm": 1.4514294862747192,
"learning_rate": 0.00029050678087080655,
"loss": 0.5032,
"step": 2400
},
{
"epoch": 2.57,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.5022692084312439,
"eval_runtime": 2.963,
"eval_samples_per_second": 530.535,
"eval_steps_per_second": 16.875,
"step": 2400
},
{
"epoch": 2.78,
"grad_norm": 1.826932668685913,
"learning_rate": 0.0002869379014989293,
"loss": 0.5034,
"step": 2600
},
{
"epoch": 2.78,
"eval_accuracy": 0.80470737913486,
"eval_loss": 0.5088226199150085,
"eval_runtime": 2.9831,
"eval_samples_per_second": 526.969,
"eval_steps_per_second": 16.761,
"step": 2600
},
{
"epoch": 3.0,
"grad_norm": 1.4404336214065552,
"learning_rate": 0.0002833690221270521,
"loss": 0.4923,
"step": 2800
},
{
"epoch": 3.0,
"eval_accuracy": 0.799618320610687,
"eval_loss": 0.5219257473945618,
"eval_runtime": 2.9722,
"eval_samples_per_second": 528.9,
"eval_steps_per_second": 16.823,
"step": 2800
},
{
"epoch": 3.21,
"grad_norm": 0.8795878291130066,
"learning_rate": 0.00027980014275517484,
"loss": 0.4934,
"step": 3000
},
{
"epoch": 3.21,
"eval_accuracy": 0.8129770992366412,
"eval_loss": 0.4905295968055725,
"eval_runtime": 2.9734,
"eval_samples_per_second": 528.696,
"eval_steps_per_second": 16.816,
"step": 3000
},
{
"epoch": 3.43,
"grad_norm": 1.6092537641525269,
"learning_rate": 0.0002762312633832976,
"loss": 0.4798,
"step": 3200
},
{
"epoch": 3.43,
"eval_accuracy": 0.8097964376590331,
"eval_loss": 0.4907812178134918,
"eval_runtime": 2.9897,
"eval_samples_per_second": 525.803,
"eval_steps_per_second": 16.724,
"step": 3200
},
{
"epoch": 3.64,
"grad_norm": 1.6475110054016113,
"learning_rate": 0.0002726623840114204,
"loss": 0.4831,
"step": 3400
},
{
"epoch": 3.64,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.48748457431793213,
"eval_runtime": 2.9694,
"eval_samples_per_second": 529.396,
"eval_steps_per_second": 16.838,
"step": 3400
},
{
"epoch": 3.85,
"grad_norm": 1.1669467687606812,
"learning_rate": 0.00026909350463954313,
"loss": 0.4707,
"step": 3600
},
{
"epoch": 3.85,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.4985896944999695,
"eval_runtime": 2.991,
"eval_samples_per_second": 525.579,
"eval_steps_per_second": 16.717,
"step": 3600
},
{
"epoch": 4.07,
"grad_norm": 0.9440352320671082,
"learning_rate": 0.00026552462526766593,
"loss": 0.4674,
"step": 3800
},
{
"epoch": 4.07,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.5195557475090027,
"eval_runtime": 2.9789,
"eval_samples_per_second": 527.711,
"eval_steps_per_second": 16.785,
"step": 3800
},
{
"epoch": 4.28,
"grad_norm": 1.8151628971099854,
"learning_rate": 0.0002619557458957887,
"loss": 0.4535,
"step": 4000
},
{
"epoch": 4.28,
"eval_accuracy": 0.8097964376590331,
"eval_loss": 0.4896373152732849,
"eval_runtime": 2.9869,
"eval_samples_per_second": 526.295,
"eval_steps_per_second": 16.74,
"step": 4000
},
{
"epoch": 4.5,
"grad_norm": 3.0790090560913086,
"learning_rate": 0.0002583868665239115,
"loss": 0.464,
"step": 4200
},
{
"epoch": 4.5,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.517495334148407,
"eval_runtime": 2.9986,
"eval_samples_per_second": 524.246,
"eval_steps_per_second": 16.674,
"step": 4200
},
{
"epoch": 4.71,
"grad_norm": 1.1520639657974243,
"learning_rate": 0.0002548179871520343,
"loss": 0.4715,
"step": 4400
},
{
"epoch": 4.71,
"eval_accuracy": 0.8027989821882952,
"eval_loss": 0.5001667737960815,
"eval_runtime": 2.9723,
"eval_samples_per_second": 528.885,
"eval_steps_per_second": 16.822,
"step": 4400
},
{
"epoch": 4.93,
"grad_norm": 0.8184943795204163,
"learning_rate": 0.000251249107780157,
"loss": 0.468,
"step": 4600
},
{
"epoch": 4.93,
"eval_accuracy": 0.8110687022900763,
"eval_loss": 0.4883332848548889,
"eval_runtime": 2.9769,
"eval_samples_per_second": 528.068,
"eval_steps_per_second": 16.796,
"step": 4600
},
{
"epoch": 5.14,
"grad_norm": 1.155013084411621,
"learning_rate": 0.00024768022840827977,
"loss": 0.4645,
"step": 4800
},
{
"epoch": 5.14,
"eval_accuracy": 0.8040712468193384,
"eval_loss": 0.5186554789543152,
"eval_runtime": 2.9698,
"eval_samples_per_second": 529.333,
"eval_steps_per_second": 16.836,
"step": 4800
},
{
"epoch": 5.35,
"grad_norm": 1.6959339380264282,
"learning_rate": 0.00024411134903640257,
"loss": 0.445,
"step": 5000
},
{
"epoch": 5.35,
"eval_accuracy": 0.806615776081425,
"eval_loss": 0.4928103983402252,
"eval_runtime": 2.9782,
"eval_samples_per_second": 527.83,
"eval_steps_per_second": 16.789,
"step": 5000
},
{
"epoch": 5.57,
"grad_norm": 1.0461735725402832,
"learning_rate": 0.00024054246966452532,
"loss": 0.4558,
"step": 5200
},
{
"epoch": 5.57,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.48704999685287476,
"eval_runtime": 2.9838,
"eval_samples_per_second": 526.839,
"eval_steps_per_second": 16.757,
"step": 5200
},
{
"epoch": 5.78,
"grad_norm": 0.9599233269691467,
"learning_rate": 0.00023697359029264806,
"loss": 0.4405,
"step": 5400
},
{
"epoch": 5.78,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.4985482692718506,
"eval_runtime": 3.0065,
"eval_samples_per_second": 522.862,
"eval_steps_per_second": 16.63,
"step": 5400
},
{
"epoch": 6.0,
"grad_norm": 1.4131615161895752,
"learning_rate": 0.00023340471092077086,
"loss": 0.4648,
"step": 5600
},
{
"epoch": 6.0,
"eval_accuracy": 0.8059796437659033,
"eval_loss": 0.48415422439575195,
"eval_runtime": 2.9786,
"eval_samples_per_second": 527.759,
"eval_steps_per_second": 16.786,
"step": 5600
},
{
"epoch": 6.21,
"grad_norm": 1.189572811126709,
"learning_rate": 0.0002298358315488936,
"loss": 0.435,
"step": 5800
},
{
"epoch": 6.21,
"eval_accuracy": 0.811704834605598,
"eval_loss": 0.4911487102508545,
"eval_runtime": 2.9997,
"eval_samples_per_second": 524.044,
"eval_steps_per_second": 16.668,
"step": 5800
},
{
"epoch": 6.42,
"grad_norm": 1.5198345184326172,
"learning_rate": 0.00022626695217701638,
"loss": 0.437,
"step": 6000
},
{
"epoch": 6.42,
"eval_accuracy": 0.8085241730279898,
"eval_loss": 0.48542749881744385,
"eval_runtime": 3.0042,
"eval_samples_per_second": 523.274,
"eval_steps_per_second": 16.644,
"step": 6000
},
{
"epoch": 6.64,
"grad_norm": 1.1990240812301636,
"learning_rate": 0.00022269807280513918,
"loss": 0.4588,
"step": 6200
},
{
"epoch": 6.64,
"eval_accuracy": 0.8085241730279898,
"eval_loss": 0.48791924118995667,
"eval_runtime": 3.0014,
"eval_samples_per_second": 523.758,
"eval_steps_per_second": 16.659,
"step": 6200
},
{
"epoch": 6.85,
"grad_norm": 1.346658706665039,
"learning_rate": 0.00021912919343326193,
"loss": 0.4342,
"step": 6400
},
{
"epoch": 6.85,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.49220582842826843,
"eval_runtime": 3.0046,
"eval_samples_per_second": 523.193,
"eval_steps_per_second": 16.641,
"step": 6400
},
{
"epoch": 7.07,
"grad_norm": 1.8644700050354004,
"learning_rate": 0.00021556031406138473,
"loss": 0.4347,
"step": 6600
},
{
"epoch": 7.07,
"eval_accuracy": 0.8142493638676844,
"eval_loss": 0.49111655354499817,
"eval_runtime": 2.985,
"eval_samples_per_second": 526.634,
"eval_steps_per_second": 16.75,
"step": 6600
},
{
"epoch": 7.28,
"grad_norm": 1.9364045858383179,
"learning_rate": 0.00021199143468950748,
"loss": 0.4326,
"step": 6800
},
{
"epoch": 7.28,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.491384893655777,
"eval_runtime": 2.9723,
"eval_samples_per_second": 528.882,
"eval_steps_per_second": 16.822,
"step": 6800
},
{
"epoch": 7.49,
"grad_norm": 0.9911957383155823,
"learning_rate": 0.00020842255531763022,
"loss": 0.4267,
"step": 7000
},
{
"epoch": 7.49,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.4917159080505371,
"eval_runtime": 2.9808,
"eval_samples_per_second": 527.373,
"eval_steps_per_second": 16.774,
"step": 7000
},
{
"epoch": 7.71,
"grad_norm": 1.2186638116836548,
"learning_rate": 0.00020485367594575302,
"loss": 0.4241,
"step": 7200
},
{
"epoch": 7.71,
"eval_accuracy": 0.8136132315521628,
"eval_loss": 0.4887010455131531,
"eval_runtime": 2.9872,
"eval_samples_per_second": 526.253,
"eval_steps_per_second": 16.738,
"step": 7200
},
{
"epoch": 7.92,
"grad_norm": 1.1467108726501465,
"learning_rate": 0.0002012847965738758,
"loss": 0.4376,
"step": 7400
},
{
"epoch": 7.92,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.5122085213661194,
"eval_runtime": 2.9829,
"eval_samples_per_second": 527.007,
"eval_steps_per_second": 16.762,
"step": 7400
},
{
"epoch": 8.14,
"grad_norm": 0.8427834510803223,
"learning_rate": 0.00019771591720199854,
"loss": 0.4323,
"step": 7600
},
{
"epoch": 8.14,
"eval_accuracy": 0.8097964376590331,
"eval_loss": 0.49093857407569885,
"eval_runtime": 2.9738,
"eval_samples_per_second": 528.625,
"eval_steps_per_second": 16.814,
"step": 7600
},
{
"epoch": 8.35,
"grad_norm": 1.2060902118682861,
"learning_rate": 0.00019414703783012134,
"loss": 0.4264,
"step": 7800
},
{
"epoch": 8.35,
"eval_accuracy": 0.8142493638676844,
"eval_loss": 0.48821595311164856,
"eval_runtime": 2.9836,
"eval_samples_per_second": 526.88,
"eval_steps_per_second": 16.758,
"step": 7800
},
{
"epoch": 8.57,
"grad_norm": 1.7033394575119019,
"learning_rate": 0.0001905781584582441,
"loss": 0.4175,
"step": 8000
},
{
"epoch": 8.57,
"eval_accuracy": 0.8053435114503816,
"eval_loss": 0.5090692043304443,
"eval_runtime": 2.9978,
"eval_samples_per_second": 524.393,
"eval_steps_per_second": 16.679,
"step": 8000
},
{
"epoch": 8.78,
"grad_norm": 1.3033976554870605,
"learning_rate": 0.0001870092790863669,
"loss": 0.4228,
"step": 8200
},
{
"epoch": 8.78,
"eval_accuracy": 0.8097964376590331,
"eval_loss": 0.5060204863548279,
"eval_runtime": 2.9975,
"eval_samples_per_second": 524.436,
"eval_steps_per_second": 16.681,
"step": 8200
},
{
"epoch": 8.99,
"grad_norm": 1.2635438442230225,
"learning_rate": 0.00018344039971448964,
"loss": 0.4189,
"step": 8400
},
{
"epoch": 8.99,
"eval_accuracy": 0.8091603053435115,
"eval_loss": 0.4940575361251831,
"eval_runtime": 2.9634,
"eval_samples_per_second": 530.468,
"eval_steps_per_second": 16.872,
"step": 8400
},
{
"epoch": 9.21,
"grad_norm": 1.496982455253601,
"learning_rate": 0.0001798715203426124,
"loss": 0.4161,
"step": 8600
},
{
"epoch": 9.21,
"eval_accuracy": 0.8174300254452926,
"eval_loss": 0.5010442137718201,
"eval_runtime": 2.973,
"eval_samples_per_second": 528.758,
"eval_steps_per_second": 16.818,
"step": 8600
},
{
"epoch": 9.42,
"grad_norm": 1.355362892150879,
"learning_rate": 0.00017630264097073518,
"loss": 0.4078,
"step": 8800
},
{
"epoch": 9.42,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.4949406683444977,
"eval_runtime": 2.9901,
"eval_samples_per_second": 525.736,
"eval_steps_per_second": 16.722,
"step": 8800
},
{
"epoch": 9.64,
"grad_norm": 1.180076241493225,
"learning_rate": 0.00017273376159885795,
"loss": 0.4201,
"step": 9000
},
{
"epoch": 9.64,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5017107129096985,
"eval_runtime": 2.952,
"eval_samples_per_second": 532.525,
"eval_steps_per_second": 16.938,
"step": 9000
},
{
"epoch": 9.85,
"grad_norm": 1.1020286083221436,
"learning_rate": 0.0001691648822269807,
"loss": 0.4141,
"step": 9200
},
{
"epoch": 9.85,
"eval_accuracy": 0.8091603053435115,
"eval_loss": 0.4984731078147888,
"eval_runtime": 2.9633,
"eval_samples_per_second": 530.497,
"eval_steps_per_second": 16.873,
"step": 9200
},
{
"epoch": 10.06,
"grad_norm": 1.2666047811508179,
"learning_rate": 0.0001655960028551035,
"loss": 0.4132,
"step": 9400
},
{
"epoch": 10.06,
"eval_accuracy": 0.8053435114503816,
"eval_loss": 0.5031649470329285,
"eval_runtime": 2.9822,
"eval_samples_per_second": 527.133,
"eval_steps_per_second": 16.766,
"step": 9400
},
{
"epoch": 10.28,
"grad_norm": 0.6767197251319885,
"learning_rate": 0.00016202712348322625,
"loss": 0.4043,
"step": 9600
},
{
"epoch": 10.28,
"eval_accuracy": 0.8129770992366412,
"eval_loss": 0.5038406848907471,
"eval_runtime": 2.9816,
"eval_samples_per_second": 527.24,
"eval_steps_per_second": 16.77,
"step": 9600
},
{
"epoch": 10.49,
"grad_norm": 1.147275447845459,
"learning_rate": 0.00015845824411134902,
"loss": 0.4187,
"step": 9800
},
{
"epoch": 10.49,
"eval_accuracy": 0.8104325699745547,
"eval_loss": 0.4981047213077545,
"eval_runtime": 2.9858,
"eval_samples_per_second": 526.485,
"eval_steps_per_second": 16.746,
"step": 9800
},
{
"epoch": 10.71,
"grad_norm": 1.6172677278518677,
"learning_rate": 0.0001548893647394718,
"loss": 0.3827,
"step": 10000
},
{
"epoch": 10.71,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5126467943191528,
"eval_runtime": 2.9825,
"eval_samples_per_second": 527.072,
"eval_steps_per_second": 16.764,
"step": 10000
},
{
"epoch": 10.92,
"grad_norm": 1.8639923334121704,
"learning_rate": 0.00015132048536759457,
"loss": 0.4074,
"step": 10200
},
{
"epoch": 10.92,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5088323950767517,
"eval_runtime": 2.9816,
"eval_samples_per_second": 527.237,
"eval_steps_per_second": 16.77,
"step": 10200
},
{
"epoch": 11.13,
"grad_norm": 1.2519667148590088,
"learning_rate": 0.00014775160599571734,
"loss": 0.4013,
"step": 10400
},
{
"epoch": 11.13,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5061373114585876,
"eval_runtime": 2.9811,
"eval_samples_per_second": 527.316,
"eval_steps_per_second": 16.772,
"step": 10400
},
{
"epoch": 11.35,
"grad_norm": 1.1711052656173706,
"learning_rate": 0.0001441827266238401,
"loss": 0.3888,
"step": 10600
},
{
"epoch": 11.35,
"eval_accuracy": 0.8085241730279898,
"eval_loss": 0.5013065338134766,
"eval_runtime": 2.9847,
"eval_samples_per_second": 526.681,
"eval_steps_per_second": 16.752,
"step": 10600
},
{
"epoch": 11.56,
"grad_norm": 1.8078001737594604,
"learning_rate": 0.00014061384725196286,
"loss": 0.3855,
"step": 10800
},
{
"epoch": 11.56,
"eval_accuracy": 0.8059796437659033,
"eval_loss": 0.4992610514163971,
"eval_runtime": 2.9927,
"eval_samples_per_second": 525.27,
"eval_steps_per_second": 16.707,
"step": 10800
},
{
"epoch": 11.78,
"grad_norm": 1.1071592569351196,
"learning_rate": 0.00013704496788008563,
"loss": 0.3924,
"step": 11000
},
{
"epoch": 11.78,
"eval_accuracy": 0.8085241730279898,
"eval_loss": 0.5075262188911438,
"eval_runtime": 3.0066,
"eval_samples_per_second": 522.844,
"eval_steps_per_second": 16.63,
"step": 11000
},
{
"epoch": 11.99,
"grad_norm": 1.3704427480697632,
"learning_rate": 0.0001334760885082084,
"loss": 0.4046,
"step": 11200
},
{
"epoch": 11.99,
"eval_accuracy": 0.8027989821882952,
"eval_loss": 0.49990707635879517,
"eval_runtime": 3.0049,
"eval_samples_per_second": 523.149,
"eval_steps_per_second": 16.64,
"step": 11200
},
{
"epoch": 12.21,
"grad_norm": 1.40170419216156,
"learning_rate": 0.00012990720913633118,
"loss": 0.3957,
"step": 11400
},
{
"epoch": 12.21,
"eval_accuracy": 0.8034351145038168,
"eval_loss": 0.5089264512062073,
"eval_runtime": 2.9942,
"eval_samples_per_second": 525.011,
"eval_steps_per_second": 16.699,
"step": 11400
},
{
"epoch": 12.42,
"grad_norm": 1.1885521411895752,
"learning_rate": 0.00012633832976445395,
"loss": 0.381,
"step": 11600
},
{
"epoch": 12.42,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5207549929618835,
"eval_runtime": 2.9746,
"eval_samples_per_second": 528.479,
"eval_steps_per_second": 16.809,
"step": 11600
},
{
"epoch": 12.63,
"grad_norm": 0.8873888254165649,
"learning_rate": 0.00012276945039257673,
"loss": 0.3906,
"step": 11800
},
{
"epoch": 12.63,
"eval_accuracy": 0.806615776081425,
"eval_loss": 0.513671875,
"eval_runtime": 2.961,
"eval_samples_per_second": 530.901,
"eval_steps_per_second": 16.886,
"step": 11800
},
{
"epoch": 12.85,
"grad_norm": 1.6491570472717285,
"learning_rate": 0.0001192005710206995,
"loss": 0.3734,
"step": 12000
},
{
"epoch": 12.85,
"eval_accuracy": 0.8040712468193384,
"eval_loss": 0.5183374881744385,
"eval_runtime": 2.9533,
"eval_samples_per_second": 532.292,
"eval_steps_per_second": 16.93,
"step": 12000
},
{
"epoch": 13.06,
"grad_norm": 2.042646884918213,
"learning_rate": 0.00011563169164882227,
"loss": 0.3928,
"step": 12200
},
{
"epoch": 13.06,
"eval_accuracy": 0.806615776081425,
"eval_loss": 0.5069447159767151,
"eval_runtime": 2.959,
"eval_samples_per_second": 531.259,
"eval_steps_per_second": 16.898,
"step": 12200
},
{
"epoch": 13.28,
"grad_norm": 0.817425549030304,
"learning_rate": 0.00011206281227694502,
"loss": 0.3774,
"step": 12400
},
{
"epoch": 13.28,
"eval_accuracy": 0.8008905852417303,
"eval_loss": 0.5086419582366943,
"eval_runtime": 2.9547,
"eval_samples_per_second": 532.04,
"eval_steps_per_second": 16.922,
"step": 12400
},
{
"epoch": 13.49,
"grad_norm": 1.0988578796386719,
"learning_rate": 0.0001084939329050678,
"loss": 0.3892,
"step": 12600
},
{
"epoch": 13.49,
"eval_accuracy": 0.8059796437659033,
"eval_loss": 0.4966925382614136,
"eval_runtime": 2.9538,
"eval_samples_per_second": 532.194,
"eval_steps_per_second": 16.927,
"step": 12600
},
{
"epoch": 13.7,
"grad_norm": 1.312321662902832,
"learning_rate": 0.00010492505353319058,
"loss": 0.372,
"step": 12800
},
{
"epoch": 13.7,
"eval_accuracy": 0.8040712468193384,
"eval_loss": 0.5042534470558167,
"eval_runtime": 2.9651,
"eval_samples_per_second": 530.16,
"eval_steps_per_second": 16.863,
"step": 12800
},
{
"epoch": 13.92,
"grad_norm": 1.642741322517395,
"learning_rate": 0.00010135617416131332,
"loss": 0.388,
"step": 13000
},
{
"epoch": 13.92,
"eval_accuracy": 0.8072519083969466,
"eval_loss": 0.5095480680465698,
"eval_runtime": 2.9526,
"eval_samples_per_second": 532.404,
"eval_steps_per_second": 16.934,
"step": 13000
},
{
"epoch": 14.13,
"grad_norm": 1.10377836227417,
"learning_rate": 9.778729478943611e-05,
"loss": 0.3754,
"step": 13200
},
{
"epoch": 14.13,
"eval_accuracy": 0.8021628498727735,
"eval_loss": 0.5103972554206848,
"eval_runtime": 2.9663,
"eval_samples_per_second": 529.954,
"eval_steps_per_second": 16.856,
"step": 13200
},
{
"epoch": 14.35,
"grad_norm": 1.1614229679107666,
"learning_rate": 9.421841541755888e-05,
"loss": 0.3639,
"step": 13400
},
{
"epoch": 14.35,
"eval_accuracy": 0.7983460559796438,
"eval_loss": 0.5263165235519409,
"eval_runtime": 2.9391,
"eval_samples_per_second": 534.858,
"eval_steps_per_second": 17.012,
"step": 13400
},
{
"epoch": 14.56,
"grad_norm": 1.6049692630767822,
"learning_rate": 9.064953604568166e-05,
"loss": 0.3795,
"step": 13600
},
{
"epoch": 14.56,
"eval_accuracy": 0.8015267175572519,
"eval_loss": 0.5145931839942932,
"eval_runtime": 2.9465,
"eval_samples_per_second": 533.506,
"eval_steps_per_second": 16.969,
"step": 13600
},
{
"epoch": 14.78,
"grad_norm": 2.813002347946167,
"learning_rate": 8.708065667380442e-05,
"loss": 0.3792,
"step": 13800
},
{
"epoch": 14.78,
"eval_accuracy": 0.8040712468193384,
"eval_loss": 0.5066380500793457,
"eval_runtime": 2.9409,
"eval_samples_per_second": 534.523,
"eval_steps_per_second": 17.001,
"step": 13800
},
{
"epoch": 14.99,
"grad_norm": 1.2670201063156128,
"learning_rate": 8.351177730192719e-05,
"loss": 0.3589,
"step": 14000
},
{
"epoch": 14.99,
"eval_accuracy": 0.8078880407124682,
"eval_loss": 0.5135853886604309,
"eval_runtime": 2.962,
"eval_samples_per_second": 530.717,
"eval_steps_per_second": 16.88,
"step": 14000
},
{
"epoch": 15.2,
"grad_norm": 1.9681557416915894,
"learning_rate": 7.994289793004996e-05,
"loss": 0.3624,
"step": 14200
},
{
"epoch": 15.2,
"eval_accuracy": 0.8021628498727735,
"eval_loss": 0.5237164497375488,
"eval_runtime": 2.9535,
"eval_samples_per_second": 532.245,
"eval_steps_per_second": 16.929,
"step": 14200
},
{
"epoch": 15.42,
"grad_norm": 1.8548041582107544,
"learning_rate": 7.637401855817274e-05,
"loss": 0.3659,
"step": 14400
},
{
"epoch": 15.42,
"eval_accuracy": 0.8059796437659033,
"eval_loss": 0.5165674090385437,
"eval_runtime": 2.9482,
"eval_samples_per_second": 533.2,
"eval_steps_per_second": 16.959,
"step": 14400
},
{
"epoch": 15.63,
"grad_norm": 1.3727173805236816,
"learning_rate": 7.28051391862955e-05,
"loss": 0.3657,
"step": 14600
},
{
"epoch": 15.63,
"eval_accuracy": 0.8002544529262087,
"eval_loss": 0.5177738070487976,
"eval_runtime": 2.9451,
"eval_samples_per_second": 533.764,
"eval_steps_per_second": 16.977,
"step": 14600
},
{
"epoch": 15.85,
"grad_norm": 2.10198974609375,
"learning_rate": 6.923625981441827e-05,
"loss": 0.359,
"step": 14800
},
{
"epoch": 15.85,
"eval_accuracy": 0.7983460559796438,
"eval_loss": 0.5152426362037659,
"eval_runtime": 2.9473,
"eval_samples_per_second": 533.372,
"eval_steps_per_second": 16.965,
"step": 14800
},
{
"epoch": 16.06,
"grad_norm": 1.0453667640686035,
"learning_rate": 6.566738044254104e-05,
"loss": 0.3677,
"step": 15000
},
{
"epoch": 16.06,
"eval_accuracy": 0.8034351145038168,
"eval_loss": 0.5211815237998962,
"eval_runtime": 2.9478,
"eval_samples_per_second": 533.274,
"eval_steps_per_second": 16.962,
"step": 15000
},
{
"epoch": 16.27,
"grad_norm": 1.0645538568496704,
"learning_rate": 6.20985010706638e-05,
"loss": 0.3521,
"step": 15200
},
{
"epoch": 16.27,
"eval_accuracy": 0.8002544529262087,
"eval_loss": 0.5323696732521057,
"eval_runtime": 2.9594,
"eval_samples_per_second": 531.197,
"eval_steps_per_second": 16.896,
"step": 15200
},
{
"epoch": 16.49,
"grad_norm": 3.849015951156616,
"learning_rate": 5.852962169878657e-05,
"loss": 0.3589,
"step": 15400
},
{
"epoch": 16.49,
"eval_accuracy": 0.8040712468193384,
"eval_loss": 0.5237988829612732,
"eval_runtime": 2.9364,
"eval_samples_per_second": 535.357,
"eval_steps_per_second": 17.028,
"step": 15400
},
{
"epoch": 16.7,
"grad_norm": 1.3231987953186035,
"learning_rate": 5.496074232690935e-05,
"loss": 0.3695,
"step": 15600
},
{
"epoch": 16.7,
"eval_accuracy": 0.7977099236641222,
"eval_loss": 0.511340320110321,
"eval_runtime": 2.969,
"eval_samples_per_second": 529.468,
"eval_steps_per_second": 16.841,
"step": 15600
},
{
"epoch": 16.92,
"grad_norm": 1.7709985971450806,
"learning_rate": 5.139186295503211e-05,
"loss": 0.3606,
"step": 15800
},
{
"epoch": 16.92,
"eval_accuracy": 0.7983460559796438,
"eval_loss": 0.5136662721633911,
"eval_runtime": 2.9594,
"eval_samples_per_second": 531.193,
"eval_steps_per_second": 16.895,
"step": 15800
},
{
"epoch": 17.13,
"grad_norm": 1.5108495950698853,
"learning_rate": 4.782298358315489e-05,
"loss": 0.3581,
"step": 16000
},
{
"epoch": 17.13,
"eval_accuracy": 0.799618320610687,
"eval_loss": 0.5130853056907654,
"eval_runtime": 2.9611,
"eval_samples_per_second": 530.882,
"eval_steps_per_second": 16.886,
"step": 16000
},
{
"epoch": 17.34,
"grad_norm": 1.3634617328643799,
"learning_rate": 4.4254104211277655e-05,
"loss": 0.3488,
"step": 16200
},
{
"epoch": 17.34,
"eval_accuracy": 0.7989821882951654,
"eval_loss": 0.5270070433616638,
"eval_runtime": 2.9953,
"eval_samples_per_second": 524.824,
"eval_steps_per_second": 16.693,
"step": 16200
},
{
"epoch": 17.56,
"grad_norm": 1.0239213705062866,
"learning_rate": 4.068522483940043e-05,
"loss": 0.3499,
"step": 16400
},
{
"epoch": 17.56,
"eval_accuracy": 0.7964376590330788,
"eval_loss": 0.523576021194458,
"eval_runtime": 2.9356,
"eval_samples_per_second": 535.502,
"eval_steps_per_second": 17.033,
"step": 16400
},
{
"epoch": 17.77,
"grad_norm": 1.108484148979187,
"learning_rate": 3.7116345467523195e-05,
"loss": 0.3603,
"step": 16600
},
{
"epoch": 17.77,
"eval_accuracy": 0.8002544529262087,
"eval_loss": 0.5186541080474854,
"eval_runtime": 2.9666,
"eval_samples_per_second": 529.891,
"eval_steps_per_second": 16.854,
"step": 16600
},
{
"epoch": 17.99,
"grad_norm": 2.816092014312744,
"learning_rate": 3.354746609564596e-05,
"loss": 0.3578,
"step": 16800
},
{
"epoch": 17.99,
"eval_accuracy": 0.8021628498727735,
"eval_loss": 0.5223926901817322,
"eval_runtime": 2.9355,
"eval_samples_per_second": 535.521,
"eval_steps_per_second": 17.033,
"step": 16800
},
{
"epoch": 18.2,
"grad_norm": 1.5831489562988281,
"learning_rate": 2.997858672376873e-05,
"loss": 0.3449,
"step": 17000
},
{
"epoch": 18.2,
"eval_accuracy": 0.7989821882951654,
"eval_loss": 0.5227622389793396,
"eval_runtime": 2.9602,
"eval_samples_per_second": 531.048,
"eval_steps_per_second": 16.891,
"step": 17000
},
{
"epoch": 18.42,
"grad_norm": 1.0060327053070068,
"learning_rate": 2.64097073518915e-05,
"loss": 0.3418,
"step": 17200
},
{
"epoch": 18.42,
"eval_accuracy": 0.8008905852417303,
"eval_loss": 0.5287216901779175,
"eval_runtime": 2.9537,
"eval_samples_per_second": 532.21,
"eval_steps_per_second": 16.928,
"step": 17200
},
{
"epoch": 18.63,
"grad_norm": 1.8092093467712402,
"learning_rate": 2.2840827980014274e-05,
"loss": 0.3334,
"step": 17400
},
{
"epoch": 18.63,
"eval_accuracy": 0.799618320610687,
"eval_loss": 0.5322315096855164,
"eval_runtime": 2.9745,
"eval_samples_per_second": 528.484,
"eval_steps_per_second": 16.809,
"step": 17400
},
{
"epoch": 18.84,
"grad_norm": 1.4800430536270142,
"learning_rate": 1.9271948608137044e-05,
"loss": 0.3567,
"step": 17600
},
{
"epoch": 18.84,
"eval_accuracy": 0.7983460559796438,
"eval_loss": 0.5293812155723572,
"eval_runtime": 2.9485,
"eval_samples_per_second": 533.161,
"eval_steps_per_second": 16.958,
"step": 17600
},
{
"epoch": 19.06,
"grad_norm": 1.6271811723709106,
"learning_rate": 1.5703069236259814e-05,
"loss": 0.3541,
"step": 17800
},
{
"epoch": 19.06,
"eval_accuracy": 0.8002544529262087,
"eval_loss": 0.5250320434570312,
"eval_runtime": 2.9479,
"eval_samples_per_second": 533.268,
"eval_steps_per_second": 16.961,
"step": 17800
},
{
"epoch": 19.27,
"grad_norm": 0.7758527994155884,
"learning_rate": 1.2134189864382584e-05,
"loss": 0.365,
"step": 18000
},
{
"epoch": 19.27,
"eval_accuracy": 0.7983460559796438,
"eval_loss": 0.5246437788009644,
"eval_runtime": 2.9363,
"eval_samples_per_second": 535.369,
"eval_steps_per_second": 17.028,
"step": 18000
},
{
"epoch": 19.49,
"grad_norm": 0.9722337126731873,
"learning_rate": 8.565310492505352e-06,
"loss": 0.337,
"step": 18200
},
{
"epoch": 19.49,
"eval_accuracy": 0.7977099236641222,
"eval_loss": 0.527810275554657,
"eval_runtime": 2.9383,
"eval_samples_per_second": 535.006,
"eval_steps_per_second": 17.017,
"step": 18200
},
{
"epoch": 19.7,
"grad_norm": 1.5007203817367554,
"learning_rate": 4.996431120628123e-06,
"loss": 0.3301,
"step": 18400
},
{
"epoch": 19.7,
"eval_accuracy": 0.7989821882951654,
"eval_loss": 0.5283259153366089,
"eval_runtime": 2.9603,
"eval_samples_per_second": 531.035,
"eval_steps_per_second": 16.89,
"step": 18400
},
{
"epoch": 19.91,
"grad_norm": 1.1220752000808716,
"learning_rate": 1.4275517487508921e-06,
"loss": 0.3421,
"step": 18600
},
{
"epoch": 19.91,
"eval_accuracy": 0.7977099236641222,
"eval_loss": 0.5287136435508728,
"eval_runtime": 2.9398,
"eval_samples_per_second": 534.737,
"eval_steps_per_second": 17.008,
"step": 18600
},
{
"epoch": 20.0,
"step": 18680,
"total_flos": 1.44512252251488e+16,
"train_loss": 0.42864556159401346,
"train_runtime": 2680.553,
"train_samples_per_second": 222.82,
"train_steps_per_second": 6.969
}
],
"logging_steps": 200,
"max_steps": 18680,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 200,
"total_flos": 1.44512252251488e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}