wcyat's picture
Training in progress, step 1745, checkpoint
67bed29 verified
{
"best_metric": 0.2911098897457123,
"best_model_checkpoint": "./results/checkpoint-280",
"epoch": 5.0,
"eval_steps": 20,
"global_step": 1745,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05730659025787966,
"grad_norm": 14.95106029510498,
"learning_rate": 1.9770773638968482e-05,
"loss": 0.5908,
"step": 20
},
{
"epoch": 0.05730659025787966,
"eval_accuracy": 0.7974683544303798,
"eval_loss": 0.44761696457862854,
"eval_runtime": 12.826,
"eval_samples_per_second": 12.319,
"eval_steps_per_second": 3.119,
"step": 20
},
{
"epoch": 0.11461318051575932,
"grad_norm": 16.47698402404785,
"learning_rate": 1.9541547277936966e-05,
"loss": 0.543,
"step": 40
},
{
"epoch": 0.11461318051575932,
"eval_accuracy": 0.7721518987341772,
"eval_loss": 0.4422585070133209,
"eval_runtime": 13.5793,
"eval_samples_per_second": 11.635,
"eval_steps_per_second": 2.946,
"step": 40
},
{
"epoch": 0.17191977077363896,
"grad_norm": 22.875091552734375,
"learning_rate": 1.9312320916905443e-05,
"loss": 0.5093,
"step": 60
},
{
"epoch": 0.17191977077363896,
"eval_accuracy": 0.7721518987341772,
"eval_loss": 0.5881978869438171,
"eval_runtime": 14.7375,
"eval_samples_per_second": 10.721,
"eval_steps_per_second": 2.714,
"step": 60
},
{
"epoch": 0.22922636103151864,
"grad_norm": 6.222044944763184,
"learning_rate": 1.9083094555873927e-05,
"loss": 0.5186,
"step": 80
},
{
"epoch": 0.22922636103151864,
"eval_accuracy": 0.7658227848101266,
"eval_loss": 0.6422034502029419,
"eval_runtime": 14.6414,
"eval_samples_per_second": 10.791,
"eval_steps_per_second": 2.732,
"step": 80
},
{
"epoch": 0.28653295128939826,
"grad_norm": 10.637746810913086,
"learning_rate": 1.8853868194842408e-05,
"loss": 0.502,
"step": 100
},
{
"epoch": 0.28653295128939826,
"eval_accuracy": 0.7658227848101266,
"eval_loss": 0.9381818175315857,
"eval_runtime": 14.572,
"eval_samples_per_second": 10.843,
"eval_steps_per_second": 2.745,
"step": 100
},
{
"epoch": 0.3438395415472779,
"grad_norm": 8.144033432006836,
"learning_rate": 1.8624641833810892e-05,
"loss": 0.573,
"step": 120
},
{
"epoch": 0.3438395415472779,
"eval_accuracy": 0.8227848101265823,
"eval_loss": 0.4263954758644104,
"eval_runtime": 14.6662,
"eval_samples_per_second": 10.773,
"eval_steps_per_second": 2.727,
"step": 120
},
{
"epoch": 0.40114613180515757,
"grad_norm": 0.44048359990119934,
"learning_rate": 1.8395415472779372e-05,
"loss": 0.5269,
"step": 140
},
{
"epoch": 0.40114613180515757,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.5453027486801147,
"eval_runtime": 14.5869,
"eval_samples_per_second": 10.832,
"eval_steps_per_second": 2.742,
"step": 140
},
{
"epoch": 0.4584527220630373,
"grad_norm": 18.155141830444336,
"learning_rate": 1.8166189111747853e-05,
"loss": 0.3545,
"step": 160
},
{
"epoch": 0.4584527220630373,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.4540826678276062,
"eval_runtime": 14.6402,
"eval_samples_per_second": 10.792,
"eval_steps_per_second": 2.732,
"step": 160
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.482028603553772,
"learning_rate": 1.7936962750716333e-05,
"loss": 0.4449,
"step": 180
},
{
"epoch": 0.5157593123209169,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.43535691499710083,
"eval_runtime": 14.6919,
"eval_samples_per_second": 10.754,
"eval_steps_per_second": 2.723,
"step": 180
},
{
"epoch": 0.5730659025787965,
"grad_norm": 108.88398742675781,
"learning_rate": 1.7707736389684814e-05,
"loss": 0.3868,
"step": 200
},
{
"epoch": 0.5730659025787965,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.8784106373786926,
"eval_runtime": 14.6371,
"eval_samples_per_second": 10.794,
"eval_steps_per_second": 2.733,
"step": 200
},
{
"epoch": 0.6303724928366762,
"grad_norm": 12.77889347076416,
"learning_rate": 1.7478510028653298e-05,
"loss": 0.7576,
"step": 220
},
{
"epoch": 0.6303724928366762,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.38221749663352966,
"eval_runtime": 14.6383,
"eval_samples_per_second": 10.794,
"eval_steps_per_second": 2.733,
"step": 220
},
{
"epoch": 0.6876790830945558,
"grad_norm": 13.416525840759277,
"learning_rate": 1.724928366762178e-05,
"loss": 0.1956,
"step": 240
},
{
"epoch": 0.6876790830945558,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.4667538106441498,
"eval_runtime": 14.6113,
"eval_samples_per_second": 10.814,
"eval_steps_per_second": 2.738,
"step": 240
},
{
"epoch": 0.7449856733524355,
"grad_norm": 10.141700744628906,
"learning_rate": 1.702005730659026e-05,
"loss": 0.4942,
"step": 260
},
{
"epoch": 0.7449856733524355,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.5736417174339294,
"eval_runtime": 14.603,
"eval_samples_per_second": 10.82,
"eval_steps_per_second": 2.739,
"step": 260
},
{
"epoch": 0.8022922636103151,
"grad_norm": 23.185056686401367,
"learning_rate": 1.679083094555874e-05,
"loss": 0.4762,
"step": 280
},
{
"epoch": 0.8022922636103151,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.2911098897457123,
"eval_runtime": 14.6519,
"eval_samples_per_second": 10.784,
"eval_steps_per_second": 2.73,
"step": 280
},
{
"epoch": 0.8595988538681948,
"grad_norm": 46.526451110839844,
"learning_rate": 1.6561604584527223e-05,
"loss": 0.4136,
"step": 300
},
{
"epoch": 0.8595988538681948,
"eval_accuracy": 0.8607594936708861,
"eval_loss": 0.3629298508167267,
"eval_runtime": 14.6627,
"eval_samples_per_second": 10.776,
"eval_steps_per_second": 2.728,
"step": 300
},
{
"epoch": 0.9169054441260746,
"grad_norm": 5.966210842132568,
"learning_rate": 1.6332378223495704e-05,
"loss": 0.5865,
"step": 320
},
{
"epoch": 0.9169054441260746,
"eval_accuracy": 0.7721518987341772,
"eval_loss": 0.9794216752052307,
"eval_runtime": 14.6593,
"eval_samples_per_second": 10.778,
"eval_steps_per_second": 2.729,
"step": 320
},
{
"epoch": 0.9742120343839542,
"grad_norm": 3.5877606868743896,
"learning_rate": 1.6103151862464185e-05,
"loss": 0.3758,
"step": 340
},
{
"epoch": 0.9742120343839542,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.46775683760643005,
"eval_runtime": 14.6442,
"eval_samples_per_second": 10.789,
"eval_steps_per_second": 2.731,
"step": 340
},
{
"epoch": 1.0315186246418337,
"grad_norm": 5.313683986663818,
"learning_rate": 1.5873925501432665e-05,
"loss": 0.4285,
"step": 360
},
{
"epoch": 1.0315186246418337,
"eval_accuracy": 0.8670886075949367,
"eval_loss": 0.5543066263198853,
"eval_runtime": 14.6827,
"eval_samples_per_second": 10.761,
"eval_steps_per_second": 2.724,
"step": 360
},
{
"epoch": 1.0888252148997135,
"grad_norm": 10.655978202819824,
"learning_rate": 1.5644699140401146e-05,
"loss": 0.44,
"step": 380
},
{
"epoch": 1.0888252148997135,
"eval_accuracy": 0.8607594936708861,
"eval_loss": 0.5150261521339417,
"eval_runtime": 14.6825,
"eval_samples_per_second": 10.761,
"eval_steps_per_second": 2.724,
"step": 380
},
{
"epoch": 1.146131805157593,
"grad_norm": 0.08064723014831543,
"learning_rate": 1.541547277936963e-05,
"loss": 0.3573,
"step": 400
},
{
"epoch": 1.146131805157593,
"eval_accuracy": 0.8607594936708861,
"eval_loss": 0.563529908657074,
"eval_runtime": 14.6349,
"eval_samples_per_second": 10.796,
"eval_steps_per_second": 2.733,
"step": 400
},
{
"epoch": 1.2034383954154728,
"grad_norm": 0.46097293496131897,
"learning_rate": 1.518624641833811e-05,
"loss": 0.4187,
"step": 420
},
{
"epoch": 1.2034383954154728,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.6609386205673218,
"eval_runtime": 14.5517,
"eval_samples_per_second": 10.858,
"eval_steps_per_second": 2.749,
"step": 420
},
{
"epoch": 1.2607449856733524,
"grad_norm": 0.37571266293525696,
"learning_rate": 1.495702005730659e-05,
"loss": 0.3742,
"step": 440
},
{
"epoch": 1.2607449856733524,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.5912802815437317,
"eval_runtime": 14.594,
"eval_samples_per_second": 10.826,
"eval_steps_per_second": 2.741,
"step": 440
},
{
"epoch": 1.3180515759312321,
"grad_norm": 0.4662785828113556,
"learning_rate": 1.4727793696275073e-05,
"loss": 0.5179,
"step": 460
},
{
"epoch": 1.3180515759312321,
"eval_accuracy": 0.8354430379746836,
"eval_loss": 0.3983699679374695,
"eval_runtime": 14.6982,
"eval_samples_per_second": 10.75,
"eval_steps_per_second": 2.721,
"step": 460
},
{
"epoch": 1.3753581661891117,
"grad_norm": 3.044969081878662,
"learning_rate": 1.4498567335243553e-05,
"loss": 0.1685,
"step": 480
},
{
"epoch": 1.3753581661891117,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.5606595873832703,
"eval_runtime": 14.5479,
"eval_samples_per_second": 10.861,
"eval_steps_per_second": 2.75,
"step": 480
},
{
"epoch": 1.4326647564469914,
"grad_norm": 9.852724075317383,
"learning_rate": 1.4269340974212036e-05,
"loss": 0.5284,
"step": 500
},
{
"epoch": 1.4326647564469914,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.35282623767852783,
"eval_runtime": 14.6738,
"eval_samples_per_second": 10.767,
"eval_steps_per_second": 2.726,
"step": 500
},
{
"epoch": 1.4899713467048712,
"grad_norm": 25.850496292114258,
"learning_rate": 1.4040114613180518e-05,
"loss": 0.4246,
"step": 520
},
{
"epoch": 1.4899713467048712,
"eval_accuracy": 0.8607594936708861,
"eval_loss": 0.5857312083244324,
"eval_runtime": 15.5144,
"eval_samples_per_second": 10.184,
"eval_steps_per_second": 2.578,
"step": 520
},
{
"epoch": 1.5472779369627507,
"grad_norm": 7.516841888427734,
"learning_rate": 1.3810888252148997e-05,
"loss": 0.2419,
"step": 540
},
{
"epoch": 1.5472779369627507,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.34958717226982117,
"eval_runtime": 14.4393,
"eval_samples_per_second": 10.942,
"eval_steps_per_second": 2.77,
"step": 540
},
{
"epoch": 1.6045845272206303,
"grad_norm": 0.07038611173629761,
"learning_rate": 1.3581661891117479e-05,
"loss": 0.4416,
"step": 560
},
{
"epoch": 1.6045845272206303,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.4946177005767822,
"eval_runtime": 14.6819,
"eval_samples_per_second": 10.762,
"eval_steps_per_second": 2.724,
"step": 560
},
{
"epoch": 1.66189111747851,
"grad_norm": 9.443480491638184,
"learning_rate": 1.3352435530085961e-05,
"loss": 0.4426,
"step": 580
},
{
"epoch": 1.66189111747851,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.34582754969596863,
"eval_runtime": 14.6267,
"eval_samples_per_second": 10.802,
"eval_steps_per_second": 2.735,
"step": 580
},
{
"epoch": 1.7191977077363898,
"grad_norm": 0.07343020290136337,
"learning_rate": 1.3123209169054444e-05,
"loss": 0.2122,
"step": 600
},
{
"epoch": 1.7191977077363898,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.6184278130531311,
"eval_runtime": 14.6949,
"eval_samples_per_second": 10.752,
"eval_steps_per_second": 2.722,
"step": 600
},
{
"epoch": 1.7765042979942693,
"grad_norm": 0.03269320726394653,
"learning_rate": 1.2893982808022924e-05,
"loss": 0.1734,
"step": 620
},
{
"epoch": 1.7765042979942693,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.7278411388397217,
"eval_runtime": 14.5541,
"eval_samples_per_second": 10.856,
"eval_steps_per_second": 2.748,
"step": 620
},
{
"epoch": 1.8338108882521489,
"grad_norm": 0.021946750581264496,
"learning_rate": 1.2664756446991405e-05,
"loss": 0.2314,
"step": 640
},
{
"epoch": 1.8338108882521489,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.543005645275116,
"eval_runtime": 14.6871,
"eval_samples_per_second": 10.758,
"eval_steps_per_second": 2.723,
"step": 640
},
{
"epoch": 1.8911174785100286,
"grad_norm": 0.17806316912174225,
"learning_rate": 1.2435530085959885e-05,
"loss": 0.4886,
"step": 660
},
{
"epoch": 1.8911174785100286,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.5081498622894287,
"eval_runtime": 14.6477,
"eval_samples_per_second": 10.787,
"eval_steps_per_second": 2.731,
"step": 660
},
{
"epoch": 1.9484240687679084,
"grad_norm": 18.20897674560547,
"learning_rate": 1.2206303724928367e-05,
"loss": 0.3429,
"step": 680
},
{
"epoch": 1.9484240687679084,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.6000381708145142,
"eval_runtime": 14.5629,
"eval_samples_per_second": 10.849,
"eval_steps_per_second": 2.747,
"step": 680
},
{
"epoch": 2.005730659025788,
"grad_norm": 0.07220949977636337,
"learning_rate": 1.197707736389685e-05,
"loss": 0.3591,
"step": 700
},
{
"epoch": 2.005730659025788,
"eval_accuracy": 0.8607594936708861,
"eval_loss": 0.5183639526367188,
"eval_runtime": 14.6159,
"eval_samples_per_second": 10.81,
"eval_steps_per_second": 2.737,
"step": 700
},
{
"epoch": 2.0630372492836675,
"grad_norm": 0.03888562321662903,
"learning_rate": 1.1747851002865332e-05,
"loss": 0.3638,
"step": 720
},
{
"epoch": 2.0630372492836675,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.4008268415927887,
"eval_runtime": 14.6829,
"eval_samples_per_second": 10.761,
"eval_steps_per_second": 2.724,
"step": 720
},
{
"epoch": 2.1203438395415475,
"grad_norm": 0.05230604112148285,
"learning_rate": 1.151862464183381e-05,
"loss": 0.1881,
"step": 740
},
{
"epoch": 2.1203438395415475,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.616079568862915,
"eval_runtime": 14.646,
"eval_samples_per_second": 10.788,
"eval_steps_per_second": 2.731,
"step": 740
},
{
"epoch": 2.177650429799427,
"grad_norm": 0.6790505647659302,
"learning_rate": 1.1289398280802293e-05,
"loss": 0.241,
"step": 760
},
{
"epoch": 2.177650429799427,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.5249369144439697,
"eval_runtime": 14.6423,
"eval_samples_per_second": 10.791,
"eval_steps_per_second": 2.732,
"step": 760
},
{
"epoch": 2.2349570200573066,
"grad_norm": 0.8485791087150574,
"learning_rate": 1.1060171919770775e-05,
"loss": 0.4699,
"step": 780
},
{
"epoch": 2.2349570200573066,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.5322971343994141,
"eval_runtime": 14.6006,
"eval_samples_per_second": 10.821,
"eval_steps_per_second": 2.74,
"step": 780
},
{
"epoch": 2.292263610315186,
"grad_norm": 96.15169525146484,
"learning_rate": 1.0830945558739256e-05,
"loss": 0.3702,
"step": 800
},
{
"epoch": 2.292263610315186,
"eval_accuracy": 0.8481012658227848,
"eval_loss": 0.728390097618103,
"eval_runtime": 14.5807,
"eval_samples_per_second": 10.836,
"eval_steps_per_second": 2.743,
"step": 800
},
{
"epoch": 2.349570200573066,
"grad_norm": 0.1611723154783249,
"learning_rate": 1.0601719197707738e-05,
"loss": 0.4192,
"step": 820
},
{
"epoch": 2.349570200573066,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.36709439754486084,
"eval_runtime": 14.5871,
"eval_samples_per_second": 10.831,
"eval_steps_per_second": 2.742,
"step": 820
},
{
"epoch": 2.4068767908309456,
"grad_norm": 0.11072923988103867,
"learning_rate": 1.0372492836676219e-05,
"loss": 0.1747,
"step": 840
},
{
"epoch": 2.4068767908309456,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.42927253246307373,
"eval_runtime": 14.6133,
"eval_samples_per_second": 10.812,
"eval_steps_per_second": 2.737,
"step": 840
},
{
"epoch": 2.464183381088825,
"grad_norm": 0.03486654907464981,
"learning_rate": 1.01432664756447e-05,
"loss": 0.347,
"step": 860
},
{
"epoch": 2.464183381088825,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.40468934178352356,
"eval_runtime": 14.6475,
"eval_samples_per_second": 10.787,
"eval_steps_per_second": 2.731,
"step": 860
},
{
"epoch": 2.5214899713467047,
"grad_norm": 0.27154240012168884,
"learning_rate": 9.914040114613181e-06,
"loss": 0.0533,
"step": 880
},
{
"epoch": 2.5214899713467047,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.5134832859039307,
"eval_runtime": 14.6718,
"eval_samples_per_second": 10.769,
"eval_steps_per_second": 2.726,
"step": 880
},
{
"epoch": 2.5787965616045847,
"grad_norm": 24.125070571899414,
"learning_rate": 9.684813753581662e-06,
"loss": 0.2002,
"step": 900
},
{
"epoch": 2.5787965616045847,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.5535210967063904,
"eval_runtime": 14.6419,
"eval_samples_per_second": 10.791,
"eval_steps_per_second": 2.732,
"step": 900
},
{
"epoch": 2.6361031518624642,
"grad_norm": 0.03520410135388374,
"learning_rate": 9.455587392550144e-06,
"loss": 0.0274,
"step": 920
},
{
"epoch": 2.6361031518624642,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.6635323762893677,
"eval_runtime": 14.6418,
"eval_samples_per_second": 10.791,
"eval_steps_per_second": 2.732,
"step": 920
},
{
"epoch": 2.693409742120344,
"grad_norm": 0.09307877719402313,
"learning_rate": 9.226361031518626e-06,
"loss": 0.2339,
"step": 940
},
{
"epoch": 2.693409742120344,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.4939664602279663,
"eval_runtime": 14.6554,
"eval_samples_per_second": 10.781,
"eval_steps_per_second": 2.729,
"step": 940
},
{
"epoch": 2.7507163323782233,
"grad_norm": 80.65755462646484,
"learning_rate": 8.997134670487107e-06,
"loss": 0.3015,
"step": 960
},
{
"epoch": 2.7507163323782233,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.5513517260551453,
"eval_runtime": 14.6022,
"eval_samples_per_second": 10.82,
"eval_steps_per_second": 2.739,
"step": 960
},
{
"epoch": 2.8080229226361033,
"grad_norm": 180.23745727539062,
"learning_rate": 8.767908309455588e-06,
"loss": 0.4222,
"step": 980
},
{
"epoch": 2.8080229226361033,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.5411596298217773,
"eval_runtime": 14.6522,
"eval_samples_per_second": 10.783,
"eval_steps_per_second": 2.73,
"step": 980
},
{
"epoch": 2.865329512893983,
"grad_norm": 106.34879302978516,
"learning_rate": 8.53868194842407e-06,
"loss": 0.3243,
"step": 1000
},
{
"epoch": 2.865329512893983,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.5439683198928833,
"eval_runtime": 14.6662,
"eval_samples_per_second": 10.773,
"eval_steps_per_second": 2.727,
"step": 1000
},
{
"epoch": 2.9226361031518624,
"grad_norm": 43.02892303466797,
"learning_rate": 8.30945558739255e-06,
"loss": 0.3137,
"step": 1020
},
{
"epoch": 2.9226361031518624,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.45336952805519104,
"eval_runtime": 15.5419,
"eval_samples_per_second": 10.166,
"eval_steps_per_second": 2.574,
"step": 1020
},
{
"epoch": 2.9799426934097424,
"grad_norm": 0.05886560305953026,
"learning_rate": 8.080229226361033e-06,
"loss": 0.191,
"step": 1040
},
{
"epoch": 2.9799426934097424,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.6082937121391296,
"eval_runtime": 14.4222,
"eval_samples_per_second": 10.955,
"eval_steps_per_second": 2.774,
"step": 1040
},
{
"epoch": 3.037249283667622,
"grad_norm": 0.0684143528342247,
"learning_rate": 7.851002865329513e-06,
"loss": 0.1213,
"step": 1060
},
{
"epoch": 3.037249283667622,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.5798259377479553,
"eval_runtime": 14.8164,
"eval_samples_per_second": 10.664,
"eval_steps_per_second": 2.7,
"step": 1060
},
{
"epoch": 3.0945558739255015,
"grad_norm": 0.08387450873851776,
"learning_rate": 7.6217765042979954e-06,
"loss": 0.1582,
"step": 1080
},
{
"epoch": 3.0945558739255015,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.48295101523399353,
"eval_runtime": 14.6812,
"eval_samples_per_second": 10.762,
"eval_steps_per_second": 2.725,
"step": 1080
},
{
"epoch": 3.151862464183381,
"grad_norm": 0.056213777512311935,
"learning_rate": 7.392550143266476e-06,
"loss": 0.0546,
"step": 1100
},
{
"epoch": 3.151862464183381,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.7038730382919312,
"eval_runtime": 14.5146,
"eval_samples_per_second": 10.886,
"eval_steps_per_second": 2.756,
"step": 1100
},
{
"epoch": 3.2091690544412605,
"grad_norm": 0.013059821911156178,
"learning_rate": 7.163323782234957e-06,
"loss": 0.0387,
"step": 1120
},
{
"epoch": 3.2091690544412605,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.6058567762374878,
"eval_runtime": 14.7233,
"eval_samples_per_second": 10.731,
"eval_steps_per_second": 2.717,
"step": 1120
},
{
"epoch": 3.2664756446991405,
"grad_norm": 15.5554780960083,
"learning_rate": 6.934097421203439e-06,
"loss": 0.4619,
"step": 1140
},
{
"epoch": 3.2664756446991405,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.6933996677398682,
"eval_runtime": 14.6193,
"eval_samples_per_second": 10.808,
"eval_steps_per_second": 2.736,
"step": 1140
},
{
"epoch": 3.32378223495702,
"grad_norm": 2.1167819499969482,
"learning_rate": 6.70487106017192e-06,
"loss": 0.2789,
"step": 1160
},
{
"epoch": 3.32378223495702,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.524710476398468,
"eval_runtime": 14.6186,
"eval_samples_per_second": 10.808,
"eval_steps_per_second": 2.736,
"step": 1160
},
{
"epoch": 3.3810888252148996,
"grad_norm": 0.020894192159175873,
"learning_rate": 6.475644699140402e-06,
"loss": 0.1361,
"step": 1180
},
{
"epoch": 3.3810888252148996,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.6307375431060791,
"eval_runtime": 14.6338,
"eval_samples_per_second": 10.797,
"eval_steps_per_second": 2.733,
"step": 1180
},
{
"epoch": 3.4383954154727796,
"grad_norm": 106.9233627319336,
"learning_rate": 6.246418338108883e-06,
"loss": 0.0475,
"step": 1200
},
{
"epoch": 3.4383954154727796,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.5455241203308105,
"eval_runtime": 14.6106,
"eval_samples_per_second": 10.814,
"eval_steps_per_second": 2.738,
"step": 1200
},
{
"epoch": 3.495702005730659,
"grad_norm": 10.43300724029541,
"learning_rate": 6.017191977077364e-06,
"loss": 0.2889,
"step": 1220
},
{
"epoch": 3.495702005730659,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.5864837169647217,
"eval_runtime": 14.7,
"eval_samples_per_second": 10.748,
"eval_steps_per_second": 2.721,
"step": 1220
},
{
"epoch": 3.5530085959885387,
"grad_norm": 0.143876850605011,
"learning_rate": 5.787965616045845e-06,
"loss": 0.2507,
"step": 1240
},
{
"epoch": 3.5530085959885387,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.5028768181800842,
"eval_runtime": 14.6373,
"eval_samples_per_second": 10.794,
"eval_steps_per_second": 2.733,
"step": 1240
},
{
"epoch": 3.6103151862464182,
"grad_norm": 41.49633026123047,
"learning_rate": 5.558739255014327e-06,
"loss": 0.1476,
"step": 1260
},
{
"epoch": 3.6103151862464182,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.651742160320282,
"eval_runtime": 14.5921,
"eval_samples_per_second": 10.828,
"eval_steps_per_second": 2.741,
"step": 1260
},
{
"epoch": 3.6676217765042978,
"grad_norm": 0.19821767508983612,
"learning_rate": 5.3295128939828086e-06,
"loss": 0.0709,
"step": 1280
},
{
"epoch": 3.6676217765042978,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.5607478618621826,
"eval_runtime": 14.6558,
"eval_samples_per_second": 10.781,
"eval_steps_per_second": 2.729,
"step": 1280
},
{
"epoch": 3.7249283667621778,
"grad_norm": 0.014833999797701836,
"learning_rate": 5.10028653295129e-06,
"loss": 0.2416,
"step": 1300
},
{
"epoch": 3.7249283667621778,
"eval_accuracy": 0.8670886075949367,
"eval_loss": 0.6906114220619202,
"eval_runtime": 14.699,
"eval_samples_per_second": 10.749,
"eval_steps_per_second": 2.721,
"step": 1300
},
{
"epoch": 3.7822349570200573,
"grad_norm": 13.687612533569336,
"learning_rate": 4.871060171919771e-06,
"loss": 0.2482,
"step": 1320
},
{
"epoch": 3.7822349570200573,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.45231887698173523,
"eval_runtime": 14.6807,
"eval_samples_per_second": 10.762,
"eval_steps_per_second": 2.725,
"step": 1320
},
{
"epoch": 3.839541547277937,
"grad_norm": 0.014498379081487656,
"learning_rate": 4.641833810888253e-06,
"loss": 0.1591,
"step": 1340
},
{
"epoch": 3.839541547277937,
"eval_accuracy": 0.9177215189873418,
"eval_loss": 0.3677010238170624,
"eval_runtime": 14.6812,
"eval_samples_per_second": 10.762,
"eval_steps_per_second": 2.725,
"step": 1340
},
{
"epoch": 3.896848137535817,
"grad_norm": 0.2034488171339035,
"learning_rate": 4.412607449856734e-06,
"loss": 0.1728,
"step": 1360
},
{
"epoch": 3.896848137535817,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.4237450659275055,
"eval_runtime": 14.6536,
"eval_samples_per_second": 10.782,
"eval_steps_per_second": 2.73,
"step": 1360
},
{
"epoch": 3.9541547277936964,
"grad_norm": 1.0174587965011597,
"learning_rate": 4.1833810888252155e-06,
"loss": 0.1061,
"step": 1380
},
{
"epoch": 3.9541547277936964,
"eval_accuracy": 0.9240506329113924,
"eval_loss": 0.37083700299263,
"eval_runtime": 14.6215,
"eval_samples_per_second": 10.806,
"eval_steps_per_second": 2.736,
"step": 1380
},
{
"epoch": 4.011461318051576,
"grad_norm": 0.23911085724830627,
"learning_rate": 3.954154727793696e-06,
"loss": 0.1461,
"step": 1400
},
{
"epoch": 4.011461318051576,
"eval_accuracy": 0.9050632911392406,
"eval_loss": 0.4641564190387726,
"eval_runtime": 14.6444,
"eval_samples_per_second": 10.789,
"eval_steps_per_second": 2.731,
"step": 1400
},
{
"epoch": 4.0687679083094554,
"grad_norm": 0.13148854672908783,
"learning_rate": 3.724928366762178e-06,
"loss": 0.0671,
"step": 1420
},
{
"epoch": 4.0687679083094554,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.556703507900238,
"eval_runtime": 14.6395,
"eval_samples_per_second": 10.793,
"eval_steps_per_second": 2.732,
"step": 1420
},
{
"epoch": 4.126074498567335,
"grad_norm": 0.1307491660118103,
"learning_rate": 3.4957020057306597e-06,
"loss": 0.0363,
"step": 1440
},
{
"epoch": 4.126074498567335,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.6240283846855164,
"eval_runtime": 14.6021,
"eval_samples_per_second": 10.82,
"eval_steps_per_second": 2.739,
"step": 1440
},
{
"epoch": 4.1833810888252145,
"grad_norm": 0.055873971432447433,
"learning_rate": 3.2664756446991407e-06,
"loss": 0.1257,
"step": 1460
},
{
"epoch": 4.1833810888252145,
"eval_accuracy": 0.8734177215189873,
"eval_loss": 0.7053503394126892,
"eval_runtime": 14.6002,
"eval_samples_per_second": 10.822,
"eval_steps_per_second": 2.74,
"step": 1460
},
{
"epoch": 4.240687679083095,
"grad_norm": 0.10310907661914825,
"learning_rate": 3.037249283667622e-06,
"loss": 0.1307,
"step": 1480
},
{
"epoch": 4.240687679083095,
"eval_accuracy": 0.8860759493670886,
"eval_loss": 0.6526200771331787,
"eval_runtime": 14.6477,
"eval_samples_per_second": 10.787,
"eval_steps_per_second": 2.731,
"step": 1480
},
{
"epoch": 4.2979942693409745,
"grad_norm": 0.09674423187971115,
"learning_rate": 2.8080229226361035e-06,
"loss": 0.226,
"step": 1500
},
{
"epoch": 4.2979942693409745,
"eval_accuracy": 0.879746835443038,
"eval_loss": 0.588349461555481,
"eval_runtime": 14.6299,
"eval_samples_per_second": 10.8,
"eval_steps_per_second": 2.734,
"step": 1500
},
{
"epoch": 4.355300859598854,
"grad_norm": 3.432967185974121,
"learning_rate": 2.5787965616045845e-06,
"loss": 0.0714,
"step": 1520
},
{
"epoch": 4.355300859598854,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.5381926894187927,
"eval_runtime": 15.6025,
"eval_samples_per_second": 10.127,
"eval_steps_per_second": 2.564,
"step": 1520
},
{
"epoch": 4.412607449856734,
"grad_norm": 0.03264419734477997,
"learning_rate": 2.3495702005730663e-06,
"loss": 0.0617,
"step": 1540
},
{
"epoch": 4.412607449856734,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.6029611229896545,
"eval_runtime": 14.4132,
"eval_samples_per_second": 10.962,
"eval_steps_per_second": 2.775,
"step": 1540
},
{
"epoch": 4.469914040114613,
"grad_norm": 0.06593719124794006,
"learning_rate": 2.1203438395415473e-06,
"loss": 0.0802,
"step": 1560
},
{
"epoch": 4.469914040114613,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.567659318447113,
"eval_runtime": 14.8121,
"eval_samples_per_second": 10.667,
"eval_steps_per_second": 2.7,
"step": 1560
},
{
"epoch": 4.527220630372493,
"grad_norm": 0.1013946682214737,
"learning_rate": 1.8911174785100289e-06,
"loss": 0.2404,
"step": 1580
},
{
"epoch": 4.527220630372493,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.5836894512176514,
"eval_runtime": 14.7362,
"eval_samples_per_second": 10.722,
"eval_steps_per_second": 2.714,
"step": 1580
},
{
"epoch": 4.584527220630372,
"grad_norm": 6.956309795379639,
"learning_rate": 1.66189111747851e-06,
"loss": 0.2311,
"step": 1600
},
{
"epoch": 4.584527220630372,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.6191691160202026,
"eval_runtime": 14.4896,
"eval_samples_per_second": 10.904,
"eval_steps_per_second": 2.761,
"step": 1600
},
{
"epoch": 4.641833810888253,
"grad_norm": 0.13025854527950287,
"learning_rate": 1.4326647564469915e-06,
"loss": 0.0031,
"step": 1620
},
{
"epoch": 4.641833810888253,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.6153239011764526,
"eval_runtime": 14.714,
"eval_samples_per_second": 10.738,
"eval_steps_per_second": 2.718,
"step": 1620
},
{
"epoch": 4.699140401146132,
"grad_norm": 0.02252735011279583,
"learning_rate": 1.2034383954154729e-06,
"loss": 0.1621,
"step": 1640
},
{
"epoch": 4.699140401146132,
"eval_accuracy": 0.8924050632911392,
"eval_loss": 0.6008380651473999,
"eval_runtime": 14.6006,
"eval_samples_per_second": 10.821,
"eval_steps_per_second": 2.74,
"step": 1640
},
{
"epoch": 4.756446991404012,
"grad_norm": 0.03680579736828804,
"learning_rate": 9.742120343839543e-07,
"loss": 0.0841,
"step": 1660
},
{
"epoch": 4.756446991404012,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.5886847376823425,
"eval_runtime": 14.6522,
"eval_samples_per_second": 10.783,
"eval_steps_per_second": 2.73,
"step": 1660
},
{
"epoch": 4.813753581661891,
"grad_norm": 0.027355097234249115,
"learning_rate": 7.449856733524357e-07,
"loss": 0.0014,
"step": 1680
},
{
"epoch": 4.813753581661891,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.586622416973114,
"eval_runtime": 14.7046,
"eval_samples_per_second": 10.745,
"eval_steps_per_second": 2.72,
"step": 1680
},
{
"epoch": 4.871060171919771,
"grad_norm": 0.011458040215075016,
"learning_rate": 5.15759312320917e-07,
"loss": 0.1199,
"step": 1700
},
{
"epoch": 4.871060171919771,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.590861976146698,
"eval_runtime": 14.6646,
"eval_samples_per_second": 10.774,
"eval_steps_per_second": 2.728,
"step": 1700
},
{
"epoch": 4.92836676217765,
"grad_norm": 0.025075102224946022,
"learning_rate": 2.865329512893983e-07,
"loss": 0.0124,
"step": 1720
},
{
"epoch": 4.92836676217765,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.5905599594116211,
"eval_runtime": 14.686,
"eval_samples_per_second": 10.759,
"eval_steps_per_second": 2.724,
"step": 1720
},
{
"epoch": 4.98567335243553,
"grad_norm": 0.021264472976326942,
"learning_rate": 5.730659025787966e-08,
"loss": 0.046,
"step": 1740
},
{
"epoch": 4.98567335243553,
"eval_accuracy": 0.8987341772151899,
"eval_loss": 0.5924892425537109,
"eval_runtime": 14.595,
"eval_samples_per_second": 10.826,
"eval_steps_per_second": 2.741,
"step": 1740
}
],
"logging_steps": 20,
"max_steps": 1745,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5599966461345732.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}