|
{ |
|
"best_metric": 0.2911098897457123, |
|
"best_model_checkpoint": "./results/checkpoint-280", |
|
"epoch": 5.0, |
|
"eval_steps": 20, |
|
"global_step": 1745, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05730659025787966, |
|
"grad_norm": 14.95106029510498, |
|
"learning_rate": 1.9770773638968482e-05, |
|
"loss": 0.5908, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05730659025787966, |
|
"eval_accuracy": 0.7974683544303798, |
|
"eval_loss": 0.44761696457862854, |
|
"eval_runtime": 12.826, |
|
"eval_samples_per_second": 12.319, |
|
"eval_steps_per_second": 3.119, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11461318051575932, |
|
"grad_norm": 16.47698402404785, |
|
"learning_rate": 1.9541547277936966e-05, |
|
"loss": 0.543, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11461318051575932, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.4422585070133209, |
|
"eval_runtime": 13.5793, |
|
"eval_samples_per_second": 11.635, |
|
"eval_steps_per_second": 2.946, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17191977077363896, |
|
"grad_norm": 22.875091552734375, |
|
"learning_rate": 1.9312320916905443e-05, |
|
"loss": 0.5093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17191977077363896, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.5881978869438171, |
|
"eval_runtime": 14.7375, |
|
"eval_samples_per_second": 10.721, |
|
"eval_steps_per_second": 2.714, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22922636103151864, |
|
"grad_norm": 6.222044944763184, |
|
"learning_rate": 1.9083094555873927e-05, |
|
"loss": 0.5186, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22922636103151864, |
|
"eval_accuracy": 0.7658227848101266, |
|
"eval_loss": 0.6422034502029419, |
|
"eval_runtime": 14.6414, |
|
"eval_samples_per_second": 10.791, |
|
"eval_steps_per_second": 2.732, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28653295128939826, |
|
"grad_norm": 10.637746810913086, |
|
"learning_rate": 1.8853868194842408e-05, |
|
"loss": 0.502, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.28653295128939826, |
|
"eval_accuracy": 0.7658227848101266, |
|
"eval_loss": 0.9381818175315857, |
|
"eval_runtime": 14.572, |
|
"eval_samples_per_second": 10.843, |
|
"eval_steps_per_second": 2.745, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3438395415472779, |
|
"grad_norm": 8.144033432006836, |
|
"learning_rate": 1.8624641833810892e-05, |
|
"loss": 0.573, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3438395415472779, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.4263954758644104, |
|
"eval_runtime": 14.6662, |
|
"eval_samples_per_second": 10.773, |
|
"eval_steps_per_second": 2.727, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.40114613180515757, |
|
"grad_norm": 0.44048359990119934, |
|
"learning_rate": 1.8395415472779372e-05, |
|
"loss": 0.5269, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.40114613180515757, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.5453027486801147, |
|
"eval_runtime": 14.5869, |
|
"eval_samples_per_second": 10.832, |
|
"eval_steps_per_second": 2.742, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4584527220630373, |
|
"grad_norm": 18.155141830444336, |
|
"learning_rate": 1.8166189111747853e-05, |
|
"loss": 0.3545, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4584527220630373, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.4540826678276062, |
|
"eval_runtime": 14.6402, |
|
"eval_samples_per_second": 10.792, |
|
"eval_steps_per_second": 2.732, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5157593123209169, |
|
"grad_norm": 0.482028603553772, |
|
"learning_rate": 1.7936962750716333e-05, |
|
"loss": 0.4449, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5157593123209169, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.43535691499710083, |
|
"eval_runtime": 14.6919, |
|
"eval_samples_per_second": 10.754, |
|
"eval_steps_per_second": 2.723, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5730659025787965, |
|
"grad_norm": 108.88398742675781, |
|
"learning_rate": 1.7707736389684814e-05, |
|
"loss": 0.3868, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5730659025787965, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.8784106373786926, |
|
"eval_runtime": 14.6371, |
|
"eval_samples_per_second": 10.794, |
|
"eval_steps_per_second": 2.733, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6303724928366762, |
|
"grad_norm": 12.77889347076416, |
|
"learning_rate": 1.7478510028653298e-05, |
|
"loss": 0.7576, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6303724928366762, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.38221749663352966, |
|
"eval_runtime": 14.6383, |
|
"eval_samples_per_second": 10.794, |
|
"eval_steps_per_second": 2.733, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6876790830945558, |
|
"grad_norm": 13.416525840759277, |
|
"learning_rate": 1.724928366762178e-05, |
|
"loss": 0.1956, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6876790830945558, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.4667538106441498, |
|
"eval_runtime": 14.6113, |
|
"eval_samples_per_second": 10.814, |
|
"eval_steps_per_second": 2.738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7449856733524355, |
|
"grad_norm": 10.141700744628906, |
|
"learning_rate": 1.702005730659026e-05, |
|
"loss": 0.4942, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7449856733524355, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.5736417174339294, |
|
"eval_runtime": 14.603, |
|
"eval_samples_per_second": 10.82, |
|
"eval_steps_per_second": 2.739, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8022922636103151, |
|
"grad_norm": 23.185056686401367, |
|
"learning_rate": 1.679083094555874e-05, |
|
"loss": 0.4762, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8022922636103151, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.2911098897457123, |
|
"eval_runtime": 14.6519, |
|
"eval_samples_per_second": 10.784, |
|
"eval_steps_per_second": 2.73, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8595988538681948, |
|
"grad_norm": 46.526451110839844, |
|
"learning_rate": 1.6561604584527223e-05, |
|
"loss": 0.4136, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8595988538681948, |
|
"eval_accuracy": 0.8607594936708861, |
|
"eval_loss": 0.3629298508167267, |
|
"eval_runtime": 14.6627, |
|
"eval_samples_per_second": 10.776, |
|
"eval_steps_per_second": 2.728, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9169054441260746, |
|
"grad_norm": 5.966210842132568, |
|
"learning_rate": 1.6332378223495704e-05, |
|
"loss": 0.5865, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9169054441260746, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.9794216752052307, |
|
"eval_runtime": 14.6593, |
|
"eval_samples_per_second": 10.778, |
|
"eval_steps_per_second": 2.729, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9742120343839542, |
|
"grad_norm": 3.5877606868743896, |
|
"learning_rate": 1.6103151862464185e-05, |
|
"loss": 0.3758, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9742120343839542, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.46775683760643005, |
|
"eval_runtime": 14.6442, |
|
"eval_samples_per_second": 10.789, |
|
"eval_steps_per_second": 2.731, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0315186246418337, |
|
"grad_norm": 5.313683986663818, |
|
"learning_rate": 1.5873925501432665e-05, |
|
"loss": 0.4285, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0315186246418337, |
|
"eval_accuracy": 0.8670886075949367, |
|
"eval_loss": 0.5543066263198853, |
|
"eval_runtime": 14.6827, |
|
"eval_samples_per_second": 10.761, |
|
"eval_steps_per_second": 2.724, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0888252148997135, |
|
"grad_norm": 10.655978202819824, |
|
"learning_rate": 1.5644699140401146e-05, |
|
"loss": 0.44, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0888252148997135, |
|
"eval_accuracy": 0.8607594936708861, |
|
"eval_loss": 0.5150261521339417, |
|
"eval_runtime": 14.6825, |
|
"eval_samples_per_second": 10.761, |
|
"eval_steps_per_second": 2.724, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.146131805157593, |
|
"grad_norm": 0.08064723014831543, |
|
"learning_rate": 1.541547277936963e-05, |
|
"loss": 0.3573, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.146131805157593, |
|
"eval_accuracy": 0.8607594936708861, |
|
"eval_loss": 0.563529908657074, |
|
"eval_runtime": 14.6349, |
|
"eval_samples_per_second": 10.796, |
|
"eval_steps_per_second": 2.733, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2034383954154728, |
|
"grad_norm": 0.46097293496131897, |
|
"learning_rate": 1.518624641833811e-05, |
|
"loss": 0.4187, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2034383954154728, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6609386205673218, |
|
"eval_runtime": 14.5517, |
|
"eval_samples_per_second": 10.858, |
|
"eval_steps_per_second": 2.749, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2607449856733524, |
|
"grad_norm": 0.37571266293525696, |
|
"learning_rate": 1.495702005730659e-05, |
|
"loss": 0.3742, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2607449856733524, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.5912802815437317, |
|
"eval_runtime": 14.594, |
|
"eval_samples_per_second": 10.826, |
|
"eval_steps_per_second": 2.741, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3180515759312321, |
|
"grad_norm": 0.4662785828113556, |
|
"learning_rate": 1.4727793696275073e-05, |
|
"loss": 0.5179, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3180515759312321, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.3983699679374695, |
|
"eval_runtime": 14.6982, |
|
"eval_samples_per_second": 10.75, |
|
"eval_steps_per_second": 2.721, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3753581661891117, |
|
"grad_norm": 3.044969081878662, |
|
"learning_rate": 1.4498567335243553e-05, |
|
"loss": 0.1685, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3753581661891117, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.5606595873832703, |
|
"eval_runtime": 14.5479, |
|
"eval_samples_per_second": 10.861, |
|
"eval_steps_per_second": 2.75, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4326647564469914, |
|
"grad_norm": 9.852724075317383, |
|
"learning_rate": 1.4269340974212036e-05, |
|
"loss": 0.5284, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4326647564469914, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.35282623767852783, |
|
"eval_runtime": 14.6738, |
|
"eval_samples_per_second": 10.767, |
|
"eval_steps_per_second": 2.726, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4899713467048712, |
|
"grad_norm": 25.850496292114258, |
|
"learning_rate": 1.4040114613180518e-05, |
|
"loss": 0.4246, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4899713467048712, |
|
"eval_accuracy": 0.8607594936708861, |
|
"eval_loss": 0.5857312083244324, |
|
"eval_runtime": 15.5144, |
|
"eval_samples_per_second": 10.184, |
|
"eval_steps_per_second": 2.578, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5472779369627507, |
|
"grad_norm": 7.516841888427734, |
|
"learning_rate": 1.3810888252148997e-05, |
|
"loss": 0.2419, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5472779369627507, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.34958717226982117, |
|
"eval_runtime": 14.4393, |
|
"eval_samples_per_second": 10.942, |
|
"eval_steps_per_second": 2.77, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6045845272206303, |
|
"grad_norm": 0.07038611173629761, |
|
"learning_rate": 1.3581661891117479e-05, |
|
"loss": 0.4416, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.6045845272206303, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.4946177005767822, |
|
"eval_runtime": 14.6819, |
|
"eval_samples_per_second": 10.762, |
|
"eval_steps_per_second": 2.724, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.66189111747851, |
|
"grad_norm": 9.443480491638184, |
|
"learning_rate": 1.3352435530085961e-05, |
|
"loss": 0.4426, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.66189111747851, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.34582754969596863, |
|
"eval_runtime": 14.6267, |
|
"eval_samples_per_second": 10.802, |
|
"eval_steps_per_second": 2.735, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7191977077363898, |
|
"grad_norm": 0.07343020290136337, |
|
"learning_rate": 1.3123209169054444e-05, |
|
"loss": 0.2122, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7191977077363898, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.6184278130531311, |
|
"eval_runtime": 14.6949, |
|
"eval_samples_per_second": 10.752, |
|
"eval_steps_per_second": 2.722, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7765042979942693, |
|
"grad_norm": 0.03269320726394653, |
|
"learning_rate": 1.2893982808022924e-05, |
|
"loss": 0.1734, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7765042979942693, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.7278411388397217, |
|
"eval_runtime": 14.5541, |
|
"eval_samples_per_second": 10.856, |
|
"eval_steps_per_second": 2.748, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8338108882521489, |
|
"grad_norm": 0.021946750581264496, |
|
"learning_rate": 1.2664756446991405e-05, |
|
"loss": 0.2314, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.8338108882521489, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.543005645275116, |
|
"eval_runtime": 14.6871, |
|
"eval_samples_per_second": 10.758, |
|
"eval_steps_per_second": 2.723, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.8911174785100286, |
|
"grad_norm": 0.17806316912174225, |
|
"learning_rate": 1.2435530085959885e-05, |
|
"loss": 0.4886, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8911174785100286, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.5081498622894287, |
|
"eval_runtime": 14.6477, |
|
"eval_samples_per_second": 10.787, |
|
"eval_steps_per_second": 2.731, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9484240687679084, |
|
"grad_norm": 18.20897674560547, |
|
"learning_rate": 1.2206303724928367e-05, |
|
"loss": 0.3429, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9484240687679084, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6000381708145142, |
|
"eval_runtime": 14.5629, |
|
"eval_samples_per_second": 10.849, |
|
"eval_steps_per_second": 2.747, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.005730659025788, |
|
"grad_norm": 0.07220949977636337, |
|
"learning_rate": 1.197707736389685e-05, |
|
"loss": 0.3591, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.005730659025788, |
|
"eval_accuracy": 0.8607594936708861, |
|
"eval_loss": 0.5183639526367188, |
|
"eval_runtime": 14.6159, |
|
"eval_samples_per_second": 10.81, |
|
"eval_steps_per_second": 2.737, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0630372492836675, |
|
"grad_norm": 0.03888562321662903, |
|
"learning_rate": 1.1747851002865332e-05, |
|
"loss": 0.3638, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.0630372492836675, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.4008268415927887, |
|
"eval_runtime": 14.6829, |
|
"eval_samples_per_second": 10.761, |
|
"eval_steps_per_second": 2.724, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1203438395415475, |
|
"grad_norm": 0.05230604112148285, |
|
"learning_rate": 1.151862464183381e-05, |
|
"loss": 0.1881, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.1203438395415475, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.616079568862915, |
|
"eval_runtime": 14.646, |
|
"eval_samples_per_second": 10.788, |
|
"eval_steps_per_second": 2.731, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.177650429799427, |
|
"grad_norm": 0.6790505647659302, |
|
"learning_rate": 1.1289398280802293e-05, |
|
"loss": 0.241, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.177650429799427, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.5249369144439697, |
|
"eval_runtime": 14.6423, |
|
"eval_samples_per_second": 10.791, |
|
"eval_steps_per_second": 2.732, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.2349570200573066, |
|
"grad_norm": 0.8485791087150574, |
|
"learning_rate": 1.1060171919770775e-05, |
|
"loss": 0.4699, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.2349570200573066, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.5322971343994141, |
|
"eval_runtime": 14.6006, |
|
"eval_samples_per_second": 10.821, |
|
"eval_steps_per_second": 2.74, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.292263610315186, |
|
"grad_norm": 96.15169525146484, |
|
"learning_rate": 1.0830945558739256e-05, |
|
"loss": 0.3702, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.292263610315186, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.728390097618103, |
|
"eval_runtime": 14.5807, |
|
"eval_samples_per_second": 10.836, |
|
"eval_steps_per_second": 2.743, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.349570200573066, |
|
"grad_norm": 0.1611723154783249, |
|
"learning_rate": 1.0601719197707738e-05, |
|
"loss": 0.4192, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.349570200573066, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.36709439754486084, |
|
"eval_runtime": 14.5871, |
|
"eval_samples_per_second": 10.831, |
|
"eval_steps_per_second": 2.742, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4068767908309456, |
|
"grad_norm": 0.11072923988103867, |
|
"learning_rate": 1.0372492836676219e-05, |
|
"loss": 0.1747, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.4068767908309456, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.42927253246307373, |
|
"eval_runtime": 14.6133, |
|
"eval_samples_per_second": 10.812, |
|
"eval_steps_per_second": 2.737, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.464183381088825, |
|
"grad_norm": 0.03486654907464981, |
|
"learning_rate": 1.01432664756447e-05, |
|
"loss": 0.347, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.464183381088825, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.40468934178352356, |
|
"eval_runtime": 14.6475, |
|
"eval_samples_per_second": 10.787, |
|
"eval_steps_per_second": 2.731, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.5214899713467047, |
|
"grad_norm": 0.27154240012168884, |
|
"learning_rate": 9.914040114613181e-06, |
|
"loss": 0.0533, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.5214899713467047, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.5134832859039307, |
|
"eval_runtime": 14.6718, |
|
"eval_samples_per_second": 10.769, |
|
"eval_steps_per_second": 2.726, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.5787965616045847, |
|
"grad_norm": 24.125070571899414, |
|
"learning_rate": 9.684813753581662e-06, |
|
"loss": 0.2002, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.5787965616045847, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.5535210967063904, |
|
"eval_runtime": 14.6419, |
|
"eval_samples_per_second": 10.791, |
|
"eval_steps_per_second": 2.732, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.6361031518624642, |
|
"grad_norm": 0.03520410135388374, |
|
"learning_rate": 9.455587392550144e-06, |
|
"loss": 0.0274, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.6361031518624642, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.6635323762893677, |
|
"eval_runtime": 14.6418, |
|
"eval_samples_per_second": 10.791, |
|
"eval_steps_per_second": 2.732, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.693409742120344, |
|
"grad_norm": 0.09307877719402313, |
|
"learning_rate": 9.226361031518626e-06, |
|
"loss": 0.2339, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.693409742120344, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.4939664602279663, |
|
"eval_runtime": 14.6554, |
|
"eval_samples_per_second": 10.781, |
|
"eval_steps_per_second": 2.729, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.7507163323782233, |
|
"grad_norm": 80.65755462646484, |
|
"learning_rate": 8.997134670487107e-06, |
|
"loss": 0.3015, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.7507163323782233, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.5513517260551453, |
|
"eval_runtime": 14.6022, |
|
"eval_samples_per_second": 10.82, |
|
"eval_steps_per_second": 2.739, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.8080229226361033, |
|
"grad_norm": 180.23745727539062, |
|
"learning_rate": 8.767908309455588e-06, |
|
"loss": 0.4222, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.8080229226361033, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.5411596298217773, |
|
"eval_runtime": 14.6522, |
|
"eval_samples_per_second": 10.783, |
|
"eval_steps_per_second": 2.73, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.865329512893983, |
|
"grad_norm": 106.34879302978516, |
|
"learning_rate": 8.53868194842407e-06, |
|
"loss": 0.3243, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.865329512893983, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.5439683198928833, |
|
"eval_runtime": 14.6662, |
|
"eval_samples_per_second": 10.773, |
|
"eval_steps_per_second": 2.727, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.9226361031518624, |
|
"grad_norm": 43.02892303466797, |
|
"learning_rate": 8.30945558739255e-06, |
|
"loss": 0.3137, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.9226361031518624, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.45336952805519104, |
|
"eval_runtime": 15.5419, |
|
"eval_samples_per_second": 10.166, |
|
"eval_steps_per_second": 2.574, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.9799426934097424, |
|
"grad_norm": 0.05886560305953026, |
|
"learning_rate": 8.080229226361033e-06, |
|
"loss": 0.191, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.9799426934097424, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.6082937121391296, |
|
"eval_runtime": 14.4222, |
|
"eval_samples_per_second": 10.955, |
|
"eval_steps_per_second": 2.774, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.037249283667622, |
|
"grad_norm": 0.0684143528342247, |
|
"learning_rate": 7.851002865329513e-06, |
|
"loss": 0.1213, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.037249283667622, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.5798259377479553, |
|
"eval_runtime": 14.8164, |
|
"eval_samples_per_second": 10.664, |
|
"eval_steps_per_second": 2.7, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.0945558739255015, |
|
"grad_norm": 0.08387450873851776, |
|
"learning_rate": 7.6217765042979954e-06, |
|
"loss": 0.1582, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.0945558739255015, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.48295101523399353, |
|
"eval_runtime": 14.6812, |
|
"eval_samples_per_second": 10.762, |
|
"eval_steps_per_second": 2.725, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.151862464183381, |
|
"grad_norm": 0.056213777512311935, |
|
"learning_rate": 7.392550143266476e-06, |
|
"loss": 0.0546, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.151862464183381, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.7038730382919312, |
|
"eval_runtime": 14.5146, |
|
"eval_samples_per_second": 10.886, |
|
"eval_steps_per_second": 2.756, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.2091690544412605, |
|
"grad_norm": 0.013059821911156178, |
|
"learning_rate": 7.163323782234957e-06, |
|
"loss": 0.0387, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.2091690544412605, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.6058567762374878, |
|
"eval_runtime": 14.7233, |
|
"eval_samples_per_second": 10.731, |
|
"eval_steps_per_second": 2.717, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.2664756446991405, |
|
"grad_norm": 15.5554780960083, |
|
"learning_rate": 6.934097421203439e-06, |
|
"loss": 0.4619, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.2664756446991405, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.6933996677398682, |
|
"eval_runtime": 14.6193, |
|
"eval_samples_per_second": 10.808, |
|
"eval_steps_per_second": 2.736, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.32378223495702, |
|
"grad_norm": 2.1167819499969482, |
|
"learning_rate": 6.70487106017192e-06, |
|
"loss": 0.2789, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.32378223495702, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.524710476398468, |
|
"eval_runtime": 14.6186, |
|
"eval_samples_per_second": 10.808, |
|
"eval_steps_per_second": 2.736, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.3810888252148996, |
|
"grad_norm": 0.020894192159175873, |
|
"learning_rate": 6.475644699140402e-06, |
|
"loss": 0.1361, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.3810888252148996, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.6307375431060791, |
|
"eval_runtime": 14.6338, |
|
"eval_samples_per_second": 10.797, |
|
"eval_steps_per_second": 2.733, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.4383954154727796, |
|
"grad_norm": 106.9233627319336, |
|
"learning_rate": 6.246418338108883e-06, |
|
"loss": 0.0475, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.4383954154727796, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.5455241203308105, |
|
"eval_runtime": 14.6106, |
|
"eval_samples_per_second": 10.814, |
|
"eval_steps_per_second": 2.738, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.495702005730659, |
|
"grad_norm": 10.43300724029541, |
|
"learning_rate": 6.017191977077364e-06, |
|
"loss": 0.2889, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.495702005730659, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.5864837169647217, |
|
"eval_runtime": 14.7, |
|
"eval_samples_per_second": 10.748, |
|
"eval_steps_per_second": 2.721, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.5530085959885387, |
|
"grad_norm": 0.143876850605011, |
|
"learning_rate": 5.787965616045845e-06, |
|
"loss": 0.2507, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.5530085959885387, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.5028768181800842, |
|
"eval_runtime": 14.6373, |
|
"eval_samples_per_second": 10.794, |
|
"eval_steps_per_second": 2.733, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.6103151862464182, |
|
"grad_norm": 41.49633026123047, |
|
"learning_rate": 5.558739255014327e-06, |
|
"loss": 0.1476, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.6103151862464182, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.651742160320282, |
|
"eval_runtime": 14.5921, |
|
"eval_samples_per_second": 10.828, |
|
"eval_steps_per_second": 2.741, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.6676217765042978, |
|
"grad_norm": 0.19821767508983612, |
|
"learning_rate": 5.3295128939828086e-06, |
|
"loss": 0.0709, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.6676217765042978, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.5607478618621826, |
|
"eval_runtime": 14.6558, |
|
"eval_samples_per_second": 10.781, |
|
"eval_steps_per_second": 2.729, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.7249283667621778, |
|
"grad_norm": 0.014833999797701836, |
|
"learning_rate": 5.10028653295129e-06, |
|
"loss": 0.2416, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.7249283667621778, |
|
"eval_accuracy": 0.8670886075949367, |
|
"eval_loss": 0.6906114220619202, |
|
"eval_runtime": 14.699, |
|
"eval_samples_per_second": 10.749, |
|
"eval_steps_per_second": 2.721, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.7822349570200573, |
|
"grad_norm": 13.687612533569336, |
|
"learning_rate": 4.871060171919771e-06, |
|
"loss": 0.2482, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.7822349570200573, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.45231887698173523, |
|
"eval_runtime": 14.6807, |
|
"eval_samples_per_second": 10.762, |
|
"eval_steps_per_second": 2.725, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.839541547277937, |
|
"grad_norm": 0.014498379081487656, |
|
"learning_rate": 4.641833810888253e-06, |
|
"loss": 0.1591, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.839541547277937, |
|
"eval_accuracy": 0.9177215189873418, |
|
"eval_loss": 0.3677010238170624, |
|
"eval_runtime": 14.6812, |
|
"eval_samples_per_second": 10.762, |
|
"eval_steps_per_second": 2.725, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.896848137535817, |
|
"grad_norm": 0.2034488171339035, |
|
"learning_rate": 4.412607449856734e-06, |
|
"loss": 0.1728, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.896848137535817, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.4237450659275055, |
|
"eval_runtime": 14.6536, |
|
"eval_samples_per_second": 10.782, |
|
"eval_steps_per_second": 2.73, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.9541547277936964, |
|
"grad_norm": 1.0174587965011597, |
|
"learning_rate": 4.1833810888252155e-06, |
|
"loss": 0.1061, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.9541547277936964, |
|
"eval_accuracy": 0.9240506329113924, |
|
"eval_loss": 0.37083700299263, |
|
"eval_runtime": 14.6215, |
|
"eval_samples_per_second": 10.806, |
|
"eval_steps_per_second": 2.736, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.011461318051576, |
|
"grad_norm": 0.23911085724830627, |
|
"learning_rate": 3.954154727793696e-06, |
|
"loss": 0.1461, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.011461318051576, |
|
"eval_accuracy": 0.9050632911392406, |
|
"eval_loss": 0.4641564190387726, |
|
"eval_runtime": 14.6444, |
|
"eval_samples_per_second": 10.789, |
|
"eval_steps_per_second": 2.731, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.0687679083094554, |
|
"grad_norm": 0.13148854672908783, |
|
"learning_rate": 3.724928366762178e-06, |
|
"loss": 0.0671, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.0687679083094554, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.556703507900238, |
|
"eval_runtime": 14.6395, |
|
"eval_samples_per_second": 10.793, |
|
"eval_steps_per_second": 2.732, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.126074498567335, |
|
"grad_norm": 0.1307491660118103, |
|
"learning_rate": 3.4957020057306597e-06, |
|
"loss": 0.0363, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.126074498567335, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.6240283846855164, |
|
"eval_runtime": 14.6021, |
|
"eval_samples_per_second": 10.82, |
|
"eval_steps_per_second": 2.739, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.1833810888252145, |
|
"grad_norm": 0.055873971432447433, |
|
"learning_rate": 3.2664756446991407e-06, |
|
"loss": 0.1257, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.1833810888252145, |
|
"eval_accuracy": 0.8734177215189873, |
|
"eval_loss": 0.7053503394126892, |
|
"eval_runtime": 14.6002, |
|
"eval_samples_per_second": 10.822, |
|
"eval_steps_per_second": 2.74, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.240687679083095, |
|
"grad_norm": 0.10310907661914825, |
|
"learning_rate": 3.037249283667622e-06, |
|
"loss": 0.1307, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.240687679083095, |
|
"eval_accuracy": 0.8860759493670886, |
|
"eval_loss": 0.6526200771331787, |
|
"eval_runtime": 14.6477, |
|
"eval_samples_per_second": 10.787, |
|
"eval_steps_per_second": 2.731, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.2979942693409745, |
|
"grad_norm": 0.09674423187971115, |
|
"learning_rate": 2.8080229226361035e-06, |
|
"loss": 0.226, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.2979942693409745, |
|
"eval_accuracy": 0.879746835443038, |
|
"eval_loss": 0.588349461555481, |
|
"eval_runtime": 14.6299, |
|
"eval_samples_per_second": 10.8, |
|
"eval_steps_per_second": 2.734, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.355300859598854, |
|
"grad_norm": 3.432967185974121, |
|
"learning_rate": 2.5787965616045845e-06, |
|
"loss": 0.0714, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.355300859598854, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.5381926894187927, |
|
"eval_runtime": 15.6025, |
|
"eval_samples_per_second": 10.127, |
|
"eval_steps_per_second": 2.564, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.412607449856734, |
|
"grad_norm": 0.03264419734477997, |
|
"learning_rate": 2.3495702005730663e-06, |
|
"loss": 0.0617, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.412607449856734, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.6029611229896545, |
|
"eval_runtime": 14.4132, |
|
"eval_samples_per_second": 10.962, |
|
"eval_steps_per_second": 2.775, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.469914040114613, |
|
"grad_norm": 0.06593719124794006, |
|
"learning_rate": 2.1203438395415473e-06, |
|
"loss": 0.0802, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.469914040114613, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.567659318447113, |
|
"eval_runtime": 14.8121, |
|
"eval_samples_per_second": 10.667, |
|
"eval_steps_per_second": 2.7, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.527220630372493, |
|
"grad_norm": 0.1013946682214737, |
|
"learning_rate": 1.8911174785100289e-06, |
|
"loss": 0.2404, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.527220630372493, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.5836894512176514, |
|
"eval_runtime": 14.7362, |
|
"eval_samples_per_second": 10.722, |
|
"eval_steps_per_second": 2.714, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.584527220630372, |
|
"grad_norm": 6.956309795379639, |
|
"learning_rate": 1.66189111747851e-06, |
|
"loss": 0.2311, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.584527220630372, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.6191691160202026, |
|
"eval_runtime": 14.4896, |
|
"eval_samples_per_second": 10.904, |
|
"eval_steps_per_second": 2.761, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.641833810888253, |
|
"grad_norm": 0.13025854527950287, |
|
"learning_rate": 1.4326647564469915e-06, |
|
"loss": 0.0031, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.641833810888253, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.6153239011764526, |
|
"eval_runtime": 14.714, |
|
"eval_samples_per_second": 10.738, |
|
"eval_steps_per_second": 2.718, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.699140401146132, |
|
"grad_norm": 0.02252735011279583, |
|
"learning_rate": 1.2034383954154729e-06, |
|
"loss": 0.1621, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.699140401146132, |
|
"eval_accuracy": 0.8924050632911392, |
|
"eval_loss": 0.6008380651473999, |
|
"eval_runtime": 14.6006, |
|
"eval_samples_per_second": 10.821, |
|
"eval_steps_per_second": 2.74, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.756446991404012, |
|
"grad_norm": 0.03680579736828804, |
|
"learning_rate": 9.742120343839543e-07, |
|
"loss": 0.0841, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.756446991404012, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.5886847376823425, |
|
"eval_runtime": 14.6522, |
|
"eval_samples_per_second": 10.783, |
|
"eval_steps_per_second": 2.73, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.813753581661891, |
|
"grad_norm": 0.027355097234249115, |
|
"learning_rate": 7.449856733524357e-07, |
|
"loss": 0.0014, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.813753581661891, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.586622416973114, |
|
"eval_runtime": 14.7046, |
|
"eval_samples_per_second": 10.745, |
|
"eval_steps_per_second": 2.72, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.871060171919771, |
|
"grad_norm": 0.011458040215075016, |
|
"learning_rate": 5.15759312320917e-07, |
|
"loss": 0.1199, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.871060171919771, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.590861976146698, |
|
"eval_runtime": 14.6646, |
|
"eval_samples_per_second": 10.774, |
|
"eval_steps_per_second": 2.728, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.92836676217765, |
|
"grad_norm": 0.025075102224946022, |
|
"learning_rate": 2.865329512893983e-07, |
|
"loss": 0.0124, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.92836676217765, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.5905599594116211, |
|
"eval_runtime": 14.686, |
|
"eval_samples_per_second": 10.759, |
|
"eval_steps_per_second": 2.724, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.98567335243553, |
|
"grad_norm": 0.021264472976326942, |
|
"learning_rate": 5.730659025787966e-08, |
|
"loss": 0.046, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.98567335243553, |
|
"eval_accuracy": 0.8987341772151899, |
|
"eval_loss": 0.5924892425537109, |
|
"eval_runtime": 14.595, |
|
"eval_samples_per_second": 10.826, |
|
"eval_steps_per_second": 2.741, |
|
"step": 1740 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1745, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5599966461345732.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|