{ "best_metric": 0.2911098897457123, "best_model_checkpoint": "./results/checkpoint-280", "epoch": 5.0, "eval_steps": 20, "global_step": 1745, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05730659025787966, "grad_norm": 14.95106029510498, "learning_rate": 1.9770773638968482e-05, "loss": 0.5908, "step": 20 }, { "epoch": 0.05730659025787966, "eval_accuracy": 0.7974683544303798, "eval_loss": 0.44761696457862854, "eval_runtime": 12.826, "eval_samples_per_second": 12.319, "eval_steps_per_second": 3.119, "step": 20 }, { "epoch": 0.11461318051575932, "grad_norm": 16.47698402404785, "learning_rate": 1.9541547277936966e-05, "loss": 0.543, "step": 40 }, { "epoch": 0.11461318051575932, "eval_accuracy": 0.7721518987341772, "eval_loss": 0.4422585070133209, "eval_runtime": 13.5793, "eval_samples_per_second": 11.635, "eval_steps_per_second": 2.946, "step": 40 }, { "epoch": 0.17191977077363896, "grad_norm": 22.875091552734375, "learning_rate": 1.9312320916905443e-05, "loss": 0.5093, "step": 60 }, { "epoch": 0.17191977077363896, "eval_accuracy": 0.7721518987341772, "eval_loss": 0.5881978869438171, "eval_runtime": 14.7375, "eval_samples_per_second": 10.721, "eval_steps_per_second": 2.714, "step": 60 }, { "epoch": 0.22922636103151864, "grad_norm": 6.222044944763184, "learning_rate": 1.9083094555873927e-05, "loss": 0.5186, "step": 80 }, { "epoch": 0.22922636103151864, "eval_accuracy": 0.7658227848101266, "eval_loss": 0.6422034502029419, "eval_runtime": 14.6414, "eval_samples_per_second": 10.791, "eval_steps_per_second": 2.732, "step": 80 }, { "epoch": 0.28653295128939826, "grad_norm": 10.637746810913086, "learning_rate": 1.8853868194842408e-05, "loss": 0.502, "step": 100 }, { "epoch": 0.28653295128939826, "eval_accuracy": 0.7658227848101266, "eval_loss": 0.9381818175315857, "eval_runtime": 14.572, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.745, "step": 100 }, { "epoch": 0.3438395415472779, "grad_norm": 8.144033432006836, "learning_rate": 1.8624641833810892e-05, "loss": 0.573, "step": 120 }, { "epoch": 0.3438395415472779, "eval_accuracy": 0.8227848101265823, "eval_loss": 0.4263954758644104, "eval_runtime": 14.6662, "eval_samples_per_second": 10.773, "eval_steps_per_second": 2.727, "step": 120 }, { "epoch": 0.40114613180515757, "grad_norm": 0.44048359990119934, "learning_rate": 1.8395415472779372e-05, "loss": 0.5269, "step": 140 }, { "epoch": 0.40114613180515757, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.5453027486801147, "eval_runtime": 14.5869, "eval_samples_per_second": 10.832, "eval_steps_per_second": 2.742, "step": 140 }, { "epoch": 0.4584527220630373, "grad_norm": 18.155141830444336, "learning_rate": 1.8166189111747853e-05, "loss": 0.3545, "step": 160 }, { "epoch": 0.4584527220630373, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.4540826678276062, "eval_runtime": 14.6402, "eval_samples_per_second": 10.792, "eval_steps_per_second": 2.732, "step": 160 }, { "epoch": 0.5157593123209169, "grad_norm": 0.482028603553772, "learning_rate": 1.7936962750716333e-05, "loss": 0.4449, "step": 180 }, { "epoch": 0.5157593123209169, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.43535691499710083, "eval_runtime": 14.6919, "eval_samples_per_second": 10.754, "eval_steps_per_second": 2.723, "step": 180 }, { "epoch": 0.5730659025787965, "grad_norm": 108.88398742675781, "learning_rate": 1.7707736389684814e-05, "loss": 0.3868, "step": 200 }, { "epoch": 0.5730659025787965, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.8784106373786926, "eval_runtime": 14.6371, "eval_samples_per_second": 10.794, "eval_steps_per_second": 2.733, "step": 200 }, { "epoch": 0.6303724928366762, "grad_norm": 12.77889347076416, "learning_rate": 1.7478510028653298e-05, "loss": 0.7576, "step": 220 }, { "epoch": 0.6303724928366762, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.38221749663352966, "eval_runtime": 14.6383, "eval_samples_per_second": 10.794, "eval_steps_per_second": 2.733, "step": 220 }, { "epoch": 0.6876790830945558, "grad_norm": 13.416525840759277, "learning_rate": 1.724928366762178e-05, "loss": 0.1956, "step": 240 }, { "epoch": 0.6876790830945558, "eval_accuracy": 0.879746835443038, "eval_loss": 0.4667538106441498, "eval_runtime": 14.6113, "eval_samples_per_second": 10.814, "eval_steps_per_second": 2.738, "step": 240 }, { "epoch": 0.7449856733524355, "grad_norm": 10.141700744628906, "learning_rate": 1.702005730659026e-05, "loss": 0.4942, "step": 260 }, { "epoch": 0.7449856733524355, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.5736417174339294, "eval_runtime": 14.603, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.739, "step": 260 }, { "epoch": 0.8022922636103151, "grad_norm": 23.185056686401367, "learning_rate": 1.679083094555874e-05, "loss": 0.4762, "step": 280 }, { "epoch": 0.8022922636103151, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.2911098897457123, "eval_runtime": 14.6519, "eval_samples_per_second": 10.784, "eval_steps_per_second": 2.73, "step": 280 }, { "epoch": 0.8595988538681948, "grad_norm": 46.526451110839844, "learning_rate": 1.6561604584527223e-05, "loss": 0.4136, "step": 300 }, { "epoch": 0.8595988538681948, "eval_accuracy": 0.8607594936708861, "eval_loss": 0.3629298508167267, "eval_runtime": 14.6627, "eval_samples_per_second": 10.776, "eval_steps_per_second": 2.728, "step": 300 }, { "epoch": 0.9169054441260746, "grad_norm": 5.966210842132568, "learning_rate": 1.6332378223495704e-05, "loss": 0.5865, "step": 320 }, { "epoch": 0.9169054441260746, "eval_accuracy": 0.7721518987341772, "eval_loss": 0.9794216752052307, "eval_runtime": 14.6593, "eval_samples_per_second": 10.778, "eval_steps_per_second": 2.729, "step": 320 }, { "epoch": 0.9742120343839542, "grad_norm": 3.5877606868743896, "learning_rate": 1.6103151862464185e-05, "loss": 0.3758, "step": 340 }, { "epoch": 0.9742120343839542, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.46775683760643005, "eval_runtime": 14.6442, "eval_samples_per_second": 10.789, "eval_steps_per_second": 2.731, "step": 340 }, { "epoch": 1.0315186246418337, "grad_norm": 5.313683986663818, "learning_rate": 1.5873925501432665e-05, "loss": 0.4285, "step": 360 }, { "epoch": 1.0315186246418337, "eval_accuracy": 0.8670886075949367, "eval_loss": 0.5543066263198853, "eval_runtime": 14.6827, "eval_samples_per_second": 10.761, "eval_steps_per_second": 2.724, "step": 360 }, { "epoch": 1.0888252148997135, "grad_norm": 10.655978202819824, "learning_rate": 1.5644699140401146e-05, "loss": 0.44, "step": 380 }, { "epoch": 1.0888252148997135, "eval_accuracy": 0.8607594936708861, "eval_loss": 0.5150261521339417, "eval_runtime": 14.6825, "eval_samples_per_second": 10.761, "eval_steps_per_second": 2.724, "step": 380 }, { "epoch": 1.146131805157593, "grad_norm": 0.08064723014831543, "learning_rate": 1.541547277936963e-05, "loss": 0.3573, "step": 400 }, { "epoch": 1.146131805157593, "eval_accuracy": 0.8607594936708861, "eval_loss": 0.563529908657074, "eval_runtime": 14.6349, "eval_samples_per_second": 10.796, "eval_steps_per_second": 2.733, "step": 400 }, { "epoch": 1.2034383954154728, "grad_norm": 0.46097293496131897, "learning_rate": 1.518624641833811e-05, "loss": 0.4187, "step": 420 }, { "epoch": 1.2034383954154728, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.6609386205673218, "eval_runtime": 14.5517, "eval_samples_per_second": 10.858, "eval_steps_per_second": 2.749, "step": 420 }, { "epoch": 1.2607449856733524, "grad_norm": 0.37571266293525696, "learning_rate": 1.495702005730659e-05, "loss": 0.3742, "step": 440 }, { "epoch": 1.2607449856733524, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.5912802815437317, "eval_runtime": 14.594, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.741, "step": 440 }, { "epoch": 1.3180515759312321, "grad_norm": 0.4662785828113556, "learning_rate": 1.4727793696275073e-05, "loss": 0.5179, "step": 460 }, { "epoch": 1.3180515759312321, "eval_accuracy": 0.8354430379746836, "eval_loss": 0.3983699679374695, "eval_runtime": 14.6982, "eval_samples_per_second": 10.75, "eval_steps_per_second": 2.721, "step": 460 }, { "epoch": 1.3753581661891117, "grad_norm": 3.044969081878662, "learning_rate": 1.4498567335243553e-05, "loss": 0.1685, "step": 480 }, { "epoch": 1.3753581661891117, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.5606595873832703, "eval_runtime": 14.5479, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.75, "step": 480 }, { "epoch": 1.4326647564469914, "grad_norm": 9.852724075317383, "learning_rate": 1.4269340974212036e-05, "loss": 0.5284, "step": 500 }, { "epoch": 1.4326647564469914, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.35282623767852783, "eval_runtime": 14.6738, "eval_samples_per_second": 10.767, "eval_steps_per_second": 2.726, "step": 500 }, { "epoch": 1.4899713467048712, "grad_norm": 25.850496292114258, "learning_rate": 1.4040114613180518e-05, "loss": 0.4246, "step": 520 }, { "epoch": 1.4899713467048712, "eval_accuracy": 0.8607594936708861, "eval_loss": 0.5857312083244324, "eval_runtime": 15.5144, "eval_samples_per_second": 10.184, "eval_steps_per_second": 2.578, "step": 520 }, { "epoch": 1.5472779369627507, "grad_norm": 7.516841888427734, "learning_rate": 1.3810888252148997e-05, "loss": 0.2419, "step": 540 }, { "epoch": 1.5472779369627507, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.34958717226982117, "eval_runtime": 14.4393, "eval_samples_per_second": 10.942, "eval_steps_per_second": 2.77, "step": 540 }, { "epoch": 1.6045845272206303, "grad_norm": 0.07038611173629761, "learning_rate": 1.3581661891117479e-05, "loss": 0.4416, "step": 560 }, { "epoch": 1.6045845272206303, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.4946177005767822, "eval_runtime": 14.6819, "eval_samples_per_second": 10.762, "eval_steps_per_second": 2.724, "step": 560 }, { "epoch": 1.66189111747851, "grad_norm": 9.443480491638184, "learning_rate": 1.3352435530085961e-05, "loss": 0.4426, "step": 580 }, { "epoch": 1.66189111747851, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.34582754969596863, "eval_runtime": 14.6267, "eval_samples_per_second": 10.802, "eval_steps_per_second": 2.735, "step": 580 }, { "epoch": 1.7191977077363898, "grad_norm": 0.07343020290136337, "learning_rate": 1.3123209169054444e-05, "loss": 0.2122, "step": 600 }, { "epoch": 1.7191977077363898, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.6184278130531311, "eval_runtime": 14.6949, "eval_samples_per_second": 10.752, "eval_steps_per_second": 2.722, "step": 600 }, { "epoch": 1.7765042979942693, "grad_norm": 0.03269320726394653, "learning_rate": 1.2893982808022924e-05, "loss": 0.1734, "step": 620 }, { "epoch": 1.7765042979942693, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.7278411388397217, "eval_runtime": 14.5541, "eval_samples_per_second": 10.856, "eval_steps_per_second": 2.748, "step": 620 }, { "epoch": 1.8338108882521489, "grad_norm": 0.021946750581264496, "learning_rate": 1.2664756446991405e-05, "loss": 0.2314, "step": 640 }, { "epoch": 1.8338108882521489, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.543005645275116, "eval_runtime": 14.6871, "eval_samples_per_second": 10.758, "eval_steps_per_second": 2.723, "step": 640 }, { "epoch": 1.8911174785100286, "grad_norm": 0.17806316912174225, "learning_rate": 1.2435530085959885e-05, "loss": 0.4886, "step": 660 }, { "epoch": 1.8911174785100286, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.5081498622894287, "eval_runtime": 14.6477, "eval_samples_per_second": 10.787, "eval_steps_per_second": 2.731, "step": 660 }, { "epoch": 1.9484240687679084, "grad_norm": 18.20897674560547, "learning_rate": 1.2206303724928367e-05, "loss": 0.3429, "step": 680 }, { "epoch": 1.9484240687679084, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.6000381708145142, "eval_runtime": 14.5629, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.747, "step": 680 }, { "epoch": 2.005730659025788, "grad_norm": 0.07220949977636337, "learning_rate": 1.197707736389685e-05, "loss": 0.3591, "step": 700 }, { "epoch": 2.005730659025788, "eval_accuracy": 0.8607594936708861, "eval_loss": 0.5183639526367188, "eval_runtime": 14.6159, "eval_samples_per_second": 10.81, "eval_steps_per_second": 2.737, "step": 700 }, { "epoch": 2.0630372492836675, "grad_norm": 0.03888562321662903, "learning_rate": 1.1747851002865332e-05, "loss": 0.3638, "step": 720 }, { "epoch": 2.0630372492836675, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.4008268415927887, "eval_runtime": 14.6829, "eval_samples_per_second": 10.761, "eval_steps_per_second": 2.724, "step": 720 }, { "epoch": 2.1203438395415475, "grad_norm": 0.05230604112148285, "learning_rate": 1.151862464183381e-05, "loss": 0.1881, "step": 740 }, { "epoch": 2.1203438395415475, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.616079568862915, "eval_runtime": 14.646, "eval_samples_per_second": 10.788, "eval_steps_per_second": 2.731, "step": 740 }, { "epoch": 2.177650429799427, "grad_norm": 0.6790505647659302, "learning_rate": 1.1289398280802293e-05, "loss": 0.241, "step": 760 }, { "epoch": 2.177650429799427, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.5249369144439697, "eval_runtime": 14.6423, "eval_samples_per_second": 10.791, "eval_steps_per_second": 2.732, "step": 760 }, { "epoch": 2.2349570200573066, "grad_norm": 0.8485791087150574, "learning_rate": 1.1060171919770775e-05, "loss": 0.4699, "step": 780 }, { "epoch": 2.2349570200573066, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.5322971343994141, "eval_runtime": 14.6006, "eval_samples_per_second": 10.821, "eval_steps_per_second": 2.74, "step": 780 }, { "epoch": 2.292263610315186, "grad_norm": 96.15169525146484, "learning_rate": 1.0830945558739256e-05, "loss": 0.3702, "step": 800 }, { "epoch": 2.292263610315186, "eval_accuracy": 0.8481012658227848, "eval_loss": 0.728390097618103, "eval_runtime": 14.5807, "eval_samples_per_second": 10.836, "eval_steps_per_second": 2.743, "step": 800 }, { "epoch": 2.349570200573066, "grad_norm": 0.1611723154783249, "learning_rate": 1.0601719197707738e-05, "loss": 0.4192, "step": 820 }, { "epoch": 2.349570200573066, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.36709439754486084, "eval_runtime": 14.5871, "eval_samples_per_second": 10.831, "eval_steps_per_second": 2.742, "step": 820 }, { "epoch": 2.4068767908309456, "grad_norm": 0.11072923988103867, "learning_rate": 1.0372492836676219e-05, "loss": 0.1747, "step": 840 }, { "epoch": 2.4068767908309456, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.42927253246307373, "eval_runtime": 14.6133, "eval_samples_per_second": 10.812, "eval_steps_per_second": 2.737, "step": 840 }, { "epoch": 2.464183381088825, "grad_norm": 0.03486654907464981, "learning_rate": 1.01432664756447e-05, "loss": 0.347, "step": 860 }, { "epoch": 2.464183381088825, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.40468934178352356, "eval_runtime": 14.6475, "eval_samples_per_second": 10.787, "eval_steps_per_second": 2.731, "step": 860 }, { "epoch": 2.5214899713467047, "grad_norm": 0.27154240012168884, "learning_rate": 9.914040114613181e-06, "loss": 0.0533, "step": 880 }, { "epoch": 2.5214899713467047, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.5134832859039307, "eval_runtime": 14.6718, "eval_samples_per_second": 10.769, "eval_steps_per_second": 2.726, "step": 880 }, { "epoch": 2.5787965616045847, "grad_norm": 24.125070571899414, "learning_rate": 9.684813753581662e-06, "loss": 0.2002, "step": 900 }, { "epoch": 2.5787965616045847, "eval_accuracy": 0.879746835443038, "eval_loss": 0.5535210967063904, "eval_runtime": 14.6419, "eval_samples_per_second": 10.791, "eval_steps_per_second": 2.732, "step": 900 }, { "epoch": 2.6361031518624642, "grad_norm": 0.03520410135388374, "learning_rate": 9.455587392550144e-06, "loss": 0.0274, "step": 920 }, { "epoch": 2.6361031518624642, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.6635323762893677, "eval_runtime": 14.6418, "eval_samples_per_second": 10.791, "eval_steps_per_second": 2.732, "step": 920 }, { "epoch": 2.693409742120344, "grad_norm": 0.09307877719402313, "learning_rate": 9.226361031518626e-06, "loss": 0.2339, "step": 940 }, { "epoch": 2.693409742120344, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.4939664602279663, "eval_runtime": 14.6554, "eval_samples_per_second": 10.781, "eval_steps_per_second": 2.729, "step": 940 }, { "epoch": 2.7507163323782233, "grad_norm": 80.65755462646484, "learning_rate": 8.997134670487107e-06, "loss": 0.3015, "step": 960 }, { "epoch": 2.7507163323782233, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.5513517260551453, "eval_runtime": 14.6022, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.739, "step": 960 }, { "epoch": 2.8080229226361033, "grad_norm": 180.23745727539062, "learning_rate": 8.767908309455588e-06, "loss": 0.4222, "step": 980 }, { "epoch": 2.8080229226361033, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.5411596298217773, "eval_runtime": 14.6522, "eval_samples_per_second": 10.783, "eval_steps_per_second": 2.73, "step": 980 }, { "epoch": 2.865329512893983, "grad_norm": 106.34879302978516, "learning_rate": 8.53868194842407e-06, "loss": 0.3243, "step": 1000 }, { "epoch": 2.865329512893983, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.5439683198928833, "eval_runtime": 14.6662, "eval_samples_per_second": 10.773, "eval_steps_per_second": 2.727, "step": 1000 }, { "epoch": 2.9226361031518624, "grad_norm": 43.02892303466797, "learning_rate": 8.30945558739255e-06, "loss": 0.3137, "step": 1020 }, { "epoch": 2.9226361031518624, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.45336952805519104, "eval_runtime": 15.5419, "eval_samples_per_second": 10.166, "eval_steps_per_second": 2.574, "step": 1020 }, { "epoch": 2.9799426934097424, "grad_norm": 0.05886560305953026, "learning_rate": 8.080229226361033e-06, "loss": 0.191, "step": 1040 }, { "epoch": 2.9799426934097424, "eval_accuracy": 0.879746835443038, "eval_loss": 0.6082937121391296, "eval_runtime": 14.4222, "eval_samples_per_second": 10.955, "eval_steps_per_second": 2.774, "step": 1040 }, { "epoch": 3.037249283667622, "grad_norm": 0.0684143528342247, "learning_rate": 7.851002865329513e-06, "loss": 0.1213, "step": 1060 }, { "epoch": 3.037249283667622, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.5798259377479553, "eval_runtime": 14.8164, "eval_samples_per_second": 10.664, "eval_steps_per_second": 2.7, "step": 1060 }, { "epoch": 3.0945558739255015, "grad_norm": 0.08387450873851776, "learning_rate": 7.6217765042979954e-06, "loss": 0.1582, "step": 1080 }, { "epoch": 3.0945558739255015, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.48295101523399353, "eval_runtime": 14.6812, "eval_samples_per_second": 10.762, "eval_steps_per_second": 2.725, "step": 1080 }, { "epoch": 3.151862464183381, "grad_norm": 0.056213777512311935, "learning_rate": 7.392550143266476e-06, "loss": 0.0546, "step": 1100 }, { "epoch": 3.151862464183381, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.7038730382919312, "eval_runtime": 14.5146, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.756, "step": 1100 }, { "epoch": 3.2091690544412605, "grad_norm": 0.013059821911156178, "learning_rate": 7.163323782234957e-06, "loss": 0.0387, "step": 1120 }, { "epoch": 3.2091690544412605, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.6058567762374878, "eval_runtime": 14.7233, "eval_samples_per_second": 10.731, "eval_steps_per_second": 2.717, "step": 1120 }, { "epoch": 3.2664756446991405, "grad_norm": 15.5554780960083, "learning_rate": 6.934097421203439e-06, "loss": 0.4619, "step": 1140 }, { "epoch": 3.2664756446991405, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.6933996677398682, "eval_runtime": 14.6193, "eval_samples_per_second": 10.808, "eval_steps_per_second": 2.736, "step": 1140 }, { "epoch": 3.32378223495702, "grad_norm": 2.1167819499969482, "learning_rate": 6.70487106017192e-06, "loss": 0.2789, "step": 1160 }, { "epoch": 3.32378223495702, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.524710476398468, "eval_runtime": 14.6186, "eval_samples_per_second": 10.808, "eval_steps_per_second": 2.736, "step": 1160 }, { "epoch": 3.3810888252148996, "grad_norm": 0.020894192159175873, "learning_rate": 6.475644699140402e-06, "loss": 0.1361, "step": 1180 }, { "epoch": 3.3810888252148996, "eval_accuracy": 0.879746835443038, "eval_loss": 0.6307375431060791, "eval_runtime": 14.6338, "eval_samples_per_second": 10.797, "eval_steps_per_second": 2.733, "step": 1180 }, { "epoch": 3.4383954154727796, "grad_norm": 106.9233627319336, "learning_rate": 6.246418338108883e-06, "loss": 0.0475, "step": 1200 }, { "epoch": 3.4383954154727796, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.5455241203308105, "eval_runtime": 14.6106, "eval_samples_per_second": 10.814, "eval_steps_per_second": 2.738, "step": 1200 }, { "epoch": 3.495702005730659, "grad_norm": 10.43300724029541, "learning_rate": 6.017191977077364e-06, "loss": 0.2889, "step": 1220 }, { "epoch": 3.495702005730659, "eval_accuracy": 0.879746835443038, "eval_loss": 0.5864837169647217, "eval_runtime": 14.7, "eval_samples_per_second": 10.748, "eval_steps_per_second": 2.721, "step": 1220 }, { "epoch": 3.5530085959885387, "grad_norm": 0.143876850605011, "learning_rate": 5.787965616045845e-06, "loss": 0.2507, "step": 1240 }, { "epoch": 3.5530085959885387, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.5028768181800842, "eval_runtime": 14.6373, "eval_samples_per_second": 10.794, "eval_steps_per_second": 2.733, "step": 1240 }, { "epoch": 3.6103151862464182, "grad_norm": 41.49633026123047, "learning_rate": 5.558739255014327e-06, "loss": 0.1476, "step": 1260 }, { "epoch": 3.6103151862464182, "eval_accuracy": 0.879746835443038, "eval_loss": 0.651742160320282, "eval_runtime": 14.5921, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.741, "step": 1260 }, { "epoch": 3.6676217765042978, "grad_norm": 0.19821767508983612, "learning_rate": 5.3295128939828086e-06, "loss": 0.0709, "step": 1280 }, { "epoch": 3.6676217765042978, "eval_accuracy": 0.879746835443038, "eval_loss": 0.5607478618621826, "eval_runtime": 14.6558, "eval_samples_per_second": 10.781, "eval_steps_per_second": 2.729, "step": 1280 }, { "epoch": 3.7249283667621778, "grad_norm": 0.014833999797701836, "learning_rate": 5.10028653295129e-06, "loss": 0.2416, "step": 1300 }, { "epoch": 3.7249283667621778, "eval_accuracy": 0.8670886075949367, "eval_loss": 0.6906114220619202, "eval_runtime": 14.699, "eval_samples_per_second": 10.749, "eval_steps_per_second": 2.721, "step": 1300 }, { "epoch": 3.7822349570200573, "grad_norm": 13.687612533569336, "learning_rate": 4.871060171919771e-06, "loss": 0.2482, "step": 1320 }, { "epoch": 3.7822349570200573, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.45231887698173523, "eval_runtime": 14.6807, "eval_samples_per_second": 10.762, "eval_steps_per_second": 2.725, "step": 1320 }, { "epoch": 3.839541547277937, "grad_norm": 0.014498379081487656, "learning_rate": 4.641833810888253e-06, "loss": 0.1591, "step": 1340 }, { "epoch": 3.839541547277937, "eval_accuracy": 0.9177215189873418, "eval_loss": 0.3677010238170624, "eval_runtime": 14.6812, "eval_samples_per_second": 10.762, "eval_steps_per_second": 2.725, "step": 1340 }, { "epoch": 3.896848137535817, "grad_norm": 0.2034488171339035, "learning_rate": 4.412607449856734e-06, "loss": 0.1728, "step": 1360 }, { "epoch": 3.896848137535817, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.4237450659275055, "eval_runtime": 14.6536, "eval_samples_per_second": 10.782, "eval_steps_per_second": 2.73, "step": 1360 }, { "epoch": 3.9541547277936964, "grad_norm": 1.0174587965011597, "learning_rate": 4.1833810888252155e-06, "loss": 0.1061, "step": 1380 }, { "epoch": 3.9541547277936964, "eval_accuracy": 0.9240506329113924, "eval_loss": 0.37083700299263, "eval_runtime": 14.6215, "eval_samples_per_second": 10.806, "eval_steps_per_second": 2.736, "step": 1380 }, { "epoch": 4.011461318051576, "grad_norm": 0.23911085724830627, "learning_rate": 3.954154727793696e-06, "loss": 0.1461, "step": 1400 }, { "epoch": 4.011461318051576, "eval_accuracy": 0.9050632911392406, "eval_loss": 0.4641564190387726, "eval_runtime": 14.6444, "eval_samples_per_second": 10.789, "eval_steps_per_second": 2.731, "step": 1400 }, { "epoch": 4.0687679083094554, "grad_norm": 0.13148854672908783, "learning_rate": 3.724928366762178e-06, "loss": 0.0671, "step": 1420 }, { "epoch": 4.0687679083094554, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.556703507900238, "eval_runtime": 14.6395, "eval_samples_per_second": 10.793, "eval_steps_per_second": 2.732, "step": 1420 }, { "epoch": 4.126074498567335, "grad_norm": 0.1307491660118103, "learning_rate": 3.4957020057306597e-06, "loss": 0.0363, "step": 1440 }, { "epoch": 4.126074498567335, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.6240283846855164, "eval_runtime": 14.6021, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.739, "step": 1440 }, { "epoch": 4.1833810888252145, "grad_norm": 0.055873971432447433, "learning_rate": 3.2664756446991407e-06, "loss": 0.1257, "step": 1460 }, { "epoch": 4.1833810888252145, "eval_accuracy": 0.8734177215189873, "eval_loss": 0.7053503394126892, "eval_runtime": 14.6002, "eval_samples_per_second": 10.822, "eval_steps_per_second": 2.74, "step": 1460 }, { "epoch": 4.240687679083095, "grad_norm": 0.10310907661914825, "learning_rate": 3.037249283667622e-06, "loss": 0.1307, "step": 1480 }, { "epoch": 4.240687679083095, "eval_accuracy": 0.8860759493670886, "eval_loss": 0.6526200771331787, "eval_runtime": 14.6477, "eval_samples_per_second": 10.787, "eval_steps_per_second": 2.731, "step": 1480 }, { "epoch": 4.2979942693409745, "grad_norm": 0.09674423187971115, "learning_rate": 2.8080229226361035e-06, "loss": 0.226, "step": 1500 }, { "epoch": 4.2979942693409745, "eval_accuracy": 0.879746835443038, "eval_loss": 0.588349461555481, "eval_runtime": 14.6299, "eval_samples_per_second": 10.8, "eval_steps_per_second": 2.734, "step": 1500 }, { "epoch": 4.355300859598854, "grad_norm": 3.432967185974121, "learning_rate": 2.5787965616045845e-06, "loss": 0.0714, "step": 1520 }, { "epoch": 4.355300859598854, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.5381926894187927, "eval_runtime": 15.6025, "eval_samples_per_second": 10.127, "eval_steps_per_second": 2.564, "step": 1520 }, { "epoch": 4.412607449856734, "grad_norm": 0.03264419734477997, "learning_rate": 2.3495702005730663e-06, "loss": 0.0617, "step": 1540 }, { "epoch": 4.412607449856734, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.6029611229896545, "eval_runtime": 14.4132, "eval_samples_per_second": 10.962, "eval_steps_per_second": 2.775, "step": 1540 }, { "epoch": 4.469914040114613, "grad_norm": 0.06593719124794006, "learning_rate": 2.1203438395415473e-06, "loss": 0.0802, "step": 1560 }, { "epoch": 4.469914040114613, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.567659318447113, "eval_runtime": 14.8121, "eval_samples_per_second": 10.667, "eval_steps_per_second": 2.7, "step": 1560 }, { "epoch": 4.527220630372493, "grad_norm": 0.1013946682214737, "learning_rate": 1.8911174785100289e-06, "loss": 0.2404, "step": 1580 }, { "epoch": 4.527220630372493, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.5836894512176514, "eval_runtime": 14.7362, "eval_samples_per_second": 10.722, "eval_steps_per_second": 2.714, "step": 1580 }, { "epoch": 4.584527220630372, "grad_norm": 6.956309795379639, "learning_rate": 1.66189111747851e-06, "loss": 0.2311, "step": 1600 }, { "epoch": 4.584527220630372, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.6191691160202026, "eval_runtime": 14.4896, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.761, "step": 1600 }, { "epoch": 4.641833810888253, "grad_norm": 0.13025854527950287, "learning_rate": 1.4326647564469915e-06, "loss": 0.0031, "step": 1620 }, { "epoch": 4.641833810888253, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.6153239011764526, "eval_runtime": 14.714, "eval_samples_per_second": 10.738, "eval_steps_per_second": 2.718, "step": 1620 }, { "epoch": 4.699140401146132, "grad_norm": 0.02252735011279583, "learning_rate": 1.2034383954154729e-06, "loss": 0.1621, "step": 1640 }, { "epoch": 4.699140401146132, "eval_accuracy": 0.8924050632911392, "eval_loss": 0.6008380651473999, "eval_runtime": 14.6006, "eval_samples_per_second": 10.821, "eval_steps_per_second": 2.74, "step": 1640 }, { "epoch": 4.756446991404012, "grad_norm": 0.03680579736828804, "learning_rate": 9.742120343839543e-07, "loss": 0.0841, "step": 1660 }, { "epoch": 4.756446991404012, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.5886847376823425, "eval_runtime": 14.6522, "eval_samples_per_second": 10.783, "eval_steps_per_second": 2.73, "step": 1660 }, { "epoch": 4.813753581661891, "grad_norm": 0.027355097234249115, "learning_rate": 7.449856733524357e-07, "loss": 0.0014, "step": 1680 }, { "epoch": 4.813753581661891, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.586622416973114, "eval_runtime": 14.7046, "eval_samples_per_second": 10.745, "eval_steps_per_second": 2.72, "step": 1680 }, { "epoch": 4.871060171919771, "grad_norm": 0.011458040215075016, "learning_rate": 5.15759312320917e-07, "loss": 0.1199, "step": 1700 }, { "epoch": 4.871060171919771, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.590861976146698, "eval_runtime": 14.6646, "eval_samples_per_second": 10.774, "eval_steps_per_second": 2.728, "step": 1700 }, { "epoch": 4.92836676217765, "grad_norm": 0.025075102224946022, "learning_rate": 2.865329512893983e-07, "loss": 0.0124, "step": 1720 }, { "epoch": 4.92836676217765, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.5905599594116211, "eval_runtime": 14.686, "eval_samples_per_second": 10.759, "eval_steps_per_second": 2.724, "step": 1720 }, { "epoch": 4.98567335243553, "grad_norm": 0.021264472976326942, "learning_rate": 5.730659025787966e-08, "loss": 0.046, "step": 1740 }, { "epoch": 4.98567335243553, "eval_accuracy": 0.8987341772151899, "eval_loss": 0.5924892425537109, "eval_runtime": 14.595, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.741, "step": 1740 } ], "logging_steps": 20, "max_steps": 1745, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5599966461345732.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }