|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8, |
|
"eval_steps": 500, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"Batch Mean": -3.411376953125, |
|
"accuracy": 0.40625, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -3.425048828125, |
|
"accuracy": 0.4375, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -3.405029296875, |
|
"accuracy": 0.53125, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -3.482421875, |
|
"accuracy": 0.3125, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 7.395800590515137, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.7083, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -3.413330078125, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -3.407958984375, |
|
"accuracy": 0.5, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -3.4326171875, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -3.40673828125, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 6.054078102111816, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.7038, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -3.418212890625, |
|
"accuracy": 0.46875, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -3.4130859375, |
|
"accuracy": 0.5625, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -3.4619140625, |
|
"accuracy": 0.375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -3.51220703125, |
|
"accuracy": 0.5, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 6.486208915710449, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.6902, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -3.40234375, |
|
"accuracy": 0.5, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -3.492431640625, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -3.45166015625, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -3.4267578125, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.4884724617004395, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.6921, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -3.444580078125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -3.358154296875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -3.40966796875, |
|
"accuracy": 0.5625, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -3.458251953125, |
|
"accuracy": 0.40625, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 6.3613505363464355, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.6776, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -3.421875, |
|
"accuracy": 0.5, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -3.458740234375, |
|
"accuracy": 0.375, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -3.420166015625, |
|
"accuracy": 0.5, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -3.45556640625, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 8.138617515563965, |
|
"learning_rate": 9e-07, |
|
"loss": 0.6939, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -3.35302734375, |
|
"accuracy": 0.40625, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -3.459228515625, |
|
"accuracy": 0.5, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -3.4326171875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -3.396240234375, |
|
"accuracy": 0.4375, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 6.334463596343994, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.6898, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -3.45947265625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -3.437744140625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -3.44287109375, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -3.466552734375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.167725563049316, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.6892, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -3.371826171875, |
|
"accuracy": 0.4375, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -3.379150390625, |
|
"accuracy": 0.5, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -3.42626953125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -3.397216796875, |
|
"accuracy": 0.3125, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 6.235499382019043, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.699, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -3.39697265625, |
|
"accuracy": 0.5, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -3.365966796875, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -3.386474609375, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -3.41552734375, |
|
"accuracy": 0.375, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 7.100496292114258, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.696, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -3.426025390625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -3.349853515625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -3.46533203125, |
|
"accuracy": 0.5625, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -3.40234375, |
|
"accuracy": 0.5625, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 6.040953636169434, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.6655, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -3.449462890625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -3.42236328125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -3.34033203125, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -3.392578125, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.257072448730469, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.6685, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -3.33984375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -3.25537109375, |
|
"accuracy": 0.5625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -3.3603515625, |
|
"accuracy": 0.59375, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -3.239501953125, |
|
"accuracy": 0.4375, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 6.011251449584961, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.6608, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -3.31787109375, |
|
"accuracy": 0.75, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -3.3154296875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -3.2490234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -3.3046875, |
|
"accuracy": 0.75, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 6.238981246948242, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.6297, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -3.24609375, |
|
"accuracy": 0.75, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -3.34130859375, |
|
"accuracy": 0.625, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -3.279052734375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -3.28564453125, |
|
"accuracy": 0.46875, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 8.44164752960205, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.6589, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -3.275146484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -3.27294921875, |
|
"accuracy": 0.5, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -3.300048828125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -3.251708984375, |
|
"accuracy": 0.75, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.432738304138184, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.6441, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -3.21826171875, |
|
"accuracy": 0.6875, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -3.194580078125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -3.182373046875, |
|
"accuracy": 0.59375, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -3.169677734375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 5.111855983734131, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.6361, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -3.256103515625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -3.240234375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -3.067138671875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -3.288330078125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 5.055493354797363, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.6128, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -3.41650390625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -3.077392578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -3.1966552734375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -3.2864990234375, |
|
"accuracy": 0.625, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 6.847475051879883, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.5578, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -3.0543212890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -3.05322265625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -2.981201171875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -3.3494873046875, |
|
"accuracy": 0.625, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.472431659698486, |
|
"learning_rate": 3e-06, |
|
"loss": 0.5922, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": -3.2989501953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": -3.1324462890625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": -3.1845703125, |
|
"accuracy": 0.625, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": -3.0384521484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 10.34755802154541, |
|
"learning_rate": 2.992105263157895e-06, |
|
"loss": 0.5812, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": -3.0819091796875, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": -3.2308349609375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": -3.0845947265625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": -3.302001953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 6.563014030456543, |
|
"learning_rate": 2.9842105263157896e-06, |
|
"loss": 0.5442, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -3.26788330078125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -3.18896484375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -3.02227783203125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -3.3311767578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 6.604343891143799, |
|
"learning_rate": 2.9763157894736843e-06, |
|
"loss": 0.5241, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -3.5736083984375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -3.3741493225097656, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -3.36822509765625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -3.2643585205078125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.35377311706543, |
|
"learning_rate": 2.968421052631579e-06, |
|
"loss": 0.5049, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -3.401031494140625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -3.914642333984375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -3.43243408203125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -3.382415771484375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 11.941130638122559, |
|
"learning_rate": 2.960526315789474e-06, |
|
"loss": 0.5471, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -4.3116302490234375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -3.766510009765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -3.777069091796875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -3.7879981994628906, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 9.934557914733887, |
|
"learning_rate": 2.9526315789473685e-06, |
|
"loss": 0.423, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -3.805145263671875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -3.8750686645507812, |
|
"accuracy": 0.71875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -3.9338016510009766, |
|
"accuracy": 0.53125, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -3.4400253295898438, |
|
"accuracy": 0.78125, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 11.906075477600098, |
|
"learning_rate": 2.9447368421052633e-06, |
|
"loss": 0.4943, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -4.218650817871094, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -3.3725433349609375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -4.4810791015625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -4.03173828125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 14.714795112609863, |
|
"learning_rate": 2.936842105263158e-06, |
|
"loss": 0.5273, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -4.141151428222656, |
|
"accuracy": 0.6875, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -3.8033447265625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -3.9724884033203125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -4.264892578125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 24.04404640197754, |
|
"learning_rate": 2.9289473684210528e-06, |
|
"loss": 0.7219, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -4.1270751953125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -3.89520263671875, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -3.8218345642089844, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -4.262153625488281, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 12.918435096740723, |
|
"learning_rate": 2.9210526315789475e-06, |
|
"loss": 0.5446, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -4.5279541015625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -4.4937744140625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -4.4219970703125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -4.4163818359375, |
|
"accuracy": 0.75, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 12.408222198486328, |
|
"learning_rate": 2.9131578947368423e-06, |
|
"loss": 0.4516, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -4.74365234375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -4.267730712890625, |
|
"accuracy": 0.75, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -4.415737152099609, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -4.21026611328125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 11.342855453491211, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.4729, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -3.95538330078125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -4.6029052734375, |
|
"accuracy": 0.75, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -4.25469970703125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -4.155879974365234, |
|
"accuracy": 0.75, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 12.777791976928711, |
|
"learning_rate": 2.8973684210526318e-06, |
|
"loss": 0.5232, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -4.009521484375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -3.910858154296875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -4.120941162109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -4.4345703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 11.19157886505127, |
|
"learning_rate": 2.8894736842105265e-06, |
|
"loss": 0.4562, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -3.889312744140625, |
|
"accuracy": 0.75, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -3.822784423828125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -4.01025390625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -4.082855224609375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 12.107293128967285, |
|
"learning_rate": 2.8815789473684213e-06, |
|
"loss": 0.5443, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -3.8009033203125, |
|
"accuracy": 0.875, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -3.89312744140625, |
|
"accuracy": 0.75, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -4.010528564453125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -3.480602264404297, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 10.730036735534668, |
|
"learning_rate": 2.873684210526316e-06, |
|
"loss": 0.4968, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -3.891571044921875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -3.62432861328125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -3.650390625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -3.6742515563964844, |
|
"accuracy": 0.75, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 8.428532600402832, |
|
"learning_rate": 2.8657894736842103e-06, |
|
"loss": 0.4515, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -3.5828857421875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -3.687744140625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -3.60595703125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -4.343017578125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 8.982612609863281, |
|
"learning_rate": 2.857894736842105e-06, |
|
"loss": 0.4853, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": -4.35125732421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": -4.2618408203125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": -3.67474365234375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": -4.13818359375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 8.4310302734375, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.4476, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": -4.23419189453125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": -4.0997314453125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": -4.07379150390625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": -4.32989501953125, |
|
"accuracy": 1.0, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.986425399780273, |
|
"learning_rate": 2.8421052631578946e-06, |
|
"loss": 0.4334, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": -4.50189208984375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": -3.809326171875, |
|
"accuracy": 0.75, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": -4.21588134765625, |
|
"accuracy": 0.875, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": -3.85302734375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 8.613731384277344, |
|
"learning_rate": 2.8342105263157897e-06, |
|
"loss": 0.4477, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": -3.824188232421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": -3.658496856689453, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": -3.964691162109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": -3.8968505859375, |
|
"accuracy": 0.75, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 8.035860061645508, |
|
"learning_rate": 2.8263157894736845e-06, |
|
"loss": 0.3786, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -4.138427734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -3.946533203125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -4.2305908203125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -3.92535400390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 8.79111385345459, |
|
"learning_rate": 2.8184210526315792e-06, |
|
"loss": 0.3971, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -3.7923431396484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -3.5310258865356445, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -3.6854248046875, |
|
"accuracy": 0.875, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -3.468961715698242, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 10.596081733703613, |
|
"learning_rate": 2.810526315789474e-06, |
|
"loss": 0.4792, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -3.435342788696289, |
|
"accuracy": 0.875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -3.907958984375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -3.551055908203125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -3.1349716186523438, |
|
"accuracy": 0.90625, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 8.473621368408203, |
|
"learning_rate": 2.8026315789473687e-06, |
|
"loss": 0.438, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -3.0492935180664062, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -3.042724609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -3.8280296325683594, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -3.4905853271484375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 7.537892818450928, |
|
"learning_rate": 2.7947368421052635e-06, |
|
"loss": 0.2883, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": -2.8027992248535156, |
|
"accuracy": 0.625, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": -2.9038753509521484, |
|
"accuracy": 0.875, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": -3.4294891357421875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": -2.933074951171875, |
|
"accuracy": 0.75, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 12.333584785461426, |
|
"learning_rate": 2.7868421052631578e-06, |
|
"loss": 0.5348, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -2.7773094177246094, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -2.03399658203125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -2.9137496948242188, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -3.201608657836914, |
|
"accuracy": 0.875, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.547114372253418, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.4621, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -2.280449867248535, |
|
"accuracy": 0.78125, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -2.9716339111328125, |
|
"accuracy": 0.875, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -2.959075927734375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -2.8322715759277344, |
|
"accuracy": 0.71875, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 12.012454986572266, |
|
"learning_rate": 2.7710526315789473e-06, |
|
"loss": 0.489, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -3.118091583251953, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -2.2162322998046875, |
|
"accuracy": 0.75, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -2.1783409118652344, |
|
"accuracy": 0.59375, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -2.5770187377929688, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 13.930920600891113, |
|
"learning_rate": 2.763157894736842e-06, |
|
"loss": 0.5659, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -2.640003204345703, |
|
"accuracy": 0.875, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -3.2294464111328125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -3.3978271484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -2.858466148376465, |
|
"accuracy": 0.8125, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 11.848394393920898, |
|
"learning_rate": 2.7552631578947368e-06, |
|
"loss": 0.5105, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -2.7342262268066406, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -3.48663330078125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -3.0019655227661133, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -3.550537109375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 10.681953430175781, |
|
"learning_rate": 2.7473684210526315e-06, |
|
"loss": 0.5472, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -3.7133026123046875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -3.6298141479492188, |
|
"accuracy": 0.71875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -4.31036376953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -3.61016845703125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 7.710223197937012, |
|
"learning_rate": 2.7394736842105263e-06, |
|
"loss": 0.5108, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -4.007965087890625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -3.752777099609375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -3.687549591064453, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -3.5942840576171875, |
|
"accuracy": 0.75, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 6.719354152679443, |
|
"learning_rate": 2.7315789473684214e-06, |
|
"loss": 0.4691, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -4.132904052734375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -4.61541748046875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -4.22991943359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -4.14007568359375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 6.091842174530029, |
|
"learning_rate": 2.723684210526316e-06, |
|
"loss": 0.4276, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -4.385498046875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -4.61492919921875, |
|
"accuracy": 0.625, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -4.2962646484375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -4.18157958984375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.7918620109558105, |
|
"learning_rate": 2.715789473684211e-06, |
|
"loss": 0.4102, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -3.7819137573242188, |
|
"accuracy": 0.78125, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -3.8838958740234375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -4.2425537109375, |
|
"accuracy": 0.625, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -4.3662109375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 7.213972568511963, |
|
"learning_rate": 2.7078947368421052e-06, |
|
"loss": 0.457, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -3.624969482421875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -4.58807373046875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -4.09814453125, |
|
"accuracy": 0.75, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -3.97235107421875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 5.783757209777832, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.4118, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -3.2208633422851562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -3.5696563720703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -3.245067596435547, |
|
"accuracy": 0.6875, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -3.427947998046875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 6.49120569229126, |
|
"learning_rate": 2.6921052631578947e-06, |
|
"loss": 0.4278, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -3.1783447265625, |
|
"accuracy": 0.625, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -3.128032684326172, |
|
"accuracy": 0.875, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -3.291412353515625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -3.10302734375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.3600850105285645, |
|
"learning_rate": 2.6842105263157895e-06, |
|
"loss": 0.4719, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -2.8273773193359375, |
|
"accuracy": 0.75, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -3.152629852294922, |
|
"accuracy": 0.71875, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -2.8756637573242188, |
|
"accuracy": 0.6875, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -2.9803466796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 6.642537593841553, |
|
"learning_rate": 2.6763157894736842e-06, |
|
"loss": 0.4796, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -2.676004409790039, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -2.830108642578125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -2.6599464416503906, |
|
"accuracy": 0.875, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -2.9864349365234375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 6.946270942687988, |
|
"learning_rate": 2.668421052631579e-06, |
|
"loss": 0.3668, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -2.9581680297851562, |
|
"accuracy": 0.875, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -2.5045700073242188, |
|
"accuracy": 0.84375, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -3.3170013427734375, |
|
"accuracy": 0.75, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -3.1421661376953125, |
|
"accuracy": 0.75, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 7.357174396514893, |
|
"learning_rate": 2.6605263157894737e-06, |
|
"loss": 0.4197, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -2.3607635498046875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -3.1617660522460938, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -3.3164749145507812, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -3.1099395751953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.538585662841797, |
|
"learning_rate": 2.6526315789473685e-06, |
|
"loss": 0.4143, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -2.6910171508789062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -3.074167013168335, |
|
"accuracy": 0.8125, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -3.1822357177734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -3.3923683166503906, |
|
"accuracy": 0.78125, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 7.79094123840332, |
|
"learning_rate": 2.644736842105263e-06, |
|
"loss": 0.4276, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -2.9289722442626953, |
|
"accuracy": 0.75, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -3.164053440093994, |
|
"accuracy": 0.75, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -3.19818115234375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -3.493072509765625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 8.591028213500977, |
|
"learning_rate": 2.636842105263158e-06, |
|
"loss": 0.5094, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -3.191915512084961, |
|
"accuracy": 0.75, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -3.100576400756836, |
|
"accuracy": 0.8125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -3.4085922241210938, |
|
"accuracy": 0.96875, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -3.1348419189453125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 8.065413475036621, |
|
"learning_rate": 2.6289473684210527e-06, |
|
"loss": 0.4552, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -4.05078125, |
|
"accuracy": 0.75, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -3.196155548095703, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -4.133514404296875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -3.7607345581054688, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.609820365905762, |
|
"learning_rate": 2.6210526315789474e-06, |
|
"loss": 0.4536, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -3.8547210693359375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -3.7351226806640625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -4.212074279785156, |
|
"accuracy": 0.78125, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -3.53668212890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 7.895672798156738, |
|
"learning_rate": 2.613157894736842e-06, |
|
"loss": 0.3879, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -4.088775634765625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -3.75189208984375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -4.203272819519043, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -3.900634765625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 7.224689483642578, |
|
"learning_rate": 2.605263157894737e-06, |
|
"loss": 0.422, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -4.03101921081543, |
|
"accuracy": 0.75, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -3.585662841796875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -3.6973724365234375, |
|
"accuracy": 0.875, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -4.1641082763671875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 6.848752498626709, |
|
"learning_rate": 2.5973684210526317e-06, |
|
"loss": 0.4386, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -3.96783447265625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -4.127544403076172, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -4.292510986328125, |
|
"accuracy": 0.53125, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -3.774566650390625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.717357635498047, |
|
"learning_rate": 2.5894736842105264e-06, |
|
"loss": 0.4317, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -4.0881805419921875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -4.007743835449219, |
|
"accuracy": 0.84375, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -4.3395538330078125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -3.7432098388671875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 6.629477500915527, |
|
"learning_rate": 2.581578947368421e-06, |
|
"loss": 0.3347, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -4.6607513427734375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -4.265266418457031, |
|
"accuracy": 0.875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -3.5947265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -4.227779388427734, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 6.081233501434326, |
|
"learning_rate": 2.573684210526316e-06, |
|
"loss": 0.3813, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -3.6422882080078125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -4.190940856933594, |
|
"accuracy": 0.6875, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -3.8231201171875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -4.39495849609375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 7.641036033630371, |
|
"learning_rate": 2.5657894736842107e-06, |
|
"loss": 0.5041, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -3.682525634765625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -3.192474365234375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -2.80194091796875, |
|
"accuracy": 0.875, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -3.7286148071289062, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.7662200927734375, |
|
"learning_rate": 2.5578947368421054e-06, |
|
"loss": 0.4441, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -3.60430908203125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -3.5869674682617188, |
|
"accuracy": 0.875, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -3.536773681640625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -3.7320709228515625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 7.16535758972168, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.4266, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -2.750619411468506, |
|
"accuracy": 0.875, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -3.563629150390625, |
|
"accuracy": 0.625, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -3.890869140625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -3.540557861328125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 8.778853416442871, |
|
"learning_rate": 2.542105263157895e-06, |
|
"loss": 0.4598, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -3.7507858276367188, |
|
"accuracy": 0.84375, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -4.087003707885742, |
|
"accuracy": 0.875, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -3.482025146484375, |
|
"accuracy": 0.75, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -3.1190052032470703, |
|
"accuracy": 0.9375, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 5.622673034667969, |
|
"learning_rate": 2.5342105263157892e-06, |
|
"loss": 0.3767, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -4.231414794921875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -3.4138107299804688, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -3.5726470947265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -3.738006591796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.790273189544678, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.437, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -3.6096343994140625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -3.303373336791992, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -3.778522491455078, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -3.6217997074127197, |
|
"accuracy": 0.875, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 6.186145305633545, |
|
"learning_rate": 2.5184210526315787e-06, |
|
"loss": 0.3668, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": -2.8534088134765625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": -3.2958984375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": -3.643817901611328, |
|
"accuracy": 0.75, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": -3.3983230590820312, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 6.6807379722595215, |
|
"learning_rate": 2.510526315789474e-06, |
|
"loss": 0.4352, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -3.2677268981933594, |
|
"accuracy": 0.75, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -3.808116912841797, |
|
"accuracy": 0.8125, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -3.621933937072754, |
|
"accuracy": 0.78125, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -3.544036865234375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 6.687833309173584, |
|
"learning_rate": 2.5026315789473686e-06, |
|
"loss": 0.4091, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": -3.397430419921875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": -3.124340057373047, |
|
"accuracy": 0.875, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": -3.5037002563476562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": -2.901216506958008, |
|
"accuracy": 0.75, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.103653907775879, |
|
"learning_rate": 2.4947368421052634e-06, |
|
"loss": 0.3172, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": -3.0877227783203125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": -2.7988204956054688, |
|
"accuracy": 0.78125, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": -3.3592453002929688, |
|
"accuracy": 0.8125, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": -2.706587314605713, |
|
"accuracy": 0.6875, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 6.470891952514648, |
|
"learning_rate": 2.486842105263158e-06, |
|
"loss": 0.3878, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -2.30181884765625, |
|
"accuracy": 0.875, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -2.7299463748931885, |
|
"accuracy": 0.71875, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -2.6263694763183594, |
|
"accuracy": 0.6875, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -2.6631546020507812, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 7.898273944854736, |
|
"learning_rate": 2.478947368421053e-06, |
|
"loss": 0.474, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -2.9964141845703125, |
|
"accuracy": 0.875, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -1.6867389678955078, |
|
"accuracy": 0.75, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -2.084423065185547, |
|
"accuracy": 0.875, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -2.3247833251953125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 7.01542329788208, |
|
"learning_rate": 2.4710526315789476e-06, |
|
"loss": 0.354, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": -1.62408447265625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": -1.8180961608886719, |
|
"accuracy": 0.75, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": -2.39483642578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": -2.1982975006103516, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 9.38858413696289, |
|
"learning_rate": 2.4631578947368424e-06, |
|
"loss": 0.463, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -1.4812946319580078, |
|
"accuracy": 0.8125, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -2.553070068359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -1.1305127143859863, |
|
"accuracy": 0.78125, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -1.9516029357910156, |
|
"accuracy": 0.875, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 8.485116004943848, |
|
"learning_rate": 2.4552631578947367e-06, |
|
"loss": 0.4295, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -1.5622539520263672, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -2.8045387268066406, |
|
"accuracy": 0.875, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -1.2656402587890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -1.0702705383300781, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 6.422337532043457, |
|
"learning_rate": 2.4473684210526314e-06, |
|
"loss": 0.3633, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": -1.141378402709961, |
|
"accuracy": 0.71875, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": -1.7803688049316406, |
|
"accuracy": 0.875, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": -1.991696834564209, |
|
"accuracy": 0.75, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": -0.957183837890625, |
|
"accuracy": 0.875, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 8.08924388885498, |
|
"learning_rate": 2.439473684210526e-06, |
|
"loss": 0.4245, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": -1.1860132217407227, |
|
"accuracy": 0.71875, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": -0.84332275390625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": -2.0398807525634766, |
|
"accuracy": 0.75, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": -1.4838409423828125, |
|
"accuracy": 0.875, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.4404425621032715, |
|
"learning_rate": 2.431578947368421e-06, |
|
"loss": 0.3759, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -1.5761184692382812, |
|
"accuracy": 0.96875, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -2.564685821533203, |
|
"accuracy": 0.71875, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -1.5132369995117188, |
|
"accuracy": 0.6875, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -2.0555648803710938, |
|
"accuracy": 0.875, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 6.906028747558594, |
|
"learning_rate": 2.4236842105263157e-06, |
|
"loss": 0.4027, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -1.4121761322021484, |
|
"accuracy": 0.875, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -1.95318603515625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -1.6953773498535156, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -1.5923538208007812, |
|
"accuracy": 0.875, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 6.271661758422852, |
|
"learning_rate": 2.4157894736842104e-06, |
|
"loss": 0.3762, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -2.2878494262695312, |
|
"accuracy": 0.71875, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -2.286632537841797, |
|
"accuracy": 0.78125, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -2.0052947998046875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -2.1512603759765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 7.963528156280518, |
|
"learning_rate": 2.4078947368421056e-06, |
|
"loss": 0.4721, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -1.9903564453125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -2.5484466552734375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -2.6534271240234375, |
|
"accuracy": 0.75, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -1.515106201171875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.653286933898926, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.4651, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -2.765472412109375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -2.008441925048828, |
|
"accuracy": 0.875, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -2.3530349731445312, |
|
"accuracy": 0.8125, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -2.2541656494140625, |
|
"accuracy": 0.75, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 6.258959770202637, |
|
"learning_rate": 2.392105263157895e-06, |
|
"loss": 0.3895, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": -2.4180755615234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": -2.4765663146972656, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": -2.5517921447753906, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": -2.794464111328125, |
|
"accuracy": 0.75, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 6.597422122955322, |
|
"learning_rate": 2.38421052631579e-06, |
|
"loss": 0.3898, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": -2.1800403594970703, |
|
"accuracy": 0.6875, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": -2.8349609375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": -2.4977874755859375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": -2.3620100021362305, |
|
"accuracy": 0.78125, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 7.636416912078857, |
|
"learning_rate": 2.376315789473684e-06, |
|
"loss": 0.5111, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": -2.890850067138672, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": -2.570037841796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": -2.489429473876953, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": -2.4925973415374756, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.276621341705322, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.3493, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": -2.685060501098633, |
|
"accuracy": 0.8125, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": -3.0683860778808594, |
|
"accuracy": 0.75, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": -2.406707763671875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": -3.204519271850586, |
|
"accuracy": 0.75, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 7.729552745819092, |
|
"learning_rate": 2.3605263157894736e-06, |
|
"loss": 0.4712, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": -2.516643524169922, |
|
"accuracy": 0.875, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": -2.89825439453125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": -2.921875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": -2.51778507232666, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 6.247341156005859, |
|
"learning_rate": 2.3526315789473684e-06, |
|
"loss": 0.3784, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": -2.3507378101348877, |
|
"accuracy": 0.8125, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": -2.923431396484375, |
|
"accuracy": 0.875, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": -2.9839935302734375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": -2.5507564544677734, |
|
"accuracy": 0.90625, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 5.437046051025391, |
|
"learning_rate": 2.344736842105263e-06, |
|
"loss": 0.3234, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -2.7343482971191406, |
|
"accuracy": 0.75, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -3.000023365020752, |
|
"accuracy": 0.75, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -3.117097854614258, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -3.2165985107421875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.148366451263428, |
|
"learning_rate": 2.336842105263158e-06, |
|
"loss": 0.4215, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -3.3616867065429688, |
|
"accuracy": 0.84375, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -3.173351287841797, |
|
"accuracy": 0.875, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -2.648517608642578, |
|
"accuracy": 0.71875, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -3.0380096435546875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 7.0915021896362305, |
|
"learning_rate": 2.3289473684210526e-06, |
|
"loss": 0.3746, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -2.9630088806152344, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -2.8781280517578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -3.432220458984375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -2.7377090454101562, |
|
"accuracy": 0.875, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 6.841515064239502, |
|
"learning_rate": 2.3210526315789473e-06, |
|
"loss": 0.3739, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": -2.955535888671875, |
|
"accuracy": 0.875, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": -2.5553359985351562, |
|
"accuracy": 0.71875, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": -3.257892608642578, |
|
"accuracy": 0.75, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": -2.7374954223632812, |
|
"accuracy": 0.8125, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 6.91084623336792, |
|
"learning_rate": 2.313157894736842e-06, |
|
"loss": 0.3924, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": -3.3003501892089844, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": -2.7673654556274414, |
|
"accuracy": 0.65625, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": -2.5366382598876953, |
|
"accuracy": 0.75, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": -2.509608507156372, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 7.229374885559082, |
|
"learning_rate": 2.305263157894737e-06, |
|
"loss": 0.4284, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": -3.148345947265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": -3.0621566772460938, |
|
"accuracy": 0.90625, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": -2.859434127807617, |
|
"accuracy": 0.71875, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": -3.1439666748046875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 8.356084823608398, |
|
"learning_rate": 2.2973684210526316e-06, |
|
"loss": 0.4372, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": -2.7968883514404297, |
|
"accuracy": 0.875, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": -2.700042724609375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": -2.6135177612304688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": -3.498870849609375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 6.937132358551025, |
|
"learning_rate": 2.2894736842105263e-06, |
|
"loss": 0.338, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": -2.8419265747070312, |
|
"accuracy": 0.90625, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": -1.7743732929229736, |
|
"accuracy": 0.90625, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": -2.9847335815429688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": -2.8534889221191406, |
|
"accuracy": 0.84375, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 6.012710094451904, |
|
"learning_rate": 2.281578947368421e-06, |
|
"loss": 0.2648, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -2.719818115234375, |
|
"accuracy": 0.96875, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -2.7094573974609375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -3.1852569580078125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -2.9012527465820312, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.613126754760742, |
|
"learning_rate": 2.273684210526316e-06, |
|
"loss": 0.2261, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -2.317493438720703, |
|
"accuracy": 0.84375, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -2.7786102294921875, |
|
"accuracy": 0.875, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -2.4584197998046875, |
|
"accuracy": 0.75, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -3.82269287109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 8.895098686218262, |
|
"learning_rate": 2.2657894736842106e-06, |
|
"loss": 0.4254, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -2.82708740234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -2.834275245666504, |
|
"accuracy": 0.96875, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -2.4896316528320312, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -2.9547224044799805, |
|
"accuracy": 0.875, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 6.938596725463867, |
|
"learning_rate": 2.2578947368421053e-06, |
|
"loss": 0.2756, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": -3.3860321044921875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": -1.289031982421875, |
|
"accuracy": 0.75, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": -1.973724365234375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": -1.4242782592773438, |
|
"accuracy": 0.78125, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 10.572776794433594, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.4922, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": -2.7473220825195312, |
|
"accuracy": 0.6875, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": -2.382610321044922, |
|
"accuracy": 0.90625, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": -2.4156417846679688, |
|
"accuracy": 0.75, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": -2.8010101318359375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 8.355461120605469, |
|
"learning_rate": 2.242105263157895e-06, |
|
"loss": 0.3628, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": -3.490283966064453, |
|
"accuracy": 0.96875, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": -3.179001808166504, |
|
"accuracy": 0.84375, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": -2.5853271484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": -2.0840072631835938, |
|
"accuracy": 0.75, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 7.448988437652588, |
|
"learning_rate": 2.2342105263157895e-06, |
|
"loss": 0.3418, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": -3.2340087890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": -2.6928272247314453, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": -2.367229461669922, |
|
"accuracy": 0.9375, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": -2.940413475036621, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 7.369281768798828, |
|
"learning_rate": 2.2263157894736843e-06, |
|
"loss": 0.3758, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": -3.154327392578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": -3.162120819091797, |
|
"accuracy": 0.78125, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": -2.571208953857422, |
|
"accuracy": 0.75, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": -2.0688514709472656, |
|
"accuracy": 0.78125, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 9.859661102294922, |
|
"learning_rate": 2.218421052631579e-06, |
|
"loss": 0.4781, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": -2.288646697998047, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": -2.9682788848876953, |
|
"accuracy": 0.75, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": -3.6280517578125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": -2.7621307373046875, |
|
"accuracy": 0.875, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.310153961181641, |
|
"learning_rate": 2.2105263157894738e-06, |
|
"loss": 0.3414, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": -3.35638427734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": -3.4458541870117188, |
|
"accuracy": 0.75, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": -3.32684326171875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": -3.52008056640625, |
|
"accuracy": 0.875, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 6.901857852935791, |
|
"learning_rate": 2.2026315789473685e-06, |
|
"loss": 0.3663, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -3.2964706420898438, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -4.0569610595703125, |
|
"accuracy": 0.75, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -3.8861026763916016, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -4.03857421875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 6.965332508087158, |
|
"learning_rate": 2.1947368421052633e-06, |
|
"loss": 0.3583, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": -4.39910888671875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": -3.1727142333984375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": -3.89581298828125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": -3.6239490509033203, |
|
"accuracy": 0.84375, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 6.396092414855957, |
|
"learning_rate": 2.186842105263158e-06, |
|
"loss": 0.321, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": -3.5579376220703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": -3.6140670776367188, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": -3.25006103515625, |
|
"accuracy": 0.875, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": -3.859375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 6.925971984863281, |
|
"learning_rate": 2.1789473684210528e-06, |
|
"loss": 0.3773, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": -3.74835205078125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": -2.98968505859375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": -3.8388538360595703, |
|
"accuracy": 0.78125, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": -3.46600341796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 7.271778106689453, |
|
"learning_rate": 2.1710526315789475e-06, |
|
"loss": 0.4283, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": -3.1362533569335938, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": -3.978717803955078, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": -2.7917633056640625, |
|
"accuracy": 0.75, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": -3.4448699951171875, |
|
"accuracy": 0.75, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 6.391500473022461, |
|
"learning_rate": 2.1631578947368423e-06, |
|
"loss": 0.377, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": -3.4984054565429688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": -3.4216766357421875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": -3.199005126953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": -3.3887786865234375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 7.132022380828857, |
|
"learning_rate": 2.155263157894737e-06, |
|
"loss": 0.348, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": -3.533205032348633, |
|
"accuracy": 0.875, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": -3.0708465576171875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": -2.5532379150390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": -3.242755889892578, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.543447971343994, |
|
"learning_rate": 2.1473684210526317e-06, |
|
"loss": 0.3184, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": -3.64508056640625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": -4.324256896972656, |
|
"accuracy": 0.9375, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": -3.880157470703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": -3.4844741821289062, |
|
"accuracy": 0.78125, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 7.542953014373779, |
|
"learning_rate": 2.1394736842105265e-06, |
|
"loss": 0.3573, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -3.59429931640625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -3.3390655517578125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -3.0296974182128906, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -3.78515625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 6.553402900695801, |
|
"learning_rate": 2.1315789473684212e-06, |
|
"loss": 0.3219, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": -2.7899856567382812, |
|
"accuracy": 0.8125, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": -3.6958160400390625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": -2.806610107421875, |
|
"accuracy": 0.75, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": -3.182114601135254, |
|
"accuracy": 0.78125, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 7.584489822387695, |
|
"learning_rate": 2.123684210526316e-06, |
|
"loss": 0.4417, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": -3.492685317993164, |
|
"accuracy": 0.875, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": -3.4088897705078125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": -3.0204315185546875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": -3.449188232421875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.928311347961426, |
|
"learning_rate": 2.1157894736842103e-06, |
|
"loss": 0.3043, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": -2.6694793701171875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": -2.967529296875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": -4.03662109375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": -3.179962158203125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 8.017102241516113, |
|
"learning_rate": 2.107894736842105e-06, |
|
"loss": 0.4024, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": -3.1867218017578125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": -3.424471855163574, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": -3.25, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": -3.68438720703125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 7.597751140594482, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.4027, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": -2.4028396606445312, |
|
"accuracy": 0.75, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": -3.6143722534179688, |
|
"accuracy": 0.8125, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": -3.1368408203125, |
|
"accuracy": 0.875, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": -2.922149658203125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 7.709555149078369, |
|
"learning_rate": 2.0921052631578945e-06, |
|
"loss": 0.4432, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": -4.095916748046875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": -3.7461395263671875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": -3.748504161834717, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": -2.7383270263671875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.083080291748047, |
|
"learning_rate": 2.0842105263157897e-06, |
|
"loss": 0.3501, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": -3.571258544921875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": -3.6791038513183594, |
|
"accuracy": 0.875, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": -3.7348480224609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": -3.2222461700439453, |
|
"accuracy": 0.71875, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 6.974395275115967, |
|
"learning_rate": 2.0763157894736845e-06, |
|
"loss": 0.358, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": -4.6036376953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": -3.4961585998535156, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": -3.3908309936523438, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": -4.088287353515625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 7.275962829589844, |
|
"learning_rate": 2.068421052631579e-06, |
|
"loss": 0.3928, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": -4.08306884765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": -3.62261962890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": -4.1116790771484375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": -4.20538330078125, |
|
"accuracy": 0.875, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 6.272771835327148, |
|
"learning_rate": 2.060526315789474e-06, |
|
"loss": 0.3163, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": -4.465492248535156, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": -4.651458740234375, |
|
"accuracy": 0.875, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": -4.1647186279296875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": -2.7436389923095703, |
|
"accuracy": 0.875, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.057549953460693, |
|
"learning_rate": 2.0526315789473687e-06, |
|
"loss": 0.3358, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": -3.641510009765625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": -3.993194580078125, |
|
"accuracy": 0.875, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": -4.205657958984375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": -3.8518409729003906, |
|
"accuracy": 0.90625, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 6.492579936981201, |
|
"learning_rate": 2.0447368421052634e-06, |
|
"loss": 0.36, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": -3.0039520263671875, |
|
"accuracy": 0.625, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": -3.5678176879882812, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": -3.959442138671875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": -3.90399169921875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 8.822011947631836, |
|
"learning_rate": 2.0368421052631578e-06, |
|
"loss": 0.4625, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": -4.228359222412109, |
|
"accuracy": 0.8125, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": -4.1596527099609375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": -3.49688720703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": -3.519866943359375, |
|
"accuracy": 0.875, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 7.293342590332031, |
|
"learning_rate": 2.0289473684210525e-06, |
|
"loss": 0.4173, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": -4.350372314453125, |
|
"accuracy": 0.875, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": -4.087388038635254, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": -3.1829357147216797, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": -4.2586517333984375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.839393138885498, |
|
"learning_rate": 2.0210526315789473e-06, |
|
"loss": 0.3149, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": -3.851898193359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": -3.5305938720703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": -4.296562194824219, |
|
"accuracy": 0.71875, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": -4.097373962402344, |
|
"accuracy": 0.84375, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 6.38557767868042, |
|
"learning_rate": 2.013157894736842e-06, |
|
"loss": 0.3809, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": -3.644855499267578, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": -4.092620849609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": -3.9543728828430176, |
|
"accuracy": 0.875, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": -3.901203155517578, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 6.386345386505127, |
|
"learning_rate": 2.0052631578947367e-06, |
|
"loss": 0.3469, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": -3.872467041015625, |
|
"accuracy": 0.875, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": -3.0312156677246094, |
|
"accuracy": 0.8125, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": -3.2101287841796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": -3.727203369140625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 7.31825590133667, |
|
"learning_rate": 1.9973684210526315e-06, |
|
"loss": 0.3775, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": -4.53240966796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": -4.39666748046875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": -4.245336532592773, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": -3.6735334396362305, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.167656898498535, |
|
"learning_rate": 1.9894736842105262e-06, |
|
"loss": 0.4151, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": -4.223419189453125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": -2.857851028442383, |
|
"accuracy": 0.78125, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": -3.902324676513672, |
|
"accuracy": 0.71875, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": -4.022743225097656, |
|
"accuracy": 0.75, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 6.420680522918701, |
|
"learning_rate": 1.9815789473684214e-06, |
|
"loss": 0.4168, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": -4.02081298828125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": -3.3986377716064453, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": -4.357936859130859, |
|
"accuracy": 0.875, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": -3.590606689453125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 5.989567279815674, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 0.2872, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": -3.69549560546875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": -3.0214691162109375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": -3.49407958984375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": -3.97283935546875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 6.451083660125732, |
|
"learning_rate": 1.965789473684211e-06, |
|
"loss": 0.3594, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": -3.495819091796875, |
|
"accuracy": 0.875, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": -4.1375274658203125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": -3.330834150314331, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": -3.2013702392578125, |
|
"accuracy": 0.875, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.380356788635254, |
|
"learning_rate": 1.9578947368421052e-06, |
|
"loss": 0.4359, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": -3.5101318359375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": -3.5372161865234375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": -2.7825088500976562, |
|
"accuracy": 0.8125, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": -3.661346435546875, |
|
"accuracy": 0.96875, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 5.823904514312744, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.3266, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": -2.8103408813476562, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": -3.345386505126953, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": -3.250551223754883, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": -3.2271499633789062, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 8.050492286682129, |
|
"learning_rate": 1.9421052631578947e-06, |
|
"loss": 0.3497, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": -2.0474777221679688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": -3.3988208770751953, |
|
"accuracy": 0.90625, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": -3.7056427001953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": -2.6432647705078125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 5.581839561462402, |
|
"learning_rate": 1.9342105263157895e-06, |
|
"loss": 0.3112, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": -2.814176559448242, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": -2.0308380126953125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": -3.164947509765625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": -2.7253189086914062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.545515537261963, |
|
"learning_rate": 1.926315789473684e-06, |
|
"loss": 0.3156, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": -2.7420883178710938, |
|
"accuracy": 0.8125, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": -1.8706493377685547, |
|
"accuracy": 0.875, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": -1.612701416015625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": -2.74774169921875, |
|
"accuracy": 0.75, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 6.551059246063232, |
|
"learning_rate": 1.918421052631579e-06, |
|
"loss": 0.3152, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -3.313770294189453, |
|
"accuracy": 0.875, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -2.3776092529296875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -2.6562118530273438, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -2.5965747833251953, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 7.611530303955078, |
|
"learning_rate": 1.9105263157894737e-06, |
|
"loss": 0.2848, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -2.9923248291015625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -3.1744384765625, |
|
"accuracy": 0.875, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -1.654348373413086, |
|
"accuracy": 0.84375, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -2.4622344970703125, |
|
"accuracy": 0.875, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 6.978616237640381, |
|
"learning_rate": 1.9026315789473684e-06, |
|
"loss": 0.3135, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": -2.66357421875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": -0.5262050628662109, |
|
"accuracy": 0.875, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": -2.160125732421875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": -2.8070220947265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.540273666381836, |
|
"learning_rate": 1.8947368421052632e-06, |
|
"loss": 0.2625, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": -1.2659626007080078, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": -1.5436820983886719, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": -1.8903522491455078, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": -2.8146209716796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 8.280950546264648, |
|
"learning_rate": 1.8868421052631577e-06, |
|
"loss": 0.3544, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": -0.720062255859375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": -1.4061965942382812, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": -1.1033363342285156, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": -1.5503730773925781, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 7.286800384521484, |
|
"learning_rate": 1.8789473684210525e-06, |
|
"loss": 0.3147, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": -1.6750932931900024, |
|
"accuracy": 0.6875, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": -2.2614898681640625, |
|
"accuracy": 0.875, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": -2.33111572265625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": -1.9058303833007812, |
|
"accuracy": 0.8125, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 8.709750175476074, |
|
"learning_rate": 1.8710526315789476e-06, |
|
"loss": 0.3208, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -1.1716461181640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -2.4022750854492188, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -2.6826629638671875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -2.4820892810821533, |
|
"accuracy": 0.875, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.75256633758545, |
|
"learning_rate": 1.8631578947368424e-06, |
|
"loss": 0.3381, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -2.472442626953125, |
|
"accuracy": 0.875, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -1.6739931106567383, |
|
"accuracy": 0.875, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -0.72857666015625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -3.306723117828369, |
|
"accuracy": 0.84375, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 10.206024169921875, |
|
"learning_rate": 1.855263157894737e-06, |
|
"loss": 0.3731, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -3.639404296875, |
|
"accuracy": 0.875, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -2.616802215576172, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -2.4031362533569336, |
|
"accuracy": 0.875, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -2.225605010986328, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 9.089224815368652, |
|
"learning_rate": 1.8473684210526317e-06, |
|
"loss": 0.2955, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": -2.55029296875, |
|
"accuracy": 0.875, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": -2.450855255126953, |
|
"accuracy": 0.875, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": -2.3015074729919434, |
|
"accuracy": 0.8125, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": -1.580474853515625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 8.66561222076416, |
|
"learning_rate": 1.8394736842105264e-06, |
|
"loss": 0.3365, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": -2.9533843994140625, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": -2.373016357421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": -0.9237747192382812, |
|
"accuracy": 0.875, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": -2.4250221252441406, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.428339004516602, |
|
"learning_rate": 1.8315789473684211e-06, |
|
"loss": 0.2774, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": -3.4917097091674805, |
|
"accuracy": 0.84375, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": -3.013397216796875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": -2.8196258544921875, |
|
"accuracy": 0.875, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": -3.11566162109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 10.0361328125, |
|
"learning_rate": 1.8236842105263159e-06, |
|
"loss": 0.3954, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": -2.8026123046875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": -2.0240249633789062, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": -3.3115146160125732, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": -3.3820533752441406, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 8.027361869812012, |
|
"learning_rate": 1.8157894736842106e-06, |
|
"loss": 0.3511, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -3.3656005859375, |
|
"accuracy": 0.875, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -2.1733436584472656, |
|
"accuracy": 0.875, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -4.290916442871094, |
|
"accuracy": 0.75, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -2.6019363403320312, |
|
"accuracy": 0.9375, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 7.092214584350586, |
|
"learning_rate": 1.8078947368421052e-06, |
|
"loss": 0.2611, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -3.9479827880859375, |
|
"accuracy": 0.75, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -4.346954345703125, |
|
"accuracy": 0.875, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -4.29962158203125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -2.8163528442382812, |
|
"accuracy": 0.71875, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 8.232298851013184, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.3665, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": -4.021289825439453, |
|
"accuracy": 0.875, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": -4.265247344970703, |
|
"accuracy": 0.75, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": -3.1769113540649414, |
|
"accuracy": 0.8125, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": -3.112884521484375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 6.9392218589782715, |
|
"learning_rate": 1.7921052631578947e-06, |
|
"loss": 0.3095, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": -5.00799560546875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": -4.173049449920654, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": -4.1201171875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": -3.61419677734375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 7.176342964172363, |
|
"learning_rate": 1.7842105263157894e-06, |
|
"loss": 0.3698, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": -3.4647297859191895, |
|
"accuracy": 0.78125, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": -3.5292892456054688, |
|
"accuracy": 0.875, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": -3.6618194580078125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": -3.8795318603515625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 7.459259510040283, |
|
"learning_rate": 1.7763157894736842e-06, |
|
"loss": 0.3615, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": -5.0499267578125, |
|
"accuracy": 0.875, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": -3.0543928146362305, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": -4.691841125488281, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": -4.546966552734375, |
|
"accuracy": 0.75, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.923594951629639, |
|
"learning_rate": 1.768421052631579e-06, |
|
"loss": 0.3677, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": -3.6771240234375, |
|
"accuracy": 0.875, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": -4.0308990478515625, |
|
"accuracy": 0.96875, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": -3.9250564575195312, |
|
"accuracy": 0.78125, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": -4.5969390869140625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 6.972391128540039, |
|
"learning_rate": 1.7605263157894739e-06, |
|
"loss": 0.3915, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": -4.8792572021484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": -4.31494140625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": -4.7841796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": -3.2111968994140625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 6.1882805824279785, |
|
"learning_rate": 1.7526315789473686e-06, |
|
"loss": 0.2838, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": -4.4821577072143555, |
|
"accuracy": 0.90625, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": -3.9442481994628906, |
|
"accuracy": 0.78125, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": -3.8130874633789062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": -4.3104095458984375, |
|
"accuracy": 0.875, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 8.147394180297852, |
|
"learning_rate": 1.7447368421052633e-06, |
|
"loss": 0.33, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": -5.016021728515625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": -5.16357421875, |
|
"accuracy": 0.75, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": -5.1396484375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": -4.346986770629883, |
|
"accuracy": 0.875, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.089241027832031, |
|
"learning_rate": 1.736842105263158e-06, |
|
"loss": 0.3783, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": -4.29742431640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": -4.288475036621094, |
|
"accuracy": 0.875, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": -3.9849853515625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": -3.968719482421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 6.3386101722717285, |
|
"learning_rate": 1.7289473684210526e-06, |
|
"loss": 0.3564, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -4.3591766357421875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -3.95111083984375, |
|
"accuracy": 0.875, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -3.8838043212890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -4.525177001953125, |
|
"accuracy": 0.875, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 5.666683197021484, |
|
"learning_rate": 1.7210526315789474e-06, |
|
"loss": 0.2878, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": -3.9508824348449707, |
|
"accuracy": 0.71875, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": -4.3285064697265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": -4.2314043045043945, |
|
"accuracy": 0.75, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": -3.8064422607421875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 8.353093147277832, |
|
"learning_rate": 1.7131578947368421e-06, |
|
"loss": 0.469, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -4.291778564453125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -4.387228488922119, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -4.6067657470703125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -4.279502868652344, |
|
"accuracy": 0.75, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.954326629638672, |
|
"learning_rate": 1.7052631578947369e-06, |
|
"loss": 0.4025, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": -4.3614501953125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": -5.14569091796875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": -4.415924072265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": -4.2044525146484375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 8.812761306762695, |
|
"learning_rate": 1.6973684210526316e-06, |
|
"loss": 0.4675, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": -4.315742492675781, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": -2.924093246459961, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": -4.596168518066406, |
|
"accuracy": 0.71875, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": -4.245391845703125, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 6.642449378967285, |
|
"learning_rate": 1.6894736842105264e-06, |
|
"loss": 0.343, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": -3.6098060607910156, |
|
"accuracy": 0.78125, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": -3.5883445739746094, |
|
"accuracy": 0.96875, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": -3.1055908203125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": -4.519744873046875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 5.832669258117676, |
|
"learning_rate": 1.6815789473684209e-06, |
|
"loss": 0.3375, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -4.752597808837891, |
|
"accuracy": 0.75, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -4.298095703125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -3.3523178100585938, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -4.147176742553711, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 5.989990234375, |
|
"learning_rate": 1.6736842105263156e-06, |
|
"loss": 0.3268, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": -3.7307815551757812, |
|
"accuracy": 0.90625, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": -3.8831558227539062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": -4.1096649169921875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": -5.082061767578125, |
|
"accuracy": 0.75, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 7.336109161376953, |
|
"learning_rate": 1.6657894736842104e-06, |
|
"loss": 0.3981, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": -4.101593017578125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": -4.2955322265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": -4.952667236328125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": -3.91229248046875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 6.2155442237854, |
|
"learning_rate": 1.6578947368421056e-06, |
|
"loss": 0.2867, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": -3.2933349609375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": -4.35699462890625, |
|
"accuracy": 0.875, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": -3.69720458984375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": -3.815704345703125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 8.016613960266113, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.3731, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -4.8086395263671875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -4.474151611328125, |
|
"accuracy": 0.875, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -4.273475646972656, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -3.0001220703125, |
|
"accuracy": 0.875, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.50937032699585, |
|
"learning_rate": 1.6421052631578948e-06, |
|
"loss": 0.2694, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -3.7930126190185547, |
|
"accuracy": 0.75, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -4.763885498046875, |
|
"accuracy": 0.96875, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -4.5084228515625, |
|
"accuracy": 0.875, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -3.997528076171875, |
|
"accuracy": 0.875, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 5.860997676849365, |
|
"learning_rate": 1.6342105263157896e-06, |
|
"loss": 0.3189, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": -4.019233703613281, |
|
"accuracy": 0.875, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": -2.669097900390625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": -3.9401063919067383, |
|
"accuracy": 0.875, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": -3.343963623046875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 7.13978910446167, |
|
"learning_rate": 1.6263157894736843e-06, |
|
"loss": 0.3526, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": -3.36846923828125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": -2.7085227966308594, |
|
"accuracy": 0.8125, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": -3.2451019287109375, |
|
"accuracy": 0.875, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": -3.0539398193359375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 7.440718173980713, |
|
"learning_rate": 1.618421052631579e-06, |
|
"loss": 0.3791, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": -3.2764711380004883, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": -3.397979736328125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": -4.21051025390625, |
|
"accuracy": 0.75, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": -3.2050552368164062, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.265270233154297, |
|
"learning_rate": 1.6105263157894738e-06, |
|
"loss": 0.3847, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": -3.199951171875, |
|
"accuracy": 0.875, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": -4.533942699432373, |
|
"accuracy": 0.8125, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": -3.7108230590820312, |
|
"accuracy": 0.84375, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": -3.08642578125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 7.191283226013184, |
|
"learning_rate": 1.6026315789473683e-06, |
|
"loss": 0.2875, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": -3.5214691162109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": -4.107030868530273, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": -3.35797119140625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": -3.8758201599121094, |
|
"accuracy": 0.875, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 6.3946709632873535, |
|
"learning_rate": 1.594736842105263e-06, |
|
"loss": 0.2981, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": -2.5393829345703125, |
|
"accuracy": 0.96875, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": -3.3964996337890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": -4.194858551025391, |
|
"accuracy": 0.84375, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": -4.430961608886719, |
|
"accuracy": 0.875, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 6.2794599533081055, |
|
"learning_rate": 1.5868421052631578e-06, |
|
"loss": 0.2904, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -3.3924102783203125, |
|
"accuracy": 0.96875, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -3.5726470947265625, |
|
"accuracy": 0.875, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -2.8596115112304688, |
|
"accuracy": 0.9375, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -4.352705001831055, |
|
"accuracy": 0.75, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.9940595626831055, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.2861, |
|
"step": 200 |
|
}, |
|
{ |
|
"Batch Mean": -4.12518310546875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"Batch Mean": -1.9889678955078125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"Batch Mean": -4.243873596191406, |
|
"accuracy": 0.875, |
|
"epoch": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"Batch Mean": -3.7152099609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5025, |
|
"grad_norm": 7.452747344970703, |
|
"learning_rate": 1.5710526315789473e-06, |
|
"loss": 0.3522, |
|
"step": 201 |
|
}, |
|
{ |
|
"Batch Mean": -3.8014869689941406, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5025, |
|
"step": 201 |
|
}, |
|
{ |
|
"Batch Mean": -3.8201751708984375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5025, |
|
"step": 201 |
|
}, |
|
{ |
|
"Batch Mean": -4.052543640136719, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5025, |
|
"step": 201 |
|
}, |
|
{ |
|
"Batch Mean": -4.5098114013671875, |
|
"accuracy": 0.875, |
|
"epoch": 0.5025, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 6.652421951293945, |
|
"learning_rate": 1.563157894736842e-06, |
|
"loss": 0.2527, |
|
"step": 202 |
|
}, |
|
{ |
|
"Batch Mean": -4.42303466796875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.505, |
|
"step": 202 |
|
}, |
|
{ |
|
"Batch Mean": -3.4556884765625, |
|
"accuracy": 0.875, |
|
"epoch": 0.505, |
|
"step": 202 |
|
}, |
|
{ |
|
"Batch Mean": -3.7808380126953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.505, |
|
"step": 202 |
|
}, |
|
{ |
|
"Batch Mean": -3.71234130859375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.505, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5075, |
|
"grad_norm": 6.491587162017822, |
|
"learning_rate": 1.5552631578947368e-06, |
|
"loss": 0.2616, |
|
"step": 203 |
|
}, |
|
{ |
|
"Batch Mean": -3.1049137115478516, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5075, |
|
"step": 203 |
|
}, |
|
{ |
|
"Batch Mean": -4.4124908447265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5075, |
|
"step": 203 |
|
}, |
|
{ |
|
"Batch Mean": -3.9703140258789062, |
|
"accuracy": 0.875, |
|
"epoch": 0.5075, |
|
"step": 203 |
|
}, |
|
{ |
|
"Batch Mean": -3.132171630859375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5075, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 8.836193084716797, |
|
"learning_rate": 1.5473684210526318e-06, |
|
"loss": 0.3677, |
|
"step": 204 |
|
}, |
|
{ |
|
"Batch Mean": -3.517852783203125, |
|
"accuracy": 0.875, |
|
"epoch": 0.51, |
|
"step": 204 |
|
}, |
|
{ |
|
"Batch Mean": -3.1400108337402344, |
|
"accuracy": 0.75, |
|
"epoch": 0.51, |
|
"step": 204 |
|
}, |
|
{ |
|
"Batch Mean": -3.8603515625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.51, |
|
"step": 204 |
|
}, |
|
{ |
|
"Batch Mean": -3.963531494140625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.51, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 9.294251441955566, |
|
"learning_rate": 1.5394736842105265e-06, |
|
"loss": 0.3596, |
|
"step": 205 |
|
}, |
|
{ |
|
"Batch Mean": -2.659269332885742, |
|
"accuracy": 0.71875, |
|
"epoch": 0.5125, |
|
"step": 205 |
|
}, |
|
{ |
|
"Batch Mean": -3.768524169921875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5125, |
|
"step": 205 |
|
}, |
|
{ |
|
"Batch Mean": -2.410369873046875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.5125, |
|
"step": 205 |
|
}, |
|
{ |
|
"Batch Mean": -4.1054840087890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5125, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 8.42018985748291, |
|
"learning_rate": 1.5315789473684213e-06, |
|
"loss": 0.373, |
|
"step": 206 |
|
}, |
|
{ |
|
"Batch Mean": -3.9416961669921875, |
|
"accuracy": 0.96875, |
|
"epoch": 0.515, |
|
"step": 206 |
|
}, |
|
{ |
|
"Batch Mean": -4.67938232421875, |
|
"accuracy": 0.75, |
|
"epoch": 0.515, |
|
"step": 206 |
|
}, |
|
{ |
|
"Batch Mean": -3.5211868286132812, |
|
"accuracy": 0.75, |
|
"epoch": 0.515, |
|
"step": 206 |
|
}, |
|
{ |
|
"Batch Mean": -3.649818181991577, |
|
"accuracy": 0.90625, |
|
"epoch": 0.515, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5175, |
|
"grad_norm": 8.131293296813965, |
|
"learning_rate": 1.5236842105263158e-06, |
|
"loss": 0.316, |
|
"step": 207 |
|
}, |
|
{ |
|
"Batch Mean": -3.995166778564453, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5175, |
|
"step": 207 |
|
}, |
|
{ |
|
"Batch Mean": -3.2599563598632812, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5175, |
|
"step": 207 |
|
}, |
|
{ |
|
"Batch Mean": -4.12115478515625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5175, |
|
"step": 207 |
|
}, |
|
{ |
|
"Batch Mean": -3.947113037109375, |
|
"accuracy": 0.875, |
|
"epoch": 0.5175, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 8.336821556091309, |
|
"learning_rate": 1.5157894736842105e-06, |
|
"loss": 0.2943, |
|
"step": 208 |
|
}, |
|
{ |
|
"Batch Mean": -3.06756591796875, |
|
"accuracy": 0.875, |
|
"epoch": 0.52, |
|
"step": 208 |
|
}, |
|
{ |
|
"Batch Mean": -2.88372802734375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.52, |
|
"step": 208 |
|
}, |
|
{ |
|
"Batch Mean": -2.9774584770202637, |
|
"accuracy": 0.84375, |
|
"epoch": 0.52, |
|
"step": 208 |
|
}, |
|
{ |
|
"Batch Mean": -3.4167098999023438, |
|
"accuracy": 0.9375, |
|
"epoch": 0.52, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5225, |
|
"grad_norm": 7.195529937744141, |
|
"learning_rate": 1.5078947368421053e-06, |
|
"loss": 0.3139, |
|
"step": 209 |
|
}, |
|
{ |
|
"Batch Mean": -3.744873046875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5225, |
|
"step": 209 |
|
}, |
|
{ |
|
"Batch Mean": -3.461726188659668, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5225, |
|
"step": 209 |
|
}, |
|
{ |
|
"Batch Mean": -3.9631195068359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5225, |
|
"step": 209 |
|
}, |
|
{ |
|
"Batch Mean": -3.3913650512695312, |
|
"accuracy": 0.78125, |
|
"epoch": 0.5225, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 7.90736722946167, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.332, |
|
"step": 210 |
|
}, |
|
{ |
|
"Batch Mean": -3.917583465576172, |
|
"accuracy": 0.75, |
|
"epoch": 0.525, |
|
"step": 210 |
|
}, |
|
{ |
|
"Batch Mean": -3.137542724609375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.525, |
|
"step": 210 |
|
}, |
|
{ |
|
"Batch Mean": -3.7052459716796875, |
|
"accuracy": 0.75, |
|
"epoch": 0.525, |
|
"step": 210 |
|
}, |
|
{ |
|
"Batch Mean": -4.4464111328125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.525, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5275, |
|
"grad_norm": 11.173701286315918, |
|
"learning_rate": 1.4921052631578948e-06, |
|
"loss": 0.3781, |
|
"step": 211 |
|
}, |
|
{ |
|
"Batch Mean": -2.9488754272460938, |
|
"accuracy": 0.71875, |
|
"epoch": 0.5275, |
|
"step": 211 |
|
}, |
|
{ |
|
"Batch Mean": -3.9233360290527344, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5275, |
|
"step": 211 |
|
}, |
|
{ |
|
"Batch Mean": -3.617340087890625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5275, |
|
"step": 211 |
|
}, |
|
{ |
|
"Batch Mean": -3.932525634765625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5275, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 7.895691394805908, |
|
"learning_rate": 1.4842105263157895e-06, |
|
"loss": 0.3269, |
|
"step": 212 |
|
}, |
|
{ |
|
"Batch Mean": -3.774271011352539, |
|
"accuracy": 0.78125, |
|
"epoch": 0.53, |
|
"step": 212 |
|
}, |
|
{ |
|
"Batch Mean": -3.649505615234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.53, |
|
"step": 212 |
|
}, |
|
{ |
|
"Batch Mean": -3.3257598876953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.53, |
|
"step": 212 |
|
}, |
|
{ |
|
"Batch Mean": -2.99652099609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.53, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5325, |
|
"grad_norm": 7.724608898162842, |
|
"learning_rate": 1.4763157894736843e-06, |
|
"loss": 0.3798, |
|
"step": 213 |
|
}, |
|
{ |
|
"Batch Mean": -3.405029296875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5325, |
|
"step": 213 |
|
}, |
|
{ |
|
"Batch Mean": -3.267181396484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5325, |
|
"step": 213 |
|
}, |
|
{ |
|
"Batch Mean": -3.329376220703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5325, |
|
"step": 213 |
|
}, |
|
{ |
|
"Batch Mean": -3.267730712890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5325, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 7.690370082855225, |
|
"learning_rate": 1.468421052631579e-06, |
|
"loss": 0.3397, |
|
"step": 214 |
|
}, |
|
{ |
|
"Batch Mean": -3.646892547607422, |
|
"accuracy": 0.875, |
|
"epoch": 0.535, |
|
"step": 214 |
|
}, |
|
{ |
|
"Batch Mean": -2.780792236328125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.535, |
|
"step": 214 |
|
}, |
|
{ |
|
"Batch Mean": -3.8013041019439697, |
|
"accuracy": 0.875, |
|
"epoch": 0.535, |
|
"step": 214 |
|
}, |
|
{ |
|
"Batch Mean": -3.426971435546875, |
|
"accuracy": 0.875, |
|
"epoch": 0.535, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 8.358918190002441, |
|
"learning_rate": 1.4605263157894738e-06, |
|
"loss": 0.3611, |
|
"step": 215 |
|
}, |
|
{ |
|
"Batch Mean": -3.7685012817382812, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5375, |
|
"step": 215 |
|
}, |
|
{ |
|
"Batch Mean": -3.950042724609375, |
|
"accuracy": 0.875, |
|
"epoch": 0.5375, |
|
"step": 215 |
|
}, |
|
{ |
|
"Batch Mean": -3.021089553833008, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5375, |
|
"step": 215 |
|
}, |
|
{ |
|
"Batch Mean": -3.802825927734375, |
|
"accuracy": 0.75, |
|
"epoch": 0.5375, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 8.37688159942627, |
|
"learning_rate": 1.4526315789473685e-06, |
|
"loss": 0.3714, |
|
"step": 216 |
|
}, |
|
{ |
|
"Batch Mean": -2.695831298828125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.54, |
|
"step": 216 |
|
}, |
|
{ |
|
"Batch Mean": -3.305999755859375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.54, |
|
"step": 216 |
|
}, |
|
{ |
|
"Batch Mean": -4.085424423217773, |
|
"accuracy": 0.84375, |
|
"epoch": 0.54, |
|
"step": 216 |
|
}, |
|
{ |
|
"Batch Mean": -3.650482177734375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.54, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5425, |
|
"grad_norm": 7.070443153381348, |
|
"learning_rate": 1.4447368421052633e-06, |
|
"loss": 0.3216, |
|
"step": 217 |
|
}, |
|
{ |
|
"Batch Mean": -3.5268707275390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5425, |
|
"step": 217 |
|
}, |
|
{ |
|
"Batch Mean": -2.8703842163085938, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5425, |
|
"step": 217 |
|
}, |
|
{ |
|
"Batch Mean": -3.6777610778808594, |
|
"accuracy": 0.875, |
|
"epoch": 0.5425, |
|
"step": 217 |
|
}, |
|
{ |
|
"Batch Mean": -3.273221015930176, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5425, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 7.268734455108643, |
|
"learning_rate": 1.436842105263158e-06, |
|
"loss": 0.3018, |
|
"step": 218 |
|
}, |
|
{ |
|
"Batch Mean": -4.14947509765625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.545, |
|
"step": 218 |
|
}, |
|
{ |
|
"Batch Mean": -3.9094696044921875, |
|
"accuracy": 0.875, |
|
"epoch": 0.545, |
|
"step": 218 |
|
}, |
|
{ |
|
"Batch Mean": -4.183448791503906, |
|
"accuracy": 0.90625, |
|
"epoch": 0.545, |
|
"step": 218 |
|
}, |
|
{ |
|
"Batch Mean": -3.0260162353515625, |
|
"accuracy": 0.875, |
|
"epoch": 0.545, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5475, |
|
"grad_norm": 6.658324718475342, |
|
"learning_rate": 1.4289473684210525e-06, |
|
"loss": 0.3162, |
|
"step": 219 |
|
}, |
|
{ |
|
"Batch Mean": -3.3106613159179688, |
|
"accuracy": 0.96875, |
|
"epoch": 0.5475, |
|
"step": 219 |
|
}, |
|
{ |
|
"Batch Mean": -3.244659423828125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5475, |
|
"step": 219 |
|
}, |
|
{ |
|
"Batch Mean": -3.70648193359375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5475, |
|
"step": 219 |
|
}, |
|
{ |
|
"Batch Mean": -3.5211639404296875, |
|
"accuracy": 0.875, |
|
"epoch": 0.5475, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.178283214569092, |
|
"learning_rate": 1.4210526315789473e-06, |
|
"loss": 0.3051, |
|
"step": 220 |
|
}, |
|
{ |
|
"Batch Mean": -4.166107177734375, |
|
"accuracy": 0.875, |
|
"epoch": 0.55, |
|
"step": 220 |
|
}, |
|
{ |
|
"Batch Mean": -3.3558349609375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.55, |
|
"step": 220 |
|
}, |
|
{ |
|
"Batch Mean": -4.33941650390625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.55, |
|
"step": 220 |
|
}, |
|
{ |
|
"Batch Mean": -4.407003402709961, |
|
"accuracy": 0.78125, |
|
"epoch": 0.55, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5525, |
|
"grad_norm": 7.264347076416016, |
|
"learning_rate": 1.4131578947368422e-06, |
|
"loss": 0.3635, |
|
"step": 221 |
|
}, |
|
{ |
|
"Batch Mean": -3.1469879150390625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.5525, |
|
"step": 221 |
|
}, |
|
{ |
|
"Batch Mean": -3.2413787841796875, |
|
"accuracy": 0.75, |
|
"epoch": 0.5525, |
|
"step": 221 |
|
}, |
|
{ |
|
"Batch Mean": -3.6842041015625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5525, |
|
"step": 221 |
|
}, |
|
{ |
|
"Batch Mean": -3.679645538330078, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5525, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 7.31732177734375, |
|
"learning_rate": 1.405263157894737e-06, |
|
"loss": 0.3973, |
|
"step": 222 |
|
}, |
|
{ |
|
"Batch Mean": -3.34716796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.555, |
|
"step": 222 |
|
}, |
|
{ |
|
"Batch Mean": -3.9062652587890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.555, |
|
"step": 222 |
|
}, |
|
{ |
|
"Batch Mean": -3.495391845703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.555, |
|
"step": 222 |
|
}, |
|
{ |
|
"Batch Mean": -2.7776336669921875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.555, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5575, |
|
"grad_norm": 8.380857467651367, |
|
"learning_rate": 1.3973684210526317e-06, |
|
"loss": 0.3659, |
|
"step": 223 |
|
}, |
|
{ |
|
"Batch Mean": -3.2745742797851562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5575, |
|
"step": 223 |
|
}, |
|
{ |
|
"Batch Mean": -3.9029083251953125, |
|
"accuracy": 0.875, |
|
"epoch": 0.5575, |
|
"step": 223 |
|
}, |
|
{ |
|
"Batch Mean": -4.170623779296875, |
|
"accuracy": 0.75, |
|
"epoch": 0.5575, |
|
"step": 223 |
|
}, |
|
{ |
|
"Batch Mean": -2.839531898498535, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5575, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.545968055725098, |
|
"learning_rate": 1.3894736842105263e-06, |
|
"loss": 0.2995, |
|
"step": 224 |
|
}, |
|
{ |
|
"Batch Mean": -3.87921142578125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.56, |
|
"step": 224 |
|
}, |
|
{ |
|
"Batch Mean": -3.514899730682373, |
|
"accuracy": 0.84375, |
|
"epoch": 0.56, |
|
"step": 224 |
|
}, |
|
{ |
|
"Batch Mean": -4.507991790771484, |
|
"accuracy": 0.84375, |
|
"epoch": 0.56, |
|
"step": 224 |
|
}, |
|
{ |
|
"Batch Mean": -4.523223876953125, |
|
"accuracy": 0.875, |
|
"epoch": 0.56, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 8.860806465148926, |
|
"learning_rate": 1.381578947368421e-06, |
|
"loss": 0.3704, |
|
"step": 225 |
|
}, |
|
{ |
|
"Batch Mean": -3.1665496826171875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5625, |
|
"step": 225 |
|
}, |
|
{ |
|
"Batch Mean": -3.83599853515625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5625, |
|
"step": 225 |
|
}, |
|
{ |
|
"Batch Mean": -3.524555206298828, |
|
"accuracy": 0.75, |
|
"epoch": 0.5625, |
|
"step": 225 |
|
}, |
|
{ |
|
"Batch Mean": -3.394775390625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5625, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 6.077930450439453, |
|
"learning_rate": 1.3736842105263158e-06, |
|
"loss": 0.318, |
|
"step": 226 |
|
}, |
|
{ |
|
"Batch Mean": -3.356473922729492, |
|
"accuracy": 0.96875, |
|
"epoch": 0.565, |
|
"step": 226 |
|
}, |
|
{ |
|
"Batch Mean": -4.112098693847656, |
|
"accuracy": 0.8125, |
|
"epoch": 0.565, |
|
"step": 226 |
|
}, |
|
{ |
|
"Batch Mean": -4.091426849365234, |
|
"accuracy": 0.90625, |
|
"epoch": 0.565, |
|
"step": 226 |
|
}, |
|
{ |
|
"Batch Mean": -3.0035018920898438, |
|
"accuracy": 0.90625, |
|
"epoch": 0.565, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5675, |
|
"grad_norm": 6.0616631507873535, |
|
"learning_rate": 1.3657894736842107e-06, |
|
"loss": 0.2958, |
|
"step": 227 |
|
}, |
|
{ |
|
"Batch Mean": -3.436004638671875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5675, |
|
"step": 227 |
|
}, |
|
{ |
|
"Batch Mean": -3.202392578125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5675, |
|
"step": 227 |
|
}, |
|
{ |
|
"Batch Mean": -3.9043502807617188, |
|
"accuracy": 0.65625, |
|
"epoch": 0.5675, |
|
"step": 227 |
|
}, |
|
{ |
|
"Batch Mean": -3.1567840576171875, |
|
"accuracy": 0.75, |
|
"epoch": 0.5675, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 7.531952381134033, |
|
"learning_rate": 1.3578947368421055e-06, |
|
"loss": 0.4195, |
|
"step": 228 |
|
}, |
|
{ |
|
"Batch Mean": -3.8647918701171875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"Batch Mean": -3.0856170654296875, |
|
"accuracy": 0.875, |
|
"epoch": 0.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"Batch Mean": -4.292762756347656, |
|
"accuracy": 0.90625, |
|
"epoch": 0.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"Batch Mean": -3.574859619140625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5725, |
|
"grad_norm": 7.590925216674805, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.3986, |
|
"step": 229 |
|
}, |
|
{ |
|
"Batch Mean": -4.2539825439453125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5725, |
|
"step": 229 |
|
}, |
|
{ |
|
"Batch Mean": -4.0859375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5725, |
|
"step": 229 |
|
}, |
|
{ |
|
"Batch Mean": -4.0404815673828125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.5725, |
|
"step": 229 |
|
}, |
|
{ |
|
"Batch Mean": -4.4549407958984375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5725, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 6.640595436096191, |
|
"learning_rate": 1.3421052631578947e-06, |
|
"loss": 0.283, |
|
"step": 230 |
|
}, |
|
{ |
|
"Batch Mean": -3.2370572090148926, |
|
"accuracy": 0.8125, |
|
"epoch": 0.575, |
|
"step": 230 |
|
}, |
|
{ |
|
"Batch Mean": -4.13724422454834, |
|
"accuracy": 0.75, |
|
"epoch": 0.575, |
|
"step": 230 |
|
}, |
|
{ |
|
"Batch Mean": -3.491596221923828, |
|
"accuracy": 0.78125, |
|
"epoch": 0.575, |
|
"step": 230 |
|
}, |
|
{ |
|
"Batch Mean": -3.4756851196289062, |
|
"accuracy": 0.8125, |
|
"epoch": 0.575, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5775, |
|
"grad_norm": 8.793740272521973, |
|
"learning_rate": 1.3342105263157895e-06, |
|
"loss": 0.4038, |
|
"step": 231 |
|
}, |
|
{ |
|
"Batch Mean": -4.296232223510742, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5775, |
|
"step": 231 |
|
}, |
|
{ |
|
"Batch Mean": -3.5285520553588867, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5775, |
|
"step": 231 |
|
}, |
|
{ |
|
"Batch Mean": -4.08984375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.5775, |
|
"step": 231 |
|
}, |
|
{ |
|
"Batch Mean": -3.6899566650390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5775, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 7.992417335510254, |
|
"learning_rate": 1.3263157894736842e-06, |
|
"loss": 0.3747, |
|
"step": 232 |
|
}, |
|
{ |
|
"Batch Mean": -3.7143096923828125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.58, |
|
"step": 232 |
|
}, |
|
{ |
|
"Batch Mean": -3.961048126220703, |
|
"accuracy": 0.84375, |
|
"epoch": 0.58, |
|
"step": 232 |
|
}, |
|
{ |
|
"Batch Mean": -3.7069091796875, |
|
"accuracy": 0.75, |
|
"epoch": 0.58, |
|
"step": 232 |
|
}, |
|
{ |
|
"Batch Mean": -3.9899253845214844, |
|
"accuracy": 0.84375, |
|
"epoch": 0.58, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5825, |
|
"grad_norm": 7.035994052886963, |
|
"learning_rate": 1.318421052631579e-06, |
|
"loss": 0.3526, |
|
"step": 233 |
|
}, |
|
{ |
|
"Batch Mean": -3.759960174560547, |
|
"accuracy": 0.96875, |
|
"epoch": 0.5825, |
|
"step": 233 |
|
}, |
|
{ |
|
"Batch Mean": -3.8244781494140625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.5825, |
|
"step": 233 |
|
}, |
|
{ |
|
"Batch Mean": -3.9124221801757812, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5825, |
|
"step": 233 |
|
}, |
|
{ |
|
"Batch Mean": -3.5677642822265625, |
|
"accuracy": 0.875, |
|
"epoch": 0.5825, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 6.324237823486328, |
|
"learning_rate": 1.3105263157894737e-06, |
|
"loss": 0.2934, |
|
"step": 234 |
|
}, |
|
{ |
|
"Batch Mean": -3.8089447021484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.585, |
|
"step": 234 |
|
}, |
|
{ |
|
"Batch Mean": -3.088848114013672, |
|
"accuracy": 0.84375, |
|
"epoch": 0.585, |
|
"step": 234 |
|
}, |
|
{ |
|
"Batch Mean": -3.75396728515625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.585, |
|
"step": 234 |
|
}, |
|
{ |
|
"Batch Mean": -3.730072021484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.585, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 6.127070426940918, |
|
"learning_rate": 1.3026315789473685e-06, |
|
"loss": 0.2932, |
|
"step": 235 |
|
}, |
|
{ |
|
"Batch Mean": -3.4142894744873047, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5875, |
|
"step": 235 |
|
}, |
|
{ |
|
"Batch Mean": -2.889068603515625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.5875, |
|
"step": 235 |
|
}, |
|
{ |
|
"Batch Mean": -3.9111099243164062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5875, |
|
"step": 235 |
|
}, |
|
{ |
|
"Batch Mean": -2.477039337158203, |
|
"accuracy": 0.875, |
|
"epoch": 0.5875, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 6.313676357269287, |
|
"learning_rate": 1.2947368421052632e-06, |
|
"loss": 0.3009, |
|
"step": 236 |
|
}, |
|
{ |
|
"Batch Mean": -3.6894073486328125, |
|
"accuracy": 0.96875, |
|
"epoch": 0.59, |
|
"step": 236 |
|
}, |
|
{ |
|
"Batch Mean": -3.5066070556640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.59, |
|
"step": 236 |
|
}, |
|
{ |
|
"Batch Mean": -3.67718505859375, |
|
"accuracy": 0.875, |
|
"epoch": 0.59, |
|
"step": 236 |
|
}, |
|
{ |
|
"Batch Mean": -2.9031982421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.59, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5925, |
|
"grad_norm": 5.945924758911133, |
|
"learning_rate": 1.286842105263158e-06, |
|
"loss": 0.2873, |
|
"step": 237 |
|
}, |
|
{ |
|
"Batch Mean": -4.34759521484375, |
|
"accuracy": 0.875, |
|
"epoch": 0.5925, |
|
"step": 237 |
|
}, |
|
{ |
|
"Batch Mean": -3.4844703674316406, |
|
"accuracy": 0.875, |
|
"epoch": 0.5925, |
|
"step": 237 |
|
}, |
|
{ |
|
"Batch Mean": -3.9878082275390625, |
|
"accuracy": 0.75, |
|
"epoch": 0.5925, |
|
"step": 237 |
|
}, |
|
{ |
|
"Batch Mean": -3.9766464233398438, |
|
"accuracy": 0.96875, |
|
"epoch": 0.5925, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 6.510786533355713, |
|
"learning_rate": 1.2789473684210527e-06, |
|
"loss": 0.2744, |
|
"step": 238 |
|
}, |
|
{ |
|
"Batch Mean": -3.688953399658203, |
|
"accuracy": 0.84375, |
|
"epoch": 0.595, |
|
"step": 238 |
|
}, |
|
{ |
|
"Batch Mean": -3.54766845703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.595, |
|
"step": 238 |
|
}, |
|
{ |
|
"Batch Mean": -2.0823516845703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.595, |
|
"step": 238 |
|
}, |
|
{ |
|
"Batch Mean": -3.4054336547851562, |
|
"accuracy": 0.8125, |
|
"epoch": 0.595, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5975, |
|
"grad_norm": 6.6380228996276855, |
|
"learning_rate": 1.2710526315789474e-06, |
|
"loss": 0.2938, |
|
"step": 239 |
|
}, |
|
{ |
|
"Batch Mean": -2.8431396484375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.5975, |
|
"step": 239 |
|
}, |
|
{ |
|
"Batch Mean": -2.9769973754882812, |
|
"accuracy": 0.75, |
|
"epoch": 0.5975, |
|
"step": 239 |
|
}, |
|
{ |
|
"Batch Mean": -2.3684401512145996, |
|
"accuracy": 0.875, |
|
"epoch": 0.5975, |
|
"step": 239 |
|
}, |
|
{ |
|
"Batch Mean": -1.8329415321350098, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5975, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 8.833738327026367, |
|
"learning_rate": 1.263157894736842e-06, |
|
"loss": 0.3339, |
|
"step": 240 |
|
}, |
|
{ |
|
"Batch Mean": -2.77374267578125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6, |
|
"step": 240 |
|
}, |
|
{ |
|
"Batch Mean": -2.96490478515625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6, |
|
"step": 240 |
|
}, |
|
{ |
|
"Batch Mean": -3.2462158203125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6, |
|
"step": 240 |
|
}, |
|
{ |
|
"Batch Mean": -3.204761505126953, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6025, |
|
"grad_norm": 7.8431501388549805, |
|
"learning_rate": 1.255263157894737e-06, |
|
"loss": 0.377, |
|
"step": 241 |
|
}, |
|
{ |
|
"Batch Mean": -2.714019775390625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6025, |
|
"step": 241 |
|
}, |
|
{ |
|
"Batch Mean": -2.3566207885742188, |
|
"accuracy": 0.9375, |
|
"epoch": 0.6025, |
|
"step": 241 |
|
}, |
|
{ |
|
"Batch Mean": -2.1364517211914062, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6025, |
|
"step": 241 |
|
}, |
|
{ |
|
"Batch Mean": -2.121917724609375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.6025, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 6.37907600402832, |
|
"learning_rate": 1.2473684210526317e-06, |
|
"loss": 0.2626, |
|
"step": 242 |
|
}, |
|
{ |
|
"Batch Mean": -2.95263671875, |
|
"accuracy": 0.875, |
|
"epoch": 0.605, |
|
"step": 242 |
|
}, |
|
{ |
|
"Batch Mean": -3.3397293090820312, |
|
"accuracy": 0.84375, |
|
"epoch": 0.605, |
|
"step": 242 |
|
}, |
|
{ |
|
"Batch Mean": -3.6249160766601562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.605, |
|
"step": 242 |
|
}, |
|
{ |
|
"Batch Mean": -3.3070068359375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.605, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6075, |
|
"grad_norm": 7.880499362945557, |
|
"learning_rate": 1.2394736842105264e-06, |
|
"loss": 0.3124, |
|
"step": 243 |
|
}, |
|
{ |
|
"Batch Mean": -2.3552703857421875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6075, |
|
"step": 243 |
|
}, |
|
{ |
|
"Batch Mean": -2.283773422241211, |
|
"accuracy": 0.75, |
|
"epoch": 0.6075, |
|
"step": 243 |
|
}, |
|
{ |
|
"Batch Mean": -3.1748132705688477, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6075, |
|
"step": 243 |
|
}, |
|
{ |
|
"Batch Mean": -1.2377548217773438, |
|
"accuracy": 0.875, |
|
"epoch": 0.6075, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 9.455083847045898, |
|
"learning_rate": 1.2315789473684212e-06, |
|
"loss": 0.4212, |
|
"step": 244 |
|
}, |
|
{ |
|
"Batch Mean": -2.047351837158203, |
|
"accuracy": 1.0, |
|
"epoch": 0.61, |
|
"step": 244 |
|
}, |
|
{ |
|
"Batch Mean": -3.2393999099731445, |
|
"accuracy": 0.71875, |
|
"epoch": 0.61, |
|
"step": 244 |
|
}, |
|
{ |
|
"Batch Mean": -2.2280426025390625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.61, |
|
"step": 244 |
|
}, |
|
{ |
|
"Batch Mean": -3.0465545654296875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.61, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 8.660555839538574, |
|
"learning_rate": 1.2236842105263157e-06, |
|
"loss": 0.3798, |
|
"step": 245 |
|
}, |
|
{ |
|
"Batch Mean": -2.966043472290039, |
|
"accuracy": 0.71875, |
|
"epoch": 0.6125, |
|
"step": 245 |
|
}, |
|
{ |
|
"Batch Mean": -2.2515945434570312, |
|
"accuracy": 0.875, |
|
"epoch": 0.6125, |
|
"step": 245 |
|
}, |
|
{ |
|
"Batch Mean": -4.3316497802734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6125, |
|
"step": 245 |
|
}, |
|
{ |
|
"Batch Mean": -2.0645751953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6125, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 11.141060829162598, |
|
"learning_rate": 1.2157894736842105e-06, |
|
"loss": 0.4981, |
|
"step": 246 |
|
}, |
|
{ |
|
"Batch Mean": -2.4443702697753906, |
|
"accuracy": 0.8125, |
|
"epoch": 0.615, |
|
"step": 246 |
|
}, |
|
{ |
|
"Batch Mean": -1.937652587890625, |
|
"accuracy": 0.875, |
|
"epoch": 0.615, |
|
"step": 246 |
|
}, |
|
{ |
|
"Batch Mean": -4.549560546875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.615, |
|
"step": 246 |
|
}, |
|
{ |
|
"Batch Mean": -2.3571548461914062, |
|
"accuracy": 0.84375, |
|
"epoch": 0.615, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6175, |
|
"grad_norm": 7.784694194793701, |
|
"learning_rate": 1.2078947368421052e-06, |
|
"loss": 0.3372, |
|
"step": 247 |
|
}, |
|
{ |
|
"Batch Mean": -3.070587158203125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6175, |
|
"step": 247 |
|
}, |
|
{ |
|
"Batch Mean": -3.364349365234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6175, |
|
"step": 247 |
|
}, |
|
{ |
|
"Batch Mean": -3.2482681274414062, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6175, |
|
"step": 247 |
|
}, |
|
{ |
|
"Batch Mean": -2.4936294555664062, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6175, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 6.7486982345581055, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.3296, |
|
"step": 248 |
|
}, |
|
{ |
|
"Batch Mean": -2.79473876953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.62, |
|
"step": 248 |
|
}, |
|
{ |
|
"Batch Mean": -2.90679931640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.62, |
|
"step": 248 |
|
}, |
|
{ |
|
"Batch Mean": -3.117279052734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.62, |
|
"step": 248 |
|
}, |
|
{ |
|
"Batch Mean": -3.5787353515625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.62, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6225, |
|
"grad_norm": 8.39495849609375, |
|
"learning_rate": 1.192105263157895e-06, |
|
"loss": 0.3904, |
|
"step": 249 |
|
}, |
|
{ |
|
"Batch Mean": -2.7425079345703125, |
|
"accuracy": 0.75, |
|
"epoch": 0.6225, |
|
"step": 249 |
|
}, |
|
{ |
|
"Batch Mean": -4.147431373596191, |
|
"accuracy": 0.75, |
|
"epoch": 0.6225, |
|
"step": 249 |
|
}, |
|
{ |
|
"Batch Mean": -3.2316246032714844, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6225, |
|
"step": 249 |
|
}, |
|
{ |
|
"Batch Mean": -2.8546524047851562, |
|
"accuracy": 0.875, |
|
"epoch": 0.6225, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 7.723196029663086, |
|
"learning_rate": 1.1842105263157894e-06, |
|
"loss": 0.3809, |
|
"step": 250 |
|
}, |
|
{ |
|
"Batch Mean": -2.8010005950927734, |
|
"accuracy": 0.90625, |
|
"epoch": 0.625, |
|
"step": 250 |
|
}, |
|
{ |
|
"Batch Mean": -3.46478271484375, |
|
"accuracy": 0.875, |
|
"epoch": 0.625, |
|
"step": 250 |
|
}, |
|
{ |
|
"Batch Mean": -4.16009521484375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.625, |
|
"step": 250 |
|
}, |
|
{ |
|
"Batch Mean": -3.533355712890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.625, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6275, |
|
"grad_norm": 5.696111679077148, |
|
"learning_rate": 1.1763157894736842e-06, |
|
"loss": 0.2774, |
|
"step": 251 |
|
}, |
|
{ |
|
"Batch Mean": -3.386138916015625, |
|
"accuracy": 0.875, |
|
"epoch": 0.6275, |
|
"step": 251 |
|
}, |
|
{ |
|
"Batch Mean": -3.6047067642211914, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6275, |
|
"step": 251 |
|
}, |
|
{ |
|
"Batch Mean": -3.3834381103515625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6275, |
|
"step": 251 |
|
}, |
|
{ |
|
"Batch Mean": -3.3063201904296875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6275, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.556052207946777, |
|
"learning_rate": 1.168421052631579e-06, |
|
"loss": 0.3641, |
|
"step": 252 |
|
}, |
|
{ |
|
"Batch Mean": -3.0179519653320312, |
|
"accuracy": 0.65625, |
|
"epoch": 0.63, |
|
"step": 252 |
|
}, |
|
{ |
|
"Batch Mean": -3.4812660217285156, |
|
"accuracy": 0.9375, |
|
"epoch": 0.63, |
|
"step": 252 |
|
}, |
|
{ |
|
"Batch Mean": -3.9097061157226562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.63, |
|
"step": 252 |
|
}, |
|
{ |
|
"Batch Mean": -2.435272216796875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.63, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6325, |
|
"grad_norm": 6.269940376281738, |
|
"learning_rate": 1.1605263157894737e-06, |
|
"loss": 0.3371, |
|
"step": 253 |
|
}, |
|
{ |
|
"Batch Mean": -4.071680068969727, |
|
"accuracy": 0.96875, |
|
"epoch": 0.6325, |
|
"step": 253 |
|
}, |
|
{ |
|
"Batch Mean": -2.9765701293945312, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6325, |
|
"step": 253 |
|
}, |
|
{ |
|
"Batch Mean": -3.8900146484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6325, |
|
"step": 253 |
|
}, |
|
{ |
|
"Batch Mean": -2.765000343322754, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6325, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 6.220406532287598, |
|
"learning_rate": 1.1526315789473684e-06, |
|
"loss": 0.2673, |
|
"step": 254 |
|
}, |
|
{ |
|
"Batch Mean": -3.6123905181884766, |
|
"accuracy": 0.71875, |
|
"epoch": 0.635, |
|
"step": 254 |
|
}, |
|
{ |
|
"Batch Mean": -4.041259765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.635, |
|
"step": 254 |
|
}, |
|
{ |
|
"Batch Mean": -3.0314865112304688, |
|
"accuracy": 0.875, |
|
"epoch": 0.635, |
|
"step": 254 |
|
}, |
|
{ |
|
"Batch Mean": -2.9288330078125, |
|
"accuracy": 0.75, |
|
"epoch": 0.635, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 8.09074592590332, |
|
"learning_rate": 1.1447368421052632e-06, |
|
"loss": 0.4475, |
|
"step": 255 |
|
}, |
|
{ |
|
"Batch Mean": -3.0372314453125, |
|
"accuracy": 0.75, |
|
"epoch": 0.6375, |
|
"step": 255 |
|
}, |
|
{ |
|
"Batch Mean": -2.77508544921875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6375, |
|
"step": 255 |
|
}, |
|
{ |
|
"Batch Mean": -3.7425918579101562, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6375, |
|
"step": 255 |
|
}, |
|
{ |
|
"Batch Mean": -3.914762496948242, |
|
"accuracy": 0.71875, |
|
"epoch": 0.6375, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.071317672729492, |
|
"learning_rate": 1.136842105263158e-06, |
|
"loss": 0.378, |
|
"step": 256 |
|
}, |
|
{ |
|
"Batch Mean": -3.5071868896484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.64, |
|
"step": 256 |
|
}, |
|
{ |
|
"Batch Mean": -3.4954605102539062, |
|
"accuracy": 0.90625, |
|
"epoch": 0.64, |
|
"step": 256 |
|
}, |
|
{ |
|
"Batch Mean": -2.872222900390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.64, |
|
"step": 256 |
|
}, |
|
{ |
|
"Batch Mean": -3.0280609130859375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.64, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6425, |
|
"grad_norm": 6.662141799926758, |
|
"learning_rate": 1.1289473684210527e-06, |
|
"loss": 0.3832, |
|
"step": 257 |
|
}, |
|
{ |
|
"Batch Mean": -3.240509033203125, |
|
"accuracy": 0.875, |
|
"epoch": 0.6425, |
|
"step": 257 |
|
}, |
|
{ |
|
"Batch Mean": -3.8335418701171875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6425, |
|
"step": 257 |
|
}, |
|
{ |
|
"Batch Mean": -3.164426803588867, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6425, |
|
"step": 257 |
|
}, |
|
{ |
|
"Batch Mean": -3.7109527587890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6425, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 5.634103298187256, |
|
"learning_rate": 1.1210526315789474e-06, |
|
"loss": 0.3235, |
|
"step": 258 |
|
}, |
|
{ |
|
"Batch Mean": -3.70538330078125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.645, |
|
"step": 258 |
|
}, |
|
{ |
|
"Batch Mean": -4.567169189453125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.645, |
|
"step": 258 |
|
}, |
|
{ |
|
"Batch Mean": -3.396778106689453, |
|
"accuracy": 0.96875, |
|
"epoch": 0.645, |
|
"step": 258 |
|
}, |
|
{ |
|
"Batch Mean": -4.2032470703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.645, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6475, |
|
"grad_norm": 5.471774101257324, |
|
"learning_rate": 1.1131578947368421e-06, |
|
"loss": 0.3283, |
|
"step": 259 |
|
}, |
|
{ |
|
"Batch Mean": -4.028106689453125, |
|
"accuracy": 0.875, |
|
"epoch": 0.6475, |
|
"step": 259 |
|
}, |
|
{ |
|
"Batch Mean": -4.26116943359375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6475, |
|
"step": 259 |
|
}, |
|
{ |
|
"Batch Mean": -2.7089462280273438, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6475, |
|
"step": 259 |
|
}, |
|
{ |
|
"Batch Mean": -2.54803466796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6475, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.883737564086914, |
|
"learning_rate": 1.1052631578947369e-06, |
|
"loss": 0.3364, |
|
"step": 260 |
|
}, |
|
{ |
|
"Batch Mean": -3.8737640380859375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.65, |
|
"step": 260 |
|
}, |
|
{ |
|
"Batch Mean": -2.4150657653808594, |
|
"accuracy": 0.90625, |
|
"epoch": 0.65, |
|
"step": 260 |
|
}, |
|
{ |
|
"Batch Mean": -3.8108320236206055, |
|
"accuracy": 0.8125, |
|
"epoch": 0.65, |
|
"step": 260 |
|
}, |
|
{ |
|
"Batch Mean": -3.62054443359375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.65, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6525, |
|
"grad_norm": 5.747334957122803, |
|
"learning_rate": 1.0973684210526316e-06, |
|
"loss": 0.3397, |
|
"step": 261 |
|
}, |
|
{ |
|
"Batch Mean": -3.2949142456054688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6525, |
|
"step": 261 |
|
}, |
|
{ |
|
"Batch Mean": -3.8467559814453125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6525, |
|
"step": 261 |
|
}, |
|
{ |
|
"Batch Mean": -3.0838470458984375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6525, |
|
"step": 261 |
|
}, |
|
{ |
|
"Batch Mean": -3.0746688842773438, |
|
"accuracy": 0.875, |
|
"epoch": 0.6525, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 5.860691547393799, |
|
"learning_rate": 1.0894736842105264e-06, |
|
"loss": 0.3437, |
|
"step": 262 |
|
}, |
|
{ |
|
"Batch Mean": -3.8531265258789062, |
|
"accuracy": 0.78125, |
|
"epoch": 0.655, |
|
"step": 262 |
|
}, |
|
{ |
|
"Batch Mean": -2.9960174560546875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.655, |
|
"step": 262 |
|
}, |
|
{ |
|
"Batch Mean": -3.86285400390625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.655, |
|
"step": 262 |
|
}, |
|
{ |
|
"Batch Mean": -4.966447830200195, |
|
"accuracy": 0.8125, |
|
"epoch": 0.655, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6575, |
|
"grad_norm": 7.351145267486572, |
|
"learning_rate": 1.0815789473684211e-06, |
|
"loss": 0.3677, |
|
"step": 263 |
|
}, |
|
{ |
|
"Batch Mean": -3.3981552124023438, |
|
"accuracy": 0.96875, |
|
"epoch": 0.6575, |
|
"step": 263 |
|
}, |
|
{ |
|
"Batch Mean": -3.6040496826171875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6575, |
|
"step": 263 |
|
}, |
|
{ |
|
"Batch Mean": -4.476898193359375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.6575, |
|
"step": 263 |
|
}, |
|
{ |
|
"Batch Mean": -4.4954986572265625, |
|
"accuracy": 0.875, |
|
"epoch": 0.6575, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.997040748596191, |
|
"learning_rate": 1.0736842105263159e-06, |
|
"loss": 0.3175, |
|
"step": 264 |
|
}, |
|
{ |
|
"Batch Mean": -4.134864807128906, |
|
"accuracy": 0.90625, |
|
"epoch": 0.66, |
|
"step": 264 |
|
}, |
|
{ |
|
"Batch Mean": -3.3722915649414062, |
|
"accuracy": 0.8125, |
|
"epoch": 0.66, |
|
"step": 264 |
|
}, |
|
{ |
|
"Batch Mean": -3.886749267578125, |
|
"accuracy": 0.875, |
|
"epoch": 0.66, |
|
"step": 264 |
|
}, |
|
{ |
|
"Batch Mean": -3.9037017822265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.66, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 6.369904041290283, |
|
"learning_rate": 1.0657894736842106e-06, |
|
"loss": 0.3005, |
|
"step": 265 |
|
}, |
|
{ |
|
"Batch Mean": -2.43988037109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6625, |
|
"step": 265 |
|
}, |
|
{ |
|
"Batch Mean": -3.7969512939453125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6625, |
|
"step": 265 |
|
}, |
|
{ |
|
"Batch Mean": -2.8551597595214844, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6625, |
|
"step": 265 |
|
}, |
|
{ |
|
"Batch Mean": -4.376708984375, |
|
"accuracy": 0.875, |
|
"epoch": 0.6625, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 7.589053630828857, |
|
"learning_rate": 1.0578947368421052e-06, |
|
"loss": 0.378, |
|
"step": 266 |
|
}, |
|
{ |
|
"Batch Mean": -2.819580078125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.665, |
|
"step": 266 |
|
}, |
|
{ |
|
"Batch Mean": -3.7125625610351562, |
|
"accuracy": 0.875, |
|
"epoch": 0.665, |
|
"step": 266 |
|
}, |
|
{ |
|
"Batch Mean": -3.899505615234375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.665, |
|
"step": 266 |
|
}, |
|
{ |
|
"Batch Mean": -3.084259033203125, |
|
"accuracy": 0.875, |
|
"epoch": 0.665, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6675, |
|
"grad_norm": 6.894853591918945, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.31, |
|
"step": 267 |
|
}, |
|
{ |
|
"Batch Mean": -4.47222900390625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6675, |
|
"step": 267 |
|
}, |
|
{ |
|
"Batch Mean": -3.929351806640625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.6675, |
|
"step": 267 |
|
}, |
|
{ |
|
"Batch Mean": -3.6689300537109375, |
|
"accuracy": 0.875, |
|
"epoch": 0.6675, |
|
"step": 267 |
|
}, |
|
{ |
|
"Batch Mean": -3.8176116943359375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6675, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 6.852884292602539, |
|
"learning_rate": 1.0421052631578949e-06, |
|
"loss": 0.3472, |
|
"step": 268 |
|
}, |
|
{ |
|
"Batch Mean": -2.9620208740234375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.67, |
|
"step": 268 |
|
}, |
|
{ |
|
"Batch Mean": -2.3269195556640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.67, |
|
"step": 268 |
|
}, |
|
{ |
|
"Batch Mean": -2.8743438720703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.67, |
|
"step": 268 |
|
}, |
|
{ |
|
"Batch Mean": -2.6758041381835938, |
|
"accuracy": 0.8125, |
|
"epoch": 0.67, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6725, |
|
"grad_norm": 7.455336093902588, |
|
"learning_rate": 1.0342105263157896e-06, |
|
"loss": 0.3242, |
|
"step": 269 |
|
}, |
|
{ |
|
"Batch Mean": -2.933929443359375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6725, |
|
"step": 269 |
|
}, |
|
{ |
|
"Batch Mean": -2.581918716430664, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6725, |
|
"step": 269 |
|
}, |
|
{ |
|
"Batch Mean": -3.84466552734375, |
|
"accuracy": 0.875, |
|
"epoch": 0.6725, |
|
"step": 269 |
|
}, |
|
{ |
|
"Batch Mean": -2.2253456115722656, |
|
"accuracy": 0.75, |
|
"epoch": 0.6725, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 7.862179756164551, |
|
"learning_rate": 1.0263157894736843e-06, |
|
"loss": 0.3216, |
|
"step": 270 |
|
}, |
|
{ |
|
"Batch Mean": -3.59698486328125, |
|
"accuracy": 0.875, |
|
"epoch": 0.675, |
|
"step": 270 |
|
}, |
|
{ |
|
"Batch Mean": -3.780181884765625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.675, |
|
"step": 270 |
|
}, |
|
{ |
|
"Batch Mean": -3.2669754028320312, |
|
"accuracy": 0.875, |
|
"epoch": 0.675, |
|
"step": 270 |
|
}, |
|
{ |
|
"Batch Mean": -2.4827651977539062, |
|
"accuracy": 0.90625, |
|
"epoch": 0.675, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6775, |
|
"grad_norm": 6.436431884765625, |
|
"learning_rate": 1.0184210526315789e-06, |
|
"loss": 0.2718, |
|
"step": 271 |
|
}, |
|
{ |
|
"Batch Mean": -2.3623504638671875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.6775, |
|
"step": 271 |
|
}, |
|
{ |
|
"Batch Mean": -3.0159378051757812, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6775, |
|
"step": 271 |
|
}, |
|
{ |
|
"Batch Mean": -3.6621856689453125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.6775, |
|
"step": 271 |
|
}, |
|
{ |
|
"Batch Mean": -2.6310272216796875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6775, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.512716293334961, |
|
"learning_rate": 1.0105263157894736e-06, |
|
"loss": 0.279, |
|
"step": 272 |
|
}, |
|
{ |
|
"Batch Mean": -2.4361724853515625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.68, |
|
"step": 272 |
|
}, |
|
{ |
|
"Batch Mean": -2.6321334838867188, |
|
"accuracy": 0.90625, |
|
"epoch": 0.68, |
|
"step": 272 |
|
}, |
|
{ |
|
"Batch Mean": -2.1671600341796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.68, |
|
"step": 272 |
|
}, |
|
{ |
|
"Batch Mean": -2.6406402587890625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.68, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6825, |
|
"grad_norm": 8.025399208068848, |
|
"learning_rate": 1.0026315789473684e-06, |
|
"loss": 0.3236, |
|
"step": 273 |
|
}, |
|
{ |
|
"Batch Mean": -2.757819175720215, |
|
"accuracy": 0.75, |
|
"epoch": 0.6825, |
|
"step": 273 |
|
}, |
|
{ |
|
"Batch Mean": -3.958160400390625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6825, |
|
"step": 273 |
|
}, |
|
{ |
|
"Batch Mean": -2.927225112915039, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6825, |
|
"step": 273 |
|
}, |
|
{ |
|
"Batch Mean": -2.726654052734375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6825, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 7.602370262145996, |
|
"learning_rate": 9.947368421052631e-07, |
|
"loss": 0.3538, |
|
"step": 274 |
|
}, |
|
{ |
|
"Batch Mean": -2.7696990966796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.685, |
|
"step": 274 |
|
}, |
|
{ |
|
"Batch Mean": -3.3070907592773438, |
|
"accuracy": 0.8125, |
|
"epoch": 0.685, |
|
"step": 274 |
|
}, |
|
{ |
|
"Batch Mean": -2.14569091796875, |
|
"accuracy": 0.875, |
|
"epoch": 0.685, |
|
"step": 274 |
|
}, |
|
{ |
|
"Batch Mean": -2.2299423217773438, |
|
"accuracy": 0.90625, |
|
"epoch": 0.685, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 7.543124675750732, |
|
"learning_rate": 9.86842105263158e-07, |
|
"loss": 0.3338, |
|
"step": 275 |
|
}, |
|
{ |
|
"Batch Mean": -2.778106689453125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.6875, |
|
"step": 275 |
|
}, |
|
{ |
|
"Batch Mean": -2.34002685546875, |
|
"accuracy": 0.875, |
|
"epoch": 0.6875, |
|
"step": 275 |
|
}, |
|
{ |
|
"Batch Mean": -2.372264862060547, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6875, |
|
"step": 275 |
|
}, |
|
{ |
|
"Batch Mean": -3.2867889404296875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6875, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 7.420588493347168, |
|
"learning_rate": 9.789473684210526e-07, |
|
"loss": 0.3661, |
|
"step": 276 |
|
}, |
|
{ |
|
"Batch Mean": -1.7787017822265625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.69, |
|
"step": 276 |
|
}, |
|
{ |
|
"Batch Mean": -2.858257293701172, |
|
"accuracy": 0.8125, |
|
"epoch": 0.69, |
|
"step": 276 |
|
}, |
|
{ |
|
"Batch Mean": -3.493408203125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.69, |
|
"step": 276 |
|
}, |
|
{ |
|
"Batch Mean": -2.6409072875976562, |
|
"accuracy": 0.8125, |
|
"epoch": 0.69, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6925, |
|
"grad_norm": 9.459863662719727, |
|
"learning_rate": 9.710526315789474e-07, |
|
"loss": 0.4359, |
|
"step": 277 |
|
}, |
|
{ |
|
"Batch Mean": -2.4638671875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.6925, |
|
"step": 277 |
|
}, |
|
{ |
|
"Batch Mean": -2.5003557205200195, |
|
"accuracy": 0.875, |
|
"epoch": 0.6925, |
|
"step": 277 |
|
}, |
|
{ |
|
"Batch Mean": -3.018157958984375, |
|
"accuracy": 0.875, |
|
"epoch": 0.6925, |
|
"step": 277 |
|
}, |
|
{ |
|
"Batch Mean": -3.19573974609375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6925, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 7.942737102508545, |
|
"learning_rate": 9.63157894736842e-07, |
|
"loss": 0.2998, |
|
"step": 278 |
|
}, |
|
{ |
|
"Batch Mean": -3.3094139099121094, |
|
"accuracy": 0.8125, |
|
"epoch": 0.695, |
|
"step": 278 |
|
}, |
|
{ |
|
"Batch Mean": -2.495997905731201, |
|
"accuracy": 0.65625, |
|
"epoch": 0.695, |
|
"step": 278 |
|
}, |
|
{ |
|
"Batch Mean": -2.2965316772460938, |
|
"accuracy": 0.90625, |
|
"epoch": 0.695, |
|
"step": 278 |
|
}, |
|
{ |
|
"Batch Mean": -2.8788223266601562, |
|
"accuracy": 0.8125, |
|
"epoch": 0.695, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6975, |
|
"grad_norm": 8.83698558807373, |
|
"learning_rate": 9.552631578947368e-07, |
|
"loss": 0.3886, |
|
"step": 279 |
|
}, |
|
{ |
|
"Batch Mean": -2.6972923278808594, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6975, |
|
"step": 279 |
|
}, |
|
{ |
|
"Batch Mean": -3.4521484375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.6975, |
|
"step": 279 |
|
}, |
|
{ |
|
"Batch Mean": -2.5295982360839844, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6975, |
|
"step": 279 |
|
}, |
|
{ |
|
"Batch Mean": -3.016693115234375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.6975, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 7.647303104400635, |
|
"learning_rate": 9.473684210526316e-07, |
|
"loss": 0.2891, |
|
"step": 280 |
|
}, |
|
{ |
|
"Batch Mean": -3.5346527099609375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7, |
|
"step": 280 |
|
}, |
|
{ |
|
"Batch Mean": -3.563547134399414, |
|
"accuracy": 0.71875, |
|
"epoch": 0.7, |
|
"step": 280 |
|
}, |
|
{ |
|
"Batch Mean": -3.076751708984375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7, |
|
"step": 280 |
|
}, |
|
{ |
|
"Batch Mean": -2.9289207458496094, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7025, |
|
"grad_norm": 8.476974487304688, |
|
"learning_rate": 9.394736842105262e-07, |
|
"loss": 0.3541, |
|
"step": 281 |
|
}, |
|
{ |
|
"Batch Mean": -2.8744544982910156, |
|
"accuracy": 0.96875, |
|
"epoch": 0.7025, |
|
"step": 281 |
|
}, |
|
{ |
|
"Batch Mean": -3.2609710693359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7025, |
|
"step": 281 |
|
}, |
|
{ |
|
"Batch Mean": -2.6938629150390625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7025, |
|
"step": 281 |
|
}, |
|
{ |
|
"Batch Mean": -3.2952957153320312, |
|
"accuracy": 0.75, |
|
"epoch": 0.7025, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 8.024421691894531, |
|
"learning_rate": 9.315789473684212e-07, |
|
"loss": 0.3344, |
|
"step": 282 |
|
}, |
|
{ |
|
"Batch Mean": -2.7999496459960938, |
|
"accuracy": 0.90625, |
|
"epoch": 0.705, |
|
"step": 282 |
|
}, |
|
{ |
|
"Batch Mean": -2.3118896484375, |
|
"accuracy": 0.75, |
|
"epoch": 0.705, |
|
"step": 282 |
|
}, |
|
{ |
|
"Batch Mean": -3.025981903076172, |
|
"accuracy": 0.90625, |
|
"epoch": 0.705, |
|
"step": 282 |
|
}, |
|
{ |
|
"Batch Mean": -3.867767333984375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.705, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7075, |
|
"grad_norm": 8.309261322021484, |
|
"learning_rate": 9.236842105263158e-07, |
|
"loss": 0.4361, |
|
"step": 283 |
|
}, |
|
{ |
|
"Batch Mean": -2.7422637939453125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7075, |
|
"step": 283 |
|
}, |
|
{ |
|
"Batch Mean": -3.6142730712890625, |
|
"accuracy": 0.75, |
|
"epoch": 0.7075, |
|
"step": 283 |
|
}, |
|
{ |
|
"Batch Mean": -3.0303115844726562, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7075, |
|
"step": 283 |
|
}, |
|
{ |
|
"Batch Mean": -3.4545669555664062, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7075, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.4524455070495605, |
|
"learning_rate": 9.157894736842106e-07, |
|
"loss": 0.2779, |
|
"step": 284 |
|
}, |
|
{ |
|
"Batch Mean": -3.4946441650390625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.71, |
|
"step": 284 |
|
}, |
|
{ |
|
"Batch Mean": -2.9883852005004883, |
|
"accuracy": 0.96875, |
|
"epoch": 0.71, |
|
"step": 284 |
|
}, |
|
{ |
|
"Batch Mean": -3.276304244995117, |
|
"accuracy": 0.875, |
|
"epoch": 0.71, |
|
"step": 284 |
|
}, |
|
{ |
|
"Batch Mean": -3.5324325561523438, |
|
"accuracy": 0.9375, |
|
"epoch": 0.71, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 6.129977226257324, |
|
"learning_rate": 9.078947368421053e-07, |
|
"loss": 0.2836, |
|
"step": 285 |
|
}, |
|
{ |
|
"Batch Mean": -3.501049041748047, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"Batch Mean": -3.31304931640625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"Batch Mean": -3.850574493408203, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"Batch Mean": -2.6171798706054688, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 6.9938459396362305, |
|
"learning_rate": 9e-07, |
|
"loss": 0.2995, |
|
"step": 286 |
|
}, |
|
{ |
|
"Batch Mean": -3.8350830078125, |
|
"accuracy": 0.75, |
|
"epoch": 0.715, |
|
"step": 286 |
|
}, |
|
{ |
|
"Batch Mean": -2.8526458740234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.715, |
|
"step": 286 |
|
}, |
|
{ |
|
"Batch Mean": -3.446859836578369, |
|
"accuracy": 0.875, |
|
"epoch": 0.715, |
|
"step": 286 |
|
}, |
|
{ |
|
"Batch Mean": -3.232410430908203, |
|
"accuracy": 0.90625, |
|
"epoch": 0.715, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7175, |
|
"grad_norm": 8.538209915161133, |
|
"learning_rate": 8.921052631578947e-07, |
|
"loss": 0.3235, |
|
"step": 287 |
|
}, |
|
{ |
|
"Batch Mean": -3.7576637268066406, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7175, |
|
"step": 287 |
|
}, |
|
{ |
|
"Batch Mean": -3.164440155029297, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7175, |
|
"step": 287 |
|
}, |
|
{ |
|
"Batch Mean": -3.6189346313476562, |
|
"accuracy": 0.875, |
|
"epoch": 0.7175, |
|
"step": 287 |
|
}, |
|
{ |
|
"Batch Mean": -3.303243637084961, |
|
"accuracy": 0.75, |
|
"epoch": 0.7175, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 8.347675323486328, |
|
"learning_rate": 8.842105263157895e-07, |
|
"loss": 0.3859, |
|
"step": 288 |
|
}, |
|
{ |
|
"Batch Mean": -3.249699831008911, |
|
"accuracy": 0.875, |
|
"epoch": 0.72, |
|
"step": 288 |
|
}, |
|
{ |
|
"Batch Mean": -3.5829315185546875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.72, |
|
"step": 288 |
|
}, |
|
{ |
|
"Batch Mean": -3.14984130859375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.72, |
|
"step": 288 |
|
}, |
|
{ |
|
"Batch Mean": -3.806365966796875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.72, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7225, |
|
"grad_norm": 6.650171279907227, |
|
"learning_rate": 8.763157894736843e-07, |
|
"loss": 0.2523, |
|
"step": 289 |
|
}, |
|
{ |
|
"Batch Mean": -3.68951416015625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7225, |
|
"step": 289 |
|
}, |
|
{ |
|
"Batch Mean": -3.31353759765625, |
|
"accuracy": 0.875, |
|
"epoch": 0.7225, |
|
"step": 289 |
|
}, |
|
{ |
|
"Batch Mean": -3.738893508911133, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7225, |
|
"step": 289 |
|
}, |
|
{ |
|
"Batch Mean": -3.700286865234375, |
|
"accuracy": 0.75, |
|
"epoch": 0.7225, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 6.943315029144287, |
|
"learning_rate": 8.68421052631579e-07, |
|
"loss": 0.3015, |
|
"step": 290 |
|
}, |
|
{ |
|
"Batch Mean": -3.887226104736328, |
|
"accuracy": 0.84375, |
|
"epoch": 0.725, |
|
"step": 290 |
|
}, |
|
{ |
|
"Batch Mean": -2.4902381896972656, |
|
"accuracy": 0.875, |
|
"epoch": 0.725, |
|
"step": 290 |
|
}, |
|
{ |
|
"Batch Mean": -3.1428375244140625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.725, |
|
"step": 290 |
|
}, |
|
{ |
|
"Batch Mean": -2.7413482666015625, |
|
"accuracy": 0.875, |
|
"epoch": 0.725, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7275, |
|
"grad_norm": 7.108777046203613, |
|
"learning_rate": 8.605263157894737e-07, |
|
"loss": 0.3567, |
|
"step": 291 |
|
}, |
|
{ |
|
"Batch Mean": -3.3324928283691406, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7275, |
|
"step": 291 |
|
}, |
|
{ |
|
"Batch Mean": -3.8308334350585938, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7275, |
|
"step": 291 |
|
}, |
|
{ |
|
"Batch Mean": -4.030181884765625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7275, |
|
"step": 291 |
|
}, |
|
{ |
|
"Batch Mean": -3.4538726806640625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7275, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 8.231657981872559, |
|
"learning_rate": 8.526315789473684e-07, |
|
"loss": 0.3771, |
|
"step": 292 |
|
}, |
|
{ |
|
"Batch Mean": -3.4822425842285156, |
|
"accuracy": 0.75, |
|
"epoch": 0.73, |
|
"step": 292 |
|
}, |
|
{ |
|
"Batch Mean": -4.32940673828125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.73, |
|
"step": 292 |
|
}, |
|
{ |
|
"Batch Mean": -4.422943115234375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.73, |
|
"step": 292 |
|
}, |
|
{ |
|
"Batch Mean": -4.202728271484375, |
|
"accuracy": 0.875, |
|
"epoch": 0.73, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7325, |
|
"grad_norm": 7.5367560386657715, |
|
"learning_rate": 8.447368421052632e-07, |
|
"loss": 0.4246, |
|
"step": 293 |
|
}, |
|
{ |
|
"Batch Mean": -3.4039154052734375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7325, |
|
"step": 293 |
|
}, |
|
{ |
|
"Batch Mean": -3.3543243408203125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7325, |
|
"step": 293 |
|
}, |
|
{ |
|
"Batch Mean": -3.695354461669922, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7325, |
|
"step": 293 |
|
}, |
|
{ |
|
"Batch Mean": -3.03192138671875, |
|
"accuracy": 0.875, |
|
"epoch": 0.7325, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 7.1592583656311035, |
|
"learning_rate": 8.368421052631578e-07, |
|
"loss": 0.3243, |
|
"step": 294 |
|
}, |
|
{ |
|
"Batch Mean": -3.390155792236328, |
|
"accuracy": 0.84375, |
|
"epoch": 0.735, |
|
"step": 294 |
|
}, |
|
{ |
|
"Batch Mean": -3.437049388885498, |
|
"accuracy": 0.90625, |
|
"epoch": 0.735, |
|
"step": 294 |
|
}, |
|
{ |
|
"Batch Mean": -3.310375213623047, |
|
"accuracy": 0.90625, |
|
"epoch": 0.735, |
|
"step": 294 |
|
}, |
|
{ |
|
"Batch Mean": -3.1389846801757812, |
|
"accuracy": 0.71875, |
|
"epoch": 0.735, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 7.694336891174316, |
|
"learning_rate": 8.289473684210528e-07, |
|
"loss": 0.3502, |
|
"step": 295 |
|
}, |
|
{ |
|
"Batch Mean": -3.582538604736328, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7375, |
|
"step": 295 |
|
}, |
|
{ |
|
"Batch Mean": -3.919708251953125, |
|
"accuracy": 0.75, |
|
"epoch": 0.7375, |
|
"step": 295 |
|
}, |
|
{ |
|
"Batch Mean": -3.33111572265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7375, |
|
"step": 295 |
|
}, |
|
{ |
|
"Batch Mean": -4.291971206665039, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7375, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.617946147918701, |
|
"learning_rate": 8.210526315789474e-07, |
|
"loss": 0.3229, |
|
"step": 296 |
|
}, |
|
{ |
|
"Batch Mean": -2.8900604248046875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.74, |
|
"step": 296 |
|
}, |
|
{ |
|
"Batch Mean": -3.4481353759765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.74, |
|
"step": 296 |
|
}, |
|
{ |
|
"Batch Mean": -3.49090576171875, |
|
"accuracy": 0.875, |
|
"epoch": 0.74, |
|
"step": 296 |
|
}, |
|
{ |
|
"Batch Mean": -3.353851318359375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.74, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7425, |
|
"grad_norm": 6.788773059844971, |
|
"learning_rate": 8.131578947368422e-07, |
|
"loss": 0.3667, |
|
"step": 297 |
|
}, |
|
{ |
|
"Batch Mean": -3.618988037109375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7425, |
|
"step": 297 |
|
}, |
|
{ |
|
"Batch Mean": -3.5942840576171875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7425, |
|
"step": 297 |
|
}, |
|
{ |
|
"Batch Mean": -4.110076904296875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7425, |
|
"step": 297 |
|
}, |
|
{ |
|
"Batch Mean": -3.9837026596069336, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7425, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 6.459832191467285, |
|
"learning_rate": 8.052631578947369e-07, |
|
"loss": 0.3202, |
|
"step": 298 |
|
}, |
|
{ |
|
"Batch Mean": -3.5243988037109375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.745, |
|
"step": 298 |
|
}, |
|
{ |
|
"Batch Mean": -3.4703292846679688, |
|
"accuracy": 0.78125, |
|
"epoch": 0.745, |
|
"step": 298 |
|
}, |
|
{ |
|
"Batch Mean": -3.6161651611328125, |
|
"accuracy": 0.875, |
|
"epoch": 0.745, |
|
"step": 298 |
|
}, |
|
{ |
|
"Batch Mean": -3.5712432861328125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.745, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7475, |
|
"grad_norm": 6.366523265838623, |
|
"learning_rate": 7.973684210526315e-07, |
|
"loss": 0.3409, |
|
"step": 299 |
|
}, |
|
{ |
|
"Batch Mean": -3.39794921875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7475, |
|
"step": 299 |
|
}, |
|
{ |
|
"Batch Mean": -4.16961669921875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7475, |
|
"step": 299 |
|
}, |
|
{ |
|
"Batch Mean": -3.1703014373779297, |
|
"accuracy": 0.75, |
|
"epoch": 0.7475, |
|
"step": 299 |
|
}, |
|
{ |
|
"Batch Mean": -2.985076904296875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7475, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.7196125984191895, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.3255, |
|
"step": 300 |
|
}, |
|
{ |
|
"Batch Mean": -3.1924591064453125, |
|
"accuracy": 0.875, |
|
"epoch": 0.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"Batch Mean": -2.8484268188476562, |
|
"accuracy": 0.90625, |
|
"epoch": 0.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"Batch Mean": -2.9788818359375, |
|
"accuracy": 0.875, |
|
"epoch": 0.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"Batch Mean": -3.427490234375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7525, |
|
"grad_norm": 6.829746246337891, |
|
"learning_rate": 7.81578947368421e-07, |
|
"loss": 0.2915, |
|
"step": 301 |
|
}, |
|
{ |
|
"Batch Mean": -4.0568695068359375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.7525, |
|
"step": 301 |
|
}, |
|
{ |
|
"Batch Mean": -3.538299560546875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7525, |
|
"step": 301 |
|
}, |
|
{ |
|
"Batch Mean": -4.1434783935546875, |
|
"accuracy": 0.96875, |
|
"epoch": 0.7525, |
|
"step": 301 |
|
}, |
|
{ |
|
"Batch Mean": -3.912379264831543, |
|
"accuracy": 0.875, |
|
"epoch": 0.7525, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 6.381470680236816, |
|
"learning_rate": 7.736842105263159e-07, |
|
"loss": 0.3439, |
|
"step": 302 |
|
}, |
|
{ |
|
"Batch Mean": -3.84674072265625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.755, |
|
"step": 302 |
|
}, |
|
{ |
|
"Batch Mean": -3.388824462890625, |
|
"accuracy": 0.9375, |
|
"epoch": 0.755, |
|
"step": 302 |
|
}, |
|
{ |
|
"Batch Mean": -3.8634033203125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.755, |
|
"step": 302 |
|
}, |
|
{ |
|
"Batch Mean": -3.41217041015625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.755, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7575, |
|
"grad_norm": 7.850198268890381, |
|
"learning_rate": 7.657894736842106e-07, |
|
"loss": 0.4118, |
|
"step": 303 |
|
}, |
|
{ |
|
"Batch Mean": -3.5252227783203125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7575, |
|
"step": 303 |
|
}, |
|
{ |
|
"Batch Mean": -2.8803558349609375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7575, |
|
"step": 303 |
|
}, |
|
{ |
|
"Batch Mean": -3.93988037109375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7575, |
|
"step": 303 |
|
}, |
|
{ |
|
"Batch Mean": -3.7777557373046875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7575, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.541781425476074, |
|
"learning_rate": 7.578947368421053e-07, |
|
"loss": 0.2979, |
|
"step": 304 |
|
}, |
|
{ |
|
"Batch Mean": -3.21771240234375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.76, |
|
"step": 304 |
|
}, |
|
{ |
|
"Batch Mean": -2.7990989685058594, |
|
"accuracy": 0.84375, |
|
"epoch": 0.76, |
|
"step": 304 |
|
}, |
|
{ |
|
"Batch Mean": -2.48345947265625, |
|
"accuracy": 1.0, |
|
"epoch": 0.76, |
|
"step": 304 |
|
}, |
|
{ |
|
"Batch Mean": -2.9448890686035156, |
|
"accuracy": 0.875, |
|
"epoch": 0.76, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 6.998548984527588, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.2772, |
|
"step": 305 |
|
}, |
|
{ |
|
"Batch Mean": -3.30224609375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7625, |
|
"step": 305 |
|
}, |
|
{ |
|
"Batch Mean": -4.053798675537109, |
|
"accuracy": 0.65625, |
|
"epoch": 0.7625, |
|
"step": 305 |
|
}, |
|
{ |
|
"Batch Mean": -3.5540008544921875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7625, |
|
"step": 305 |
|
}, |
|
{ |
|
"Batch Mean": -2.8664073944091797, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7625, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 7.624053001403809, |
|
"learning_rate": 7.421052631578948e-07, |
|
"loss": 0.4125, |
|
"step": 306 |
|
}, |
|
{ |
|
"Batch Mean": -3.087240219116211, |
|
"accuracy": 0.75, |
|
"epoch": 0.765, |
|
"step": 306 |
|
}, |
|
{ |
|
"Batch Mean": -2.1680727005004883, |
|
"accuracy": 0.71875, |
|
"epoch": 0.765, |
|
"step": 306 |
|
}, |
|
{ |
|
"Batch Mean": -2.568817138671875, |
|
"accuracy": 0.90625, |
|
"epoch": 0.765, |
|
"step": 306 |
|
}, |
|
{ |
|
"Batch Mean": -2.698699951171875, |
|
"accuracy": 0.6875, |
|
"epoch": 0.765, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7675, |
|
"grad_norm": 9.134117126464844, |
|
"learning_rate": 7.342105263157895e-07, |
|
"loss": 0.4755, |
|
"step": 307 |
|
}, |
|
{ |
|
"Batch Mean": -3.3601226806640625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.7675, |
|
"step": 307 |
|
}, |
|
{ |
|
"Batch Mean": -3.3802719116210938, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7675, |
|
"step": 307 |
|
}, |
|
{ |
|
"Batch Mean": -3.57061767578125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7675, |
|
"step": 307 |
|
}, |
|
{ |
|
"Batch Mean": -3.086669921875, |
|
"accuracy": 0.875, |
|
"epoch": 0.7675, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 7.370970249176025, |
|
"learning_rate": 7.263157894736843e-07, |
|
"loss": 0.3724, |
|
"step": 308 |
|
}, |
|
{ |
|
"Batch Mean": -3.758331298828125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.77, |
|
"step": 308 |
|
}, |
|
{ |
|
"Batch Mean": -3.1384429931640625, |
|
"accuracy": 0.875, |
|
"epoch": 0.77, |
|
"step": 308 |
|
}, |
|
{ |
|
"Batch Mean": -3.933807373046875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.77, |
|
"step": 308 |
|
}, |
|
{ |
|
"Batch Mean": -1.7305450439453125, |
|
"accuracy": 0.875, |
|
"epoch": 0.77, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7725, |
|
"grad_norm": 6.031234264373779, |
|
"learning_rate": 7.18421052631579e-07, |
|
"loss": 0.3028, |
|
"step": 309 |
|
}, |
|
{ |
|
"Batch Mean": -2.9827880859375, |
|
"accuracy": 0.875, |
|
"epoch": 0.7725, |
|
"step": 309 |
|
}, |
|
{ |
|
"Batch Mean": -2.880035400390625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7725, |
|
"step": 309 |
|
}, |
|
{ |
|
"Batch Mean": -3.112762451171875, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7725, |
|
"step": 309 |
|
}, |
|
{ |
|
"Batch Mean": -3.1557464599609375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7725, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 6.867088794708252, |
|
"learning_rate": 7.105263157894736e-07, |
|
"loss": 0.3139, |
|
"step": 310 |
|
}, |
|
{ |
|
"Batch Mean": -2.5328445434570312, |
|
"accuracy": 0.90625, |
|
"epoch": 0.775, |
|
"step": 310 |
|
}, |
|
{ |
|
"Batch Mean": -2.6245956420898438, |
|
"accuracy": 0.875, |
|
"epoch": 0.775, |
|
"step": 310 |
|
}, |
|
{ |
|
"Batch Mean": -2.5271682739257812, |
|
"accuracy": 0.78125, |
|
"epoch": 0.775, |
|
"step": 310 |
|
}, |
|
{ |
|
"Batch Mean": -2.7974395751953125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.775, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7775, |
|
"grad_norm": 6.109833717346191, |
|
"learning_rate": 7.026315789473685e-07, |
|
"loss": 0.2965, |
|
"step": 311 |
|
}, |
|
{ |
|
"Batch Mean": -3.696605682373047, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7775, |
|
"step": 311 |
|
}, |
|
{ |
|
"Batch Mean": -2.560495376586914, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7775, |
|
"step": 311 |
|
}, |
|
{ |
|
"Batch Mean": -2.85882568359375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7775, |
|
"step": 311 |
|
}, |
|
{ |
|
"Batch Mean": -3.4066162109375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7775, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 8.343950271606445, |
|
"learning_rate": 6.947368421052631e-07, |
|
"loss": 0.3525, |
|
"step": 312 |
|
}, |
|
{ |
|
"Batch Mean": -3.6371707916259766, |
|
"accuracy": 0.90625, |
|
"epoch": 0.78, |
|
"step": 312 |
|
}, |
|
{ |
|
"Batch Mean": -3.43695068359375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.78, |
|
"step": 312 |
|
}, |
|
{ |
|
"Batch Mean": -2.969696044921875, |
|
"accuracy": 0.875, |
|
"epoch": 0.78, |
|
"step": 312 |
|
}, |
|
{ |
|
"Batch Mean": -2.048126220703125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.78, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7825, |
|
"grad_norm": 7.288078308105469, |
|
"learning_rate": 6.868421052631579e-07, |
|
"loss": 0.3558, |
|
"step": 313 |
|
}, |
|
{ |
|
"Batch Mean": -1.7478790283203125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7825, |
|
"step": 313 |
|
}, |
|
{ |
|
"Batch Mean": -3.667694091796875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7825, |
|
"step": 313 |
|
}, |
|
{ |
|
"Batch Mean": -2.718048095703125, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7825, |
|
"step": 313 |
|
}, |
|
{ |
|
"Batch Mean": -2.8699588775634766, |
|
"accuracy": 0.875, |
|
"epoch": 0.7825, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 5.961294174194336, |
|
"learning_rate": 6.789473684210527e-07, |
|
"loss": 0.2646, |
|
"step": 314 |
|
}, |
|
{ |
|
"Batch Mean": -3.25496768951416, |
|
"accuracy": 0.75, |
|
"epoch": 0.785, |
|
"step": 314 |
|
}, |
|
{ |
|
"Batch Mean": -2.7613067626953125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.785, |
|
"step": 314 |
|
}, |
|
{ |
|
"Batch Mean": -2.9433860778808594, |
|
"accuracy": 0.84375, |
|
"epoch": 0.785, |
|
"step": 314 |
|
}, |
|
{ |
|
"Batch Mean": -2.6829519271850586, |
|
"accuracy": 0.875, |
|
"epoch": 0.785, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 7.829719066619873, |
|
"learning_rate": 6.710526315789474e-07, |
|
"loss": 0.3657, |
|
"step": 315 |
|
}, |
|
{ |
|
"Batch Mean": -3.2679595947265625, |
|
"accuracy": 0.875, |
|
"epoch": 0.7875, |
|
"step": 315 |
|
}, |
|
{ |
|
"Batch Mean": -2.335235595703125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7875, |
|
"step": 315 |
|
}, |
|
{ |
|
"Batch Mean": -3.3640594482421875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7875, |
|
"step": 315 |
|
}, |
|
{ |
|
"Batch Mean": -2.4273529052734375, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7875, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 7.626279830932617, |
|
"learning_rate": 6.631578947368421e-07, |
|
"loss": 0.3543, |
|
"step": 316 |
|
}, |
|
{ |
|
"Batch Mean": -2.5838470458984375, |
|
"accuracy": 0.9375, |
|
"epoch": 0.79, |
|
"step": 316 |
|
}, |
|
{ |
|
"Batch Mean": -3.27642822265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.79, |
|
"step": 316 |
|
}, |
|
{ |
|
"Batch Mean": -2.986175537109375, |
|
"accuracy": 0.875, |
|
"epoch": 0.79, |
|
"step": 316 |
|
}, |
|
{ |
|
"Batch Mean": -1.9454727172851562, |
|
"accuracy": 0.875, |
|
"epoch": 0.79, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7925, |
|
"grad_norm": 6.644615650177002, |
|
"learning_rate": 6.552631578947369e-07, |
|
"loss": 0.2865, |
|
"step": 317 |
|
}, |
|
{ |
|
"Batch Mean": -2.931793212890625, |
|
"accuracy": 0.875, |
|
"epoch": 0.7925, |
|
"step": 317 |
|
}, |
|
{ |
|
"Batch Mean": -2.5732688903808594, |
|
"accuracy": 0.875, |
|
"epoch": 0.7925, |
|
"step": 317 |
|
}, |
|
{ |
|
"Batch Mean": -2.371246337890625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7925, |
|
"step": 317 |
|
}, |
|
{ |
|
"Batch Mean": -2.889852523803711, |
|
"accuracy": 0.8125, |
|
"epoch": 0.7925, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 7.724899768829346, |
|
"learning_rate": 6.473684210526316e-07, |
|
"loss": 0.3715, |
|
"step": 318 |
|
}, |
|
{ |
|
"Batch Mean": -2.809967041015625, |
|
"accuracy": 0.875, |
|
"epoch": 0.795, |
|
"step": 318 |
|
}, |
|
{ |
|
"Batch Mean": -2.647411346435547, |
|
"accuracy": 0.8125, |
|
"epoch": 0.795, |
|
"step": 318 |
|
}, |
|
{ |
|
"Batch Mean": -3.0574254989624023, |
|
"accuracy": 0.875, |
|
"epoch": 0.795, |
|
"step": 318 |
|
}, |
|
{ |
|
"Batch Mean": -2.959758758544922, |
|
"accuracy": 0.875, |
|
"epoch": 0.795, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7975, |
|
"grad_norm": 6.657589912414551, |
|
"learning_rate": 6.394736842105264e-07, |
|
"loss": 0.329, |
|
"step": 319 |
|
}, |
|
{ |
|
"Batch Mean": -2.9253692626953125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7975, |
|
"step": 319 |
|
}, |
|
{ |
|
"Batch Mean": -3.6897506713867188, |
|
"accuracy": 0.875, |
|
"epoch": 0.7975, |
|
"step": 319 |
|
}, |
|
{ |
|
"Batch Mean": -2.2700228691101074, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7975, |
|
"step": 319 |
|
}, |
|
{ |
|
"Batch Mean": -2.148712158203125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.7975, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.154412746429443, |
|
"learning_rate": 6.31578947368421e-07, |
|
"loss": 0.3156, |
|
"step": 320 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 80, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|