|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.25, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"Batch Mean": 2.61431884765625, |
|
"accuracy": 0.4765625, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 42.59496307373047, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.8092, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": 2.574005126953125, |
|
"accuracy": 0.578125, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 46.709930419921875, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.7799, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": 2.560516357421875, |
|
"accuracy": 0.484375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 44.51314163208008, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.8043, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": 2.6197509765625, |
|
"accuracy": 0.5, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 42.86121368408203, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.8014, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": 2.565338134765625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 44.409908294677734, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.7983, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": 2.522857666015625, |
|
"accuracy": 0.484375, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 43.77534484863281, |
|
"learning_rate": 9e-07, |
|
"loss": 0.7975, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": 2.3875732421875, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 41.981407165527344, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.7951, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": 2.317047119140625, |
|
"accuracy": 0.515625, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 40.53751754760742, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.791, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": 1.757080078125, |
|
"accuracy": 0.5078125, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 31.105939865112305, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.7571, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": 1.6220855712890625, |
|
"accuracy": 0.578125, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 33.61161804199219, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.7234, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": 1.400146484375, |
|
"accuracy": 0.53125, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 34.554622650146484, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.725, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -0.43697381019592285, |
|
"accuracy": 0.5234375, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 12.222326278686523, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.6986, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -0.8854951858520508, |
|
"accuracy": 0.5234375, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 16.398786544799805, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.7081, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -0.9508838653564453, |
|
"accuracy": 0.640625, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 19.590002059936523, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.6784, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -1.0560526847839355, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 22.669742584228516, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.7235, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -1.1405725479125977, |
|
"accuracy": 0.625, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 22.189626693725586, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.6714, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -1.1522831916809082, |
|
"accuracy": 0.640625, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 19.9060115814209, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.6757, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -1.1666946411132812, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 23.661216735839844, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.7002, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -0.999359130859375, |
|
"accuracy": 0.6640625, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 15.433859825134277, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.6471, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -0.6625549793243408, |
|
"accuracy": 0.6171875, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 13.135390281677246, |
|
"learning_rate": 3e-06, |
|
"loss": 0.662, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": -0.13487493991851807, |
|
"accuracy": 0.6171875, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 7.336737632751465, |
|
"learning_rate": 2.992105263157895e-06, |
|
"loss": 0.629, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.2938040494918823, |
|
"accuracy": 0.6640625, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 9.586155891418457, |
|
"learning_rate": 2.9842105263157896e-06, |
|
"loss": 0.6225, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": 0.6780391931533813, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 14.825817108154297, |
|
"learning_rate": 2.9763157894736843e-06, |
|
"loss": 0.607, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": 0.9421095848083496, |
|
"accuracy": 0.609375, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 16.7529354095459, |
|
"learning_rate": 2.968421052631579e-06, |
|
"loss": 0.6542, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": 1.0722179412841797, |
|
"accuracy": 0.71875, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 18.940032958984375, |
|
"learning_rate": 2.960526315789474e-06, |
|
"loss": 0.5882, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": 0.8396664261817932, |
|
"accuracy": 0.6484375, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 14.680763244628906, |
|
"learning_rate": 2.9526315789473685e-06, |
|
"loss": 0.6463, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": 0.2986793518066406, |
|
"accuracy": 0.671875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 7.3362345695495605, |
|
"learning_rate": 2.9447368421052633e-06, |
|
"loss": 0.5923, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.29196763038635254, |
|
"accuracy": 0.734375, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.053507804870605, |
|
"learning_rate": 2.936842105263158e-06, |
|
"loss": 0.5261, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -0.8161113262176514, |
|
"accuracy": 0.7109375, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 14.036270141601562, |
|
"learning_rate": 2.9289473684210528e-06, |
|
"loss": 0.5482, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -0.7824737429618835, |
|
"accuracy": 0.75, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 13.984892845153809, |
|
"learning_rate": 2.9210526315789475e-06, |
|
"loss": 0.5939, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -0.4927825927734375, |
|
"accuracy": 0.734375, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 10.751565933227539, |
|
"learning_rate": 2.9131578947368423e-06, |
|
"loss": 0.5901, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": 0.15497040748596191, |
|
"accuracy": 0.6796875, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.30102252960205, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.5731, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": 0.16421844065189362, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 9.087366104125977, |
|
"learning_rate": 2.8973684210526318e-06, |
|
"loss": 0.4958, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": 0.3596491813659668, |
|
"accuracy": 0.859375, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 9.984371185302734, |
|
"learning_rate": 2.8894736842105265e-06, |
|
"loss": 0.4321, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": 0.2128266543149948, |
|
"accuracy": 0.734375, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 7.400077819824219, |
|
"learning_rate": 2.8815789473684213e-06, |
|
"loss": 0.5178, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": 0.10449030995368958, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.669457912445068, |
|
"learning_rate": 2.873684210526316e-06, |
|
"loss": 0.4914, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -0.24382781982421875, |
|
"accuracy": 0.6953125, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 10.831747055053711, |
|
"learning_rate": 2.8657894736842103e-06, |
|
"loss": 0.5305, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -0.29154396057128906, |
|
"accuracy": 0.7109375, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 11.060967445373535, |
|
"learning_rate": 2.857894736842105e-06, |
|
"loss": 0.5589, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": -0.07830595970153809, |
|
"accuracy": 0.765625, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 8.426444053649902, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.4663, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": -0.09856069087982178, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 11.046717643737793, |
|
"learning_rate": 2.8421052631578946e-06, |
|
"loss": 0.455, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.29904642701148987, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 11.232060432434082, |
|
"learning_rate": 2.8342105263157897e-06, |
|
"loss": 0.437, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.43527090549468994, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 12.555906295776367, |
|
"learning_rate": 2.8263157894736845e-06, |
|
"loss": 0.5316, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": 0.02639901638031006, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 9.500948905944824, |
|
"learning_rate": 2.8184210526315792e-06, |
|
"loss": 0.4874, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -0.3524761199951172, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 10.417457580566406, |
|
"learning_rate": 2.810526315789474e-06, |
|
"loss": 0.5011, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -0.32391130924224854, |
|
"accuracy": 0.8125, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 9.867128372192383, |
|
"learning_rate": 2.8026315789473687e-06, |
|
"loss": 0.4908, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -0.42060422897338867, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 10.613035202026367, |
|
"learning_rate": 2.7947368421052635e-06, |
|
"loss": 0.4642, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.04145359992980957, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 12.13752555847168, |
|
"learning_rate": 2.7868421052631578e-06, |
|
"loss": 0.4943, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": 0.24553179740905762, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 11.048128128051758, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.3946, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -0.05052506923675537, |
|
"accuracy": 0.796875, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 9.053565979003906, |
|
"learning_rate": 2.7710526315789473e-06, |
|
"loss": 0.4474, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.43336963653564453, |
|
"accuracy": 0.7265625, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 15.04668140411377, |
|
"learning_rate": 2.763157894736842e-06, |
|
"loss": 0.5496, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.29952335357666016, |
|
"accuracy": 0.765625, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 11.302435874938965, |
|
"learning_rate": 2.7552631578947368e-06, |
|
"loss": 0.4337, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.01873302459716797, |
|
"accuracy": 0.828125, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 10.582301139831543, |
|
"learning_rate": 2.7473684210526315e-06, |
|
"loss": 0.4073, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": 0.4117751717567444, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 9.776220321655273, |
|
"learning_rate": 2.7394736842105263e-06, |
|
"loss": 0.4235, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": 0.5990171432495117, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 12.647154808044434, |
|
"learning_rate": 2.7315789473684214e-06, |
|
"loss": 0.4858, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": 0.2538492679595947, |
|
"accuracy": 0.7265625, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 11.564103126525879, |
|
"learning_rate": 2.723684210526316e-06, |
|
"loss": 0.5233, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": 0.17059040069580078, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 10.589515686035156, |
|
"learning_rate": 2.715789473684211e-06, |
|
"loss": 0.4762, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -0.3642357587814331, |
|
"accuracy": 0.78125, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 10.727502822875977, |
|
"learning_rate": 2.7078947368421052e-06, |
|
"loss": 0.5289, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -0.5980481505393982, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 8.915611267089844, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.4038, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.46013855934143066, |
|
"accuracy": 0.859375, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 9.178686141967773, |
|
"learning_rate": 2.6921052631578947e-06, |
|
"loss": 0.3708, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.1478586494922638, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.339688301086426, |
|
"learning_rate": 2.6842105263157895e-06, |
|
"loss": 0.4036, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": 0.003389716148376465, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 6.763300895690918, |
|
"learning_rate": 2.6763157894736842e-06, |
|
"loss": 0.5163, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": 0.4018087387084961, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 8.631525039672852, |
|
"learning_rate": 2.668421052631579e-06, |
|
"loss": 0.4219, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": 0.342004656791687, |
|
"accuracy": 0.734375, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 9.190362930297852, |
|
"learning_rate": 2.6605263157894737e-06, |
|
"loss": 0.4708, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": 0.5178697109222412, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.207452774047852, |
|
"learning_rate": 2.6526315789473685e-06, |
|
"loss": 0.4458, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": 0.03012150526046753, |
|
"accuracy": 0.8125, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 8.050507545471191, |
|
"learning_rate": 2.644736842105263e-06, |
|
"loss": 0.4344, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -0.3691895604133606, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 11.163555145263672, |
|
"learning_rate": 2.636842105263158e-06, |
|
"loss": 0.4276, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -0.2624788284301758, |
|
"accuracy": 0.8125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 9.523059844970703, |
|
"learning_rate": 2.6289473684210527e-06, |
|
"loss": 0.4001, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.0055138468742370605, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.094910621643066, |
|
"learning_rate": 2.6210526315789474e-06, |
|
"loss": 0.4024, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": 0.2572704553604126, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 10.00645923614502, |
|
"learning_rate": 2.613157894736842e-06, |
|
"loss": 0.4802, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": 0.5112218856811523, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 11.776649475097656, |
|
"learning_rate": 2.605263157894737e-06, |
|
"loss": 0.4194, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": 0.29661768674850464, |
|
"accuracy": 0.8125, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 9.701162338256836, |
|
"learning_rate": 2.5973684210526317e-06, |
|
"loss": 0.4356, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": 0.049837589263916016, |
|
"accuracy": 0.859375, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.692646980285645, |
|
"learning_rate": 2.5894736842105264e-06, |
|
"loss": 0.2881, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -0.3786022663116455, |
|
"accuracy": 0.8125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 10.834145545959473, |
|
"learning_rate": 2.581578947368421e-06, |
|
"loss": 0.4654, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.20977401733398438, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 7.816598892211914, |
|
"learning_rate": 2.573684210526316e-06, |
|
"loss": 0.4101, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -0.6599991321563721, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 13.089007377624512, |
|
"learning_rate": 2.5657894736842107e-06, |
|
"loss": 0.427, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -0.37617337703704834, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 12.075210571289062, |
|
"learning_rate": 2.5578947368421054e-06, |
|
"loss": 0.4797, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": 0.1281442642211914, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 9.70462703704834, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.4425, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": 0.6071650981903076, |
|
"accuracy": 0.828125, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 14.395353317260742, |
|
"learning_rate": 2.542105263157895e-06, |
|
"loss": 0.4016, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": 0.34730714559555054, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 9.122536659240723, |
|
"learning_rate": 2.5342105263157892e-06, |
|
"loss": 0.3994, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": 0.36483922600746155, |
|
"accuracy": 0.828125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.289307594299316, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.368, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -0.20039799809455872, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 7.813342094421387, |
|
"learning_rate": 2.5184210526315787e-06, |
|
"loss": 0.3679, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": -0.1629079282283783, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 7.504952430725098, |
|
"learning_rate": 2.510526315789474e-06, |
|
"loss": 0.3823, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -0.07863587141036987, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 7.721461296081543, |
|
"learning_rate": 2.5026315789473686e-06, |
|
"loss": 0.3967, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": 0.17194491624832153, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.062063217163086, |
|
"learning_rate": 2.4947368421052634e-06, |
|
"loss": 0.4195, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": 0.11748838424682617, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 8.320355415344238, |
|
"learning_rate": 2.486842105263158e-06, |
|
"loss": 0.3864, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -0.40801382064819336, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 14.8066987991333, |
|
"learning_rate": 2.478947368421053e-06, |
|
"loss": 0.4639, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -0.35811758041381836, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 14.074177742004395, |
|
"learning_rate": 2.4710526315789476e-06, |
|
"loss": 0.4999, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": 0.278584361076355, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 10.59004020690918, |
|
"learning_rate": 2.4631578947368424e-06, |
|
"loss": 0.404, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -0.21054387092590332, |
|
"accuracy": 0.84375, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 8.488351821899414, |
|
"learning_rate": 2.4552631578947367e-06, |
|
"loss": 0.3517, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -0.22967231273651123, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 10.347502708435059, |
|
"learning_rate": 2.4473684210526314e-06, |
|
"loss": 0.3382, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": 0.1892259418964386, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 10.092534065246582, |
|
"learning_rate": 2.439473684210526e-06, |
|
"loss": 0.4586, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": 0.516247570514679, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 13.944212913513184, |
|
"learning_rate": 2.431578947368421e-06, |
|
"loss": 0.3067, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": 0.056681275367736816, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 8.53646183013916, |
|
"learning_rate": 2.4236842105263157e-06, |
|
"loss": 0.3946, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -0.47129684686660767, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 11.441537857055664, |
|
"learning_rate": 2.4157894736842104e-06, |
|
"loss": 0.3891, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -0.3262195587158203, |
|
"accuracy": 0.84375, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 10.751629829406738, |
|
"learning_rate": 2.4078947368421056e-06, |
|
"loss": 0.3578, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -0.6593484878540039, |
|
"accuracy": 0.828125, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 15.914350509643555, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.4204, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -0.32147085666656494, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 11.814691543579102, |
|
"learning_rate": 2.392105263157895e-06, |
|
"loss": 0.3892, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": 0.49291136860847473, |
|
"accuracy": 0.875, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 11.86365795135498, |
|
"learning_rate": 2.38421052631579e-06, |
|
"loss": 0.3264, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": 0.49174070358276367, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 10.874544143676758, |
|
"learning_rate": 2.376315789473684e-06, |
|
"loss": 0.3847, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": 0.6919900178909302, |
|
"accuracy": 0.796875, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 15.105185508728027, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.4066, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|