|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.427528675703858, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.0, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 1.0000000000000001e-07, |
|
"loss": 3.5777, |
|
"max_norm": 11.326861381530762, |
|
"max_norm/layer0": 11.326861381530762, |
|
"mean_norm": 7.967035204172134, |
|
"mean_norm/layer0": 7.967035204172134, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.1, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2319, |
|
"max_norm": 11.350375175476074, |
|
"max_norm/layer0": 11.350375175476074, |
|
"mean_norm": 8.056939780712128, |
|
"mean_norm/layer0": 8.056939780712128, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5317416829745597, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.9134443998336792, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.034, |
|
"eval_samples_per_second": 96.716, |
|
"eval_steps_per_second": 1.934, |
|
"step": 1000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.21, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8521, |
|
"max_norm": 11.451685905456543, |
|
"max_norm/layer0": 11.451685905456543, |
|
"mean_norm": 8.345297634601593, |
|
"mean_norm/layer0": 8.345297634601593, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5495499021526419, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.798967719078064, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0243, |
|
"eval_samples_per_second": 97.629, |
|
"eval_steps_per_second": 1.953, |
|
"step": 2000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.31, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003, |
|
"loss": 1.7879, |
|
"max_norm": 12.298419952392578, |
|
"max_norm/layer0": 12.298419952392578, |
|
"mean_norm": 8.877436935901642, |
|
"mean_norm/layer0": 8.877436935901642, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5556751467710371, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7738969326019287, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0304, |
|
"eval_samples_per_second": 97.052, |
|
"eval_steps_per_second": 1.941, |
|
"step": 3000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.42, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004, |
|
"loss": 1.7728, |
|
"max_norm": 13.492940902709961, |
|
"max_norm/layer0": 13.492940902709961, |
|
"mean_norm": 9.665014863014221, |
|
"mean_norm/layer0": 9.665014863014221, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5564187866927593, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7666442394256592, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0276, |
|
"eval_samples_per_second": 97.317, |
|
"eval_steps_per_second": 1.946, |
|
"step": 4000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.52, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7686, |
|
"max_norm": 15.771540641784668, |
|
"max_norm/layer0": 15.771540641784668, |
|
"mean_norm": 10.713956594467163, |
|
"mean_norm/layer0": 10.713956594467163, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5594520547945205, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7608648538589478, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0276, |
|
"eval_samples_per_second": 97.311, |
|
"eval_steps_per_second": 1.946, |
|
"step": 5000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.63, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004947368421052632, |
|
"loss": 1.7635, |
|
"max_norm": 18.436635971069336, |
|
"max_norm/layer0": 18.436635971069336, |
|
"mean_norm": 11.895205318927765, |
|
"mean_norm/layer0": 11.895205318927765, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5597651663405088, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7554824352264404, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0292, |
|
"eval_samples_per_second": 97.159, |
|
"eval_steps_per_second": 1.943, |
|
"step": 6000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.73, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004894736842105264, |
|
"loss": 1.7523, |
|
"max_norm": 21.048765182495117, |
|
"max_norm/layer0": 21.048765182495117, |
|
"mean_norm": 13.066829144954681, |
|
"mean_norm/layer0": 13.066829144954681, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5631506849315069, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.738256812095642, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0284, |
|
"eval_samples_per_second": 97.237, |
|
"eval_steps_per_second": 1.945, |
|
"step": 7000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.83, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004842105263157895, |
|
"loss": 1.7471, |
|
"max_norm": 23.563966751098633, |
|
"max_norm/layer0": 23.563966751098633, |
|
"mean_norm": 14.23173063993454, |
|
"mean_norm/layer0": 14.23173063993454, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5643248532289629, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7368353605270386, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0291, |
|
"eval_samples_per_second": 97.172, |
|
"eval_steps_per_second": 1.943, |
|
"step": 8000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 0.94, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00047894736842105264, |
|
"loss": 1.7404, |
|
"max_norm": 26.197235107421875, |
|
"max_norm/layer0": 26.197235107421875, |
|
"mean_norm": 15.391284584999084, |
|
"mean_norm/layer0": 15.391284584999084, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5658904109589041, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7276737689971924, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.029, |
|
"eval_samples_per_second": 97.186, |
|
"eval_steps_per_second": 1.944, |
|
"step": 9000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.04, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00047368421052631577, |
|
"loss": 1.728, |
|
"max_norm": 28.83698272705078, |
|
"max_norm/layer0": 28.83698272705078, |
|
"mean_norm": 16.548602163791656, |
|
"mean_norm/layer0": 16.548602163791656, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5647358121330724, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7289787530899048, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0306, |
|
"eval_samples_per_second": 97.031, |
|
"eval_steps_per_second": 1.941, |
|
"step": 10000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.15, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00046842105263157895, |
|
"loss": 1.7195, |
|
"max_norm": 31.372026443481445, |
|
"max_norm/layer0": 31.372026443481445, |
|
"mean_norm": 17.70964866876602, |
|
"mean_norm/layer0": 17.70964866876602, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5667318982387476, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7244290113449097, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0293, |
|
"eval_samples_per_second": 97.153, |
|
"eval_steps_per_second": 1.943, |
|
"step": 11000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.25, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00046315789473684214, |
|
"loss": 1.7198, |
|
"max_norm": 33.9889030456543, |
|
"max_norm/layer0": 33.9889030456543, |
|
"mean_norm": 18.86298167705536, |
|
"mean_norm/layer0": 18.86298167705536, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5671037181996086, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7230280637741089, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0314, |
|
"eval_samples_per_second": 96.953, |
|
"eval_steps_per_second": 1.939, |
|
"step": 12000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.36, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00045789473684210527, |
|
"loss": 1.7171, |
|
"max_norm": 36.61670684814453, |
|
"max_norm/layer0": 36.61670684814453, |
|
"mean_norm": 20.012963116168976, |
|
"mean_norm/layer0": 20.012963116168976, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5689432485322896, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7177398204803467, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0289, |
|
"eval_samples_per_second": 97.193, |
|
"eval_steps_per_second": 1.944, |
|
"step": 13000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.46, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00045263157894736845, |
|
"loss": 1.7185, |
|
"max_norm": 39.28107452392578, |
|
"max_norm/layer0": 39.28107452392578, |
|
"mean_norm": 21.156790494918823, |
|
"mean_norm/layer0": 21.156790494918823, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5688258317025441, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.715006709098816, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0296, |
|
"eval_samples_per_second": 97.126, |
|
"eval_steps_per_second": 1.943, |
|
"step": 14000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.56, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004473684210526316, |
|
"loss": 1.7149, |
|
"max_norm": 41.809288024902344, |
|
"max_norm/layer0": 41.809288024902344, |
|
"mean_norm": 22.29547154903412, |
|
"mean_norm/layer0": 22.29547154903412, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5694520547945205, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7125060558319092, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0282, |
|
"eval_samples_per_second": 97.259, |
|
"eval_steps_per_second": 1.945, |
|
"step": 15000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.67, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004421052631578947, |
|
"loss": 1.7105, |
|
"max_norm": 44.21694564819336, |
|
"max_norm/layer0": 44.21694564819336, |
|
"mean_norm": 23.42733907699585, |
|
"mean_norm/layer0": 23.42733907699585, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5695303326810176, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.709671139717102, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0286, |
|
"eval_samples_per_second": 97.217, |
|
"eval_steps_per_second": 1.944, |
|
"step": 16000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.77, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00043684210526315795, |
|
"loss": 1.7107, |
|
"max_norm": 46.647300720214844, |
|
"max_norm/layer0": 46.647300720214844, |
|
"mean_norm": 24.55408787727356, |
|
"mean_norm/layer0": 24.55408787727356, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5689236790606653, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7072749137878418, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0377, |
|
"eval_samples_per_second": 96.366, |
|
"eval_steps_per_second": 1.927, |
|
"step": 17000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.88, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004315789473684211, |
|
"loss": 1.7113, |
|
"max_norm": 49.0349235534668, |
|
"max_norm/layer0": 49.0349235534668, |
|
"mean_norm": 25.673280954360962, |
|
"mean_norm/layer0": 25.673280954360962, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5711937377690802, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7024654150009155, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.031, |
|
"eval_samples_per_second": 96.991, |
|
"eval_steps_per_second": 1.94, |
|
"step": 18000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 1.98, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004263157894736842, |
|
"loss": 1.7078, |
|
"max_norm": 51.330352783203125, |
|
"max_norm/layer0": 51.330352783203125, |
|
"mean_norm": 26.78341281414032, |
|
"mean_norm/layer0": 26.78341281414032, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5701956947162427, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7047913074493408, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0324, |
|
"eval_samples_per_second": 96.861, |
|
"eval_steps_per_second": 1.937, |
|
"step": 19000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.09, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00042105263157894734, |
|
"loss": 1.693, |
|
"max_norm": 53.77473449707031, |
|
"max_norm/layer0": 53.77473449707031, |
|
"mean_norm": 27.891030192375183, |
|
"mean_norm/layer0": 27.891030192375183, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5695694716242662, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7044708728790283, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0296, |
|
"eval_samples_per_second": 97.121, |
|
"eval_steps_per_second": 1.942, |
|
"step": 20000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.19, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004157894736842106, |
|
"loss": 1.6935, |
|
"max_norm": 56.13137435913086, |
|
"max_norm/layer0": 56.13137435913086, |
|
"mean_norm": 28.993399620056152, |
|
"mean_norm/layer0": 28.993399620056152, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5694716242661448, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.706821322441101, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.03, |
|
"eval_samples_per_second": 97.083, |
|
"eval_steps_per_second": 1.942, |
|
"step": 21000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.29, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004105263157894737, |
|
"loss": 1.6962, |
|
"max_norm": 58.38813018798828, |
|
"max_norm/layer0": 58.38813018798828, |
|
"mean_norm": 30.087660908699036, |
|
"mean_norm/layer0": 30.087660908699036, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5687475538160469, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7046499252319336, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0302, |
|
"eval_samples_per_second": 97.072, |
|
"eval_steps_per_second": 1.941, |
|
"step": 22000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.4, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00040526315789473684, |
|
"loss": 1.6954, |
|
"max_norm": 60.607887268066406, |
|
"max_norm/layer0": 60.607887268066406, |
|
"mean_norm": 31.172435641288757, |
|
"mean_norm/layer0": 31.172435641288757, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5706457925636008, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.7018758058547974, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0307, |
|
"eval_samples_per_second": 97.019, |
|
"eval_steps_per_second": 1.94, |
|
"step": 23000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.5, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0004, |
|
"loss": 1.6933, |
|
"max_norm": 62.92869186401367, |
|
"max_norm/layer0": 62.92869186401367, |
|
"mean_norm": 32.24555063247681, |
|
"mean_norm/layer0": 32.24555063247681, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5725440313111546, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.700171947479248, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0287, |
|
"eval_samples_per_second": 97.209, |
|
"eval_steps_per_second": 1.944, |
|
"step": 24000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.61, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00039473684210526315, |
|
"loss": 1.6942, |
|
"max_norm": 65.25504302978516, |
|
"max_norm/layer0": 65.25504302978516, |
|
"mean_norm": 33.31111395359039, |
|
"mean_norm/layer0": 33.31111395359039, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5717221135029354, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6983325481414795, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0305, |
|
"eval_samples_per_second": 97.039, |
|
"eval_steps_per_second": 1.941, |
|
"step": 25000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.71, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00038947368421052633, |
|
"loss": 1.6935, |
|
"max_norm": 67.45101928710938, |
|
"max_norm/layer0": 67.45101928710938, |
|
"mean_norm": 34.36543405056, |
|
"mean_norm/layer0": 34.36543405056, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.573013698630137, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6938215494155884, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.031, |
|
"eval_samples_per_second": 96.993, |
|
"eval_steps_per_second": 1.94, |
|
"step": 26000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.82, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00038421052631578946, |
|
"loss": 1.6928, |
|
"max_norm": 69.7523193359375, |
|
"max_norm/layer0": 69.7523193359375, |
|
"mean_norm": 35.40895915031433, |
|
"mean_norm/layer0": 35.40895915031433, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5718786692759296, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6977686882019043, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0294, |
|
"eval_samples_per_second": 97.146, |
|
"eval_steps_per_second": 1.943, |
|
"step": 27000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 2.92, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00037894736842105265, |
|
"loss": 1.6927, |
|
"max_norm": 71.84566497802734, |
|
"max_norm/layer0": 71.84566497802734, |
|
"mean_norm": 36.44334518909454, |
|
"mean_norm/layer0": 36.44334518909454, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.571545988258317, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6934936046600342, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0309, |
|
"eval_samples_per_second": 97.006, |
|
"eval_steps_per_second": 1.94, |
|
"step": 28000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.02, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003736842105263158, |
|
"loss": 1.6855, |
|
"max_norm": 73.88700866699219, |
|
"max_norm/layer0": 73.88700866699219, |
|
"mean_norm": 37.46485388278961, |
|
"mean_norm/layer0": 37.46485388278961, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5726027397260274, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.69780695438385, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0312, |
|
"eval_samples_per_second": 96.977, |
|
"eval_steps_per_second": 1.94, |
|
"step": 29000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.13, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00036842105263157896, |
|
"loss": 1.6773, |
|
"max_norm": 76.05863189697266, |
|
"max_norm/layer0": 76.05863189697266, |
|
"mean_norm": 38.48086929321289, |
|
"mean_norm/layer0": 38.48086929321289, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5731898238747554, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6950737237930298, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0321, |
|
"eval_samples_per_second": 96.894, |
|
"eval_steps_per_second": 1.938, |
|
"step": 30000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.23, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00036315789473684214, |
|
"loss": 1.6788, |
|
"max_norm": 78.17182922363281, |
|
"max_norm/layer0": 78.17182922363281, |
|
"mean_norm": 39.482818245887756, |
|
"mean_norm/layer0": 39.482818245887756, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5728375733855186, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6925665140151978, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0261, |
|
"eval_samples_per_second": 97.46, |
|
"eval_steps_per_second": 1.949, |
|
"step": 31000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.34, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003578947368421053, |
|
"loss": 1.6813, |
|
"max_norm": 80.1740493774414, |
|
"max_norm/layer0": 80.1740493774414, |
|
"mean_norm": 40.47369468212128, |
|
"mean_norm/layer0": 40.47369468212128, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.572641878669276, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6919567584991455, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0325, |
|
"eval_samples_per_second": 96.851, |
|
"eval_steps_per_second": 1.937, |
|
"step": 32000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.44, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003526315789473684, |
|
"loss": 1.6782, |
|
"max_norm": 82.25338745117188, |
|
"max_norm/layer0": 82.25338745117188, |
|
"mean_norm": 41.45106363296509, |
|
"mean_norm/layer0": 41.45106363296509, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5732681017612524, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6926295757293701, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0268, |
|
"eval_samples_per_second": 97.393, |
|
"eval_steps_per_second": 1.948, |
|
"step": 33000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.55, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003473684210526316, |
|
"loss": 1.6801, |
|
"max_norm": 84.2289047241211, |
|
"max_norm/layer0": 84.2289047241211, |
|
"mean_norm": 42.41705143451691, |
|
"mean_norm/layer0": 42.41705143451691, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5718590998043053, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6893627643585205, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0265, |
|
"eval_samples_per_second": 97.419, |
|
"eval_steps_per_second": 1.948, |
|
"step": 34000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.65, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00034210526315789477, |
|
"loss": 1.6796, |
|
"max_norm": 86.21290588378906, |
|
"max_norm/layer0": 86.21290588378906, |
|
"mean_norm": 43.37204849720001, |
|
"mean_norm/layer0": 43.37204849720001, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5727788649706458, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.688981294631958, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0277, |
|
"eval_samples_per_second": 97.302, |
|
"eval_steps_per_second": 1.946, |
|
"step": 35000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.75, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003368421052631579, |
|
"loss": 1.6768, |
|
"max_norm": 88.06742095947266, |
|
"max_norm/layer0": 88.06742095947266, |
|
"mean_norm": 44.314213514328, |
|
"mean_norm/layer0": 44.314213514328, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5721722113502935, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6881800889968872, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0296, |
|
"eval_samples_per_second": 97.122, |
|
"eval_steps_per_second": 1.942, |
|
"step": 36000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.86, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00033157894736842103, |
|
"loss": 1.6802, |
|
"max_norm": 89.83356475830078, |
|
"max_norm/layer0": 89.83356475830078, |
|
"mean_norm": 45.24382555484772, |
|
"mean_norm/layer0": 45.24382555484772, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.573228962818004, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6871685981750488, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0305, |
|
"eval_samples_per_second": 97.04, |
|
"eval_steps_per_second": 1.941, |
|
"step": 37000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 3.96, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003263157894736842, |
|
"loss": 1.6809, |
|
"max_norm": 91.73099517822266, |
|
"max_norm/layer0": 91.73099517822266, |
|
"mean_norm": 46.16047787666321, |
|
"mean_norm/layer0": 46.16047787666321, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5750097847358121, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.685491681098938, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0263, |
|
"eval_samples_per_second": 97.433, |
|
"eval_steps_per_second": 1.949, |
|
"step": 38000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.07, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003210526315789474, |
|
"loss": 1.6701, |
|
"max_norm": 93.56365966796875, |
|
"max_norm/layer0": 93.56365966796875, |
|
"mean_norm": 47.06712102890015, |
|
"mean_norm/layer0": 47.06712102890015, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5742074363992172, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6885604858398438, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0277, |
|
"eval_samples_per_second": 97.308, |
|
"eval_steps_per_second": 1.946, |
|
"step": 39000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.17, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00031578947368421053, |
|
"loss": 1.6646, |
|
"max_norm": 95.40019226074219, |
|
"max_norm/layer0": 95.40019226074219, |
|
"mean_norm": 47.9597909450531, |
|
"mean_norm/layer0": 47.9597909450531, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5734442270058708, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6889522075653076, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0311, |
|
"eval_samples_per_second": 96.985, |
|
"eval_steps_per_second": 1.94, |
|
"step": 40000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.28, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003105263157894737, |
|
"loss": 1.669, |
|
"max_norm": 97.21604919433594, |
|
"max_norm/layer0": 97.21604919433594, |
|
"mean_norm": 48.839876651763916, |
|
"mean_norm/layer0": 48.839876651763916, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5746771037181996, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6858941316604614, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0321, |
|
"eval_samples_per_second": 96.889, |
|
"eval_steps_per_second": 1.938, |
|
"step": 41000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.38, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00030526315789473684, |
|
"loss": 1.6713, |
|
"max_norm": 99.00255584716797, |
|
"max_norm/layer0": 99.00255584716797, |
|
"mean_norm": 49.7055082321167, |
|
"mean_norm/layer0": 49.7055082321167, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5739726027397261, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.686662197113037, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0316, |
|
"eval_samples_per_second": 96.933, |
|
"eval_steps_per_second": 1.939, |
|
"step": 42000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.48, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6693, |
|
"max_norm": 100.78987884521484, |
|
"max_norm/layer0": 100.78987884521484, |
|
"mean_norm": 50.558117628097534, |
|
"mean_norm/layer0": 50.558117628097534, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5749510763209393, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.682096004486084, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0324, |
|
"eval_samples_per_second": 96.862, |
|
"eval_steps_per_second": 1.937, |
|
"step": 43000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.59, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00029473684210526316, |
|
"loss": 1.6693, |
|
"max_norm": 102.48609161376953, |
|
"max_norm/layer0": 102.48609161376953, |
|
"mean_norm": 51.397204637527466, |
|
"mean_norm/layer0": 51.397204637527466, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5747162426614482, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6822019815444946, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0343, |
|
"eval_samples_per_second": 96.681, |
|
"eval_steps_per_second": 1.934, |
|
"step": 44000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.69, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00028947368421052634, |
|
"loss": 1.6692, |
|
"max_norm": 104.09925842285156, |
|
"max_norm/layer0": 104.09925842285156, |
|
"mean_norm": 52.224265336990356, |
|
"mean_norm/layer0": 52.224265336990356, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5745009784735812, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.680064082145691, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0301, |
|
"eval_samples_per_second": 97.078, |
|
"eval_steps_per_second": 1.942, |
|
"step": 45000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.8, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00028421052631578947, |
|
"loss": 1.6703, |
|
"max_norm": 105.74360656738281, |
|
"max_norm/layer0": 105.74360656738281, |
|
"mean_norm": 53.035457372665405, |
|
"mean_norm/layer0": 53.035457372665405, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5761448140900196, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6834497451782227, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.028, |
|
"eval_samples_per_second": 97.278, |
|
"eval_steps_per_second": 1.946, |
|
"step": 46000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 4.9, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002789473684210526, |
|
"loss": 1.6677, |
|
"max_norm": 107.3390884399414, |
|
"max_norm/layer0": 107.3390884399414, |
|
"mean_norm": 53.83394503593445, |
|
"mean_norm/layer0": 53.83394503593445, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5756360078277887, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.681907296180725, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0321, |
|
"eval_samples_per_second": 96.892, |
|
"eval_steps_per_second": 1.938, |
|
"step": 47000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.01, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00027368421052631584, |
|
"loss": 1.6682, |
|
"max_norm": 108.92868041992188, |
|
"max_norm/layer0": 108.92868041992188, |
|
"mean_norm": 54.61986470222473, |
|
"mean_norm/layer0": 54.61986470222473, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5752250489236791, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6778249740600586, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0308, |
|
"eval_samples_per_second": 97.012, |
|
"eval_steps_per_second": 1.94, |
|
"step": 48000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.11, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00026842105263157897, |
|
"loss": 1.6547, |
|
"max_norm": 110.44733428955078, |
|
"max_norm/layer0": 110.44733428955078, |
|
"mean_norm": 55.39363622665405, |
|
"mean_norm/layer0": 55.39363622665405, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.575146771037182, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6824584007263184, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0301, |
|
"eval_samples_per_second": 97.074, |
|
"eval_steps_per_second": 1.941, |
|
"step": 49000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.21, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002631578947368421, |
|
"loss": 1.6566, |
|
"max_norm": 112.06961059570312, |
|
"max_norm/layer0": 112.06961059570312, |
|
"mean_norm": 56.14954137802124, |
|
"mean_norm/layer0": 56.14954137802124, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5757729941291585, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6824774742126465, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0319, |
|
"eval_samples_per_second": 96.905, |
|
"eval_steps_per_second": 1.938, |
|
"step": 50000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.32, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002578947368421053, |
|
"loss": 1.6605, |
|
"max_norm": 113.63825988769531, |
|
"max_norm/layer0": 113.63825988769531, |
|
"mean_norm": 56.89331555366516, |
|
"mean_norm/layer0": 56.89331555366516, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5745988258317025, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6813552379608154, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0311, |
|
"eval_samples_per_second": 96.986, |
|
"eval_steps_per_second": 1.94, |
|
"step": 51000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.42, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002526315789473684, |
|
"loss": 1.6603, |
|
"max_norm": 115.20358276367188, |
|
"max_norm/layer0": 115.20358276367188, |
|
"mean_norm": 57.622037410736084, |
|
"mean_norm/layer0": 57.622037410736084, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5754598825831703, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6768248081207275, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0402, |
|
"eval_samples_per_second": 96.135, |
|
"eval_steps_per_second": 1.923, |
|
"step": 52000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.53, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002473684210526316, |
|
"loss": 1.6595, |
|
"max_norm": 116.71245574951172, |
|
"max_norm/layer0": 116.71245574951172, |
|
"mean_norm": 58.335200548172, |
|
"mean_norm/layer0": 58.335200548172, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5753424657534246, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6757440567016602, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0326, |
|
"eval_samples_per_second": 96.84, |
|
"eval_steps_per_second": 1.937, |
|
"step": 53000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.63, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00024210526315789475, |
|
"loss": 1.6603, |
|
"max_norm": 118.23548889160156, |
|
"max_norm/layer0": 118.23548889160156, |
|
"mean_norm": 59.03403639793396, |
|
"mean_norm/layer0": 59.03403639793396, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5738160469667319, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6769322156906128, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0316, |
|
"eval_samples_per_second": 96.941, |
|
"eval_steps_per_second": 1.939, |
|
"step": 54000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.74, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00023684210526315788, |
|
"loss": 1.662, |
|
"max_norm": 119.70069122314453, |
|
"max_norm/layer0": 119.70069122314453, |
|
"mean_norm": 59.720083475112915, |
|
"mean_norm/layer0": 59.720083475112915, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5758708414872798, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6757923364639282, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0457, |
|
"eval_samples_per_second": 95.626, |
|
"eval_steps_per_second": 1.913, |
|
"step": 55000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.84, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00023157894736842107, |
|
"loss": 1.6602, |
|
"max_norm": 121.170654296875, |
|
"max_norm/layer0": 121.170654296875, |
|
"mean_norm": 60.39123606681824, |
|
"mean_norm/layer0": 60.39123606681824, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.57573385518591, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6770671606063843, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0283, |
|
"eval_samples_per_second": 97.247, |
|
"eval_steps_per_second": 1.945, |
|
"step": 56000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 5.94, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00022631578947368422, |
|
"loss": 1.6624, |
|
"max_norm": 122.60064697265625, |
|
"max_norm/layer0": 122.60064697265625, |
|
"mean_norm": 61.04953479766846, |
|
"mean_norm/layer0": 61.04953479766846, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5770254403131115, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6749203205108643, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0377, |
|
"eval_samples_per_second": 96.365, |
|
"eval_steps_per_second": 1.927, |
|
"step": 57000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.05, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00022105263157894735, |
|
"loss": 1.6527, |
|
"max_norm": 123.97573852539062, |
|
"max_norm/layer0": 123.97573852539062, |
|
"mean_norm": 61.69300150871277, |
|
"mean_norm/layer0": 61.69300150871277, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5757925636007828, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6790989637374878, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0297, |
|
"eval_samples_per_second": 97.113, |
|
"eval_steps_per_second": 1.942, |
|
"step": 58000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.15, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00021578947368421054, |
|
"loss": 1.6474, |
|
"max_norm": 125.31166076660156, |
|
"max_norm/layer0": 125.31166076660156, |
|
"mean_norm": 62.322699308395386, |
|
"mean_norm/layer0": 62.322699308395386, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5772602739726027, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.676284909248352, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0311, |
|
"eval_samples_per_second": 96.983, |
|
"eval_steps_per_second": 1.94, |
|
"step": 59000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.26, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00021052631578947367, |
|
"loss": 1.6494, |
|
"max_norm": 126.64249420166016, |
|
"max_norm/layer0": 126.64249420166016, |
|
"mean_norm": 62.93570160865784, |
|
"mean_norm/layer0": 62.93570160865784, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5760861056751467, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6764713525772095, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0321, |
|
"eval_samples_per_second": 96.893, |
|
"eval_steps_per_second": 1.938, |
|
"step": 60000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.36, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00020526315789473685, |
|
"loss": 1.6539, |
|
"max_norm": 127.96533966064453, |
|
"max_norm/layer0": 127.96533966064453, |
|
"mean_norm": 63.53509712219238, |
|
"mean_norm/layer0": 63.53509712219238, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5763600782778865, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6741266250610352, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0379, |
|
"eval_samples_per_second": 96.35, |
|
"eval_steps_per_second": 1.927, |
|
"step": 61000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.47, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6539, |
|
"max_norm": 129.2185516357422, |
|
"max_norm/layer0": 129.2185516357422, |
|
"mean_norm": 64.11901497840881, |
|
"mean_norm/layer0": 64.11901497840881, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5767710371819961, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6752326488494873, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0315, |
|
"eval_samples_per_second": 96.949, |
|
"eval_steps_per_second": 1.939, |
|
"step": 62000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.57, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00019473684210526317, |
|
"loss": 1.6529, |
|
"max_norm": 130.4375, |
|
"max_norm/layer0": 130.4375, |
|
"mean_norm": 64.6885313987732, |
|
"mean_norm/layer0": 64.6885313987732, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5774755381604697, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6737432479858398, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0332, |
|
"eval_samples_per_second": 96.787, |
|
"eval_steps_per_second": 1.936, |
|
"step": 63000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.67, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 1.6533, |
|
"max_norm": 131.647705078125, |
|
"max_norm/layer0": 131.647705078125, |
|
"mean_norm": 65.24243497848511, |
|
"mean_norm/layer0": 65.24243497848511, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5758317025440313, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6725146770477295, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0301, |
|
"eval_samples_per_second": 97.079, |
|
"eval_steps_per_second": 1.942, |
|
"step": 64000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.78, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 1.653, |
|
"max_norm": 132.81036376953125, |
|
"max_norm/layer0": 132.81036376953125, |
|
"mean_norm": 65.78296113014221, |
|
"mean_norm/layer0": 65.78296113014221, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5773581213307241, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6722198724746704, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0294, |
|
"eval_samples_per_second": 97.147, |
|
"eval_steps_per_second": 1.943, |
|
"step": 65000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.88, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 1.6522, |
|
"max_norm": 133.95559692382812, |
|
"max_norm/layer0": 133.95559692382812, |
|
"mean_norm": 66.3084762096405, |
|
"mean_norm/layer0": 66.3084762096405, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5761643835616438, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6726341247558594, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0342, |
|
"eval_samples_per_second": 96.689, |
|
"eval_steps_per_second": 1.934, |
|
"step": 66000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 6.99, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0001736842105263158, |
|
"loss": 1.6528, |
|
"max_norm": 135.03582763671875, |
|
"max_norm/layer0": 135.03582763671875, |
|
"mean_norm": 66.81927680969238, |
|
"mean_norm/layer0": 66.81927680969238, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5767710371819961, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.672642707824707, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0319, |
|
"eval_samples_per_second": 96.909, |
|
"eval_steps_per_second": 1.938, |
|
"step": 67000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.09, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 1.6439, |
|
"max_norm": 136.08602905273438, |
|
"max_norm/layer0": 136.08602905273438, |
|
"mean_norm": 67.31627178192139, |
|
"mean_norm/layer0": 67.31627178192139, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5770841487279843, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6727759838104248, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0316, |
|
"eval_samples_per_second": 96.933, |
|
"eval_steps_per_second": 1.939, |
|
"step": 68000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.19, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0001631578947368421, |
|
"loss": 1.6403, |
|
"max_norm": 137.1239013671875, |
|
"max_norm/layer0": 137.1239013671875, |
|
"mean_norm": 67.79701733589172, |
|
"mean_norm/layer0": 67.79701733589172, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5757534246575342, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.670316219329834, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0329, |
|
"eval_samples_per_second": 96.812, |
|
"eval_steps_per_second": 1.936, |
|
"step": 69000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.3, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 1.6447, |
|
"max_norm": 138.13624572753906, |
|
"max_norm/layer0": 138.13624572753906, |
|
"mean_norm": 68.2625687122345, |
|
"mean_norm/layer0": 68.2625687122345, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.57720156555773, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6696677207946777, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0301, |
|
"eval_samples_per_second": 97.082, |
|
"eval_steps_per_second": 1.942, |
|
"step": 70000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.4, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00015263157894736842, |
|
"loss": 1.6458, |
|
"max_norm": 139.11770629882812, |
|
"max_norm/layer0": 139.11770629882812, |
|
"mean_norm": 68.71309423446655, |
|
"mean_norm/layer0": 68.71309423446655, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5776908023483366, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6694140434265137, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0301, |
|
"eval_samples_per_second": 97.075, |
|
"eval_steps_per_second": 1.941, |
|
"step": 71000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.51, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00014736842105263158, |
|
"loss": 1.6447, |
|
"max_norm": 140.086669921875, |
|
"max_norm/layer0": 140.086669921875, |
|
"mean_norm": 69.14835000038147, |
|
"mean_norm/layer0": 69.14835000038147, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5771037181996086, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6716102361679077, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0344, |
|
"eval_samples_per_second": 96.67, |
|
"eval_steps_per_second": 1.933, |
|
"step": 72000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.61, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00014210526315789474, |
|
"loss": 1.6449, |
|
"max_norm": 141.01820373535156, |
|
"max_norm/layer0": 141.01820373535156, |
|
"mean_norm": 69.5690529346466, |
|
"mean_norm/layer0": 69.5690529346466, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.577945205479452, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6680197715759277, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.013, |
|
"eval_samples_per_second": 98.719, |
|
"eval_steps_per_second": 1.974, |
|
"step": 73000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.72, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00013684210526315792, |
|
"loss": 1.6458, |
|
"max_norm": 141.88795471191406, |
|
"max_norm/layer0": 141.88795471191406, |
|
"mean_norm": 69.97455215454102, |
|
"mean_norm/layer0": 69.97455215454102, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.577866927592955, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6682908535003662, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0282, |
|
"eval_samples_per_second": 97.262, |
|
"eval_steps_per_second": 1.945, |
|
"step": 74000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.82, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00013157894736842105, |
|
"loss": 1.6447, |
|
"max_norm": 142.72747802734375, |
|
"max_norm/layer0": 142.72747802734375, |
|
"mean_norm": 70.36574029922485, |
|
"mean_norm/layer0": 70.36574029922485, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5778277886497064, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6680580377578735, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0275, |
|
"eval_samples_per_second": 97.327, |
|
"eval_steps_per_second": 1.947, |
|
"step": 75000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 7.92, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0001263157894736842, |
|
"loss": 1.6451, |
|
"max_norm": 143.5332489013672, |
|
"max_norm/layer0": 143.5332489013672, |
|
"mean_norm": 70.7415566444397, |
|
"mean_norm/layer0": 70.7415566444397, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5780821917808219, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6676955223083496, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.038, |
|
"eval_samples_per_second": 96.341, |
|
"eval_steps_per_second": 1.927, |
|
"step": 76000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.03, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00012105263157894738, |
|
"loss": 1.6418, |
|
"max_norm": 144.31161499023438, |
|
"max_norm/layer0": 144.31161499023438, |
|
"mean_norm": 71.10220861434937, |
|
"mean_norm/layer0": 71.10220861434937, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5789041095890411, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.666453242301941, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.032, |
|
"eval_samples_per_second": 96.901, |
|
"eval_steps_per_second": 1.938, |
|
"step": 77000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.13, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00011578947368421053, |
|
"loss": 1.6361, |
|
"max_norm": 145.06106567382812, |
|
"max_norm/layer0": 145.06106567382812, |
|
"mean_norm": 71.44788241386414, |
|
"mean_norm/layer0": 71.44788241386414, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.577866927592955, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6684386730194092, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0237, |
|
"eval_samples_per_second": 97.688, |
|
"eval_steps_per_second": 1.954, |
|
"step": 78000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.24, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00011052631578947368, |
|
"loss": 1.636, |
|
"max_norm": 145.7824249267578, |
|
"max_norm/layer0": 145.7824249267578, |
|
"mean_norm": 71.77816247940063, |
|
"mean_norm/layer0": 71.77816247940063, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5785909980430528, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.668695092201233, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0276, |
|
"eval_samples_per_second": 97.318, |
|
"eval_steps_per_second": 1.946, |
|
"step": 79000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.34, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.00010526315789473683, |
|
"loss": 1.6357, |
|
"max_norm": 146.4754180908203, |
|
"max_norm/layer0": 146.4754180908203, |
|
"mean_norm": 72.09290337562561, |
|
"mean_norm/layer0": 72.09290337562561, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.579041095890411, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6670129299163818, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0297, |
|
"eval_samples_per_second": 97.117, |
|
"eval_steps_per_second": 1.942, |
|
"step": 80000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.45, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6379, |
|
"max_norm": 147.1273651123047, |
|
"max_norm/layer0": 147.1273651123047, |
|
"mean_norm": 72.39218544960022, |
|
"mean_norm/layer0": 72.39218544960022, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5787866927592955, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6658258438110352, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0297, |
|
"eval_samples_per_second": 97.119, |
|
"eval_steps_per_second": 1.942, |
|
"step": 81000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.55, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 1.6405, |
|
"max_norm": 147.75466918945312, |
|
"max_norm/layer0": 147.75466918945312, |
|
"mean_norm": 72.67654967308044, |
|
"mean_norm/layer0": 72.67654967308044, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5788454011741683, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6660892963409424, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0266, |
|
"eval_samples_per_second": 97.408, |
|
"eval_steps_per_second": 1.948, |
|
"step": 82000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.65, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 8.947368421052632e-05, |
|
"loss": 1.6378, |
|
"max_norm": 148.3541259765625, |
|
"max_norm/layer0": 148.3541259765625, |
|
"mean_norm": 72.94574618339539, |
|
"mean_norm/layer0": 72.94574618339539, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5789236790606653, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6649667024612427, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0319, |
|
"eval_samples_per_second": 96.911, |
|
"eval_steps_per_second": 1.938, |
|
"step": 83000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.76, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 1.6386, |
|
"max_norm": 148.9251251220703, |
|
"max_norm/layer0": 148.9251251220703, |
|
"mean_norm": 73.1996808052063, |
|
"mean_norm/layer0": 73.1996808052063, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5784344422700587, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.664962887763977, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0344, |
|
"eval_samples_per_second": 96.67, |
|
"eval_steps_per_second": 1.933, |
|
"step": 84000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.86, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 1.638, |
|
"max_norm": 149.44464111328125, |
|
"max_norm/layer0": 149.44464111328125, |
|
"mean_norm": 73.43817734718323, |
|
"mean_norm/layer0": 73.43817734718323, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5784931506849315, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6643970012664795, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0336, |
|
"eval_samples_per_second": 96.753, |
|
"eval_steps_per_second": 1.935, |
|
"step": 85000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 8.97, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 1.6374, |
|
"max_norm": 149.93634033203125, |
|
"max_norm/layer0": 149.93634033203125, |
|
"mean_norm": 73.66135931015015, |
|
"mean_norm/layer0": 73.66135931015015, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5777103718199609, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6635217666625977, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0303, |
|
"eval_samples_per_second": 97.056, |
|
"eval_steps_per_second": 1.941, |
|
"step": 86000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.07, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 6.842105263157896e-05, |
|
"loss": 1.6298, |
|
"max_norm": 150.40525817871094, |
|
"max_norm/layer0": 150.40525817871094, |
|
"mean_norm": 73.86943292617798, |
|
"mean_norm/layer0": 73.86943292617798, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5785127201565557, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6646850109100342, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0309, |
|
"eval_samples_per_second": 97.0, |
|
"eval_steps_per_second": 1.94, |
|
"step": 87000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.18, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 1.6302, |
|
"max_norm": 150.8313751220703, |
|
"max_norm/layer0": 150.8313751220703, |
|
"mean_norm": 74.06253480911255, |
|
"mean_norm/layer0": 74.06253480911255, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5787475538160469, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.664866328239441, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0347, |
|
"eval_samples_per_second": 96.643, |
|
"eval_steps_per_second": 1.933, |
|
"step": 88000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.28, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 5.789473684210527e-05, |
|
"loss": 1.6315, |
|
"max_norm": 151.22195434570312, |
|
"max_norm/layer0": 151.22195434570312, |
|
"mean_norm": 74.23983907699585, |
|
"mean_norm/layer0": 74.23983907699585, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5782191780821918, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6650762557983398, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0285, |
|
"eval_samples_per_second": 97.227, |
|
"eval_steps_per_second": 1.945, |
|
"step": 89000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.38, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 5.263157894736842e-05, |
|
"loss": 1.631, |
|
"max_norm": 151.57760620117188, |
|
"max_norm/layer0": 151.57760620117188, |
|
"mean_norm": 74.40166926383972, |
|
"mean_norm/layer0": 74.40166926383972, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.578825831702544, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6636165380477905, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0322, |
|
"eval_samples_per_second": 96.882, |
|
"eval_steps_per_second": 1.938, |
|
"step": 90000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.49, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 4.736842105263158e-05, |
|
"loss": 1.6316, |
|
"max_norm": 151.9007110595703, |
|
"max_norm/layer0": 151.9007110595703, |
|
"mean_norm": 74.54819416999817, |
|
"mean_norm/layer0": 74.54819416999817, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.49, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5781604696673189, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6626789569854736, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0209, |
|
"eval_samples_per_second": 97.952, |
|
"eval_steps_per_second": 1.959, |
|
"step": 91000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.59, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 1.6286, |
|
"max_norm": 152.19346618652344, |
|
"max_norm/layer0": 152.19346618652344, |
|
"mean_norm": 74.67919540405273, |
|
"mean_norm/layer0": 74.67919540405273, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5783365949119373, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.664610743522644, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0316, |
|
"eval_samples_per_second": 96.938, |
|
"eval_steps_per_second": 1.939, |
|
"step": 92000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.7, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 1.6304, |
|
"max_norm": 152.45083618164062, |
|
"max_norm/layer0": 152.45083618164062, |
|
"mean_norm": 74.79478573799133, |
|
"mean_norm/layer0": 74.79478573799133, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5800587084148728, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6632179021835327, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0258, |
|
"eval_samples_per_second": 97.488, |
|
"eval_steps_per_second": 1.95, |
|
"step": 93000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.8, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 1.6298, |
|
"max_norm": 152.67724609375, |
|
"max_norm/layer0": 152.67724609375, |
|
"mean_norm": 74.8951530456543, |
|
"mean_norm/layer0": 74.8951530456543, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5799804305283758, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6622798442840576, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0295, |
|
"eval_samples_per_second": 97.137, |
|
"eval_steps_per_second": 1.943, |
|
"step": 94000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 9.91, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 2.631578947368421e-05, |
|
"loss": 1.6309, |
|
"max_norm": 152.86326599121094, |
|
"max_norm/layer0": 152.86326599121094, |
|
"mean_norm": 74.9801697731018, |
|
"mean_norm/layer0": 74.9801697731018, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.91, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5800391389432485, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6619502305984497, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0188, |
|
"eval_samples_per_second": 98.158, |
|
"eval_steps_per_second": 1.963, |
|
"step": 95000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.01, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 1.6302, |
|
"max_norm": 153.0155029296875, |
|
"max_norm/layer0": 153.0155029296875, |
|
"mean_norm": 75.04964685440063, |
|
"mean_norm/layer0": 75.04964685440063, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.01, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5801369863013699, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6601940393447876, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0341, |
|
"eval_samples_per_second": 96.703, |
|
"eval_steps_per_second": 1.934, |
|
"step": 96000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.11, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 1.5789473684210526e-05, |
|
"loss": 1.6242, |
|
"max_norm": 153.1354217529297, |
|
"max_norm/layer0": 153.1354217529297, |
|
"mean_norm": 75.10380005836487, |
|
"mean_norm/layer0": 75.10380005836487, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5785518590998043, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6609833240509033, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0284, |
|
"eval_samples_per_second": 97.237, |
|
"eval_steps_per_second": 1.945, |
|
"step": 97000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.22, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.6258, |
|
"max_norm": 153.22213745117188, |
|
"max_norm/layer0": 153.22213745117188, |
|
"mean_norm": 75.14245867729187, |
|
"mean_norm/layer0": 75.14245867729187, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5794716242661448, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.660490870475769, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0267, |
|
"eval_samples_per_second": 97.402, |
|
"eval_steps_per_second": 1.948, |
|
"step": 98000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.32, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 1.6234, |
|
"max_norm": 153.27365112304688, |
|
"max_norm/layer0": 153.27365112304688, |
|
"mean_norm": 75.16563892364502, |
|
"mean_norm/layer0": 75.16563892364502, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5790802348336594, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6604704856872559, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0384, |
|
"eval_samples_per_second": 96.298, |
|
"eval_steps_per_second": 1.926, |
|
"step": 99000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.43, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.6245, |
|
"max_norm": 153.29054260253906, |
|
"max_norm/layer0": 153.29054260253906, |
|
"mean_norm": 75.17323780059814, |
|
"mean_norm/layer0": 75.17323780059814, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_accuracy": 0.5791389432485323, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_loss": 1.6604058742523193, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_runtime": 1.0339, |
|
"eval_samples_per_second": 96.717, |
|
"eval_steps_per_second": 1.934, |
|
"step": 100000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"epoch": 10.43, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"max_norm": 153.29054260253906, |
|
"max_norm/layer0": 153.29054260253906, |
|
"mean_norm": 75.17323780059814, |
|
"mean_norm/layer0": 75.17323780059814, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"step": 100000, |
|
"total_flos": 3.714781621832909e+17, |
|
"train_loss": 1.6774777018260956, |
|
"train_runtime": 132212.7109, |
|
"train_samples_per_second": 72.61, |
|
"train_steps_per_second": 0.756 |
|
} |
|
], |
|
"max_steps": 100000, |
|
"num_train_epochs": 11, |
|
"total_flos": 3.714781621832909e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|