{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 3.0, | |
"eval_steps": 500, | |
"global_step": 1386, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.02, | |
"grad_norm": 4.875, | |
"learning_rate": 0.0002, | |
"loss": 18.0666, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.04, | |
"grad_norm": 8.625, | |
"learning_rate": 0.0002, | |
"loss": 11.9963, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.06, | |
"grad_norm": 3.84375, | |
"learning_rate": 0.0002, | |
"loss": 6.0667, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 2.375, | |
"learning_rate": 0.0002, | |
"loss": 2.4769, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 2.109375, | |
"learning_rate": 0.0002, | |
"loss": 2.2233, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 1.8984375, | |
"learning_rate": 0.0002, | |
"loss": 2.1739, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 3.375, | |
"learning_rate": 0.0002, | |
"loss": 2.0811, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.17, | |
"grad_norm": 4.0, | |
"learning_rate": 0.0002, | |
"loss": 1.9928, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 1.4140625, | |
"learning_rate": 0.0002, | |
"loss": 1.9649, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 1.7421875, | |
"learning_rate": 0.0002, | |
"loss": 1.9645, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 2.171875, | |
"learning_rate": 0.0002, | |
"loss": 1.9812, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 1.5546875, | |
"learning_rate": 0.0002, | |
"loss": 1.9106, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.28, | |
"grad_norm": 2.984375, | |
"learning_rate": 0.0002, | |
"loss": 1.8943, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 1.3046875, | |
"learning_rate": 0.0002, | |
"loss": 1.9205, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 2.234375, | |
"learning_rate": 0.0002, | |
"loss": 1.9445, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 1.0390625, | |
"learning_rate": 0.0002, | |
"loss": 1.8882, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 2.984375, | |
"learning_rate": 0.0002, | |
"loss": 1.8806, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 2.46875, | |
"learning_rate": 0.0002, | |
"loss": 1.8692, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.41, | |
"grad_norm": 1.4375, | |
"learning_rate": 0.0002, | |
"loss": 1.8291, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 1.515625, | |
"learning_rate": 0.0002, | |
"loss": 1.8069, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 2.671875, | |
"learning_rate": 0.0002, | |
"loss": 1.8406, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 1.625, | |
"learning_rate": 0.0002, | |
"loss": 1.8694, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 1.8671875, | |
"learning_rate": 0.0002, | |
"loss": 1.821, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.52, | |
"grad_norm": 2.03125, | |
"learning_rate": 0.0002, | |
"loss": 1.8243, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 1.1875, | |
"learning_rate": 0.0002, | |
"loss": 1.7678, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 2.0, | |
"learning_rate": 0.0002, | |
"loss": 1.8301, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 1.15625, | |
"learning_rate": 0.0002, | |
"loss": 1.8845, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 1.078125, | |
"learning_rate": 0.0002, | |
"loss": 1.8901, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 1.015625, | |
"learning_rate": 0.0002, | |
"loss": 1.8198, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 1.15625, | |
"learning_rate": 0.0002, | |
"loss": 1.7615, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 1.0703125, | |
"learning_rate": 0.0002, | |
"loss": 1.8717, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 1.515625, | |
"learning_rate": 0.0002, | |
"loss": 1.7862, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 1.6796875, | |
"learning_rate": 0.0002, | |
"loss": 1.8674, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 1.0859375, | |
"learning_rate": 0.0002, | |
"loss": 1.822, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 0.921875, | |
"learning_rate": 0.0002, | |
"loss": 1.7834, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 1.4921875, | |
"learning_rate": 0.0002, | |
"loss": 1.8157, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 1.15625, | |
"learning_rate": 0.0002, | |
"loss": 1.7164, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 0.875, | |
"learning_rate": 0.0002, | |
"loss": 1.835, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 1.2109375, | |
"learning_rate": 0.0002, | |
"loss": 1.7927, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 1.8046875, | |
"learning_rate": 0.0002, | |
"loss": 1.7038, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 1.234375, | |
"learning_rate": 0.0002, | |
"loss": 1.7914, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 2.921875, | |
"learning_rate": 0.0002, | |
"loss": 1.7892, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 0.83984375, | |
"learning_rate": 0.0002, | |
"loss": 1.7335, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 1.453125, | |
"learning_rate": 0.0002, | |
"loss": 1.8013, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 1.5078125, | |
"learning_rate": 0.0002, | |
"loss": 1.7118, | |
"step": 450 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 1.4140625, | |
"learning_rate": 0.0002, | |
"loss": 1.8202, | |
"step": 460 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 2.265625, | |
"learning_rate": 0.0002, | |
"loss": 1.7011, | |
"step": 470 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 1.0546875, | |
"learning_rate": 0.0002, | |
"loss": 1.6769, | |
"step": 480 | |
}, | |
{ | |
"epoch": 1.06, | |
"grad_norm": 1.1015625, | |
"learning_rate": 0.0002, | |
"loss": 1.6393, | |
"step": 490 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 1.25, | |
"learning_rate": 0.0002, | |
"loss": 1.6685, | |
"step": 500 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 0.85546875, | |
"learning_rate": 0.0002, | |
"loss": 1.665, | |
"step": 510 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 2.046875, | |
"learning_rate": 0.0002, | |
"loss": 1.6446, | |
"step": 520 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 1.421875, | |
"learning_rate": 0.0002, | |
"loss": 1.5743, | |
"step": 530 | |
}, | |
{ | |
"epoch": 1.17, | |
"grad_norm": 0.9765625, | |
"learning_rate": 0.0002, | |
"loss": 1.6622, | |
"step": 540 | |
}, | |
{ | |
"epoch": 1.19, | |
"grad_norm": 1.125, | |
"learning_rate": 0.0002, | |
"loss": 1.5741, | |
"step": 550 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 1.1328125, | |
"learning_rate": 0.0002, | |
"loss": 1.6434, | |
"step": 560 | |
}, | |
{ | |
"epoch": 1.23, | |
"grad_norm": 1.359375, | |
"learning_rate": 0.0002, | |
"loss": 1.6343, | |
"step": 570 | |
}, | |
{ | |
"epoch": 1.26, | |
"grad_norm": 1.1640625, | |
"learning_rate": 0.0002, | |
"loss": 1.662, | |
"step": 580 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 1.28125, | |
"learning_rate": 0.0002, | |
"loss": 1.6246, | |
"step": 590 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 0.96875, | |
"learning_rate": 0.0002, | |
"loss": 1.614, | |
"step": 600 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 2.703125, | |
"learning_rate": 0.0002, | |
"loss": 1.6817, | |
"step": 610 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 2.203125, | |
"learning_rate": 0.0002, | |
"loss": 1.6343, | |
"step": 620 | |
}, | |
{ | |
"epoch": 1.36, | |
"grad_norm": 1.578125, | |
"learning_rate": 0.0002, | |
"loss": 1.6237, | |
"step": 630 | |
}, | |
{ | |
"epoch": 1.39, | |
"grad_norm": 0.97265625, | |
"learning_rate": 0.0002, | |
"loss": 1.6367, | |
"step": 640 | |
}, | |
{ | |
"epoch": 1.41, | |
"grad_norm": 1.2734375, | |
"learning_rate": 0.0002, | |
"loss": 1.6304, | |
"step": 650 | |
}, | |
{ | |
"epoch": 1.43, | |
"grad_norm": 1.09375, | |
"learning_rate": 0.0002, | |
"loss": 1.5319, | |
"step": 660 | |
}, | |
{ | |
"epoch": 1.45, | |
"grad_norm": 0.921875, | |
"learning_rate": 0.0002, | |
"loss": 1.643, | |
"step": 670 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 1.1328125, | |
"learning_rate": 0.0002, | |
"loss": 1.6247, | |
"step": 680 | |
}, | |
{ | |
"epoch": 1.49, | |
"grad_norm": 1.6796875, | |
"learning_rate": 0.0002, | |
"loss": 1.5913, | |
"step": 690 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 1.0546875, | |
"learning_rate": 0.0002, | |
"loss": 1.6415, | |
"step": 700 | |
}, | |
{ | |
"epoch": 1.54, | |
"grad_norm": 1.2890625, | |
"learning_rate": 0.0002, | |
"loss": 1.636, | |
"step": 710 | |
}, | |
{ | |
"epoch": 1.56, | |
"grad_norm": 1.28125, | |
"learning_rate": 0.0002, | |
"loss": 1.6304, | |
"step": 720 | |
}, | |
{ | |
"epoch": 1.58, | |
"grad_norm": 1.03125, | |
"learning_rate": 0.0002, | |
"loss": 1.6156, | |
"step": 730 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 1.0859375, | |
"learning_rate": 0.0002, | |
"loss": 1.5906, | |
"step": 740 | |
}, | |
{ | |
"epoch": 1.62, | |
"grad_norm": 1.21875, | |
"learning_rate": 0.0002, | |
"loss": 1.5717, | |
"step": 750 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 1.25, | |
"learning_rate": 0.0002, | |
"loss": 1.6747, | |
"step": 760 | |
}, | |
{ | |
"epoch": 1.67, | |
"grad_norm": 1.3203125, | |
"learning_rate": 0.0002, | |
"loss": 1.6095, | |
"step": 770 | |
}, | |
{ | |
"epoch": 1.69, | |
"grad_norm": 1.109375, | |
"learning_rate": 0.0002, | |
"loss": 1.6571, | |
"step": 780 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 1.5, | |
"learning_rate": 0.0002, | |
"loss": 1.5913, | |
"step": 790 | |
}, | |
{ | |
"epoch": 1.73, | |
"grad_norm": 1.03125, | |
"learning_rate": 0.0002, | |
"loss": 1.592, | |
"step": 800 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 1.7578125, | |
"learning_rate": 0.0002, | |
"loss": 1.5279, | |
"step": 810 | |
}, | |
{ | |
"epoch": 1.77, | |
"grad_norm": 1.2109375, | |
"learning_rate": 0.0002, | |
"loss": 1.6599, | |
"step": 820 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 1.1015625, | |
"learning_rate": 0.0002, | |
"loss": 1.6947, | |
"step": 830 | |
}, | |
{ | |
"epoch": 1.82, | |
"grad_norm": 1.3046875, | |
"learning_rate": 0.0002, | |
"loss": 1.608, | |
"step": 840 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 1.4140625, | |
"learning_rate": 0.0002, | |
"loss": 1.6716, | |
"step": 850 | |
}, | |
{ | |
"epoch": 1.86, | |
"grad_norm": 1.4453125, | |
"learning_rate": 0.0002, | |
"loss": 1.6847, | |
"step": 860 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 1.203125, | |
"learning_rate": 0.0002, | |
"loss": 1.5283, | |
"step": 870 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 0.93359375, | |
"learning_rate": 0.0002, | |
"loss": 1.6639, | |
"step": 880 | |
}, | |
{ | |
"epoch": 1.93, | |
"grad_norm": 1.0625, | |
"learning_rate": 0.0002, | |
"loss": 1.6154, | |
"step": 890 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 0.93359375, | |
"learning_rate": 0.0002, | |
"loss": 1.6248, | |
"step": 900 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 1.0078125, | |
"learning_rate": 0.0002, | |
"loss": 1.5525, | |
"step": 910 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 1.8203125, | |
"learning_rate": 0.0002, | |
"loss": 1.5069, | |
"step": 920 | |
}, | |
{ | |
"epoch": 2.01, | |
"grad_norm": 1.296875, | |
"learning_rate": 0.0002, | |
"loss": 1.5104, | |
"step": 930 | |
}, | |
{ | |
"epoch": 2.03, | |
"grad_norm": 1.1484375, | |
"learning_rate": 0.0002, | |
"loss": 1.3058, | |
"step": 940 | |
}, | |
{ | |
"epoch": 2.06, | |
"grad_norm": 1.25, | |
"learning_rate": 0.0002, | |
"loss": 1.4888, | |
"step": 950 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 1.0234375, | |
"learning_rate": 0.0002, | |
"loss": 1.3972, | |
"step": 960 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 1.375, | |
"learning_rate": 0.0002, | |
"loss": 1.4524, | |
"step": 970 | |
}, | |
{ | |
"epoch": 2.12, | |
"grad_norm": 1.359375, | |
"learning_rate": 0.0002, | |
"loss": 1.4534, | |
"step": 980 | |
}, | |
{ | |
"epoch": 2.14, | |
"grad_norm": 1.6484375, | |
"learning_rate": 0.0002, | |
"loss": 1.4601, | |
"step": 990 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 1.1640625, | |
"learning_rate": 0.0002, | |
"loss": 1.4352, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 2.19, | |
"grad_norm": 1.2109375, | |
"learning_rate": 0.0002, | |
"loss": 1.4183, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 1.1796875, | |
"learning_rate": 0.0002, | |
"loss": 1.3654, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 1.453125, | |
"learning_rate": 0.0002, | |
"loss": 1.4216, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 1.3359375, | |
"learning_rate": 0.0002, | |
"loss": 1.3554, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 2.27, | |
"grad_norm": 2.5, | |
"learning_rate": 0.0002, | |
"loss": 1.3716, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 1.0078125, | |
"learning_rate": 0.0002, | |
"loss": 1.426, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 1.203125, | |
"learning_rate": 0.0002, | |
"loss": 1.4755, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 2.34, | |
"grad_norm": 1.7578125, | |
"learning_rate": 0.0002, | |
"loss": 1.3976, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 1.140625, | |
"learning_rate": 0.0002, | |
"loss": 1.472, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 2.38, | |
"grad_norm": 1.28125, | |
"learning_rate": 0.0002, | |
"loss": 1.461, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 1.1328125, | |
"learning_rate": 0.0002, | |
"loss": 1.4307, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 1.625, | |
"learning_rate": 0.0002, | |
"loss": 1.4603, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 2.45, | |
"grad_norm": 2.03125, | |
"learning_rate": 0.0002, | |
"loss": 1.573, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 1.984375, | |
"learning_rate": 0.0002, | |
"loss": 1.3899, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 2.49, | |
"grad_norm": 1.3984375, | |
"learning_rate": 0.0002, | |
"loss": 1.463, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 1.296875, | |
"learning_rate": 0.0002, | |
"loss": 1.3866, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 1.125, | |
"learning_rate": 0.0002, | |
"loss": 1.3927, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 1.1484375, | |
"learning_rate": 0.0002, | |
"loss": 1.3956, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 8.75, | |
"learning_rate": 0.0002, | |
"loss": 1.335, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 1.1796875, | |
"learning_rate": 0.0002, | |
"loss": 1.4117, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 2.62, | |
"grad_norm": 1.28125, | |
"learning_rate": 0.0002, | |
"loss": 1.385, | |
"step": 1210 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 1.25, | |
"learning_rate": 0.0002, | |
"loss": 1.414, | |
"step": 1220 | |
}, | |
{ | |
"epoch": 2.66, | |
"grad_norm": 1.234375, | |
"learning_rate": 0.0002, | |
"loss": 1.3831, | |
"step": 1230 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 1.2890625, | |
"learning_rate": 0.0002, | |
"loss": 1.5191, | |
"step": 1240 | |
}, | |
{ | |
"epoch": 2.71, | |
"grad_norm": 1.453125, | |
"learning_rate": 0.0002, | |
"loss": 1.3568, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 2.73, | |
"grad_norm": 1.5078125, | |
"learning_rate": 0.0002, | |
"loss": 1.4097, | |
"step": 1260 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 1.1875, | |
"learning_rate": 0.0002, | |
"loss": 1.4798, | |
"step": 1270 | |
}, | |
{ | |
"epoch": 2.77, | |
"grad_norm": 1.234375, | |
"learning_rate": 0.0002, | |
"loss": 1.3602, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 1.671875, | |
"learning_rate": 0.0002, | |
"loss": 1.4344, | |
"step": 1290 | |
}, | |
{ | |
"epoch": 2.81, | |
"grad_norm": 1.40625, | |
"learning_rate": 0.0002, | |
"loss": 1.4734, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 2.84, | |
"grad_norm": 1.1796875, | |
"learning_rate": 0.0002, | |
"loss": 1.4536, | |
"step": 1310 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 1.28125, | |
"learning_rate": 0.0002, | |
"loss": 1.3822, | |
"step": 1320 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 1.1875, | |
"learning_rate": 0.0002, | |
"loss": 1.4099, | |
"step": 1330 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 1.09375, | |
"learning_rate": 0.0002, | |
"loss": 1.4668, | |
"step": 1340 | |
}, | |
{ | |
"epoch": 2.92, | |
"grad_norm": 1.75, | |
"learning_rate": 0.0002, | |
"loss": 1.4382, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 2.94, | |
"grad_norm": 1.8515625, | |
"learning_rate": 0.0002, | |
"loss": 1.4847, | |
"step": 1360 | |
}, | |
{ | |
"epoch": 2.97, | |
"grad_norm": 1.3203125, | |
"learning_rate": 0.0002, | |
"loss": 1.4274, | |
"step": 1370 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 1.9453125, | |
"learning_rate": 0.0002, | |
"loss": 1.4039, | |
"step": 1380 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 1386, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 500, | |
"total_flos": 3.907934746733445e+17, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |