{ "best_metric": 0.19149594008922577, "best_model_checkpoint": "miner_id_24/checkpoint-1800", "epoch": 0.687613408461465, "eval_steps": 150, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00038200744914525833, "eval_loss": 2.113102436065674, "eval_runtime": 35.511, "eval_samples_per_second": 62.093, "eval_steps_per_second": 15.544, "step": 1 }, { "epoch": 0.003820074491452583, "grad_norm": 0.9833524227142334, "learning_rate": 0.0001, "loss": 1.8845, "step": 10 }, { "epoch": 0.007640148982905166, "grad_norm": 0.6570969223976135, "learning_rate": 0.0001, "loss": 1.5895, "step": 20 }, { "epoch": 0.01146022347435775, "grad_norm": 0.7676651477813721, "learning_rate": 0.0001, "loss": 1.2362, "step": 30 }, { "epoch": 0.015280297965810333, "grad_norm": 0.6993054151535034, "learning_rate": 0.0001, "loss": 1.0434, "step": 40 }, { "epoch": 0.019100372457262916, "grad_norm": 1.1157925128936768, "learning_rate": 0.0001, "loss": 1.1106, "step": 50 }, { "epoch": 0.0229204469487155, "grad_norm": 0.5295521020889282, "learning_rate": 0.0001, "loss": 1.0765, "step": 60 }, { "epoch": 0.026740521440168082, "grad_norm": 0.56684410572052, "learning_rate": 0.0001, "loss": 0.8984, "step": 70 }, { "epoch": 0.030560595931620665, "grad_norm": 0.8060047030448914, "learning_rate": 0.0001, "loss": 0.6913, "step": 80 }, { "epoch": 0.03438067042307325, "grad_norm": 0.5849718451499939, "learning_rate": 0.0001, "loss": 0.7158, "step": 90 }, { "epoch": 0.03820074491452583, "grad_norm": 0.9322509169578552, "learning_rate": 0.0001, "loss": 0.7597, "step": 100 }, { "epoch": 0.04202081940597842, "grad_norm": 0.47825887799263, "learning_rate": 0.0001, "loss": 0.8323, "step": 110 }, { "epoch": 0.045840893897431, "grad_norm": 0.5888822674751282, "learning_rate": 0.0001, "loss": 0.6951, "step": 120 }, { "epoch": 0.049660968388883585, "grad_norm": 0.6937565207481384, "learning_rate": 0.0001, "loss": 0.5557, "step": 130 }, { "epoch": 0.053481042880336165, "grad_norm": 0.5028444528579712, "learning_rate": 0.0001, "loss": 0.5651, "step": 140 }, { "epoch": 0.05730111737178875, "grad_norm": 0.9224625825881958, "learning_rate": 0.0001, "loss": 0.618, "step": 150 }, { "epoch": 0.05730111737178875, "eval_loss": 0.5930253267288208, "eval_runtime": 35.7725, "eval_samples_per_second": 61.64, "eval_steps_per_second": 15.431, "step": 150 }, { "epoch": 0.06112119186324133, "grad_norm": 0.4309185743331909, "learning_rate": 0.0001, "loss": 0.7776, "step": 160 }, { "epoch": 0.06494126635469391, "grad_norm": 0.5074419975280762, "learning_rate": 0.0001, "loss": 0.5761, "step": 170 }, { "epoch": 0.0687613408461465, "grad_norm": 0.47409090399742126, "learning_rate": 0.0001, "loss": 0.4731, "step": 180 }, { "epoch": 0.07258141533759908, "grad_norm": 0.47136247158050537, "learning_rate": 0.0001, "loss": 0.4297, "step": 190 }, { "epoch": 0.07640148982905166, "grad_norm": 1.1784381866455078, "learning_rate": 0.0001, "loss": 0.5363, "step": 200 }, { "epoch": 0.08022156432050424, "grad_norm": 0.5751647353172302, "learning_rate": 0.0001, "loss": 0.6696, "step": 210 }, { "epoch": 0.08404163881195684, "grad_norm": 0.5390653014183044, "learning_rate": 0.0001, "loss": 0.5157, "step": 220 }, { "epoch": 0.08786171330340942, "grad_norm": 0.6373926997184753, "learning_rate": 0.0001, "loss": 0.4518, "step": 230 }, { "epoch": 0.091681787794862, "grad_norm": 0.44602304697036743, "learning_rate": 0.0001, "loss": 0.4025, "step": 240 }, { "epoch": 0.09550186228631459, "grad_norm": 1.693084478378296, "learning_rate": 0.0001, "loss": 0.4405, "step": 250 }, { "epoch": 0.09932193677776717, "grad_norm": 0.5596264600753784, "learning_rate": 0.0001, "loss": 0.5894, "step": 260 }, { "epoch": 0.10314201126921975, "grad_norm": 0.6197372078895569, "learning_rate": 0.0001, "loss": 0.4684, "step": 270 }, { "epoch": 0.10696208576067233, "grad_norm": 0.702965259552002, "learning_rate": 0.0001, "loss": 0.3781, "step": 280 }, { "epoch": 0.11078216025212492, "grad_norm": 0.48642754554748535, "learning_rate": 0.0001, "loss": 0.3107, "step": 290 }, { "epoch": 0.1146022347435775, "grad_norm": 1.4506555795669556, "learning_rate": 0.0001, "loss": 0.3594, "step": 300 }, { "epoch": 0.1146022347435775, "eval_loss": 0.41834208369255066, "eval_runtime": 35.3901, "eval_samples_per_second": 62.305, "eval_steps_per_second": 15.598, "step": 300 }, { "epoch": 0.11842230923503008, "grad_norm": 0.5598428249359131, "learning_rate": 0.0001, "loss": 0.605, "step": 310 }, { "epoch": 0.12224238372648266, "grad_norm": 0.5111796259880066, "learning_rate": 0.0001, "loss": 0.4531, "step": 320 }, { "epoch": 0.12606245821793524, "grad_norm": 0.8029823899269104, "learning_rate": 0.0001, "loss": 0.374, "step": 330 }, { "epoch": 0.12988253270938782, "grad_norm": 0.5488422513008118, "learning_rate": 0.0001, "loss": 0.2664, "step": 340 }, { "epoch": 0.13370260720084043, "grad_norm": 1.4743081331253052, "learning_rate": 0.0001, "loss": 0.3147, "step": 350 }, { "epoch": 0.137522681692293, "grad_norm": 0.5768081545829773, "learning_rate": 0.0001, "loss": 0.481, "step": 360 }, { "epoch": 0.1413427561837456, "grad_norm": 0.7705059051513672, "learning_rate": 0.0001, "loss": 0.4126, "step": 370 }, { "epoch": 0.14516283067519817, "grad_norm": 0.614723265171051, "learning_rate": 0.0001, "loss": 0.3419, "step": 380 }, { "epoch": 0.14898290516665075, "grad_norm": 0.5185005068778992, "learning_rate": 0.0001, "loss": 0.2704, "step": 390 }, { "epoch": 0.15280297965810333, "grad_norm": 1.2747787237167358, "learning_rate": 0.0001, "loss": 0.192, "step": 400 }, { "epoch": 0.1566230541495559, "grad_norm": 0.5729662179946899, "learning_rate": 0.0001, "loss": 0.5662, "step": 410 }, { "epoch": 0.1604431286410085, "grad_norm": 0.5943860411643982, "learning_rate": 0.0001, "loss": 0.4082, "step": 420 }, { "epoch": 0.1642632031324611, "grad_norm": 0.625808835029602, "learning_rate": 0.0001, "loss": 0.3395, "step": 430 }, { "epoch": 0.16808327762391367, "grad_norm": 0.4377366006374359, "learning_rate": 0.0001, "loss": 0.2566, "step": 440 }, { "epoch": 0.17190335211536625, "grad_norm": 1.2369561195373535, "learning_rate": 0.0001, "loss": 0.1672, "step": 450 }, { "epoch": 0.17190335211536625, "eval_loss": 0.3293677866458893, "eval_runtime": 35.1514, "eval_samples_per_second": 62.729, "eval_steps_per_second": 15.704, "step": 450 }, { "epoch": 0.17572342660681883, "grad_norm": 0.6541339755058289, "learning_rate": 0.0001, "loss": 0.503, "step": 460 }, { "epoch": 0.1795435010982714, "grad_norm": 0.5903088450431824, "learning_rate": 0.0001, "loss": 0.3761, "step": 470 }, { "epoch": 0.183363575589724, "grad_norm": 0.7121919989585876, "learning_rate": 0.0001, "loss": 0.3148, "step": 480 }, { "epoch": 0.18718365008117657, "grad_norm": 0.43517816066741943, "learning_rate": 0.0001, "loss": 0.2222, "step": 490 }, { "epoch": 0.19100372457262918, "grad_norm": 1.025693655014038, "learning_rate": 0.0001, "loss": 0.165, "step": 500 }, { "epoch": 0.19482379906408176, "grad_norm": 0.614997386932373, "learning_rate": 0.0001, "loss": 0.4716, "step": 510 }, { "epoch": 0.19864387355553434, "grad_norm": 0.6200920939445496, "learning_rate": 0.0001, "loss": 0.3565, "step": 520 }, { "epoch": 0.20246394804698692, "grad_norm": 0.603585422039032, "learning_rate": 0.0001, "loss": 0.28, "step": 530 }, { "epoch": 0.2062840225384395, "grad_norm": 0.5475538969039917, "learning_rate": 0.0001, "loss": 0.1779, "step": 540 }, { "epoch": 0.21010409702989208, "grad_norm": 0.8219286203384399, "learning_rate": 0.0001, "loss": 0.1212, "step": 550 }, { "epoch": 0.21392417152134466, "grad_norm": 0.5074372887611389, "learning_rate": 0.0001, "loss": 0.4237, "step": 560 }, { "epoch": 0.21774424601279724, "grad_norm": 0.7231194376945496, "learning_rate": 0.0001, "loss": 0.3688, "step": 570 }, { "epoch": 0.22156432050424985, "grad_norm": 0.6664884686470032, "learning_rate": 0.0001, "loss": 0.2747, "step": 580 }, { "epoch": 0.22538439499570243, "grad_norm": 0.5355179905891418, "learning_rate": 0.0001, "loss": 0.1644, "step": 590 }, { "epoch": 0.229204469487155, "grad_norm": 1.553965449333191, "learning_rate": 0.0001, "loss": 0.1921, "step": 600 }, { "epoch": 0.229204469487155, "eval_loss": 0.2835799753665924, "eval_runtime": 34.9833, "eval_samples_per_second": 63.03, "eval_steps_per_second": 15.779, "step": 600 }, { "epoch": 0.23302454397860758, "grad_norm": 0.6290215849876404, "learning_rate": 0.0001, "loss": 0.4529, "step": 610 }, { "epoch": 0.23684461847006016, "grad_norm": 0.653810441493988, "learning_rate": 0.0001, "loss": 0.314, "step": 620 }, { "epoch": 0.24066469296151274, "grad_norm": 0.6023784279823303, "learning_rate": 0.0001, "loss": 0.2705, "step": 630 }, { "epoch": 0.24448476745296532, "grad_norm": 0.4693503677845001, "learning_rate": 0.0001, "loss": 0.2088, "step": 640 }, { "epoch": 0.2483048419444179, "grad_norm": 1.2196199893951416, "learning_rate": 0.0001, "loss": 0.0955, "step": 650 }, { "epoch": 0.2521249164358705, "grad_norm": 0.610658586025238, "learning_rate": 0.0001, "loss": 0.4014, "step": 660 }, { "epoch": 0.2559449909273231, "grad_norm": 0.5513444542884827, "learning_rate": 0.0001, "loss": 0.3118, "step": 670 }, { "epoch": 0.25976506541877564, "grad_norm": 0.763668417930603, "learning_rate": 0.0001, "loss": 0.2552, "step": 680 }, { "epoch": 0.26358513991022825, "grad_norm": 0.5521487593650818, "learning_rate": 0.0001, "loss": 0.1542, "step": 690 }, { "epoch": 0.26740521440168086, "grad_norm": 0.6752519011497498, "learning_rate": 0.0001, "loss": 0.0955, "step": 700 }, { "epoch": 0.2712252888931334, "grad_norm": 0.5218967795372009, "learning_rate": 0.0001, "loss": 0.394, "step": 710 }, { "epoch": 0.275045363384586, "grad_norm": 0.6062911152839661, "learning_rate": 0.0001, "loss": 0.3374, "step": 720 }, { "epoch": 0.27886543787603857, "grad_norm": 0.4981124997138977, "learning_rate": 0.0001, "loss": 0.2798, "step": 730 }, { "epoch": 0.2826855123674912, "grad_norm": 0.6826938986778259, "learning_rate": 0.0001, "loss": 0.1354, "step": 740 }, { "epoch": 0.28650558685894373, "grad_norm": 0.7177520394325256, "learning_rate": 0.0001, "loss": 0.0724, "step": 750 }, { "epoch": 0.28650558685894373, "eval_loss": 0.2601197063922882, "eval_runtime": 35.5967, "eval_samples_per_second": 61.944, "eval_steps_per_second": 15.507, "step": 750 }, { "epoch": 0.29032566135039634, "grad_norm": 0.6504684686660767, "learning_rate": 0.0001, "loss": 0.4082, "step": 760 }, { "epoch": 0.29414573584184894, "grad_norm": 0.5540281534194946, "learning_rate": 0.0001, "loss": 0.3012, "step": 770 }, { "epoch": 0.2979658103333015, "grad_norm": 0.5473525524139404, "learning_rate": 0.0001, "loss": 0.2666, "step": 780 }, { "epoch": 0.3017858848247541, "grad_norm": 0.4220139980316162, "learning_rate": 0.0001, "loss": 0.1665, "step": 790 }, { "epoch": 0.30560595931620665, "grad_norm": 0.4354061484336853, "learning_rate": 0.0001, "loss": 0.0624, "step": 800 }, { "epoch": 0.30942603380765926, "grad_norm": 0.5176580548286438, "learning_rate": 0.0001, "loss": 0.3866, "step": 810 }, { "epoch": 0.3132461082991118, "grad_norm": 0.4800088703632355, "learning_rate": 0.0001, "loss": 0.2939, "step": 820 }, { "epoch": 0.3170661827905644, "grad_norm": 0.5025469064712524, "learning_rate": 0.0001, "loss": 0.2655, "step": 830 }, { "epoch": 0.320886257282017, "grad_norm": 0.48071274161338806, "learning_rate": 0.0001, "loss": 0.1041, "step": 840 }, { "epoch": 0.3247063317734696, "grad_norm": 0.6619635224342346, "learning_rate": 0.0001, "loss": 0.0745, "step": 850 }, { "epoch": 0.3285264062649222, "grad_norm": 0.6299232840538025, "learning_rate": 0.0001, "loss": 0.3734, "step": 860 }, { "epoch": 0.33234648075637474, "grad_norm": 0.5776097178459167, "learning_rate": 0.0001, "loss": 0.2873, "step": 870 }, { "epoch": 0.33616655524782735, "grad_norm": 0.8322681784629822, "learning_rate": 0.0001, "loss": 0.2167, "step": 880 }, { "epoch": 0.3399866297392799, "grad_norm": 0.45472800731658936, "learning_rate": 0.0001, "loss": 0.1204, "step": 890 }, { "epoch": 0.3438067042307325, "grad_norm": 1.1327775716781616, "learning_rate": 0.0001, "loss": 0.1111, "step": 900 }, { "epoch": 0.3438067042307325, "eval_loss": 0.26031652092933655, "eval_runtime": 35.1078, "eval_samples_per_second": 62.807, "eval_steps_per_second": 15.723, "step": 900 }, { "epoch": 0.34762677872218506, "grad_norm": 0.4318682551383972, "learning_rate": 0.0001, "loss": 0.407, "step": 910 }, { "epoch": 0.35144685321363767, "grad_norm": 0.6421833634376526, "learning_rate": 0.0001, "loss": 0.2742, "step": 920 }, { "epoch": 0.3552669277050903, "grad_norm": 0.6705676913261414, "learning_rate": 0.0001, "loss": 0.2662, "step": 930 }, { "epoch": 0.3590870021965428, "grad_norm": 0.35733574628829956, "learning_rate": 0.0001, "loss": 0.1159, "step": 940 }, { "epoch": 0.36290707668799543, "grad_norm": 0.4730457365512848, "learning_rate": 0.0001, "loss": 0.0811, "step": 950 }, { "epoch": 0.366727151179448, "grad_norm": 0.5667772889137268, "learning_rate": 0.0001, "loss": 0.3963, "step": 960 }, { "epoch": 0.3705472256709006, "grad_norm": 0.5279794931411743, "learning_rate": 0.0001, "loss": 0.3038, "step": 970 }, { "epoch": 0.37436730016235314, "grad_norm": 0.6120139360427856, "learning_rate": 0.0001, "loss": 0.246, "step": 980 }, { "epoch": 0.37818737465380575, "grad_norm": 0.2685094177722931, "learning_rate": 0.0001, "loss": 0.1254, "step": 990 }, { "epoch": 0.38200744914525836, "grad_norm": 0.5893439650535583, "learning_rate": 0.0001, "loss": 0.0665, "step": 1000 }, { "epoch": 0.3858275236367109, "grad_norm": 0.5573561787605286, "learning_rate": 0.0001, "loss": 0.3832, "step": 1010 }, { "epoch": 0.3896475981281635, "grad_norm": 0.588116466999054, "learning_rate": 0.0001, "loss": 0.2723, "step": 1020 }, { "epoch": 0.39346767261961607, "grad_norm": 0.5464149117469788, "learning_rate": 0.0001, "loss": 0.2458, "step": 1030 }, { "epoch": 0.3972877471110687, "grad_norm": 0.48240917921066284, "learning_rate": 0.0001, "loss": 0.0873, "step": 1040 }, { "epoch": 0.40110782160252123, "grad_norm": 0.7419745922088623, "learning_rate": 0.0001, "loss": 0.0542, "step": 1050 }, { "epoch": 0.40110782160252123, "eval_loss": 0.24739107489585876, "eval_runtime": 35.1279, "eval_samples_per_second": 62.771, "eval_steps_per_second": 15.714, "step": 1050 }, { "epoch": 0.40492789609397384, "grad_norm": 0.534675121307373, "learning_rate": 0.0001, "loss": 0.4066, "step": 1060 }, { "epoch": 0.4087479705854264, "grad_norm": 0.5710561871528625, "learning_rate": 0.0001, "loss": 0.2579, "step": 1070 }, { "epoch": 0.412568045076879, "grad_norm": 0.5247041583061218, "learning_rate": 0.0001, "loss": 0.2582, "step": 1080 }, { "epoch": 0.4163881195683316, "grad_norm": 0.43229806423187256, "learning_rate": 0.0001, "loss": 0.0625, "step": 1090 }, { "epoch": 0.42020819405978416, "grad_norm": 0.423566073179245, "learning_rate": 0.0001, "loss": 0.0501, "step": 1100 }, { "epoch": 0.42402826855123676, "grad_norm": 0.5252550840377808, "learning_rate": 0.0001, "loss": 0.4026, "step": 1110 }, { "epoch": 0.4278483430426893, "grad_norm": 0.5756693482398987, "learning_rate": 0.0001, "loss": 0.3035, "step": 1120 }, { "epoch": 0.4316684175341419, "grad_norm": 0.49913206696510315, "learning_rate": 0.0001, "loss": 0.2528, "step": 1130 }, { "epoch": 0.4354884920255945, "grad_norm": 0.23130396008491516, "learning_rate": 0.0001, "loss": 0.1748, "step": 1140 }, { "epoch": 0.4393085665170471, "grad_norm": 1.0342235565185547, "learning_rate": 0.0001, "loss": 0.0924, "step": 1150 }, { "epoch": 0.4431286410084997, "grad_norm": 0.46049270033836365, "learning_rate": 0.0001, "loss": 0.3391, "step": 1160 }, { "epoch": 0.44694871549995224, "grad_norm": 0.6771098971366882, "learning_rate": 0.0001, "loss": 0.2691, "step": 1170 }, { "epoch": 0.45076878999140485, "grad_norm": 0.5320389866828918, "learning_rate": 0.0001, "loss": 0.2373, "step": 1180 }, { "epoch": 0.4545888644828574, "grad_norm": 0.15962985157966614, "learning_rate": 0.0001, "loss": 0.0968, "step": 1190 }, { "epoch": 0.45840893897431, "grad_norm": 0.36463648080825806, "learning_rate": 0.0001, "loss": 0.0569, "step": 1200 }, { "epoch": 0.45840893897431, "eval_loss": 0.22201752662658691, "eval_runtime": 34.9762, "eval_samples_per_second": 63.043, "eval_steps_per_second": 15.782, "step": 1200 }, { "epoch": 0.46222901346576256, "grad_norm": 0.5054211020469666, "learning_rate": 0.0001, "loss": 0.3683, "step": 1210 }, { "epoch": 0.46604908795721517, "grad_norm": 0.619006335735321, "learning_rate": 0.0001, "loss": 0.3008, "step": 1220 }, { "epoch": 0.4698691624486678, "grad_norm": 0.49846726655960083, "learning_rate": 0.0001, "loss": 0.2396, "step": 1230 }, { "epoch": 0.47368923694012033, "grad_norm": 0.40756455063819885, "learning_rate": 0.0001, "loss": 0.1549, "step": 1240 }, { "epoch": 0.47750931143157294, "grad_norm": 0.29279765486717224, "learning_rate": 0.0001, "loss": 0.0508, "step": 1250 }, { "epoch": 0.4813293859230255, "grad_norm": 0.5386345982551575, "learning_rate": 0.0001, "loss": 0.3488, "step": 1260 }, { "epoch": 0.4851494604144781, "grad_norm": 0.5454983711242676, "learning_rate": 0.0001, "loss": 0.2472, "step": 1270 }, { "epoch": 0.48896953490593065, "grad_norm": 0.3308366537094116, "learning_rate": 0.0001, "loss": 0.2157, "step": 1280 }, { "epoch": 0.49278960939738325, "grad_norm": 0.5669858455657959, "learning_rate": 0.0001, "loss": 0.0796, "step": 1290 }, { "epoch": 0.4966096838888358, "grad_norm": 0.3885394334793091, "learning_rate": 0.0001, "loss": 0.0664, "step": 1300 }, { "epoch": 0.5004297583802885, "grad_norm": 0.5214186906814575, "learning_rate": 0.0001, "loss": 0.3623, "step": 1310 }, { "epoch": 0.504249832871741, "grad_norm": 0.45481380820274353, "learning_rate": 0.0001, "loss": 0.2668, "step": 1320 }, { "epoch": 0.5080699073631936, "grad_norm": 0.3827633559703827, "learning_rate": 0.0001, "loss": 0.2007, "step": 1330 }, { "epoch": 0.5118899818546462, "grad_norm": 0.2231357842683792, "learning_rate": 0.0001, "loss": 0.0666, "step": 1340 }, { "epoch": 0.5157100563460988, "grad_norm": 0.282650887966156, "learning_rate": 0.0001, "loss": 0.0565, "step": 1350 }, { "epoch": 0.5157100563460988, "eval_loss": 0.21465308964252472, "eval_runtime": 35.9971, "eval_samples_per_second": 61.255, "eval_steps_per_second": 15.335, "step": 1350 }, { "epoch": 0.5195301308375513, "grad_norm": 0.5369262099266052, "learning_rate": 0.0001, "loss": 0.3693, "step": 1360 }, { "epoch": 0.5233502053290039, "grad_norm": 0.6937965154647827, "learning_rate": 0.0001, "loss": 0.2701, "step": 1370 }, { "epoch": 0.5271702798204565, "grad_norm": 0.5891834497451782, "learning_rate": 0.0001, "loss": 0.2333, "step": 1380 }, { "epoch": 0.5309903543119091, "grad_norm": 0.3390977382659912, "learning_rate": 0.0001, "loss": 0.0932, "step": 1390 }, { "epoch": 0.5348104288033617, "grad_norm": 0.406382292509079, "learning_rate": 0.0001, "loss": 0.0673, "step": 1400 }, { "epoch": 0.5386305032948142, "grad_norm": 0.44265902042388916, "learning_rate": 0.0001, "loss": 0.3632, "step": 1410 }, { "epoch": 0.5424505777862668, "grad_norm": 0.6201804280281067, "learning_rate": 0.0001, "loss": 0.2502, "step": 1420 }, { "epoch": 0.5462706522777194, "grad_norm": 0.5941364765167236, "learning_rate": 0.0001, "loss": 0.225, "step": 1430 }, { "epoch": 0.550090726769172, "grad_norm": 0.12663634121418, "learning_rate": 0.0001, "loss": 0.0491, "step": 1440 }, { "epoch": 0.5539108012606245, "grad_norm": 1.1134328842163086, "learning_rate": 0.0001, "loss": 0.05, "step": 1450 }, { "epoch": 0.5577308757520771, "grad_norm": 0.49427467584609985, "learning_rate": 0.0001, "loss": 0.338, "step": 1460 }, { "epoch": 0.5615509502435297, "grad_norm": 0.5699579119682312, "learning_rate": 0.0001, "loss": 0.228, "step": 1470 }, { "epoch": 0.5653710247349824, "grad_norm": 0.4670332670211792, "learning_rate": 0.0001, "loss": 0.2299, "step": 1480 }, { "epoch": 0.569191099226435, "grad_norm": 0.6539635062217712, "learning_rate": 0.0001, "loss": 0.0879, "step": 1490 }, { "epoch": 0.5730111737178875, "grad_norm": 1.049932599067688, "learning_rate": 0.0001, "loss": 0.0449, "step": 1500 }, { "epoch": 0.5730111737178875, "eval_loss": 0.21241089701652527, "eval_runtime": 35.057, "eval_samples_per_second": 62.897, "eval_steps_per_second": 15.746, "step": 1500 }, { "epoch": 0.5768312482093401, "grad_norm": 0.5054765939712524, "learning_rate": 0.0001, "loss": 0.329, "step": 1510 }, { "epoch": 0.5806513227007927, "grad_norm": 0.561852216720581, "learning_rate": 0.0001, "loss": 0.2458, "step": 1520 }, { "epoch": 0.5844713971922453, "grad_norm": 0.47617360949516296, "learning_rate": 0.0001, "loss": 0.2562, "step": 1530 }, { "epoch": 0.5882914716836979, "grad_norm": 0.2953488230705261, "learning_rate": 0.0001, "loss": 0.0782, "step": 1540 }, { "epoch": 0.5921115461751504, "grad_norm": 0.5153560638427734, "learning_rate": 0.0001, "loss": 0.0502, "step": 1550 }, { "epoch": 0.595931620666603, "grad_norm": 0.45585310459136963, "learning_rate": 0.0001, "loss": 0.3414, "step": 1560 }, { "epoch": 0.5997516951580556, "grad_norm": 0.6329523324966431, "learning_rate": 0.0001, "loss": 0.2526, "step": 1570 }, { "epoch": 0.6035717696495082, "grad_norm": 0.3475467562675476, "learning_rate": 0.0001, "loss": 0.2111, "step": 1580 }, { "epoch": 0.6073918441409607, "grad_norm": 0.18510498106479645, "learning_rate": 0.0001, "loss": 0.1268, "step": 1590 }, { "epoch": 0.6112119186324133, "grad_norm": 0.43574023246765137, "learning_rate": 0.0001, "loss": 0.065, "step": 1600 }, { "epoch": 0.6150319931238659, "grad_norm": 0.48621150851249695, "learning_rate": 0.0001, "loss": 0.3797, "step": 1610 }, { "epoch": 0.6188520676153185, "grad_norm": 0.49730178713798523, "learning_rate": 0.0001, "loss": 0.2646, "step": 1620 }, { "epoch": 0.6226721421067711, "grad_norm": 0.6853517293930054, "learning_rate": 0.0001, "loss": 0.2254, "step": 1630 }, { "epoch": 0.6264922165982236, "grad_norm": 0.08808715641498566, "learning_rate": 0.0001, "loss": 0.0805, "step": 1640 }, { "epoch": 0.6303122910896762, "grad_norm": 0.4531610608100891, "learning_rate": 0.0001, "loss": 0.0834, "step": 1650 }, { "epoch": 0.6303122910896762, "eval_loss": 0.21575042605400085, "eval_runtime": 36.2063, "eval_samples_per_second": 60.901, "eval_steps_per_second": 15.246, "step": 1650 }, { "epoch": 0.6341323655811288, "grad_norm": 0.4218103885650635, "learning_rate": 0.0001, "loss": 0.3626, "step": 1660 }, { "epoch": 0.6379524400725815, "grad_norm": 0.359761118888855, "learning_rate": 0.0001, "loss": 0.2254, "step": 1670 }, { "epoch": 0.641772514564034, "grad_norm": 0.5094336271286011, "learning_rate": 0.0001, "loss": 0.2188, "step": 1680 }, { "epoch": 0.6455925890554866, "grad_norm": 0.17791685461997986, "learning_rate": 0.0001, "loss": 0.0839, "step": 1690 }, { "epoch": 0.6494126635469392, "grad_norm": 0.2568763792514801, "learning_rate": 0.0001, "loss": 0.0189, "step": 1700 }, { "epoch": 0.6532327380383918, "grad_norm": 0.4481312036514282, "learning_rate": 0.0001, "loss": 0.3123, "step": 1710 }, { "epoch": 0.6570528125298444, "grad_norm": 0.4391859173774719, "learning_rate": 0.0001, "loss": 0.2328, "step": 1720 }, { "epoch": 0.6608728870212969, "grad_norm": 0.6863099932670593, "learning_rate": 0.0001, "loss": 0.2108, "step": 1730 }, { "epoch": 0.6646929615127495, "grad_norm": 0.19651475548744202, "learning_rate": 0.0001, "loss": 0.0541, "step": 1740 }, { "epoch": 0.6685130360042021, "grad_norm": 0.19612224400043488, "learning_rate": 0.0001, "loss": 0.0642, "step": 1750 }, { "epoch": 0.6723331104956547, "grad_norm": 0.4632919728755951, "learning_rate": 0.0001, "loss": 0.3183, "step": 1760 }, { "epoch": 0.6761531849871073, "grad_norm": 0.623332679271698, "learning_rate": 0.0001, "loss": 0.2384, "step": 1770 }, { "epoch": 0.6799732594785598, "grad_norm": 0.4989670515060425, "learning_rate": 0.0001, "loss": 0.225, "step": 1780 }, { "epoch": 0.6837933339700124, "grad_norm": 0.2829729914665222, "learning_rate": 0.0001, "loss": 0.0445, "step": 1790 }, { "epoch": 0.687613408461465, "grad_norm": 0.18236784636974335, "learning_rate": 0.0001, "loss": 0.0274, "step": 1800 }, { "epoch": 0.687613408461465, "eval_loss": 0.19149594008922577, "eval_runtime": 35.0286, "eval_samples_per_second": 62.949, "eval_steps_per_second": 15.759, "step": 1800 } ], "logging_steps": 10, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.440952814174208e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }