{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 21460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11649580615097857, "grad_norm": 2.5263092517852783, "learning_rate": 4.883504193849022e-05, "loss": 0.612, "step": 500 }, { "epoch": 0.23299161230195714, "grad_norm": 2.820680856704712, "learning_rate": 4.767008387698043e-05, "loss": 0.4835, "step": 1000 }, { "epoch": 0.3494874184529357, "grad_norm": 2.4191653728485107, "learning_rate": 4.650512581547065e-05, "loss": 0.4403, "step": 1500 }, { "epoch": 0.4659832246039143, "grad_norm": 2.0218257904052734, "learning_rate": 4.534016775396086e-05, "loss": 0.4163, "step": 2000 }, { "epoch": 0.5824790307548928, "grad_norm": 2.193092107772827, "learning_rate": 4.4175209692451076e-05, "loss": 0.4056, "step": 2500 }, { "epoch": 0.6989748369058714, "grad_norm": 2.5563910007476807, "learning_rate": 4.3010251630941286e-05, "loss": 0.3903, "step": 3000 }, { "epoch": 0.8154706430568499, "grad_norm": 2.4142184257507324, "learning_rate": 4.1845293569431504e-05, "loss": 0.3818, "step": 3500 }, { "epoch": 0.9319664492078286, "grad_norm": 2.6781787872314453, "learning_rate": 4.068033550792172e-05, "loss": 0.3752, "step": 4000 }, { "epoch": 1.048462255358807, "grad_norm": 3.981492757797241, "learning_rate": 3.951537744641193e-05, "loss": 0.3407, "step": 4500 }, { "epoch": 1.1649580615097856, "grad_norm": 2.2366013526916504, "learning_rate": 3.835041938490215e-05, "loss": 0.3079, "step": 5000 }, { "epoch": 1.281453867660764, "grad_norm": 2.7982800006866455, "learning_rate": 3.718546132339236e-05, "loss": 0.3003, "step": 5500 }, { "epoch": 1.3979496738117427, "grad_norm": 2.147766590118408, "learning_rate": 3.602050326188258e-05, "loss": 0.3042, "step": 6000 }, { "epoch": 1.5144454799627214, "grad_norm": 3.3207345008850098, "learning_rate": 3.485554520037279e-05, "loss": 0.2994, "step": 6500 }, { "epoch": 1.6309412861136998, "grad_norm": 2.8516502380371094, "learning_rate": 3.3690587138863e-05, "loss": 0.3019, "step": 7000 }, { "epoch": 1.7474370922646785, "grad_norm": 2.4767708778381348, "learning_rate": 3.2525629077353216e-05, "loss": 0.3029, "step": 7500 }, { "epoch": 1.8639328984156571, "grad_norm": 2.9029157161712646, "learning_rate": 3.1360671015843426e-05, "loss": 0.2971, "step": 8000 }, { "epoch": 1.9804287045666356, "grad_norm": 3.0262856483459473, "learning_rate": 3.0195712954333644e-05, "loss": 0.2926, "step": 8500 }, { "epoch": 2.096924510717614, "grad_norm": 2.862046957015991, "learning_rate": 2.9030754892823857e-05, "loss": 0.2322, "step": 9000 }, { "epoch": 2.213420316868593, "grad_norm": 2.759274482727051, "learning_rate": 2.786579683131407e-05, "loss": 0.2295, "step": 9500 }, { "epoch": 2.3299161230195713, "grad_norm": 3.3560988903045654, "learning_rate": 2.670083876980429e-05, "loss": 0.2288, "step": 10000 }, { "epoch": 2.4464119291705497, "grad_norm": 2.7938876152038574, "learning_rate": 2.5535880708294503e-05, "loss": 0.228, "step": 10500 }, { "epoch": 2.562907735321528, "grad_norm": 3.100569486618042, "learning_rate": 2.4370922646784717e-05, "loss": 0.2288, "step": 11000 }, { "epoch": 2.679403541472507, "grad_norm": 2.6765120029449463, "learning_rate": 2.320596458527493e-05, "loss": 0.2307, "step": 11500 }, { "epoch": 2.7958993476234855, "grad_norm": 2.7146239280700684, "learning_rate": 2.2041006523765145e-05, "loss": 0.2275, "step": 12000 }, { "epoch": 2.9123951537744643, "grad_norm": 3.6457717418670654, "learning_rate": 2.087604846225536e-05, "loss": 0.2328, "step": 12500 }, { "epoch": 3.0288909599254428, "grad_norm": 3.6852147579193115, "learning_rate": 1.9711090400745573e-05, "loss": 0.2172, "step": 13000 }, { "epoch": 3.145386766076421, "grad_norm": 3.0440313816070557, "learning_rate": 1.854613233923579e-05, "loss": 0.1749, "step": 13500 }, { "epoch": 3.2618825722273996, "grad_norm": 2.364924192428589, "learning_rate": 1.7381174277726004e-05, "loss": 0.1767, "step": 14000 }, { "epoch": 3.3783783783783785, "grad_norm": 2.6927711963653564, "learning_rate": 1.6216216216216218e-05, "loss": 0.172, "step": 14500 }, { "epoch": 3.494874184529357, "grad_norm": 3.143772602081299, "learning_rate": 1.5051258154706432e-05, "loss": 0.1787, "step": 15000 }, { "epoch": 3.6113699906803354, "grad_norm": 2.1713764667510986, "learning_rate": 1.3886300093196648e-05, "loss": 0.1724, "step": 15500 }, { "epoch": 3.7278657968313142, "grad_norm": 4.549355506896973, "learning_rate": 1.2721342031686858e-05, "loss": 0.1744, "step": 16000 }, { "epoch": 3.8443616029822927, "grad_norm": 1.8248041868209839, "learning_rate": 1.1556383970177074e-05, "loss": 0.1781, "step": 16500 }, { "epoch": 3.960857409133271, "grad_norm": 3.255676507949829, "learning_rate": 1.039142590866729e-05, "loss": 0.1752, "step": 17000 }, { "epoch": 4.0773532152842495, "grad_norm": 4.369929790496826, "learning_rate": 9.226467847157502e-06, "loss": 0.1491, "step": 17500 }, { "epoch": 4.193849021435228, "grad_norm": 3.5270001888275146, "learning_rate": 8.061509785647716e-06, "loss": 0.1355, "step": 18000 }, { "epoch": 4.310344827586207, "grad_norm": 3.747748613357544, "learning_rate": 6.896551724137932e-06, "loss": 0.1402, "step": 18500 }, { "epoch": 4.426840633737186, "grad_norm": 2.4927761554718018, "learning_rate": 5.731593662628146e-06, "loss": 0.1341, "step": 19000 }, { "epoch": 4.543336439888164, "grad_norm": 3.129467010498047, "learning_rate": 4.56663560111836e-06, "loss": 0.1349, "step": 19500 }, { "epoch": 4.659832246039143, "grad_norm": 5.417072772979736, "learning_rate": 3.401677539608574e-06, "loss": 0.1322, "step": 20000 }, { "epoch": 4.776328052190121, "grad_norm": 3.4678492546081543, "learning_rate": 2.2367194780987884e-06, "loss": 0.1402, "step": 20500 }, { "epoch": 4.8928238583410995, "grad_norm": 3.649914026260376, "learning_rate": 1.0717614165890028e-06, "loss": 0.1375, "step": 21000 } ], "logging_steps": 500, "max_steps": 21460, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.096814400558976e+16, "train_batch_size": 128, "trial_name": null, "trial_params": null }