{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.1642619311875695, "eval_steps": 500, "global_step": 19500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05549389567147614, "grad_norm": 5.1054205894470215, "learning_rate": 0.0009815020347761745, "loss": 2.823, "step": 500 }, { "epoch": 0.11098779134295228, "grad_norm": 5.702259540557861, "learning_rate": 0.0009630040695523493, "loss": 2.5297, "step": 1000 }, { "epoch": 0.16648168701442842, "grad_norm": 2.288583278656006, "learning_rate": 0.0009445061043285239, "loss": 2.2922, "step": 1500 }, { "epoch": 0.22197558268590456, "grad_norm": 1.2640436887741089, "learning_rate": 0.0009260081391046985, "loss": 2.2693, "step": 2000 }, { "epoch": 0.27746947835738067, "grad_norm": 2.6587376594543457, "learning_rate": 0.0009075101738808731, "loss": 2.1495, "step": 2500 }, { "epoch": 0.33296337402885684, "grad_norm": 3.5381200313568115, "learning_rate": 0.0008890122086570478, "loss": 2.0572, "step": 3000 }, { "epoch": 0.38845726970033295, "grad_norm": 5.533036708831787, "learning_rate": 0.0008705142434332224, "loss": 2.0025, "step": 3500 }, { "epoch": 0.4439511653718091, "grad_norm": 5.4949541091918945, "learning_rate": 0.000852016278209397, "loss": 1.9598, "step": 4000 }, { "epoch": 0.49944506104328523, "grad_norm": 1.909883737564087, "learning_rate": 0.0008335183129855716, "loss": 1.9381, "step": 4500 }, { "epoch": 0.5549389567147613, "grad_norm": 5.691808223724365, "learning_rate": 0.0008150203477617462, "loss": 1.8509, "step": 5000 }, { "epoch": 0.6104328523862376, "grad_norm": 2.442469358444214, "learning_rate": 0.0007965223825379209, "loss": 1.7988, "step": 5500 }, { "epoch": 0.6659267480577137, "grad_norm": 2.9826149940490723, "learning_rate": 0.0007780244173140955, "loss": 1.7816, "step": 6000 }, { "epoch": 0.7214206437291898, "grad_norm": 2.3049557209014893, "learning_rate": 0.00075952645209027, "loss": 1.7736, "step": 6500 }, { "epoch": 0.7769145394006659, "grad_norm": 2.855045795440674, "learning_rate": 0.0007410284868664448, "loss": 1.7075, "step": 7000 }, { "epoch": 0.832408435072142, "grad_norm": 3.2882728576660156, "learning_rate": 0.0007225305216426194, "loss": 1.701, "step": 7500 }, { "epoch": 0.8879023307436182, "grad_norm": 2.1633858680725098, "learning_rate": 0.0007040325564187939, "loss": 1.6848, "step": 8000 }, { "epoch": 0.9433962264150944, "grad_norm": 2.723276138305664, "learning_rate": 0.0006855345911949685, "loss": 1.6152, "step": 8500 }, { "epoch": 0.9988901220865705, "grad_norm": 2.73237943649292, "learning_rate": 0.0006670366259711433, "loss": 1.5959, "step": 9000 }, { "epoch": 1.0543840177580466, "grad_norm": 2.634521007537842, "learning_rate": 0.0006485386607473178, "loss": 1.4009, "step": 9500 }, { "epoch": 1.1098779134295227, "grad_norm": 1.5136901140213013, "learning_rate": 0.0006300406955234924, "loss": 1.3877, "step": 10000 }, { "epoch": 1.1653718091009988, "grad_norm": 2.1514732837677, "learning_rate": 0.000611542730299667, "loss": 1.3724, "step": 10500 }, { "epoch": 1.220865704772475, "grad_norm": 2.6399548053741455, "learning_rate": 0.0005930447650758417, "loss": 1.335, "step": 11000 }, { "epoch": 1.2763596004439512, "grad_norm": 1.5254985094070435, "learning_rate": 0.0005745467998520163, "loss": 1.3886, "step": 11500 }, { "epoch": 1.3318534961154274, "grad_norm": 1.9089288711547852, "learning_rate": 0.0005560488346281908, "loss": 1.3439, "step": 12000 }, { "epoch": 1.3873473917869035, "grad_norm": 2.2264904975891113, "learning_rate": 0.0005375508694043655, "loss": 1.3081, "step": 12500 }, { "epoch": 1.4428412874583796, "grad_norm": 2.0363707542419434, "learning_rate": 0.0005190529041805402, "loss": 1.3539, "step": 13000 }, { "epoch": 1.4983351831298557, "grad_norm": 1.3880550861358643, "learning_rate": 0.0005005549389567147, "loss": 1.3054, "step": 13500 }, { "epoch": 1.5538290788013318, "grad_norm": 1.9909353256225586, "learning_rate": 0.00048205697373288937, "loss": 1.2664, "step": 14000 }, { "epoch": 1.609322974472808, "grad_norm": 1.8012514114379883, "learning_rate": 0.000463559008509064, "loss": 1.3344, "step": 14500 }, { "epoch": 1.6648168701442843, "grad_norm": 1.5090456008911133, "learning_rate": 0.00044506104328523863, "loss": 1.3274, "step": 15000 }, { "epoch": 1.7203107658157601, "grad_norm": 2.566316843032837, "learning_rate": 0.00042656307806141327, "loss": 1.2629, "step": 15500 }, { "epoch": 1.7758046614872365, "grad_norm": 1.7907458543777466, "learning_rate": 0.00040806511283758784, "loss": 1.3041, "step": 16000 }, { "epoch": 1.8312985571587126, "grad_norm": 1.4483698606491089, "learning_rate": 0.0003895671476137625, "loss": 1.2407, "step": 16500 }, { "epoch": 1.8867924528301887, "grad_norm": 2.474891424179077, "learning_rate": 0.0003710691823899371, "loss": 1.2555, "step": 17000 }, { "epoch": 1.9422863485016648, "grad_norm": 1.0690557956695557, "learning_rate": 0.00035257121716611174, "loss": 1.2407, "step": 17500 }, { "epoch": 1.997780244173141, "grad_norm": 1.2744386196136475, "learning_rate": 0.00033407325194228637, "loss": 1.1843, "step": 18000 }, { "epoch": 2.0532741398446173, "grad_norm": 0.9873028993606567, "learning_rate": 0.000315575286718461, "loss": 0.9438, "step": 18500 }, { "epoch": 2.108768035516093, "grad_norm": 3.0536534786224365, "learning_rate": 0.00029707732149463563, "loss": 0.918, "step": 19000 }, { "epoch": 2.1642619311875695, "grad_norm": 1.4433006048202515, "learning_rate": 0.0002785793562708102, "loss": 0.9162, "step": 19500 } ], "logging_steps": 500, "max_steps": 27030, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.074621752888525e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }