{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1261.8181818181818, "eval_steps": 500, "global_step": 3470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 36.36363636363637, "grad_norm": 3.9305503368377686, "learning_rate": 0.00019992595626374085, "loss": 3.5508, "step": 100 }, { "epoch": 72.72727272727273, "grad_norm": 7.143374919891357, "learning_rate": 0.00019969786478821292, "loss": 1.0142, "step": 200 }, { "epoch": 109.0909090909091, "grad_norm": 4.481863498687744, "learning_rate": 0.00019932062382607466, "loss": 0.5034, "step": 300 }, { "epoch": 145.45454545454547, "grad_norm": 5.447987079620361, "learning_rate": 0.00019878719501520854, "loss": 0.3536, "step": 400 }, { "epoch": 181.8181818181818, "grad_norm": 6.030304431915283, "learning_rate": 0.00019810144350986773, "loss": 0.2281, "step": 500 }, { "epoch": 218.1818181818182, "grad_norm": 3.293713331222534, "learning_rate": 0.0001972644266891692, "loss": 0.1648, "step": 600 }, { "epoch": 254.54545454545453, "grad_norm": 3.844909191131592, "learning_rate": 0.0001962774351723822, "loss": 0.1083, "step": 700 }, { "epoch": 290.90909090909093, "grad_norm": 6.67781400680542, "learning_rate": 0.00019514199082888708, "loss": 0.0708, "step": 800 }, { "epoch": 327.27272727272725, "grad_norm": 0.4285012483596802, "learning_rate": 0.00019385984443156292, "loss": 0.0539, "step": 900 }, { "epoch": 363.6363636363636, "grad_norm": 9.999969482421875, "learning_rate": 0.00019243297295722252, "loss": 0.0471, "step": 1000 }, { "epoch": 400.0, "grad_norm": 0.11577145010232925, "learning_rate": 0.00019086357653825758, "loss": 0.0361, "step": 1100 }, { "epoch": 436.3636363636364, "grad_norm": 0.04236361011862755, "learning_rate": 0.00018915407507019406, "loss": 0.0202, "step": 1200 }, { "epoch": 472.72727272727275, "grad_norm": 0.09598010778427124, "learning_rate": 0.0001873071044803886, "loss": 0.0204, "step": 1300 }, { "epoch": 509.09090909090907, "grad_norm": 0.04872431233525276, "learning_rate": 0.00018532551266361953, "loss": 0.0127, "step": 1400 }, { "epoch": 545.4545454545455, "grad_norm": 1.3734139204025269, "learning_rate": 0.00018321235509083966, "loss": 0.0125, "step": 1500 }, { "epoch": 581.8181818181819, "grad_norm": 0.058050643652677536, "learning_rate": 0.00018097089009786154, "loss": 0.0109, "step": 1600 }, { "epoch": 618.1818181818181, "grad_norm": 0.2065199464559555, "learning_rate": 0.0001786045738612397, "loss": 0.0054, "step": 1700 }, { "epoch": 654.5454545454545, "grad_norm": 0.011252381838858128, "learning_rate": 0.000176117055069097, "loss": 0.0092, "step": 1800 }, { "epoch": 690.9090909090909, "grad_norm": 0.2677134573459625, "learning_rate": 0.00017351216929511202, "loss": 0.0069, "step": 1900 }, { "epoch": 727.2727272727273, "grad_norm": 0.19302548468112946, "learning_rate": 0.00017079393308434222, "loss": 0.0052, "step": 2000 }, { "epoch": 763.6363636363636, "grad_norm": 0.09640073031187057, "learning_rate": 0.000167966537760003, "loss": 0.0056, "step": 2100 }, { "epoch": 800.0, "grad_norm": 0.05480790510773659, "learning_rate": 0.00016503434296075077, "loss": 0.0041, "step": 2200 }, { "epoch": 836.3636363636364, "grad_norm": 0.10284140706062317, "learning_rate": 0.00016200186991843633, "loss": 0.0026, "step": 2300 }, { "epoch": 872.7272727272727, "grad_norm": 0.03779178485274315, "learning_rate": 0.0001588737944866928, "loss": 0.0077, "step": 2400 }, { "epoch": 909.0909090909091, "grad_norm": 0.040165312588214874, "learning_rate": 0.00015565493993110856, "loss": 0.0068, "step": 2500 }, { "epoch": 945.4545454545455, "grad_norm": 0.0260086078196764, "learning_rate": 0.00015235026949210102, "loss": 0.004, "step": 2600 }, { "epoch": 981.8181818181819, "grad_norm": 0.0010075848549604416, "learning_rate": 0.0001489648787319599, "loss": 0.0029, "step": 2700 }, { "epoch": 1018.1818181818181, "grad_norm": 0.1857517808675766, "learning_rate": 0.000145538952885112, "loss": 0.0025, "step": 2800 }, { "epoch": 1054.5454545454545, "grad_norm": 0.009264905005693436, "learning_rate": 0.00014200857284118066, "loss": 0.0032, "step": 2900 }, { "epoch": 1090.909090909091, "grad_norm": 0.005081487353891134, "learning_rate": 0.00013841341862277026, "loss": 0.0023, "step": 3000 }, { "epoch": 1127.2727272727273, "grad_norm": 0.32677099108695984, "learning_rate": 0.0001347590336971037, "loss": 0.0055, "step": 3100 }, { "epoch": 1163.6363636363637, "grad_norm": 0.014215439558029175, "learning_rate": 0.00013105105286086123, "loss": 0.0005, "step": 3200 }, { "epoch": 1200.0, "grad_norm": 0.015997236594557762, "learning_rate": 0.00012729519355173254, "loss": 0.0009, "step": 3300 }, { "epoch": 1236.3636363636363, "grad_norm": 0.668640673160553, "learning_rate": 0.00012349724703254215, "loss": 0.0024, "step": 3400 } ], "logging_steps": 100, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 4000, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0175353452509184e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }