{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9971883786316776, "eval_steps": 67, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037488284910965324, "eval_loss": 2.361954689025879, "eval_runtime": 3.5851, "eval_samples_per_second": 125.521, "eval_steps_per_second": 15.899, "step": 1 }, { "epoch": 0.01874414245548266, "grad_norm": 0.1121288314461708, "learning_rate": 1.6666666666666667e-05, "loss": 2.0053, "step": 5 }, { "epoch": 0.03748828491096532, "grad_norm": 0.2451150268316269, "learning_rate": 3.3333333333333335e-05, "loss": 2.2996, "step": 10 }, { "epoch": 0.056232427366447985, "grad_norm": 0.10745533555746078, "learning_rate": 5e-05, "loss": 2.2399, "step": 15 }, { "epoch": 0.07497656982193064, "grad_norm": 0.21655316650867462, "learning_rate": 6.666666666666667e-05, "loss": 2.1764, "step": 20 }, { "epoch": 0.09372071227741331, "grad_norm": 0.8079115152359009, "learning_rate": 8.333333333333334e-05, "loss": 2.6489, "step": 25 }, { "epoch": 0.11246485473289597, "grad_norm": 0.14432325959205627, "learning_rate": 0.0001, "loss": 2.0175, "step": 30 }, { "epoch": 0.13120899718837864, "grad_norm": 0.33471807837486267, "learning_rate": 9.989021978333995e-05, "loss": 2.1727, "step": 35 }, { "epoch": 0.14995313964386128, "grad_norm": 0.18594202399253845, "learning_rate": 9.956136120119858e-05, "loss": 2.1616, "step": 40 }, { "epoch": 0.16869728209934395, "grad_norm": 0.25360578298568726, "learning_rate": 9.901486834023182e-05, "loss": 2.0248, "step": 45 }, { "epoch": 0.18744142455482662, "grad_norm": 0.8163858652114868, "learning_rate": 9.825314096462685e-05, "loss": 2.3624, "step": 50 }, { "epoch": 0.20618556701030927, "grad_norm": 0.21278133988380432, "learning_rate": 9.72795239782369e-05, "loss": 1.9549, "step": 55 }, { "epoch": 0.22492970946579194, "grad_norm": 0.3845430016517639, "learning_rate": 9.609829273641034e-05, "loss": 2.1518, "step": 60 }, { "epoch": 0.2436738519212746, "grad_norm": 0.20262780785560608, "learning_rate": 9.47146342720133e-05, "loss": 2.1145, "step": 65 }, { "epoch": 0.2511715089034677, "eval_loss": 2.1052558422088623, "eval_runtime": 3.565, "eval_samples_per_second": 126.226, "eval_steps_per_second": 15.989, "step": 67 }, { "epoch": 0.2624179943767573, "grad_norm": 0.27742379903793335, "learning_rate": 9.3134624518086e-05, "loss": 1.9983, "step": 70 }, { "epoch": 0.28116213683223995, "grad_norm": 1.399202585220337, "learning_rate": 9.136520162715287e-05, "loss": 2.3956, "step": 75 }, { "epoch": 0.29990627928772257, "grad_norm": 0.2662142813205719, "learning_rate": 8.94141355043471e-05, "loss": 1.8996, "step": 80 }, { "epoch": 0.31865042174320524, "grad_norm": 0.47221991419792175, "learning_rate": 8.728999368813591e-05, "loss": 2.1171, "step": 85 }, { "epoch": 0.3373945641986879, "grad_norm": 0.20478223264217377, "learning_rate": 8.500210372847127e-05, "loss": 2.0381, "step": 90 }, { "epoch": 0.3561387066541706, "grad_norm": 0.3004133105278015, "learning_rate": 8.256051222757188e-05, "loss": 1.9738, "step": 95 }, { "epoch": 0.37488284910965325, "grad_norm": 1.0329713821411133, "learning_rate": 7.997594072319625e-05, "loss": 2.2227, "step": 100 }, { "epoch": 0.3936269915651359, "grad_norm": 0.2629578709602356, "learning_rate": 7.725973860813338e-05, "loss": 1.8634, "step": 105 }, { "epoch": 0.41237113402061853, "grad_norm": 0.46902984380722046, "learning_rate": 7.442383329265062e-05, "loss": 2.0261, "step": 110 }, { "epoch": 0.4311152764761012, "grad_norm": 0.23315371572971344, "learning_rate": 7.14806778287464e-05, "loss": 2.0076, "step": 115 }, { "epoch": 0.4498594189315839, "grad_norm": 0.3503168523311615, "learning_rate": 6.844319622620039e-05, "loss": 1.9312, "step": 120 }, { "epoch": 0.46860356138706655, "grad_norm": 1.228668212890625, "learning_rate": 6.532472670054974e-05, "loss": 2.2572, "step": 125 }, { "epoch": 0.4873477038425492, "grad_norm": 0.27969205379486084, "learning_rate": 6.213896310220139e-05, "loss": 1.8566, "step": 130 }, { "epoch": 0.5023430178069354, "eval_loss": 2.028304100036621, "eval_runtime": 3.6808, "eval_samples_per_second": 122.257, "eval_steps_per_second": 15.486, "step": 134 }, { "epoch": 0.5060918462980318, "grad_norm": 0.6361961364746094, "learning_rate": 5.889989478387753e-05, "loss": 2.02, "step": 135 }, { "epoch": 0.5248359887535146, "grad_norm": 0.24769911170005798, "learning_rate": 5.5621745170448616e-05, "loss": 1.9486, "step": 140 }, { "epoch": 0.5435801312089972, "grad_norm": 0.3790437877178192, "learning_rate": 5.2318909300906926e-05, "loss": 2.012, "step": 145 }, { "epoch": 0.5623242736644799, "grad_norm": 1.3477414846420288, "learning_rate": 4.900589061674649e-05, "loss": 2.1598, "step": 150 }, { "epoch": 0.5810684161199625, "grad_norm": 0.3055458664894104, "learning_rate": 4.569723727432517e-05, "loss": 1.7986, "step": 155 }, { "epoch": 0.5998125585754451, "grad_norm": 0.5476722121238708, "learning_rate": 4.240747826087429e-05, "loss": 2.005, "step": 160 }, { "epoch": 0.6185567010309279, "grad_norm": 0.2658919095993042, "learning_rate": 3.91510595946841e-05, "loss": 1.9485, "step": 165 }, { "epoch": 0.6373008434864105, "grad_norm": 0.4225824177265167, "learning_rate": 3.5942280889623026e-05, "loss": 1.952, "step": 170 }, { "epoch": 0.6560449859418932, "grad_norm": 1.3499469757080078, "learning_rate": 3.27952325625493e-05, "loss": 2.194, "step": 175 }, { "epoch": 0.6747891283973758, "grad_norm": 0.3095148801803589, "learning_rate": 2.9723733959350307e-05, "loss": 1.7729, "step": 180 }, { "epoch": 0.6935332708528584, "grad_norm": 0.6386678218841553, "learning_rate": 2.674127267131131e-05, "loss": 2.0403, "step": 185 }, { "epoch": 0.7122774133083412, "grad_norm": 0.2703355848789215, "learning_rate": 2.3860945308287552e-05, "loss": 1.9099, "step": 190 }, { "epoch": 0.7310215557638238, "grad_norm": 0.38986149430274963, "learning_rate": 2.1095399988757574e-05, "loss": 1.9077, "step": 195 }, { "epoch": 0.7497656982193065, "grad_norm": 1.2410173416137695, "learning_rate": 1.8456780799295886e-05, "loss": 2.1793, "step": 200 }, { "epoch": 0.753514526710403, "eval_loss": 1.9828202724456787, "eval_runtime": 3.6018, "eval_samples_per_second": 124.937, "eval_steps_per_second": 15.825, "step": 201 }, { "epoch": 0.7685098406747891, "grad_norm": 0.3099184036254883, "learning_rate": 1.5956674467354537e-05, "loss": 1.7989, "step": 205 }, { "epoch": 0.7872539831302718, "grad_norm": 0.6507529616355896, "learning_rate": 1.3606059481525296e-05, "loss": 1.9024, "step": 210 }, { "epoch": 0.8059981255857545, "grad_norm": 0.28235751390457153, "learning_rate": 1.1415257882705311e-05, "loss": 1.9202, "step": 215 }, { "epoch": 0.8247422680412371, "grad_norm": 0.4169105887413025, "learning_rate": 9.393889937861694e-06, "loss": 1.9481, "step": 220 }, { "epoch": 0.8434864104967198, "grad_norm": 1.1425191164016724, "learning_rate": 7.550831895431798e-06, "loss": 2.0996, "step": 225 }, { "epoch": 0.8622305529522024, "grad_norm": 0.3197765052318573, "learning_rate": 5.894177007864271e-06, "loss": 1.8399, "step": 230 }, { "epoch": 0.8809746954076851, "grad_norm": 0.6108065843582153, "learning_rate": 4.4311999924586065e-06, "loss": 1.9692, "step": 235 }, { "epoch": 0.8997188378631678, "grad_norm": 0.2768736779689789, "learning_rate": 3.1683250865636114e-06, "loss": 1.9613, "step": 240 }, { "epoch": 0.9184629803186504, "grad_norm": 0.40607261657714844, "learning_rate": 2.1110978374106192e-06, "loss": 1.8941, "step": 245 }, { "epoch": 0.9372071227741331, "grad_norm": 1.1138426065444946, "learning_rate": 1.2641607504584928e-06, "loss": 2.0727, "step": 250 }, { "epoch": 0.9559512652296157, "grad_norm": 0.32683926820755005, "learning_rate": 6.312329031833319e-07, "loss": 1.8131, "step": 255 }, { "epoch": 0.9746954076850984, "grad_norm": 0.6287037134170532, "learning_rate": 2.1509361383330596e-07, "loss": 1.9955, "step": 260 }, { "epoch": 0.993439550140581, "grad_norm": 0.3499762713909149, "learning_rate": 1.7570236862241017e-08, "loss": 2.0133, "step": 265 } ], "logging_steps": 5, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.526691379760333e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }