{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988458927359131, "eval_steps": 500, "global_step": 552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05431093007467753, "grad_norm": 1.4870964288711548, "learning_rate": 0.00019997351589651408, "loss": 3.4965, "step": 10 }, { "epoch": 0.10862186014935506, "grad_norm": 1.784044861793518, "learning_rate": 0.00019967573081342103, "loss": 2.065, "step": 20 }, { "epoch": 0.1629327902240326, "grad_norm": 0.7305468916893005, "learning_rate": 0.00019904804439875633, "loss": 1.2421, "step": 30 }, { "epoch": 0.2172437202987101, "grad_norm": 0.6995559930801392, "learning_rate": 0.00019809253413499565, "loss": 1.093, "step": 40 }, { "epoch": 0.27155465037338766, "grad_norm": 0.6627448201179504, "learning_rate": 0.00019681236251822273, "loss": 1.0856, "step": 50 }, { "epoch": 0.3258655804480652, "grad_norm": 0.7160666584968567, "learning_rate": 0.00019521176659107142, "loss": 1.013, "step": 60 }, { "epoch": 0.3801765105227427, "grad_norm": 0.6306814551353455, "learning_rate": 0.0001932960439191915, "loss": 1.0374, "step": 70 }, { "epoch": 0.4344874405974202, "grad_norm": 0.7758208513259888, "learning_rate": 0.00019107153505765306, "loss": 0.9474, "step": 80 }, { "epoch": 0.48879837067209775, "grad_norm": 1.2394300699234009, "learning_rate": 0.000188545602565321, "loss": 0.9932, "step": 90 }, { "epoch": 0.5431093007467753, "grad_norm": 0.829031229019165, "learning_rate": 0.0001857266066366567, "loss": 0.9204, "step": 100 }, { "epoch": 0.5974202308214528, "grad_norm": 0.7629134654998779, "learning_rate": 0.0001826238774315995, "loss": 0.9457, "step": 110 }, { "epoch": 0.6517311608961304, "grad_norm": 0.8157823085784912, "learning_rate": 0.00017924768419510904, "loss": 0.8539, "step": 120 }, { "epoch": 0.7060420909708078, "grad_norm": 0.7475631237030029, "learning_rate": 0.0001756092012685749, "loss": 0.82, "step": 130 }, { "epoch": 0.7603530210454854, "grad_norm": 0.6592528223991394, "learning_rate": 0.000171720471105587, "loss": 0.8846, "step": 140 }, { "epoch": 0.814663951120163, "grad_norm": 0.6989027857780457, "learning_rate": 0.00016759436441447545, "loss": 0.8367, "step": 150 }, { "epoch": 0.8689748811948405, "grad_norm": 0.7253873348236084, "learning_rate": 0.00016324453755953773, "loss": 0.8068, "step": 160 }, { "epoch": 0.923285811269518, "grad_norm": 0.7640873193740845, "learning_rate": 0.00015868538736194427, "loss": 0.8169, "step": 170 }, { "epoch": 0.9775967413441955, "grad_norm": 0.7669989466667175, "learning_rate": 0.00015393200344991995, "loss": 0.8355, "step": 180 }, { "epoch": 1.0271554650373387, "grad_norm": 0.7532988786697388, "learning_rate": 0.0001490001183159105, "loss": 0.7339, "step": 190 }, { "epoch": 1.0814663951120163, "grad_norm": 0.7974510192871094, "learning_rate": 0.0001439060552460318, "loss": 0.8186, "step": 200 }, { "epoch": 1.1357773251866938, "grad_norm": 0.9017219543457031, "learning_rate": 0.0001386666742941419, "loss": 0.775, "step": 210 }, { "epoch": 1.1900882552613714, "grad_norm": 0.8205109238624573, "learning_rate": 0.00013329931647934883, "loss": 0.7421, "step": 220 }, { "epoch": 1.2443991853360488, "grad_norm": 0.866692066192627, "learning_rate": 0.0001278217463916453, "loss": 0.7113, "step": 230 }, { "epoch": 1.2987101154107263, "grad_norm": 0.8832337856292725, "learning_rate": 0.00012225209339563145, "loss": 0.7545, "step": 240 }, { "epoch": 1.353021045485404, "grad_norm": 1.0796443223953247, "learning_rate": 0.00011660879162692675, "loss": 0.7085, "step": 250 }, { "epoch": 1.4073319755600815, "grad_norm": 0.9231683015823364, "learning_rate": 0.00011091051897986678, "loss": 0.7168, "step": 260 }, { "epoch": 1.461642905634759, "grad_norm": 0.8881363272666931, "learning_rate": 0.00010517613528842097, "loss": 0.7606, "step": 270 }, { "epoch": 1.5159538357094364, "grad_norm": 0.8930597901344299, "learning_rate": 9.942461990493625e-05, "loss": 0.6926, "step": 280 }, { "epoch": 1.570264765784114, "grad_norm": 1.0270030498504639, "learning_rate": 9.367500888330545e-05, "loss": 0.7571, "step": 290 }, { "epoch": 1.6245756958587916, "grad_norm": 0.8959159255027771, "learning_rate": 8.79463319744677e-05, "loss": 0.7786, "step": 300 }, { "epoch": 1.6788866259334692, "grad_norm": 0.8595919013023376, "learning_rate": 8.225754964277018e-05, "loss": 0.6935, "step": 310 }, { "epoch": 1.7331975560081467, "grad_norm": 0.953175961971283, "learning_rate": 7.662749031165092e-05, "loss": 0.6901, "step": 320 }, { "epoch": 1.787508486082824, "grad_norm": 0.985431969165802, "learning_rate": 7.107478804634325e-05, "loss": 0.7101, "step": 330 }, { "epoch": 1.8418194161575017, "grad_norm": 1.0016827583312988, "learning_rate": 6.561782087985681e-05, "loss": 0.707, "step": 340 }, { "epoch": 1.8961303462321792, "grad_norm": 0.9732582569122314, "learning_rate": 6.02746499863599e-05, "loss": 0.7426, "step": 350 }, { "epoch": 1.9504412763068566, "grad_norm": 0.9253762364387512, "learning_rate": 5.506295990328385e-05, "loss": 0.7273, "step": 360 }, { "epoch": 2.0, "grad_norm": 2.792293071746826, "learning_rate": 5.000000000000002e-05, "loss": 0.7256, "step": 370 }, { "epoch": 2.0543109300746774, "grad_norm": 0.9254827499389648, "learning_rate": 4.510252738679136e-05, "loss": 0.6432, "step": 380 }, { "epoch": 2.108621860149355, "grad_norm": 1.0876941680908203, "learning_rate": 4.038675145307747e-05, "loss": 0.6256, "step": 390 }, { "epoch": 2.1629327902240325, "grad_norm": 0.916249692440033, "learning_rate": 3.5868280218455796e-05, "loss": 0.6442, "step": 400 }, { "epoch": 2.2172437202987103, "grad_norm": 0.9240853190422058, "learning_rate": 3.1562068674124344e-05, "loss": 0.5883, "step": 410 }, { "epoch": 2.2715546503733877, "grad_norm": 1.2008038759231567, "learning_rate": 2.7482369285662378e-05, "loss": 0.6987, "step": 420 }, { "epoch": 2.325865580448065, "grad_norm": 1.2723044157028198, "learning_rate": 2.364268482099218e-05, "loss": 0.708, "step": 430 }, { "epoch": 2.380176510522743, "grad_norm": 0.9695908427238464, "learning_rate": 2.0055723659649904e-05, "loss": 0.6782, "step": 440 }, { "epoch": 2.43448744059742, "grad_norm": 1.044391393661499, "learning_rate": 1.6733357731279377e-05, "loss": 0.5803, "step": 450 }, { "epoch": 2.4887983706720975, "grad_norm": 0.9964624643325806, "learning_rate": 1.368658322256311e-05, "loss": 0.6112, "step": 460 }, { "epoch": 2.5431093007467753, "grad_norm": 1.004639744758606, "learning_rate": 1.0925484182639467e-05, "loss": 0.6322, "step": 470 }, { "epoch": 2.5974202308214527, "grad_norm": 1.1456069946289062, "learning_rate": 8.45919914746337e-06, "loss": 0.5633, "step": 480 }, { "epoch": 2.6517311608961305, "grad_norm": 1.1862763166427612, "learning_rate": 6.2958908935752955e-06, "loss": 0.5859, "step": 490 }, { "epoch": 2.706042090970808, "grad_norm": 1.1233826875686646, "learning_rate": 4.442719421385922e-06, "loss": 0.6147, "step": 500 }, { "epoch": 2.7603530210454856, "grad_norm": 1.0159374475479126, "learning_rate": 2.905818257394799e-06, "loss": 0.5829, "step": 510 }, { "epoch": 2.814663951120163, "grad_norm": 1.053791880607605, "learning_rate": 1.6902741537767609e-06, "loss": 0.5938, "step": 520 }, { "epoch": 2.8689748811948403, "grad_norm": 1.0928566455841064, "learning_rate": 8.00110252525299e-07, "loss": 0.6136, "step": 530 }, { "epoch": 2.923285811269518, "grad_norm": 1.1599104404449463, "learning_rate": 2.382727698752474e-07, "loss": 0.6389, "step": 540 }, { "epoch": 2.9775967413441955, "grad_norm": 1.2020913362503052, "learning_rate": 6.621245075910665e-09, "loss": 0.6719, "step": 550 } ], "logging_steps": 10, "max_steps": 552, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4322859040948224.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }