{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.031181789834736514, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006236357966947302, "grad_norm": 0.3851119875907898, "learning_rate": 1e-05, "loss": 3.4915, "step": 1 }, { "epoch": 0.0006236357966947302, "eval_loss": 1.7047454118728638, "eval_runtime": 236.7838, "eval_samples_per_second": 5.706, "eval_steps_per_second": 0.714, "step": 1 }, { "epoch": 0.0012472715933894605, "grad_norm": 0.2950498163700104, "learning_rate": 2e-05, "loss": 3.4035, "step": 2 }, { "epoch": 0.0018709073900841909, "grad_norm": 0.34026584029197693, "learning_rate": 3e-05, "loss": 3.3069, "step": 3 }, { "epoch": 0.002494543186778921, "grad_norm": 0.32401928305625916, "learning_rate": 4e-05, "loss": 3.546, "step": 4 }, { "epoch": 0.0031181789834736516, "grad_norm": 0.3200150430202484, "learning_rate": 5e-05, "loss": 3.2917, "step": 5 }, { "epoch": 0.0031181789834736516, "eval_loss": 1.6980758905410767, "eval_runtime": 237.1624, "eval_samples_per_second": 5.697, "eval_steps_per_second": 0.713, "step": 5 }, { "epoch": 0.0037418147801683817, "grad_norm": 0.4819504916667938, "learning_rate": 6e-05, "loss": 3.55, "step": 6 }, { "epoch": 0.004365450576863112, "grad_norm": 0.5356610417366028, "learning_rate": 7e-05, "loss": 3.3637, "step": 7 }, { "epoch": 0.004989086373557842, "grad_norm": 0.4570450484752655, "learning_rate": 8e-05, "loss": 3.4409, "step": 8 }, { "epoch": 0.005612722170252572, "grad_norm": 0.48526719212532043, "learning_rate": 9e-05, "loss": 3.2111, "step": 9 }, { "epoch": 0.006236357966947303, "grad_norm": 0.6282075047492981, "learning_rate": 0.0001, "loss": 3.5479, "step": 10 }, { "epoch": 0.006236357966947303, "eval_loss": 1.6325942277908325, "eval_runtime": 237.073, "eval_samples_per_second": 5.699, "eval_steps_per_second": 0.713, "step": 10 }, { "epoch": 0.006859993763642033, "grad_norm": 0.4209723472595215, "learning_rate": 9.98458666866564e-05, "loss": 3.1233, "step": 11 }, { "epoch": 0.007483629560336763, "grad_norm": 0.33553972840309143, "learning_rate": 9.938441702975689e-05, "loss": 3.2172, "step": 12 }, { "epoch": 0.008107265357031494, "grad_norm": 0.38164424896240234, "learning_rate": 9.861849601988383e-05, "loss": 3.3218, "step": 13 }, { "epoch": 0.008730901153726224, "grad_norm": 0.36970698833465576, "learning_rate": 9.755282581475769e-05, "loss": 3.2089, "step": 14 }, { "epoch": 0.009354536950420954, "grad_norm": 0.4920770823955536, "learning_rate": 9.619397662556435e-05, "loss": 3.1671, "step": 15 }, { "epoch": 0.009354536950420954, "eval_loss": 1.5723668336868286, "eval_runtime": 237.0047, "eval_samples_per_second": 5.7, "eval_steps_per_second": 0.713, "step": 15 }, { "epoch": 0.009978172747115684, "grad_norm": 0.4778711199760437, "learning_rate": 9.45503262094184e-05, "loss": 3.1973, "step": 16 }, { "epoch": 0.010601808543810414, "grad_norm": 0.5049943327903748, "learning_rate": 9.263200821770461e-05, "loss": 3.2375, "step": 17 }, { "epoch": 0.011225444340505144, "grad_norm": 0.4225131869316101, "learning_rate": 9.045084971874738e-05, "loss": 3.2528, "step": 18 }, { "epoch": 0.011849080137199874, "grad_norm": 0.48889443278312683, "learning_rate": 8.802029828000156e-05, "loss": 3.1074, "step": 19 }, { "epoch": 0.012472715933894606, "grad_norm": 0.380058228969574, "learning_rate": 8.535533905932738e-05, "loss": 3.0086, "step": 20 }, { "epoch": 0.012472715933894606, "eval_loss": 1.5312796831130981, "eval_runtime": 237.0132, "eval_samples_per_second": 5.7, "eval_steps_per_second": 0.713, "step": 20 }, { "epoch": 0.013096351730589336, "grad_norm": 0.44404545426368713, "learning_rate": 8.247240241650918e-05, "loss": 3.1387, "step": 21 }, { "epoch": 0.013719987527284067, "grad_norm": 0.4585994482040405, "learning_rate": 7.938926261462366e-05, "loss": 3.0418, "step": 22 }, { "epoch": 0.014343623323978797, "grad_norm": 0.35382935404777527, "learning_rate": 7.612492823579745e-05, "loss": 2.9979, "step": 23 }, { "epoch": 0.014967259120673527, "grad_norm": 0.48911118507385254, "learning_rate": 7.269952498697734e-05, "loss": 3.2132, "step": 24 }, { "epoch": 0.015590894917368257, "grad_norm": 0.43348392844200134, "learning_rate": 6.91341716182545e-05, "loss": 3.0339, "step": 25 }, { "epoch": 0.015590894917368257, "eval_loss": 1.5086687803268433, "eval_runtime": 237.0653, "eval_samples_per_second": 5.699, "eval_steps_per_second": 0.713, "step": 25 }, { "epoch": 0.016214530714062987, "grad_norm": 0.3725155293941498, "learning_rate": 6.545084971874738e-05, "loss": 3.1813, "step": 26 }, { "epoch": 0.01683816651075772, "grad_norm": 0.4585173726081848, "learning_rate": 6.167226819279528e-05, "loss": 2.7768, "step": 27 }, { "epoch": 0.017461802307452447, "grad_norm": 0.4500892460346222, "learning_rate": 5.782172325201155e-05, "loss": 2.8296, "step": 28 }, { "epoch": 0.01808543810414718, "grad_norm": 0.48939481377601624, "learning_rate": 5.392295478639225e-05, "loss": 2.9663, "step": 29 }, { "epoch": 0.018709073900841908, "grad_norm": 0.3543107509613037, "learning_rate": 5e-05, "loss": 3.0134, "step": 30 }, { "epoch": 0.018709073900841908, "eval_loss": 1.4923217296600342, "eval_runtime": 237.0712, "eval_samples_per_second": 5.699, "eval_steps_per_second": 0.713, "step": 30 }, { "epoch": 0.01933270969753664, "grad_norm": 0.4475993514060974, "learning_rate": 4.607704521360776e-05, "loss": 3.0947, "step": 31 }, { "epoch": 0.019956345494231368, "grad_norm": 0.36318323016166687, "learning_rate": 4.2178276747988446e-05, "loss": 2.9932, "step": 32 }, { "epoch": 0.0205799812909261, "grad_norm": 0.4241596758365631, "learning_rate": 3.832773180720475e-05, "loss": 2.9971, "step": 33 }, { "epoch": 0.021203617087620828, "grad_norm": 0.42191532254219055, "learning_rate": 3.4549150281252636e-05, "loss": 2.9246, "step": 34 }, { "epoch": 0.02182725288431556, "grad_norm": 0.39511993527412415, "learning_rate": 3.086582838174551e-05, "loss": 2.7848, "step": 35 }, { "epoch": 0.02182725288431556, "eval_loss": 1.4821107387542725, "eval_runtime": 237.0783, "eval_samples_per_second": 5.699, "eval_steps_per_second": 0.713, "step": 35 }, { "epoch": 0.02245088868101029, "grad_norm": 0.3700943887233734, "learning_rate": 2.7300475013022663e-05, "loss": 2.9213, "step": 36 }, { "epoch": 0.02307452447770502, "grad_norm": 0.5621315836906433, "learning_rate": 2.3875071764202563e-05, "loss": 3.182, "step": 37 }, { "epoch": 0.02369816027439975, "grad_norm": 0.45323172211647034, "learning_rate": 2.061073738537635e-05, "loss": 2.9021, "step": 38 }, { "epoch": 0.02432179607109448, "grad_norm": 0.44154319167137146, "learning_rate": 1.7527597583490822e-05, "loss": 3.1109, "step": 39 }, { "epoch": 0.024945431867789213, "grad_norm": 0.45853954553604126, "learning_rate": 1.4644660940672627e-05, "loss": 3.03, "step": 40 }, { "epoch": 0.024945431867789213, "eval_loss": 1.4768427610397339, "eval_runtime": 237.1138, "eval_samples_per_second": 5.698, "eval_steps_per_second": 0.713, "step": 40 }, { "epoch": 0.02556906766448394, "grad_norm": 0.4484853744506836, "learning_rate": 1.1979701719998453e-05, "loss": 2.7623, "step": 41 }, { "epoch": 0.026192703461178673, "grad_norm": 0.45995089411735535, "learning_rate": 9.549150281252633e-06, "loss": 2.9062, "step": 42 }, { "epoch": 0.0268163392578734, "grad_norm": 0.44017675518989563, "learning_rate": 7.367991782295391e-06, "loss": 2.8865, "step": 43 }, { "epoch": 0.027439975054568133, "grad_norm": 0.49776846170425415, "learning_rate": 5.449673790581611e-06, "loss": 2.9727, "step": 44 }, { "epoch": 0.02806361085126286, "grad_norm": 0.4960048794746399, "learning_rate": 3.8060233744356633e-06, "loss": 3.0449, "step": 45 }, { "epoch": 0.02806361085126286, "eval_loss": 1.4745455980300903, "eval_runtime": 237.1867, "eval_samples_per_second": 5.696, "eval_steps_per_second": 0.713, "step": 45 }, { "epoch": 0.028687246647957593, "grad_norm": 0.40954574942588806, "learning_rate": 2.4471741852423237e-06, "loss": 2.6758, "step": 46 }, { "epoch": 0.029310882444652322, "grad_norm": 0.47597748041152954, "learning_rate": 1.3815039801161721e-06, "loss": 3.0871, "step": 47 }, { "epoch": 0.029934518241347054, "grad_norm": 0.42350029945373535, "learning_rate": 6.15582970243117e-07, "loss": 2.8222, "step": 48 }, { "epoch": 0.030558154038041782, "grad_norm": 0.4435149133205414, "learning_rate": 1.5413331334360182e-07, "loss": 2.9299, "step": 49 }, { "epoch": 0.031181789834736514, "grad_norm": 0.4817396104335785, "learning_rate": 0.0, "loss": 3.0554, "step": 50 }, { "epoch": 0.031181789834736514, "eval_loss": 1.4741227626800537, "eval_runtime": 237.1724, "eval_samples_per_second": 5.696, "eval_steps_per_second": 0.713, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.490439355564032e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }