{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.060175054704595, "eval_steps": 8, "global_step": 86, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0350109409190372, "eval_loss": 10.379219055175781, "eval_runtime": 0.7148, "eval_samples_per_second": 135.698, "eval_steps_per_second": 68.548, "step": 1 }, { "epoch": 0.1050328227571116, "grad_norm": 0.10373959690332413, "learning_rate": 3e-05, "loss": 10.3774, "step": 3 }, { "epoch": 0.2100656455142232, "grad_norm": 0.11316292732954025, "learning_rate": 6e-05, "loss": 10.3758, "step": 6 }, { "epoch": 0.2800875273522976, "eval_loss": 10.377415657043457, "eval_runtime": 0.7201, "eval_samples_per_second": 134.703, "eval_steps_per_second": 68.046, "step": 8 }, { "epoch": 0.3150984682713348, "grad_norm": 0.10614413022994995, "learning_rate": 9e-05, "loss": 10.3741, "step": 9 }, { "epoch": 0.4201312910284464, "grad_norm": 0.11337128281593323, "learning_rate": 0.00012, "loss": 10.3761, "step": 12 }, { "epoch": 0.5251641137855579, "grad_norm": 0.10311492532491684, "learning_rate": 0.00015000000000000001, "loss": 10.3712, "step": 15 }, { "epoch": 0.5601750547045952, "eval_loss": 10.371430397033691, "eval_runtime": 0.7202, "eval_samples_per_second": 134.683, "eval_steps_per_second": 68.036, "step": 16 }, { "epoch": 0.6301969365426696, "grad_norm": 0.12901803851127625, "learning_rate": 0.00018, "loss": 10.3699, "step": 18 }, { "epoch": 0.7352297592997812, "grad_norm": 0.13586793839931488, "learning_rate": 0.0001998867339183008, "loss": 10.366, "step": 21 }, { "epoch": 0.8402625820568927, "grad_norm": 0.18823161721229553, "learning_rate": 0.00019819286972627066, "loss": 10.3635, "step": 24 }, { "epoch": 0.8402625820568927, "eval_loss": 10.358380317687988, "eval_runtime": 0.7207, "eval_samples_per_second": 134.583, "eval_steps_per_second": 67.985, "step": 24 }, { "epoch": 0.9452954048140044, "grad_norm": 0.15795904397964478, "learning_rate": 0.00019450008187146684, "loss": 10.3562, "step": 27 }, { "epoch": 1.0667396061269148, "grad_norm": 0.22537720203399658, "learning_rate": 0.00018888354486549237, "loss": 13.5544, "step": 30 }, { "epoch": 1.136761487964989, "eval_loss": 10.337327003479004, "eval_runtime": 0.7182, "eval_samples_per_second": 135.057, "eval_steps_per_second": 68.225, "step": 32 }, { "epoch": 1.1717724288840263, "grad_norm": 0.27254220843315125, "learning_rate": 0.00018145759520503358, "loss": 10.2022, "step": 33 }, { "epoch": 1.276805251641138, "grad_norm": 0.2869032025337219, "learning_rate": 0.00017237340381050703, "loss": 10.3997, "step": 36 }, { "epoch": 1.3818380743982495, "grad_norm": 0.2603079676628113, "learning_rate": 0.00016181589862206052, "loss": 10.2418, "step": 39 }, { "epoch": 1.4168490153172866, "eval_loss": 10.31363582611084, "eval_runtime": 0.7196, "eval_samples_per_second": 134.788, "eval_steps_per_second": 68.089, "step": 40 }, { "epoch": 1.486870897155361, "grad_norm": 0.1934972107410431, "learning_rate": 0.00015000000000000001, "loss": 10.3704, "step": 42 }, { "epoch": 1.5919037199124726, "grad_norm": 0.19271045923233032, "learning_rate": 0.00013716624556603274, "loss": 10.3807, "step": 45 }, { "epoch": 1.6969365426695844, "grad_norm": 0.14647410809993744, "learning_rate": 0.00012357589355094275, "loss": 10.2948, "step": 48 }, { "epoch": 1.6969365426695844, "eval_loss": 10.299408912658691, "eval_runtime": 0.7215, "eval_samples_per_second": 134.448, "eval_steps_per_second": 67.917, "step": 48 }, { "epoch": 1.8019693654266957, "grad_norm": 0.11578352749347687, "learning_rate": 0.00010950560433041826, "loss": 10.223, "step": 51 }, { "epoch": 1.9070021881838075, "grad_norm": 0.10895143449306488, "learning_rate": 9.524180841762577e-05, "loss": 10.258, "step": 54 }, { "epoch": 1.9770240700218817, "eval_loss": 10.293498039245605, "eval_runtime": 0.719, "eval_samples_per_second": 134.919, "eval_steps_per_second": 68.155, "step": 56 }, { "epoch": 2.0284463894967177, "grad_norm": 0.2121211737394333, "learning_rate": 8.107487556395901e-05, "loss": 13.6014, "step": 57 }, { "epoch": 2.1334792122538295, "grad_norm": 0.11470862478017807, "learning_rate": 6.729320366825784e-05, "loss": 10.2428, "step": 60 }, { "epoch": 2.238512035010941, "grad_norm": 0.0803636685013771, "learning_rate": 5.417734782725896e-05, "loss": 10.1202, "step": 63 }, { "epoch": 2.273522975929978, "eval_loss": 10.290419578552246, "eval_runtime": 0.7218, "eval_samples_per_second": 134.394, "eval_steps_per_second": 67.89, "step": 64 }, { "epoch": 2.3435448577680527, "grad_norm": 0.17679740488529205, "learning_rate": 4.19943090428802e-05, "loss": 10.6347, "step": 66 }, { "epoch": 2.448577680525164, "grad_norm": 0.09998754411935806, "learning_rate": 3.099209885178882e-05, "loss": 10.2055, "step": 69 }, { "epoch": 2.553610503282276, "grad_norm": 0.08292704820632935, "learning_rate": 2.139469052572127e-05, "loss": 10.3454, "step": 72 }, { "epoch": 2.553610503282276, "eval_loss": 10.288853645324707, "eval_runtime": 0.7239, "eval_samples_per_second": 133.994, "eval_steps_per_second": 67.687, "step": 72 }, { "epoch": 2.658643326039387, "grad_norm": 0.1350039690732956, "learning_rate": 1.339745962155613e-05, "loss": 10.279, "step": 75 }, { "epoch": 2.763676148796499, "grad_norm": 0.09172733873128891, "learning_rate": 7.163206698392744e-06, "loss": 10.0236, "step": 78 }, { "epoch": 2.833698030634573, "eval_loss": 10.288338661193848, "eval_runtime": 0.7231, "eval_samples_per_second": 134.151, "eval_steps_per_second": 67.767, "step": 80 }, { "epoch": 2.8687089715536107, "grad_norm": 0.08289742469787598, "learning_rate": 2.818843167645835e-06, "loss": 10.2928, "step": 81 }, { "epoch": 2.973741794310722, "grad_norm": 0.08898758143186569, "learning_rate": 4.5280774269154115e-07, "loss": 10.5421, "step": 84 } ], "logging_steps": 3, "max_steps": 86, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 36914181439488.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }