{ "best_metric": 2.153470039367676, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_recent/mlm_unmasking/fr_mlm_new/childes_mlm_unmasking_context_42/checkpoint-52000", "epoch": 105.4481546572935, "eval_steps": 2000, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.51493848857645, "eval_loss": 5.561293601989746, "eval_runtime": 1.7076, "eval_samples_per_second": 1305.334, "eval_steps_per_second": 81.986, "step": 2000 }, { "epoch": 7.0298769771529, "grad_norm": 1.3643518686294556, "learning_rate": 4e-05, "loss": 6.2537, "step": 4000 }, { "epoch": 7.0298769771529, "eval_loss": 5.49988317489624, "eval_runtime": 1.7023, "eval_samples_per_second": 1309.415, "eval_steps_per_second": 82.242, "step": 4000 }, { "epoch": 10.54481546572935, "eval_loss": 5.408681392669678, "eval_runtime": 1.7195, "eval_samples_per_second": 1296.321, "eval_steps_per_second": 81.42, "step": 6000 }, { "epoch": 14.0597539543058, "grad_norm": 3.336256265640259, "learning_rate": 8e-05, "loss": 5.329, "step": 8000 }, { "epoch": 14.0597539543058, "eval_loss": 4.505647659301758, "eval_runtime": 1.6952, "eval_samples_per_second": 1314.863, "eval_steps_per_second": 82.584, "step": 8000 }, { "epoch": 17.57469244288225, "eval_loss": 3.6001980304718018, "eval_runtime": 1.6952, "eval_samples_per_second": 1314.862, "eval_steps_per_second": 82.584, "step": 10000 }, { "epoch": 21.0896309314587, "grad_norm": 2.967787504196167, "learning_rate": 0.00012, "loss": 3.7571, "step": 12000 }, { "epoch": 21.0896309314587, "eval_loss": 3.1521711349487305, "eval_runtime": 1.6965, "eval_samples_per_second": 1313.906, "eval_steps_per_second": 82.524, "step": 12000 }, { "epoch": 24.604569420035148, "eval_loss": 2.8882946968078613, "eval_runtime": 1.7031, "eval_samples_per_second": 1308.779, "eval_steps_per_second": 82.202, "step": 14000 }, { "epoch": 28.1195079086116, "grad_norm": 2.9542622566223145, "learning_rate": 0.00016, "loss": 2.9513, "step": 16000 }, { "epoch": 28.1195079086116, "eval_loss": 2.7470672130584717, "eval_runtime": 1.7009, "eval_samples_per_second": 1310.516, "eval_steps_per_second": 82.311, "step": 16000 }, { "epoch": 31.63444639718805, "eval_loss": 2.6601271629333496, "eval_runtime": 1.7005, "eval_samples_per_second": 1310.816, "eval_steps_per_second": 82.33, "step": 18000 }, { "epoch": 35.1493848857645, "grad_norm": 2.4665582180023193, "learning_rate": 0.0002, "loss": 2.6047, "step": 20000 }, { "epoch": 35.1493848857645, "eval_loss": 2.523987293243408, "eval_runtime": 1.707, "eval_samples_per_second": 1305.835, "eval_steps_per_second": 82.017, "step": 20000 }, { "epoch": 38.66432337434095, "eval_loss": 2.4842116832733154, "eval_runtime": 1.6956, "eval_samples_per_second": 1314.554, "eval_steps_per_second": 82.565, "step": 22000 }, { "epoch": 42.1792618629174, "grad_norm": 2.4973504543304443, "learning_rate": 0.00024, "loss": 2.3993, "step": 24000 }, { "epoch": 42.1792618629174, "eval_loss": 2.3980119228363037, "eval_runtime": 1.6977, "eval_samples_per_second": 1312.977, "eval_steps_per_second": 82.466, "step": 24000 }, { "epoch": 45.69420035149385, "eval_loss": 2.346266269683838, "eval_runtime": 1.698, "eval_samples_per_second": 1312.731, "eval_steps_per_second": 82.451, "step": 26000 }, { "epoch": 49.209138840070295, "grad_norm": 2.5078186988830566, "learning_rate": 0.00028000000000000003, "loss": 2.2551, "step": 28000 }, { "epoch": 49.209138840070295, "eval_loss": 2.325450897216797, "eval_runtime": 1.708, "eval_samples_per_second": 1305.065, "eval_steps_per_second": 81.969, "step": 28000 }, { "epoch": 52.72407732864675, "eval_loss": 2.3088743686676025, "eval_runtime": 1.6993, "eval_samples_per_second": 1311.726, "eval_steps_per_second": 82.387, "step": 30000 }, { "epoch": 56.2390158172232, "grad_norm": 2.2375969886779785, "learning_rate": 0.00032, "loss": 2.1506, "step": 32000 }, { "epoch": 56.2390158172232, "eval_loss": 2.2676920890808105, "eval_runtime": 1.7068, "eval_samples_per_second": 1305.937, "eval_steps_per_second": 82.024, "step": 32000 }, { "epoch": 59.753954305799645, "eval_loss": 2.2623283863067627, "eval_runtime": 1.6975, "eval_samples_per_second": 1313.122, "eval_steps_per_second": 82.475, "step": 34000 }, { "epoch": 63.2688927943761, "grad_norm": 2.131579875946045, "learning_rate": 0.00035999999999999997, "loss": 2.0777, "step": 36000 }, { "epoch": 63.2688927943761, "eval_loss": 2.2328691482543945, "eval_runtime": 1.705, "eval_samples_per_second": 1307.34, "eval_steps_per_second": 82.112, "step": 36000 }, { "epoch": 66.78383128295255, "eval_loss": 2.205517530441284, "eval_runtime": 1.714, "eval_samples_per_second": 1300.481, "eval_steps_per_second": 81.681, "step": 38000 }, { "epoch": 70.298769771529, "grad_norm": 2.3363709449768066, "learning_rate": 0.0004, "loss": 2.0179, "step": 40000 }, { "epoch": 70.298769771529, "eval_loss": 2.235269069671631, "eval_runtime": 1.6984, "eval_samples_per_second": 1312.396, "eval_steps_per_second": 82.43, "step": 40000 }, { "epoch": 73.81370826010544, "eval_loss": 2.191019058227539, "eval_runtime": 1.6977, "eval_samples_per_second": 1312.964, "eval_steps_per_second": 82.465, "step": 42000 }, { "epoch": 77.3286467486819, "grad_norm": 2.0552377700805664, "learning_rate": 0.00044, "loss": 1.9801, "step": 44000 }, { "epoch": 77.3286467486819, "eval_loss": 2.2010703086853027, "eval_runtime": 1.7107, "eval_samples_per_second": 1302.947, "eval_steps_per_second": 81.836, "step": 44000 }, { "epoch": 80.84358523725835, "eval_loss": 2.1846556663513184, "eval_runtime": 1.6975, "eval_samples_per_second": 1313.128, "eval_steps_per_second": 82.476, "step": 46000 }, { "epoch": 84.3585237258348, "grad_norm": 2.805938720703125, "learning_rate": 0.00048, "loss": 1.9489, "step": 48000 }, { "epoch": 84.3585237258348, "eval_loss": 2.173431873321533, "eval_runtime": 1.7002, "eval_samples_per_second": 1310.995, "eval_steps_per_second": 82.342, "step": 48000 }, { "epoch": 87.87346221441125, "eval_loss": 2.1883482933044434, "eval_runtime": 1.7077, "eval_samples_per_second": 1305.249, "eval_steps_per_second": 81.981, "step": 50000 }, { "epoch": 91.3884007029877, "grad_norm": 1.8798856735229492, "learning_rate": 0.0005200000000000001, "loss": 1.9218, "step": 52000 }, { "epoch": 91.3884007029877, "eval_loss": 2.153470039367676, "eval_runtime": 1.7038, "eval_samples_per_second": 1308.221, "eval_steps_per_second": 82.167, "step": 52000 }, { "epoch": 94.90333919156414, "eval_loss": 2.18257474899292, "eval_runtime": 1.702, "eval_samples_per_second": 1309.626, "eval_steps_per_second": 82.256, "step": 54000 }, { "epoch": 98.41827768014059, "grad_norm": 1.8517287969589233, "learning_rate": 0.0005600000000000001, "loss": 1.9024, "step": 56000 }, { "epoch": 98.41827768014059, "eval_loss": 2.161165714263916, "eval_runtime": 1.7048, "eval_samples_per_second": 1307.45, "eval_steps_per_second": 82.119, "step": 56000 }, { "epoch": 101.93321616871705, "eval_loss": 2.1442034244537354, "eval_runtime": 1.7089, "eval_samples_per_second": 1304.32, "eval_steps_per_second": 81.922, "step": 58000 }, { "epoch": 105.4481546572935, "grad_norm": 1.9462802410125732, "learning_rate": 0.0006, "loss": 1.8923, "step": 60000 }, { "epoch": 105.4481546572935, "eval_loss": 2.194445848464966, "eval_runtime": 1.6951, "eval_samples_per_second": 1314.934, "eval_steps_per_second": 82.589, "step": 60000 } ], "logging_steps": 4000, "max_steps": 400000, "num_input_tokens_seen": 0, "num_train_epochs": 703, "save_steps": 4000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.25132064178176e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }