|
{ |
|
"best_metric": 3.282808780670166, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_recent/mlm_unmasking/fr_mlm_new/wikipedia_30/checkpoint-44000", |
|
"epoch": 51.43191116306254, |
|
"eval_steps": 2000, |
|
"global_step": 44000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.33781414377557, |
|
"eval_loss": 6.900431156158447, |
|
"eval_runtime": 1.1096, |
|
"eval_samples_per_second": 1285.131, |
|
"eval_steps_per_second": 81.109, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.67562828755114, |
|
"grad_norm": 0.7734344005584717, |
|
"learning_rate": 4e-05, |
|
"loss": 7.3814, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.67562828755114, |
|
"eval_loss": 6.925902366638184, |
|
"eval_runtime": 1.0887, |
|
"eval_samples_per_second": 1309.827, |
|
"eval_steps_per_second": 82.668, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.01344243132671, |
|
"eval_loss": 6.909808158874512, |
|
"eval_runtime": 1.0952, |
|
"eval_samples_per_second": 1302.036, |
|
"eval_steps_per_second": 82.176, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.35125657510228, |
|
"grad_norm": 1.7065298557281494, |
|
"learning_rate": 8e-05, |
|
"loss": 6.845, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 9.35125657510228, |
|
"eval_loss": 6.668918132781982, |
|
"eval_runtime": 1.0886, |
|
"eval_samples_per_second": 1309.937, |
|
"eval_steps_per_second": 82.675, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 11.689070718877849, |
|
"eval_loss": 5.787526607513428, |
|
"eval_runtime": 1.0937, |
|
"eval_samples_per_second": 1303.854, |
|
"eval_steps_per_second": 82.291, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 14.02688486265342, |
|
"grad_norm": 3.4086945056915283, |
|
"learning_rate": 0.00012, |
|
"loss": 5.8652, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 14.02688486265342, |
|
"eval_loss": 5.192287445068359, |
|
"eval_runtime": 1.0975, |
|
"eval_samples_per_second": 1299.264, |
|
"eval_steps_per_second": 82.001, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 16.36469900642899, |
|
"eval_loss": 4.822643280029297, |
|
"eval_runtime": 1.0919, |
|
"eval_samples_per_second": 1306.028, |
|
"eval_steps_per_second": 82.428, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 18.70251315020456, |
|
"grad_norm": 3.0992887020111084, |
|
"learning_rate": 0.00016, |
|
"loss": 4.8065, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 18.70251315020456, |
|
"eval_loss": 4.534602165222168, |
|
"eval_runtime": 1.089, |
|
"eval_samples_per_second": 1309.442, |
|
"eval_steps_per_second": 82.644, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 21.040327293980127, |
|
"eval_loss": 4.324636459350586, |
|
"eval_runtime": 1.0927, |
|
"eval_samples_per_second": 1305.0, |
|
"eval_steps_per_second": 82.363, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 23.378141437755698, |
|
"grad_norm": 2.9212777614593506, |
|
"learning_rate": 0.0002, |
|
"loss": 4.2309, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 23.378141437755698, |
|
"eval_loss": 4.134767532348633, |
|
"eval_runtime": 1.0879, |
|
"eval_samples_per_second": 1310.811, |
|
"eval_steps_per_second": 82.73, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 25.71595558153127, |
|
"eval_loss": 3.96517276763916, |
|
"eval_runtime": 1.0898, |
|
"eval_samples_per_second": 1308.527, |
|
"eval_steps_per_second": 82.586, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 28.05376972530684, |
|
"grad_norm": 3.0659778118133545, |
|
"learning_rate": 0.00024, |
|
"loss": 3.8185, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 28.05376972530684, |
|
"eval_loss": 3.810825824737549, |
|
"eval_runtime": 1.0939, |
|
"eval_samples_per_second": 1303.648, |
|
"eval_steps_per_second": 82.278, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 30.391583869082407, |
|
"eval_loss": 3.710167646408081, |
|
"eval_runtime": 1.0929, |
|
"eval_samples_per_second": 1304.777, |
|
"eval_steps_per_second": 82.349, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 32.72939801285798, |
|
"grad_norm": 3.134622097015381, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 3.5163, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 32.72939801285798, |
|
"eval_loss": 3.6270954608917236, |
|
"eval_runtime": 1.0898, |
|
"eval_samples_per_second": 1308.551, |
|
"eval_steps_per_second": 82.587, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 35.067212156633545, |
|
"eval_loss": 3.534950017929077, |
|
"eval_runtime": 1.0911, |
|
"eval_samples_per_second": 1306.961, |
|
"eval_steps_per_second": 82.487, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 37.40502630040912, |
|
"grad_norm": 3.2240071296691895, |
|
"learning_rate": 0.00032, |
|
"loss": 3.2957, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 37.40502630040912, |
|
"eval_loss": 3.505293607711792, |
|
"eval_runtime": 1.0901, |
|
"eval_samples_per_second": 1308.118, |
|
"eval_steps_per_second": 82.56, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 39.74284044418469, |
|
"eval_loss": 3.414364814758301, |
|
"eval_runtime": 1.1012, |
|
"eval_samples_per_second": 1294.925, |
|
"eval_steps_per_second": 81.727, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 42.080654587960254, |
|
"grad_norm": 3.1253504753112793, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 3.1388, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 42.080654587960254, |
|
"eval_loss": 3.363224506378174, |
|
"eval_runtime": 1.1003, |
|
"eval_samples_per_second": 1296.047, |
|
"eval_steps_per_second": 81.798, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 44.41846873173583, |
|
"eval_loss": 3.3095245361328125, |
|
"eval_runtime": 1.1193, |
|
"eval_samples_per_second": 1273.985, |
|
"eval_steps_per_second": 80.406, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 46.756282875511396, |
|
"grad_norm": 2.8377697467803955, |
|
"learning_rate": 0.0004, |
|
"loss": 3.0197, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 46.756282875511396, |
|
"eval_loss": 3.3381097316741943, |
|
"eval_runtime": 1.0907, |
|
"eval_samples_per_second": 1307.437, |
|
"eval_steps_per_second": 82.517, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 49.09409701928697, |
|
"eval_loss": 3.303609609603882, |
|
"eval_runtime": 1.1029, |
|
"eval_samples_per_second": 1292.944, |
|
"eval_steps_per_second": 81.602, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 51.43191116306254, |
|
"grad_norm": 2.7318673133850098, |
|
"learning_rate": 0.00044, |
|
"loss": 2.9398, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 51.43191116306254, |
|
"eval_loss": 3.282808780670166, |
|
"eval_runtime": 1.097, |
|
"eval_samples_per_second": 1299.868, |
|
"eval_steps_per_second": 82.039, |
|
"step": 44000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 400000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 468, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.001 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9180204991119360.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|