|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1261.8181818181818, |
|
"eval_steps": 500, |
|
"global_step": 3470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 36.36363636363637, |
|
"grad_norm": 4.233419895172119, |
|
"learning_rate": 0.00019995452431006844, |
|
"loss": 3.7823, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 72.72727272727273, |
|
"grad_norm": 9.719901084899902, |
|
"learning_rate": 0.00019981048452912364, |
|
"loss": 1.3042, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 109.0909090909091, |
|
"grad_norm": 4.322120666503906, |
|
"learning_rate": 0.00019956794385021442, |
|
"loss": 0.7792, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 145.45454545454547, |
|
"grad_norm": 5.652794361114502, |
|
"learning_rate": 0.0001992271416317086, |
|
"loss": 0.5389, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 181.8181818181818, |
|
"grad_norm": 5.370779514312744, |
|
"learning_rate": 0.00019878841420425023, |
|
"loss": 0.3919, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 218.1818181818182, |
|
"grad_norm": 6.141811847686768, |
|
"learning_rate": 0.00019825219453884207, |
|
"loss": 0.2818, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 254.54545454545453, |
|
"grad_norm": 4.134015083312988, |
|
"learning_rate": 0.0001976190118195553, |
|
"loss": 0.1966, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 290.90909090909093, |
|
"grad_norm": 7.964496612548828, |
|
"learning_rate": 0.0001968894909212887, |
|
"loss": 0.1349, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 327.27272727272725, |
|
"grad_norm": 1.0434647798538208, |
|
"learning_rate": 0.0001960643517930928, |
|
"loss": 0.0931, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 363.6363636363636, |
|
"grad_norm": 4.57155179977417, |
|
"learning_rate": 0.0001951444087476669, |
|
"loss": 0.0594, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"grad_norm": 0.8262210488319397, |
|
"learning_rate": 0.0001941411696251397, |
|
"loss": 0.0473, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 436.3636363636364, |
|
"grad_norm": 2.0997166633605957, |
|
"learning_rate": 0.000193035358728009, |
|
"loss": 0.0226, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 472.72727272727275, |
|
"grad_norm": 8.319884300231934, |
|
"learning_rate": 0.00019183773316410876, |
|
"loss": 0.0183, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 509.09090909090907, |
|
"grad_norm": 0.06114543229341507, |
|
"learning_rate": 0.00019054947484527945, |
|
"loss": 0.0115, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 545.4545454545455, |
|
"grad_norm": 0.19133609533309937, |
|
"learning_rate": 0.00018917185512694839, |
|
"loss": 0.0159, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 581.8181818181819, |
|
"grad_norm": 0.4022941291332245, |
|
"learning_rate": 0.00018770623355345562, |
|
"loss": 0.0143, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 618.1818181818181, |
|
"grad_norm": 3.1849958896636963, |
|
"learning_rate": 0.00018615405651634746, |
|
"loss": 0.0107, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 654.5454545454545, |
|
"grad_norm": 0.0581185445189476, |
|
"learning_rate": 0.00018451685582696297, |
|
"loss": 0.0084, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 690.9090909090909, |
|
"grad_norm": 0.1319303661584854, |
|
"learning_rate": 0.00018279624720472093, |
|
"loss": 0.0109, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 727.2727272727273, |
|
"grad_norm": 3.9375159740448, |
|
"learning_rate": 0.00018099392868259988, |
|
"loss": 0.0053, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 763.6363636363636, |
|
"grad_norm": 0.0051200115121901035, |
|
"learning_rate": 0.00017911167893138466, |
|
"loss": 0.0144, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 800.0, |
|
"grad_norm": 0.0055800010450184345, |
|
"learning_rate": 0.00017715135550433285, |
|
"loss": 0.0076, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 836.3636363636364, |
|
"grad_norm": 0.03737466782331467, |
|
"learning_rate": 0.00017511489300399432, |
|
"loss": 0.0084, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 872.7272727272727, |
|
"grad_norm": 0.06793994456529617, |
|
"learning_rate": 0.0001730043011729918, |
|
"loss": 0.0063, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 909.0909090909091, |
|
"grad_norm": 0.0068795159459114075, |
|
"learning_rate": 0.0001708216629106476, |
|
"loss": 0.0107, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 945.4545454545455, |
|
"grad_norm": 0.5895256996154785, |
|
"learning_rate": 0.0001685691322174136, |
|
"loss": 0.0091, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 981.8181818181819, |
|
"grad_norm": 0.1091279685497284, |
|
"learning_rate": 0.00016624893206913265, |
|
"loss": 0.0024, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1018.1818181818181, |
|
"grad_norm": 0.04799462854862213, |
|
"learning_rate": 0.00016386335222322998, |
|
"loss": 0.0089, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1054.5454545454545, |
|
"grad_norm": 0.011366182006895542, |
|
"learning_rate": 0.00016141474695899894, |
|
"loss": 0.0083, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1090.909090909091, |
|
"grad_norm": 0.010076832957565784, |
|
"learning_rate": 0.00015890553275421163, |
|
"loss": 0.0059, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1127.2727272727273, |
|
"grad_norm": 0.0046064723283052444, |
|
"learning_rate": 0.00015633818590034704, |
|
"loss": 0.0033, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1163.6363636363637, |
|
"grad_norm": 0.09949818253517151, |
|
"learning_rate": 0.00015374173627698756, |
|
"loss": 0.0066, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1200.0, |
|
"grad_norm": 0.009247034788131714, |
|
"learning_rate": 0.00015106629709603283, |
|
"loss": 0.0018, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1236.3636363636363, |
|
"grad_norm": 2.8236868381500244, |
|
"learning_rate": 0.00014834046164515305, |
|
"loss": 0.0046, |
|
"step": 3400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5000, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0175353452509184e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|