|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0544069640914037, |
|
"eval_steps": 13, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001088139281828074, |
|
"grad_norm": 0.1708880215883255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2178, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001088139281828074, |
|
"eval_loss": 0.2808469533920288, |
|
"eval_runtime": 20.7308, |
|
"eval_samples_per_second": 18.668, |
|
"eval_steps_per_second": 9.358, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002176278563656148, |
|
"grad_norm": 0.18688273429870605, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2729, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003264417845484222, |
|
"grad_norm": 0.14812062680721283, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.2367, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004352557127312296, |
|
"grad_norm": 0.18467950820922852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2042, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00544069640914037, |
|
"grad_norm": 0.1507299691438675, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.2751, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006528835690968444, |
|
"grad_norm": 0.20624135434627533, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2811, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007616974972796518, |
|
"grad_norm": 0.2024683952331543, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.2443, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.008705114254624592, |
|
"grad_norm": 0.16752073168754578, |
|
"learning_rate": 4e-05, |
|
"loss": 0.268, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.009793253536452665, |
|
"grad_norm": 0.17567092180252075, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.2362, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01088139281828074, |
|
"grad_norm": 0.15081793069839478, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1754, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011969532100108813, |
|
"grad_norm": 0.24378319084644318, |
|
"learning_rate": 4.99229333433282e-05, |
|
"loss": 0.3136, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.013057671381936888, |
|
"grad_norm": 0.17457719147205353, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.162, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.014145810663764961, |
|
"grad_norm": 0.16278016567230225, |
|
"learning_rate": 4.9309248009941914e-05, |
|
"loss": 0.1994, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.014145810663764961, |
|
"eval_loss": 0.25625717639923096, |
|
"eval_runtime": 20.7372, |
|
"eval_samples_per_second": 18.662, |
|
"eval_steps_per_second": 9.355, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.015233949945593036, |
|
"grad_norm": 0.23434151709079742, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.2542, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01632208922742111, |
|
"grad_norm": 0.20760248601436615, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 0.2256, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017410228509249184, |
|
"grad_norm": 0.21011245250701904, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 0.2241, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.018498367791077257, |
|
"grad_norm": 0.18041156232357025, |
|
"learning_rate": 4.6316004108852305e-05, |
|
"loss": 0.1655, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01958650707290533, |
|
"grad_norm": 0.1776050329208374, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.2103, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.020674646354733407, |
|
"grad_norm": 0.16318459808826447, |
|
"learning_rate": 4.401014914000078e-05, |
|
"loss": 0.1372, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02176278563656148, |
|
"grad_norm": 0.18268632888793945, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.1775, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022850924918389554, |
|
"grad_norm": 0.1805269867181778, |
|
"learning_rate": 4.123620120825459e-05, |
|
"loss": 0.1497, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.023939064200217627, |
|
"grad_norm": 0.2117743194103241, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.1659, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.025027203482045703, |
|
"grad_norm": 0.20967699587345123, |
|
"learning_rate": 3.8062464117898724e-05, |
|
"loss": 0.1668, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.026115342763873776, |
|
"grad_norm": 0.23078809678554535, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 0.1963, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02720348204570185, |
|
"grad_norm": 0.27512913942337036, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 0.1898, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.028291621327529923, |
|
"grad_norm": 0.24129854142665863, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.1633, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.028291621327529923, |
|
"eval_loss": 0.21075604856014252, |
|
"eval_runtime": 20.7545, |
|
"eval_samples_per_second": 18.647, |
|
"eval_steps_per_second": 9.347, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.029379760609358, |
|
"grad_norm": 0.19251622259616852, |
|
"learning_rate": 3.083613409639764e-05, |
|
"loss": 0.1362, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.030467899891186073, |
|
"grad_norm": 0.18412667512893677, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 0.1466, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.031556039173014146, |
|
"grad_norm": 0.28334978222846985, |
|
"learning_rate": 2.6961477393196126e-05, |
|
"loss": 0.2359, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03264417845484222, |
|
"grad_norm": 0.22527562081813812, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.1213, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03373231773667029, |
|
"grad_norm": 0.31526511907577515, |
|
"learning_rate": 2.303852260680388e-05, |
|
"loss": 0.2167, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03482045701849837, |
|
"grad_norm": 0.14839228987693787, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 0.1137, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.035908596300326445, |
|
"grad_norm": 0.208391934633255, |
|
"learning_rate": 1.9163865903602374e-05, |
|
"loss": 0.1615, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.036996735582154515, |
|
"grad_norm": 0.1698072850704193, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.1691, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03808487486398259, |
|
"grad_norm": 0.16231591999530792, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 0.1603, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03917301414581066, |
|
"grad_norm": 0.16352759301662445, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 0.1365, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04026115342763874, |
|
"grad_norm": 0.2394663244485855, |
|
"learning_rate": 1.1937535882101281e-05, |
|
"loss": 0.2083, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.041349292709466814, |
|
"grad_norm": 0.2228260487318039, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.1766, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.042437431991294884, |
|
"grad_norm": 0.16056722402572632, |
|
"learning_rate": 8.763798791745411e-06, |
|
"loss": 0.1458, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.042437431991294884, |
|
"eval_loss": 0.18522028625011444, |
|
"eval_runtime": 20.7477, |
|
"eval_samples_per_second": 18.653, |
|
"eval_steps_per_second": 9.35, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04352557127312296, |
|
"grad_norm": 0.20927105844020844, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.1808, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04461371055495103, |
|
"grad_norm": 0.19594742357730865, |
|
"learning_rate": 5.989850859999227e-06, |
|
"loss": 0.1499, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04570184983677911, |
|
"grad_norm": 0.3185279667377472, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.2901, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.046789989118607184, |
|
"grad_norm": 0.19601090252399445, |
|
"learning_rate": 3.6839958911476957e-06, |
|
"loss": 0.1651, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04787812840043525, |
|
"grad_norm": 0.3092597723007202, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 0.1719, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04896626768226333, |
|
"grad_norm": 0.15384280681610107, |
|
"learning_rate": 1.9030116872178316e-06, |
|
"loss": 0.13, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05005440696409141, |
|
"grad_norm": 0.22593477368354797, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 0.1633, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.051142546245919476, |
|
"grad_norm": 0.1992155909538269, |
|
"learning_rate": 6.907519900580861e-07, |
|
"loss": 0.1572, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05223068552774755, |
|
"grad_norm": 0.21265120804309845, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 0.1135, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05331882480957562, |
|
"grad_norm": 0.16999712586402893, |
|
"learning_rate": 7.706665667180091e-08, |
|
"loss": 0.144, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0544069640914037, |
|
"grad_norm": 0.2375941425561905, |
|
"learning_rate": 0.0, |
|
"loss": 0.1357, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7687956234240000.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|