|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 23, |
|
"global_step": 228, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0043859649122807015, |
|
"grad_norm": 0.048455942422151566, |
|
"learning_rate": 5e-06, |
|
"loss": 2.2365, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0043859649122807015, |
|
"eval_loss": 2.2304763793945312, |
|
"eval_runtime": 216.9254, |
|
"eval_samples_per_second": 0.369, |
|
"eval_steps_per_second": 0.369, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008771929824561403, |
|
"grad_norm": 0.04205497354269028, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2844, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013157894736842105, |
|
"grad_norm": 0.040102917701005936, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.3425, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017543859649122806, |
|
"grad_norm": 0.03933990001678467, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2561, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021929824561403508, |
|
"grad_norm": 0.042966291308403015, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.2957, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02631578947368421, |
|
"grad_norm": 0.04421572387218475, |
|
"learning_rate": 3e-05, |
|
"loss": 2.3794, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03070175438596491, |
|
"grad_norm": 0.04840295761823654, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.4057, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03508771929824561, |
|
"grad_norm": 0.04756808653473854, |
|
"learning_rate": 4e-05, |
|
"loss": 2.2825, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.039473684210526314, |
|
"grad_norm": 0.04309004917740822, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.031, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.043859649122807015, |
|
"grad_norm": 0.0425834022462368, |
|
"learning_rate": 5e-05, |
|
"loss": 2.3098, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04824561403508772, |
|
"grad_norm": 0.03927280381321907, |
|
"learning_rate": 4.99976636830244e-05, |
|
"loss": 2.3039, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05263157894736842, |
|
"grad_norm": 0.03671770170331001, |
|
"learning_rate": 4.999065521728665e-05, |
|
"loss": 2.2635, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05701754385964912, |
|
"grad_norm": 0.043874163180589676, |
|
"learning_rate": 4.9978976058253205e-05, |
|
"loss": 2.3794, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06140350877192982, |
|
"grad_norm": 0.031037230044603348, |
|
"learning_rate": 4.9962628631365625e-05, |
|
"loss": 2.3362, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06578947368421052, |
|
"grad_norm": 0.033785346895456314, |
|
"learning_rate": 4.9941616331536875e-05, |
|
"loss": 2.4863, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07017543859649122, |
|
"grad_norm": 0.038742709904909134, |
|
"learning_rate": 4.991594352244631e-05, |
|
"loss": 2.3597, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07456140350877193, |
|
"grad_norm": 0.04327031224966049, |
|
"learning_rate": 4.9885615535633464e-05, |
|
"loss": 2.3703, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07894736842105263, |
|
"grad_norm": 0.04216183349490166, |
|
"learning_rate": 4.9850638669390816e-05, |
|
"loss": 2.3208, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.04454097896814346, |
|
"learning_rate": 4.981102018745582e-05, |
|
"loss": 2.1064, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08771929824561403, |
|
"grad_norm": 0.041186701506376266, |
|
"learning_rate": 4.976676831750243e-05, |
|
"loss": 2.1267, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09210526315789473, |
|
"grad_norm": 0.033286575227975845, |
|
"learning_rate": 4.971789224943241e-05, |
|
"loss": 2.0027, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09649122807017543, |
|
"grad_norm": 0.031756095588207245, |
|
"learning_rate": 4.9664402133466884e-05, |
|
"loss": 2.2847, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.10087719298245613, |
|
"grad_norm": 0.03505243733525276, |
|
"learning_rate": 4.960630907803838e-05, |
|
"loss": 2.045, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10087719298245613, |
|
"eval_loss": 2.201603651046753, |
|
"eval_runtime": 221.2761, |
|
"eval_samples_per_second": 0.362, |
|
"eval_steps_per_second": 0.362, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.03322993218898773, |
|
"learning_rate": 4.954362514748392e-05, |
|
"loss": 2.2481, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10964912280701754, |
|
"grad_norm": 0.029862001538276672, |
|
"learning_rate": 4.94763633595396e-05, |
|
"loss": 2.2409, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11403508771929824, |
|
"grad_norm": 0.032439861446619034, |
|
"learning_rate": 4.940453768263715e-05, |
|
"loss": 2.2443, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11842105263157894, |
|
"grad_norm": 0.02963525988161564, |
|
"learning_rate": 4.9328163033003086e-05, |
|
"loss": 2.2605, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12280701754385964, |
|
"grad_norm": 0.03317798301577568, |
|
"learning_rate": 4.9247255271560994e-05, |
|
"loss": 2.2619, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12719298245614036, |
|
"grad_norm": 0.03550229221582413, |
|
"learning_rate": 4.916183120063769e-05, |
|
"loss": 2.1695, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 0.03996920958161354, |
|
"learning_rate": 4.9071908560473775e-05, |
|
"loss": 2.1766, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13596491228070176, |
|
"grad_norm": 0.032255735248327255, |
|
"learning_rate": 4.8977506025539554e-05, |
|
"loss": 2.2302, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 0.03136478736996651, |
|
"learning_rate": 4.8878643200656793e-05, |
|
"loss": 2.2832, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14473684210526316, |
|
"grad_norm": 0.03207270801067352, |
|
"learning_rate": 4.8775340616927357e-05, |
|
"loss": 2.3316, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.14912280701754385, |
|
"grad_norm": 0.031349290162324905, |
|
"learning_rate": 4.866761972746946e-05, |
|
"loss": 2.1679, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15350877192982457, |
|
"grad_norm": 0.030217550694942474, |
|
"learning_rate": 4.855550290296248e-05, |
|
"loss": 2.292, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15789473684210525, |
|
"grad_norm": 0.032989416271448135, |
|
"learning_rate": 4.8439013427001076e-05, |
|
"loss": 2.2805, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.16228070175438597, |
|
"grad_norm": 0.03313658758997917, |
|
"learning_rate": 4.8318175491259945e-05, |
|
"loss": 2.3498, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.03528057411313057, |
|
"learning_rate": 4.8193014190469815e-05, |
|
"loss": 2.0183, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.17105263157894737, |
|
"grad_norm": 0.031257182359695435, |
|
"learning_rate": 4.8063555517205936e-05, |
|
"loss": 2.2793, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17543859649122806, |
|
"grad_norm": 0.0320710726082325, |
|
"learning_rate": 4.792982635649019e-05, |
|
"loss": 2.1969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17982456140350878, |
|
"grad_norm": 0.03237050399184227, |
|
"learning_rate": 4.779185448020774e-05, |
|
"loss": 2.3167, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.18421052631578946, |
|
"grad_norm": 0.033470962196588516, |
|
"learning_rate": 4.7649668541339635e-05, |
|
"loss": 2.2394, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18859649122807018, |
|
"grad_norm": 0.03281046077609062, |
|
"learning_rate": 4.750329806801234e-05, |
|
"loss": 2.2975, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19298245614035087, |
|
"grad_norm": 0.035084936767816544, |
|
"learning_rate": 4.735277345736555e-05, |
|
"loss": 2.1174, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19736842105263158, |
|
"grad_norm": 0.0374368391931057, |
|
"learning_rate": 4.7198125969239533e-05, |
|
"loss": 2.3721, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.20175438596491227, |
|
"grad_norm": 0.031239116564393044, |
|
"learning_rate": 4.703938771968333e-05, |
|
"loss": 2.2614, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20175438596491227, |
|
"eval_loss": 2.186415910720825, |
|
"eval_runtime": 221.6088, |
|
"eval_samples_per_second": 0.361, |
|
"eval_steps_per_second": 0.361, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20614035087719298, |
|
"grad_norm": 0.03350961580872536, |
|
"learning_rate": 4.6876591674285145e-05, |
|
"loss": 2.3679, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 0.031444985419511795, |
|
"learning_rate": 4.6709771641326244e-05, |
|
"loss": 2.2733, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2149122807017544, |
|
"grad_norm": 0.03112543374300003, |
|
"learning_rate": 4.6538962264759954e-05, |
|
"loss": 2.1792, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21929824561403508, |
|
"grad_norm": 0.03221912682056427, |
|
"learning_rate": 4.636419901701705e-05, |
|
"loss": 2.2643, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2236842105263158, |
|
"grad_norm": 0.031834062188863754, |
|
"learning_rate": 4.618551819163906e-05, |
|
"loss": 2.2191, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22807017543859648, |
|
"grad_norm": 0.0330105684697628, |
|
"learning_rate": 4.600295689574114e-05, |
|
"loss": 2.1179, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2324561403508772, |
|
"grad_norm": 0.03336255997419357, |
|
"learning_rate": 4.581655304230596e-05, |
|
"loss": 2.1939, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23684210526315788, |
|
"grad_norm": 0.034072939306497574, |
|
"learning_rate": 4.562634534231012e-05, |
|
"loss": 2.3282, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2412280701754386, |
|
"grad_norm": 0.036779358983039856, |
|
"learning_rate": 4.543237329668504e-05, |
|
"loss": 2.0628, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24561403508771928, |
|
"grad_norm": 0.03245358541607857, |
|
"learning_rate": 4.5234677188113664e-05, |
|
"loss": 2.229, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.032968707382678986, |
|
"learning_rate": 4.503329807266484e-05, |
|
"loss": 2.1244, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2543859649122807, |
|
"grad_norm": 0.03246884047985077, |
|
"learning_rate": 4.482827777126706e-05, |
|
"loss": 2.3287, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.25877192982456143, |
|
"grad_norm": 0.03295806050300598, |
|
"learning_rate": 4.461965886102351e-05, |
|
"loss": 2.313, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 0.03239140287041664, |
|
"learning_rate": 4.440748466636987e-05, |
|
"loss": 2.2437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2675438596491228, |
|
"grad_norm": 0.03406047448515892, |
|
"learning_rate": 4.419179925007705e-05, |
|
"loss": 2.2797, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2719298245614035, |
|
"grad_norm": 0.03304027020931244, |
|
"learning_rate": 4.397264740410055e-05, |
|
"loss": 2.2423, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.27631578947368424, |
|
"grad_norm": 0.035411059856414795, |
|
"learning_rate": 4.3750074640278414e-05, |
|
"loss": 2.1787, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 0.03362572193145752, |
|
"learning_rate": 4.352412718087967e-05, |
|
"loss": 2.2556, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2850877192982456, |
|
"grad_norm": 0.035657159984111786, |
|
"learning_rate": 4.32948519490052e-05, |
|
"loss": 2.254, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2894736842105263, |
|
"grad_norm": 0.03446522727608681, |
|
"learning_rate": 4.306229655884312e-05, |
|
"loss": 2.1582, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.29385964912280704, |
|
"grad_norm": 0.03329275920987129, |
|
"learning_rate": 4.282650930578061e-05, |
|
"loss": 2.0713, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2982456140350877, |
|
"grad_norm": 0.036497630178928375, |
|
"learning_rate": 4.2587539156374295e-05, |
|
"loss": 2.2137, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3026315789473684, |
|
"grad_norm": 0.034867726266384125, |
|
"learning_rate": 4.234543573818121e-05, |
|
"loss": 2.3029, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3026315789473684, |
|
"eval_loss": 2.177396059036255, |
|
"eval_runtime": 221.9078, |
|
"eval_samples_per_second": 0.361, |
|
"eval_steps_per_second": 0.361, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.30701754385964913, |
|
"grad_norm": 0.03490519896149635, |
|
"learning_rate": 4.210024932945254e-05, |
|
"loss": 2.2167, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.31140350877192985, |
|
"grad_norm": 0.03391524776816368, |
|
"learning_rate": 4.1852030848692184e-05, |
|
"loss": 2.2559, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.0409424863755703, |
|
"learning_rate": 4.160083184408238e-05, |
|
"loss": 2.1958, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3201754385964912, |
|
"grad_norm": 0.034918639808893204, |
|
"learning_rate": 4.134670448277859e-05, |
|
"loss": 2.179, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.32456140350877194, |
|
"grad_norm": 0.035045966506004333, |
|
"learning_rate": 4.1089701540075746e-05, |
|
"loss": 2.2114, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.32894736842105265, |
|
"grad_norm": 0.03757881000638008, |
|
"learning_rate": 4.0829876388448335e-05, |
|
"loss": 2.2123, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.03646866977214813, |
|
"learning_rate": 4.056728298646634e-05, |
|
"loss": 2.2624, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33771929824561403, |
|
"grad_norm": 0.037842828780412674, |
|
"learning_rate": 4.0301975867589556e-05, |
|
"loss": 2.1933, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.34210526315789475, |
|
"grad_norm": 0.04188080504536629, |
|
"learning_rate": 4.0034010128842484e-05, |
|
"loss": 2.2684, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.34649122807017546, |
|
"grad_norm": 0.03659341111779213, |
|
"learning_rate": 3.9763441419372184e-05, |
|
"loss": 2.3599, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 0.035152070224285126, |
|
"learning_rate": 3.949032592889144e-05, |
|
"loss": 2.2093, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.35526315789473684, |
|
"grad_norm": 0.04070575535297394, |
|
"learning_rate": 3.9214720376009754e-05, |
|
"loss": 2.2219, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.35964912280701755, |
|
"grad_norm": 0.03422649949789047, |
|
"learning_rate": 3.893668199645438e-05, |
|
"loss": 2.3257, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.36403508771929827, |
|
"grad_norm": 0.03752286732196808, |
|
"learning_rate": 3.865626853118409e-05, |
|
"loss": 2.3327, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3684210526315789, |
|
"grad_norm": 0.03680169954895973, |
|
"learning_rate": 3.8373538214397895e-05, |
|
"loss": 2.0797, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.37280701754385964, |
|
"grad_norm": 0.03573755919933319, |
|
"learning_rate": 3.808854976144147e-05, |
|
"loss": 2.3888, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.37719298245614036, |
|
"grad_norm": 0.03731301426887512, |
|
"learning_rate": 3.7801362356613505e-05, |
|
"loss": 2.2343, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3815789473684211, |
|
"grad_norm": 0.035398200154304504, |
|
"learning_rate": 3.751203564087484e-05, |
|
"loss": 2.3215, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.38596491228070173, |
|
"grad_norm": 0.03954707086086273, |
|
"learning_rate": 3.722062969946254e-05, |
|
"loss": 2.0028, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.39035087719298245, |
|
"grad_norm": 0.03811151161789894, |
|
"learning_rate": 3.6927205049412e-05, |
|
"loss": 2.2696, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 0.03738315403461456, |
|
"learning_rate": 3.663182262698905e-05, |
|
"loss": 2.2021, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3991228070175439, |
|
"grad_norm": 0.03729921206831932, |
|
"learning_rate": 3.63345437750353e-05, |
|
"loss": 2.287, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.40350877192982454, |
|
"grad_norm": 0.037670135498046875, |
|
"learning_rate": 3.6035430230228806e-05, |
|
"loss": 2.2515, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.40350877192982454, |
|
"eval_loss": 2.171975612640381, |
|
"eval_runtime": 221.318, |
|
"eval_samples_per_second": 0.361, |
|
"eval_steps_per_second": 0.361, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.40789473684210525, |
|
"grad_norm": 0.039739374071359634, |
|
"learning_rate": 3.573454411026311e-05, |
|
"loss": 2.4027, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.41228070175438597, |
|
"grad_norm": 0.036624688655138016, |
|
"learning_rate": 3.5431947900947086e-05, |
|
"loss": 2.2523, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.03838134929537773, |
|
"learning_rate": 3.512770444322836e-05, |
|
"loss": 2.0867, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.04036020487546921, |
|
"learning_rate": 3.4821876920142896e-05, |
|
"loss": 2.0927, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.42543859649122806, |
|
"grad_norm": 0.03726624324917793, |
|
"learning_rate": 3.4514528843693726e-05, |
|
"loss": 2.1346, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4298245614035088, |
|
"grad_norm": 0.040338750928640366, |
|
"learning_rate": 3.4205724041661135e-05, |
|
"loss": 2.279, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4342105263157895, |
|
"grad_norm": 0.03956661745905876, |
|
"learning_rate": 3.389552664434746e-05, |
|
"loss": 2.2589, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.43859649122807015, |
|
"grad_norm": 0.040090542286634445, |
|
"learning_rate": 3.358400107125892e-05, |
|
"loss": 2.2241, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.44298245614035087, |
|
"grad_norm": 0.038386132568120956, |
|
"learning_rate": 3.327121201772752e-05, |
|
"loss": 2.1874, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4473684210526316, |
|
"grad_norm": 0.03814141824841499, |
|
"learning_rate": 3.2957224441475506e-05, |
|
"loss": 2.1831, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4517543859649123, |
|
"grad_norm": 0.03928579390048981, |
|
"learning_rate": 3.264210354912551e-05, |
|
"loss": 2.2175, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.45614035087719296, |
|
"grad_norm": 0.04026209935545921, |
|
"learning_rate": 3.232591478265887e-05, |
|
"loss": 2.2436, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4605263157894737, |
|
"grad_norm": 0.0390322245657444, |
|
"learning_rate": 3.2008723805825174e-05, |
|
"loss": 2.1753, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4649122807017544, |
|
"grad_norm": 0.038063932210206985, |
|
"learning_rate": 3.169059649050561e-05, |
|
"loss": 2.3489, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4692982456140351, |
|
"grad_norm": 0.03953162208199501, |
|
"learning_rate": 3.137159890303329e-05, |
|
"loss": 2.3278, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.47368421052631576, |
|
"grad_norm": 0.03997305780649185, |
|
"learning_rate": 3.1051797290472966e-05, |
|
"loss": 2.0943, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4780701754385965, |
|
"grad_norm": 0.03778674080967903, |
|
"learning_rate": 3.073125806686343e-05, |
|
"loss": 2.384, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4824561403508772, |
|
"grad_norm": 0.038230303674936295, |
|
"learning_rate": 3.0410047799425095e-05, |
|
"loss": 2.2288, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4868421052631579, |
|
"grad_norm": 0.03927282989025116, |
|
"learning_rate": 3.0088233194735756e-05, |
|
"loss": 2.3347, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.49122807017543857, |
|
"grad_norm": 0.03646494820713997, |
|
"learning_rate": 2.9765881084877567e-05, |
|
"loss": 2.1844, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4956140350877193, |
|
"grad_norm": 0.040188323706388474, |
|
"learning_rate": 2.9443058413557746e-05, |
|
"loss": 2.2538, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.03920644521713257, |
|
"learning_rate": 2.9119832222206262e-05, |
|
"loss": 2.2415, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5043859649122807, |
|
"grad_norm": 0.037783924490213394, |
|
"learning_rate": 2.8796269636053147e-05, |
|
"loss": 2.2141, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5043859649122807, |
|
"eval_loss": 2.168910026550293, |
|
"eval_runtime": 222.2719, |
|
"eval_samples_per_second": 0.36, |
|
"eval_steps_per_second": 0.36, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5087719298245614, |
|
"grad_norm": 0.041254762560129166, |
|
"learning_rate": 2.8472437850188416e-05, |
|
"loss": 2.1322, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5131578947368421, |
|
"grad_norm": 0.04002700001001358, |
|
"learning_rate": 2.8148404115607496e-05, |
|
"loss": 2.1348, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5175438596491229, |
|
"grad_norm": 0.04018218815326691, |
|
"learning_rate": 2.7824235725245042e-05, |
|
"loss": 2.1397, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5219298245614035, |
|
"grad_norm": 0.03979986160993576, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 2.1661, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.03741481155157089, |
|
"learning_rate": 2.7175764274754967e-05, |
|
"loss": 2.0629, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5307017543859649, |
|
"grad_norm": 0.04032299295067787, |
|
"learning_rate": 2.685159588439251e-05, |
|
"loss": 2.2347, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5350877192982456, |
|
"grad_norm": 0.042785074561834335, |
|
"learning_rate": 2.6527562149811586e-05, |
|
"loss": 2.1792, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5394736842105263, |
|
"grad_norm": 0.040340930223464966, |
|
"learning_rate": 2.6203730363946855e-05, |
|
"loss": 2.4248, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.543859649122807, |
|
"grad_norm": 0.04094316065311432, |
|
"learning_rate": 2.5880167777793746e-05, |
|
"loss": 2.1152, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5482456140350878, |
|
"grad_norm": 0.04053365811705589, |
|
"learning_rate": 2.5556941586442263e-05, |
|
"loss": 2.247, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5526315789473685, |
|
"grad_norm": 0.04444659873843193, |
|
"learning_rate": 2.523411891512244e-05, |
|
"loss": 2.0332, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5570175438596491, |
|
"grad_norm": 0.04211907461285591, |
|
"learning_rate": 2.4911766805264246e-05, |
|
"loss": 2.2871, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 0.039295535534620285, |
|
"learning_rate": 2.458995220057491e-05, |
|
"loss": 2.0771, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5657894736842105, |
|
"grad_norm": 0.04139047861099243, |
|
"learning_rate": 2.426874193313657e-05, |
|
"loss": 2.3538, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5701754385964912, |
|
"grad_norm": 0.04222600907087326, |
|
"learning_rate": 2.394820270952704e-05, |
|
"loss": 2.2202, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5745614035087719, |
|
"grad_norm": 0.04055177420377731, |
|
"learning_rate": 2.3628401096966717e-05, |
|
"loss": 2.1976, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5789473684210527, |
|
"grad_norm": 0.040084317326545715, |
|
"learning_rate": 2.3309403509494393e-05, |
|
"loss": 2.1939, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.04429348185658455, |
|
"learning_rate": 2.2991276194174838e-05, |
|
"loss": 2.367, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5877192982456141, |
|
"grad_norm": 0.04366152733564377, |
|
"learning_rate": 2.267408521734113e-05, |
|
"loss": 2.2848, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5921052631578947, |
|
"grad_norm": 0.04157907888293266, |
|
"learning_rate": 2.23578964508745e-05, |
|
"loss": 2.3166, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5964912280701754, |
|
"grad_norm": 0.045395560562610626, |
|
"learning_rate": 2.2042775558524503e-05, |
|
"loss": 2.2021, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6008771929824561, |
|
"grad_norm": 0.04491296038031578, |
|
"learning_rate": 2.1728787982272493e-05, |
|
"loss": 2.1836, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6052631578947368, |
|
"grad_norm": 0.04048113152384758, |
|
"learning_rate": 2.141599892874107e-05, |
|
"loss": 2.2104, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6052631578947368, |
|
"eval_loss": 2.166778087615967, |
|
"eval_runtime": 219.3317, |
|
"eval_samples_per_second": 0.365, |
|
"eval_steps_per_second": 0.365, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6096491228070176, |
|
"grad_norm": 0.04249032586812973, |
|
"learning_rate": 2.1104473355652543e-05, |
|
"loss": 2.1101, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6140350877192983, |
|
"grad_norm": 0.0415317639708519, |
|
"learning_rate": 2.079427595833887e-05, |
|
"loss": 2.2474, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.618421052631579, |
|
"grad_norm": 0.04562680423259735, |
|
"learning_rate": 2.0485471156306286e-05, |
|
"loss": 2.278, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6228070175438597, |
|
"grad_norm": 0.04500352591276169, |
|
"learning_rate": 2.017812307985711e-05, |
|
"loss": 2.1393, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6271929824561403, |
|
"grad_norm": 0.03962068632245064, |
|
"learning_rate": 1.9872295556771652e-05, |
|
"loss": 2.1795, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.044582221657037735, |
|
"learning_rate": 1.9568052099052912e-05, |
|
"loss": 2.0925, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6359649122807017, |
|
"grad_norm": 0.04261946305632591, |
|
"learning_rate": 1.9265455889736893e-05, |
|
"loss": 2.265, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6403508771929824, |
|
"grad_norm": 0.04217216372489929, |
|
"learning_rate": 1.89645697697712e-05, |
|
"loss": 1.9885, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6447368421052632, |
|
"grad_norm": 0.04785408079624176, |
|
"learning_rate": 1.8665456224964706e-05, |
|
"loss": 1.947, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6491228070175439, |
|
"grad_norm": 0.04387756809592247, |
|
"learning_rate": 1.8368177373010954e-05, |
|
"loss": 2.2214, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6535087719298246, |
|
"grad_norm": 0.04180494323372841, |
|
"learning_rate": 1.807279495058801e-05, |
|
"loss": 2.2836, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 0.043545372784137726, |
|
"learning_rate": 1.7779370300537463e-05, |
|
"loss": 2.2564, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6622807017543859, |
|
"grad_norm": 0.04130139574408531, |
|
"learning_rate": 1.7487964359125172e-05, |
|
"loss": 2.2876, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.04039989784359932, |
|
"learning_rate": 1.71986376433865e-05, |
|
"loss": 2.2178, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6710526315789473, |
|
"grad_norm": 0.044490255415439606, |
|
"learning_rate": 1.6911450238558544e-05, |
|
"loss": 2.2864, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6754385964912281, |
|
"grad_norm": 0.044686999171972275, |
|
"learning_rate": 1.6626461785602114e-05, |
|
"loss": 2.2928, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6798245614035088, |
|
"grad_norm": 0.04130301997065544, |
|
"learning_rate": 1.634373146881592e-05, |
|
"loss": 2.3054, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6842105263157895, |
|
"grad_norm": 0.04189175367355347, |
|
"learning_rate": 1.606331800354563e-05, |
|
"loss": 2.2446, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6885964912280702, |
|
"grad_norm": 0.04377526417374611, |
|
"learning_rate": 1.578527962399025e-05, |
|
"loss": 2.3201, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6929824561403509, |
|
"grad_norm": 0.043845128268003464, |
|
"learning_rate": 1.550967407110856e-05, |
|
"loss": 2.2915, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6973684210526315, |
|
"grad_norm": 0.040581248700618744, |
|
"learning_rate": 1.5236558580627818e-05, |
|
"loss": 2.2255, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.043707672506570816, |
|
"learning_rate": 1.4965989871157523e-05, |
|
"loss": 2.1187, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.706140350877193, |
|
"grad_norm": 0.04184063524007797, |
|
"learning_rate": 1.4698024132410453e-05, |
|
"loss": 2.0291, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.706140350877193, |
|
"eval_loss": 2.165196418762207, |
|
"eval_runtime": 218.8267, |
|
"eval_samples_per_second": 0.366, |
|
"eval_steps_per_second": 0.366, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7105263157894737, |
|
"grad_norm": 0.045050691813230515, |
|
"learning_rate": 1.443271701353367e-05, |
|
"loss": 2.0026, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7149122807017544, |
|
"grad_norm": 0.04285702109336853, |
|
"learning_rate": 1.4170123611551672e-05, |
|
"loss": 2.0858, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7192982456140351, |
|
"grad_norm": 0.04647679999470711, |
|
"learning_rate": 1.391029845992426e-05, |
|
"loss": 2.2815, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7236842105263158, |
|
"grad_norm": 0.04479655995965004, |
|
"learning_rate": 1.3653295517221414e-05, |
|
"loss": 2.2331, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7280701754385965, |
|
"grad_norm": 0.04288294166326523, |
|
"learning_rate": 1.3399168155917618e-05, |
|
"loss": 2.2517, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7324561403508771, |
|
"grad_norm": 0.040452826768159866, |
|
"learning_rate": 1.3147969151307832e-05, |
|
"loss": 2.0543, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.041480351239442825, |
|
"learning_rate": 1.2899750670547473e-05, |
|
"loss": 2.184, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7412280701754386, |
|
"grad_norm": 0.04393285885453224, |
|
"learning_rate": 1.2654564261818803e-05, |
|
"loss": 2.265, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7456140350877193, |
|
"grad_norm": 0.04307292774319649, |
|
"learning_rate": 1.2412460843625707e-05, |
|
"loss": 2.0731, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.04109564423561096, |
|
"learning_rate": 1.2173490694219395e-05, |
|
"loss": 2.3338, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7543859649122807, |
|
"grad_norm": 0.04446297138929367, |
|
"learning_rate": 1.1937703441156884e-05, |
|
"loss": 2.183, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7587719298245614, |
|
"grad_norm": 0.04082484543323517, |
|
"learning_rate": 1.170514805099481e-05, |
|
"loss": 2.1772, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7631578947368421, |
|
"grad_norm": 0.04336797446012497, |
|
"learning_rate": 1.1475872819120328e-05, |
|
"loss": 2.3144, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7675438596491229, |
|
"grad_norm": 0.0429406575858593, |
|
"learning_rate": 1.1249925359721588e-05, |
|
"loss": 2.1344, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7719298245614035, |
|
"grad_norm": 0.04397052899003029, |
|
"learning_rate": 1.1027352595899456e-05, |
|
"loss": 2.1452, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7763157894736842, |
|
"grad_norm": 0.04313274472951889, |
|
"learning_rate": 1.0808200749922962e-05, |
|
"loss": 2.2194, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7807017543859649, |
|
"grad_norm": 0.04726535826921463, |
|
"learning_rate": 1.0592515333630128e-05, |
|
"loss": 1.9798, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7850877192982456, |
|
"grad_norm": 0.04374031722545624, |
|
"learning_rate": 1.038034113897649e-05, |
|
"loss": 2.0765, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 0.04320162534713745, |
|
"learning_rate": 1.0171722228732938e-05, |
|
"loss": 2.1935, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.793859649122807, |
|
"grad_norm": 0.046704404056072235, |
|
"learning_rate": 9.966701927335172e-06, |
|
"loss": 2.18, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7982456140350878, |
|
"grad_norm": 0.04343089088797569, |
|
"learning_rate": 9.765322811886333e-06, |
|
"loss": 2.1722, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.8026315789473685, |
|
"grad_norm": 0.04284314066171646, |
|
"learning_rate": 9.567626703314955e-06, |
|
"loss": 2.2947, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.8070175438596491, |
|
"grad_norm": 0.04306797310709953, |
|
"learning_rate": 9.373654657689884e-06, |
|
"loss": 2.3129, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8070175438596491, |
|
"eval_loss": 2.1641502380371094, |
|
"eval_runtime": 219.0202, |
|
"eval_samples_per_second": 0.365, |
|
"eval_steps_per_second": 0.365, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8114035087719298, |
|
"grad_norm": 0.04292619228363037, |
|
"learning_rate": 9.183446957694048e-06, |
|
"loss": 2.225, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8157894736842105, |
|
"grad_norm": 0.04302488639950752, |
|
"learning_rate": 8.997043104258856e-06, |
|
"loss": 2.0675, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8201754385964912, |
|
"grad_norm": 0.04218915104866028, |
|
"learning_rate": 8.814481808360945e-06, |
|
"loss": 2.1778, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8245614035087719, |
|
"grad_norm": 0.04348418116569519, |
|
"learning_rate": 8.635800982982958e-06, |
|
"loss": 2.2598, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8289473684210527, |
|
"grad_norm": 0.04379533231258392, |
|
"learning_rate": 8.461037735240047e-06, |
|
"loss": 2.223, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.04641556367278099, |
|
"learning_rate": 8.290228358673758e-06, |
|
"loss": 2.1633, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8377192982456141, |
|
"grad_norm": 0.04623427614569664, |
|
"learning_rate": 8.123408325714857e-06, |
|
"loss": 2.2546, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.04348750412464142, |
|
"learning_rate": 7.960612280316673e-06, |
|
"loss": 2.2283, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8464912280701754, |
|
"grad_norm": 0.04299633204936981, |
|
"learning_rate": 7.801874030760472e-06, |
|
"loss": 2.2155, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8508771929824561, |
|
"grad_norm": 0.04249183461070061, |
|
"learning_rate": 7.647226542634454e-06, |
|
"loss": 2.2647, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8552631578947368, |
|
"grad_norm": 0.04467320442199707, |
|
"learning_rate": 7.49670193198766e-06, |
|
"loss": 2.3202, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8596491228070176, |
|
"grad_norm": 0.04538441821932793, |
|
"learning_rate": 7.350331458660367e-06, |
|
"loss": 2.0542, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8640350877192983, |
|
"grad_norm": 0.04282210022211075, |
|
"learning_rate": 7.208145519792266e-06, |
|
"loss": 2.3344, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.868421052631579, |
|
"grad_norm": 0.042627353221178055, |
|
"learning_rate": 7.0701736435098155e-06, |
|
"loss": 2.3739, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8728070175438597, |
|
"grad_norm": 0.04885130748152733, |
|
"learning_rate": 6.936444482794065e-06, |
|
"loss": 2.2614, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8771929824561403, |
|
"grad_norm": 0.04192091524600983, |
|
"learning_rate": 6.806985809530189e-06, |
|
"loss": 2.0821, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.881578947368421, |
|
"grad_norm": 0.04542316868901253, |
|
"learning_rate": 6.6818245087400574e-06, |
|
"loss": 2.3226, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8859649122807017, |
|
"grad_norm": 0.0446937195956707, |
|
"learning_rate": 6.56098657299893e-06, |
|
"loss": 2.342, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8903508771929824, |
|
"grad_norm": 0.04320209473371506, |
|
"learning_rate": 6.444497097037532e-06, |
|
"loss": 2.1945, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8947368421052632, |
|
"grad_norm": 0.04684532806277275, |
|
"learning_rate": 6.332380272530536e-06, |
|
"loss": 2.2744, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8991228070175439, |
|
"grad_norm": 0.04657423868775368, |
|
"learning_rate": 6.224659383072649e-06, |
|
"loss": 2.1249, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9035087719298246, |
|
"grad_norm": 0.04765097796916962, |
|
"learning_rate": 6.1213567993432085e-06, |
|
"loss": 2.1456, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.9078947368421053, |
|
"grad_norm": 0.047186579555273056, |
|
"learning_rate": 6.022493974460447e-06, |
|
"loss": 2.2972, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9078947368421053, |
|
"eval_loss": 2.163572311401367, |
|
"eval_runtime": 218.9467, |
|
"eval_samples_per_second": 0.365, |
|
"eval_steps_per_second": 0.365, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9122807017543859, |
|
"grad_norm": 0.04384492337703705, |
|
"learning_rate": 5.928091439526226e-06, |
|
"loss": 2.2382, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.044699691236019135, |
|
"learning_rate": 5.838168799362318e-06, |
|
"loss": 2.1484, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 0.043975383043289185, |
|
"learning_rate": 5.752744728439006e-06, |
|
"loss": 2.208, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9254385964912281, |
|
"grad_norm": 0.0446508526802063, |
|
"learning_rate": 5.671836966996916e-06, |
|
"loss": 2.0749, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9298245614035088, |
|
"grad_norm": 0.04373237490653992, |
|
"learning_rate": 5.595462317362849e-06, |
|
"loss": 2.372, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9342105263157895, |
|
"grad_norm": 0.04464460536837578, |
|
"learning_rate": 5.523636640460405e-06, |
|
"loss": 2.2327, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9385964912280702, |
|
"grad_norm": 0.05024990811944008, |
|
"learning_rate": 5.456374852516083e-06, |
|
"loss": 2.1838, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9429824561403509, |
|
"grad_norm": 0.044989317655563354, |
|
"learning_rate": 5.3936909219616205e-06, |
|
"loss": 2.373, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.04446178302168846, |
|
"learning_rate": 5.335597866533116e-06, |
|
"loss": 2.0206, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9517543859649122, |
|
"grad_norm": 0.0473959781229496, |
|
"learning_rate": 5.282107750567588e-06, |
|
"loss": 2.0744, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.956140350877193, |
|
"grad_norm": 0.046764299273490906, |
|
"learning_rate": 5.233231682497572e-06, |
|
"loss": 2.0287, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9605263157894737, |
|
"grad_norm": 0.042999010533094406, |
|
"learning_rate": 5.1889798125441795e-06, |
|
"loss": 2.3255, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9649122807017544, |
|
"grad_norm": 0.04476455599069595, |
|
"learning_rate": 5.149361330609188e-06, |
|
"loss": 2.3038, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9692982456140351, |
|
"grad_norm": 0.04403753951191902, |
|
"learning_rate": 5.114384464366541e-06, |
|
"loss": 2.2972, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9736842105263158, |
|
"grad_norm": 0.04389164224267006, |
|
"learning_rate": 5.084056477553695e-06, |
|
"loss": 2.2914, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9780701754385965, |
|
"grad_norm": 0.044895585626363754, |
|
"learning_rate": 5.058383668463131e-06, |
|
"loss": 2.2486, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"grad_norm": 0.04405970871448517, |
|
"learning_rate": 5.0373713686343774e-06, |
|
"loss": 2.1718, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9868421052631579, |
|
"grad_norm": 0.04554829001426697, |
|
"learning_rate": 5.021023941746794e-06, |
|
"loss": 2.1137, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9912280701754386, |
|
"grad_norm": 0.041737962514162064, |
|
"learning_rate": 5.009344782713349e-06, |
|
"loss": 2.2621, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9956140350877193, |
|
"grad_norm": 0.04512747749686241, |
|
"learning_rate": 5.0023363169756045e-06, |
|
"loss": 2.3551, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.04331080988049507, |
|
"learning_rate": 5e-06, |
|
"loss": 2.2325, |
|
"step": 228 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 228, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 23, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2790575338430136e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|