|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.00832535486825126, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.325354868251259e-05, |
|
"grad_norm": 0.021569252014160156, |
|
"learning_rate": 1e-05, |
|
"loss": 10.3794, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 8.325354868251259e-05, |
|
"eval_loss": 10.377388000488281, |
|
"eval_runtime": 160.0598, |
|
"eval_samples_per_second": 63.195, |
|
"eval_steps_per_second": 7.903, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00016650709736502518, |
|
"grad_norm": 0.03107944317162037, |
|
"learning_rate": 2e-05, |
|
"loss": 10.3791, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00024976064604753775, |
|
"grad_norm": 0.02209843136370182, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3773, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00033301419473005036, |
|
"grad_norm": 0.02121385745704174, |
|
"learning_rate": 4e-05, |
|
"loss": 10.3788, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00041626774341256296, |
|
"grad_norm": 0.019519681110978127, |
|
"learning_rate": 5e-05, |
|
"loss": 10.3763, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0004995212920950755, |
|
"grad_norm": 0.022146768867969513, |
|
"learning_rate": 6e-05, |
|
"loss": 10.3804, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0005827748407775882, |
|
"grad_norm": 0.024616023525595665, |
|
"learning_rate": 7e-05, |
|
"loss": 10.3777, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0006660283894601007, |
|
"grad_norm": 0.02391134202480316, |
|
"learning_rate": 8e-05, |
|
"loss": 10.3806, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0007492819381426134, |
|
"grad_norm": 0.0267960112541914, |
|
"learning_rate": 9e-05, |
|
"loss": 10.3777, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0007492819381426134, |
|
"eval_loss": 10.377134323120117, |
|
"eval_runtime": 160.2613, |
|
"eval_samples_per_second": 63.116, |
|
"eval_steps_per_second": 7.893, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0008325354868251259, |
|
"grad_norm": 0.02298792079091072, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3777, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0009157890355076385, |
|
"grad_norm": 0.025504613295197487, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 10.3774, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.000999042584190151, |
|
"grad_norm": 0.027433175593614578, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 10.3788, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0010822961328726638, |
|
"grad_norm": 0.02382979728281498, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 10.3777, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0011655496815551763, |
|
"grad_norm": 0.023936374112963676, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 10.3797, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0012488032302376889, |
|
"grad_norm": 0.02442743629217148, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 10.3778, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0013320567789202014, |
|
"grad_norm": 0.019230961799621582, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 10.3759, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.001415310327602714, |
|
"grad_norm": 0.022421833127737045, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 10.3766, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0014985638762852267, |
|
"grad_norm": 0.02025768905878067, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 10.3755, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0014985638762852267, |
|
"eval_loss": 10.376496315002441, |
|
"eval_runtime": 159.6247, |
|
"eval_samples_per_second": 63.367, |
|
"eval_steps_per_second": 7.925, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0015818174249677393, |
|
"grad_norm": 0.024560928344726562, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 10.3762, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0016650709736502518, |
|
"grad_norm": 0.02397759258747101, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 10.3746, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0017483245223327644, |
|
"grad_norm": 0.02246921882033348, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 10.3755, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.001831578071015277, |
|
"grad_norm": 0.02009345218539238, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 10.3756, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0019148316196977897, |
|
"grad_norm": 0.022839196026325226, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 10.3727, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.001998085168380302, |
|
"grad_norm": 0.02412959560751915, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 10.3778, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.002081338717062815, |
|
"grad_norm": 0.027805138379335403, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 10.3754, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0021645922657453276, |
|
"grad_norm": 0.025182532146573067, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 10.3756, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00224784581442784, |
|
"grad_norm": 0.025572553277015686, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 10.3756, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00224784581442784, |
|
"eval_loss": 10.375847816467285, |
|
"eval_runtime": 149.4247, |
|
"eval_samples_per_second": 67.693, |
|
"eval_steps_per_second": 8.466, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0023310993631103527, |
|
"grad_norm": 0.023483281955122948, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 10.3749, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.002414352911792865, |
|
"grad_norm": 0.023586202412843704, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 10.3769, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0024976064604753778, |
|
"grad_norm": 0.019847828894853592, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 10.3747, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0025808600091578905, |
|
"grad_norm": 0.02112536132335663, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 10.3752, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.002664113557840403, |
|
"grad_norm": 0.020234240218997, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 10.3743, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0027473671065229156, |
|
"grad_norm": 0.022708367556333542, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 10.3754, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.002830620655205428, |
|
"grad_norm": 0.02386791631579399, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 10.3728, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0029138742038879407, |
|
"grad_norm": 0.02285214513540268, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 10.3765, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0029971277525704535, |
|
"grad_norm": 0.023896262049674988, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 10.3748, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0029971277525704535, |
|
"eval_loss": 10.375186920166016, |
|
"eval_runtime": 148.5988, |
|
"eval_samples_per_second": 68.069, |
|
"eval_steps_per_second": 8.513, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.003080381301252966, |
|
"grad_norm": 0.025179745629429817, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 10.376, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0031636348499354786, |
|
"grad_norm": 0.026716183871030807, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 10.3766, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.003246888398617991, |
|
"grad_norm": 0.030639372766017914, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 10.3764, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0033301419473005037, |
|
"grad_norm": 0.02914862520992756, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 10.3735, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0034133954959830164, |
|
"grad_norm": 0.022033508867025375, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 10.3769, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0034966490446655288, |
|
"grad_norm": 0.02572484128177166, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 10.3752, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0035799025933480415, |
|
"grad_norm": 0.023432254791259766, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 10.3747, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.003663156142030554, |
|
"grad_norm": 0.029329584911465645, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 10.3749, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0037464096907130666, |
|
"grad_norm": 0.027913155034184456, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 10.374, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0037464096907130666, |
|
"eval_loss": 10.374531745910645, |
|
"eval_runtime": 148.61, |
|
"eval_samples_per_second": 68.064, |
|
"eval_steps_per_second": 8.512, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0038296632393955794, |
|
"grad_norm": 0.023554999381303787, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 10.3721, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.003912916788078092, |
|
"grad_norm": 0.03028443641960621, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 10.3751, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.003996170336760604, |
|
"grad_norm": 0.02735976316034794, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 10.3768, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.004079423885443117, |
|
"grad_norm": 0.027595043182373047, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 10.3733, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.00416267743412563, |
|
"grad_norm": 0.02975800819694996, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 10.3733, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004245930982808142, |
|
"grad_norm": 0.022592192515730858, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 10.3757, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.004329184531490655, |
|
"grad_norm": 0.02746448665857315, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 10.373, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.004412438080173167, |
|
"grad_norm": 0.034864962100982666, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 10.3755, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00449569162885568, |
|
"grad_norm": 0.02986881136894226, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 10.3741, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00449569162885568, |
|
"eval_loss": 10.373912811279297, |
|
"eval_runtime": 148.6352, |
|
"eval_samples_per_second": 68.053, |
|
"eval_steps_per_second": 8.511, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0045789451775381925, |
|
"grad_norm": 0.032523926347494125, |
|
"learning_rate": 5e-05, |
|
"loss": 10.3756, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.004662198726220705, |
|
"grad_norm": 0.029269371181726456, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 10.3737, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.004745452274903218, |
|
"grad_norm": 0.02924845553934574, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 10.3736, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00482870582358573, |
|
"grad_norm": 0.028468172997236252, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 10.3716, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.004911959372268243, |
|
"grad_norm": 0.02863762155175209, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 10.3734, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0049952129209507555, |
|
"grad_norm": 0.03285225108265877, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 10.3724, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005078466469633268, |
|
"grad_norm": 0.028849463909864426, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 10.3711, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.005161720018315781, |
|
"grad_norm": 0.03323189169168472, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 10.3739, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.005244973566998293, |
|
"grad_norm": 0.03293667733669281, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 10.3728, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.005244973566998293, |
|
"eval_loss": 10.373388290405273, |
|
"eval_runtime": 148.6153, |
|
"eval_samples_per_second": 68.062, |
|
"eval_steps_per_second": 8.512, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.005328227115680806, |
|
"grad_norm": 0.03562981262803078, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 10.3694, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0054114806643633185, |
|
"grad_norm": 0.02975441701710224, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 10.3768, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.005494734213045831, |
|
"grad_norm": 0.04472412168979645, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 10.3748, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.005577987761728344, |
|
"grad_norm": 0.030767936259508133, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 10.3734, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.005661241310410856, |
|
"grad_norm": 0.030225660651922226, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 10.3752, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.005744494859093369, |
|
"grad_norm": 0.039484333246946335, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 10.373, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.005827748407775881, |
|
"grad_norm": 0.03261690214276314, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 10.3731, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.005911001956458394, |
|
"grad_norm": 0.03445957973599434, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 10.3739, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.005994255505140907, |
|
"grad_norm": 0.03545086085796356, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 10.3708, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.005994255505140907, |
|
"eval_loss": 10.372991561889648, |
|
"eval_runtime": 148.5888, |
|
"eval_samples_per_second": 68.074, |
|
"eval_steps_per_second": 8.513, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.006077509053823419, |
|
"grad_norm": 0.039018601179122925, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 10.3709, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.006160762602505932, |
|
"grad_norm": 0.03845333680510521, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 10.3735, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.006244016151188444, |
|
"grad_norm": 0.029868410900235176, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 10.373, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.006327269699870957, |
|
"grad_norm": 0.03503395617008209, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 10.3724, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.00641052324855347, |
|
"grad_norm": 0.04458336532115936, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 10.3728, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.006493776797235982, |
|
"grad_norm": 0.03825473040342331, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 10.3736, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.006577030345918495, |
|
"grad_norm": 0.03983627259731293, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 10.3732, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.006660283894601007, |
|
"grad_norm": 0.035727862268686295, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 10.3727, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00674353744328352, |
|
"grad_norm": 0.04354983568191528, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 10.3733, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.00674353744328352, |
|
"eval_loss": 10.372751235961914, |
|
"eval_runtime": 148.4839, |
|
"eval_samples_per_second": 68.122, |
|
"eval_steps_per_second": 8.519, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.006826790991966033, |
|
"grad_norm": 0.033028386533260345, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 10.3693, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.006910044540648545, |
|
"grad_norm": 0.03836917132139206, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 10.3715, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0069932980893310575, |
|
"grad_norm": 0.036879587918519974, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 10.3762, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00707655163801357, |
|
"grad_norm": 0.03589490056037903, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 10.3707, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.007159805186696083, |
|
"grad_norm": 0.03812255710363388, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 10.3741, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.007243058735378596, |
|
"grad_norm": 0.041089076548814774, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 10.3723, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.007326312284061108, |
|
"grad_norm": 0.03632598742842674, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 10.3768, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0074095658327436205, |
|
"grad_norm": 0.03819997236132622, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 10.369, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.007492819381426133, |
|
"grad_norm": 0.0333368182182312, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 10.3696, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007492819381426133, |
|
"eval_loss": 10.372652053833008, |
|
"eval_runtime": 148.6025, |
|
"eval_samples_per_second": 68.067, |
|
"eval_steps_per_second": 8.513, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007576072930108646, |
|
"grad_norm": 0.03456645831465721, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 10.3723, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.007659326478791159, |
|
"grad_norm": 0.04168243333697319, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 10.3738, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.007742580027473671, |
|
"grad_norm": 0.03744868189096451, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 10.372, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.007825833576156184, |
|
"grad_norm": 0.03732309117913246, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 10.3731, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.007909087124838697, |
|
"grad_norm": 0.031954191625118256, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 10.3735, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.007992340673521208, |
|
"grad_norm": 0.0394279770553112, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 10.3702, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.008075594222203721, |
|
"grad_norm": 0.04486799240112305, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 10.3759, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.008158847770886234, |
|
"grad_norm": 0.037964317947626114, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 10.373, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.008242101319568746, |
|
"grad_norm": 0.03853140026330948, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 10.3717, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.008242101319568746, |
|
"eval_loss": 10.372636795043945, |
|
"eval_runtime": 148.5867, |
|
"eval_samples_per_second": 68.075, |
|
"eval_steps_per_second": 8.514, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.00832535486825126, |
|
"grad_norm": 0.030309082940220833, |
|
"learning_rate": 0.0, |
|
"loss": 10.3732, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 10460489318400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|