|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0012771392081736, |
|
"eval_steps": 500, |
|
"global_step": 196, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005108556832694764, |
|
"grad_norm": 0.06296706199645996, |
|
"learning_rate": 0.00019998715457999314, |
|
"loss": 10.3788, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010217113665389528, |
|
"grad_norm": 0.06155822426080704, |
|
"learning_rate": 0.0001999486216200688, |
|
"loss": 10.3876, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01532567049808429, |
|
"grad_norm": 0.04243969917297363, |
|
"learning_rate": 0.0001998844110196681, |
|
"loss": 10.383, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.020434227330779056, |
|
"grad_norm": 0.08324499428272247, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 10.3854, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02554278416347382, |
|
"grad_norm": 0.06429007649421692, |
|
"learning_rate": 0.00019967902947497156, |
|
"loss": 10.3783, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03065134099616858, |
|
"grad_norm": 0.06722702831029892, |
|
"learning_rate": 0.00019953791129491983, |
|
"loss": 10.3771, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.035759897828863345, |
|
"grad_norm": 0.04376779869198799, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 10.3768, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04086845466155811, |
|
"grad_norm": 0.057710230350494385, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 10.3731, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 0.06998121738433838, |
|
"learning_rate": 0.00019896130185675261, |
|
"loss": 10.3783, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05108556832694764, |
|
"grad_norm": 0.06011413037776947, |
|
"learning_rate": 0.00019871817834144504, |
|
"loss": 10.3773, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0561941251596424, |
|
"grad_norm": 0.05903960019350052, |
|
"learning_rate": 0.00019844969329687527, |
|
"loss": 10.379, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06130268199233716, |
|
"grad_norm": 0.06023373827338219, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 10.3718, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06641123882503193, |
|
"grad_norm": 0.07594680041074753, |
|
"learning_rate": 0.00019783692102207155, |
|
"loss": 10.3802, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07151979565772669, |
|
"grad_norm": 0.07722143083810806, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 10.3835, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07662835249042145, |
|
"grad_norm": 0.06465964019298553, |
|
"learning_rate": 0.0001971236146972764, |
|
"loss": 10.3811, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08173690932311622, |
|
"grad_norm": 0.08145278692245483, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 10.3771, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08684546615581099, |
|
"grad_norm": 0.06017107516527176, |
|
"learning_rate": 0.00019631050729295707, |
|
"loss": 10.3816, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.07251092046499252, |
|
"learning_rate": 0.00019586678530366606, |
|
"loss": 10.3833, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0970625798212005, |
|
"grad_norm": 0.04419001191854477, |
|
"learning_rate": 0.00019539843433193639, |
|
"loss": 10.3781, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10217113665389528, |
|
"grad_norm": 0.0600612536072731, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 10.3828, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10727969348659004, |
|
"grad_norm": 0.07362791895866394, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 10.3844, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1123882503192848, |
|
"grad_norm": 0.06240883469581604, |
|
"learning_rate": 0.00019384684220497605, |
|
"loss": 10.3754, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11749680715197956, |
|
"grad_norm": 0.06827075779438019, |
|
"learning_rate": 0.0001932812413370265, |
|
"loss": 10.3727, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12260536398467432, |
|
"grad_norm": 0.06609390676021576, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 10.3814, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1277139208173691, |
|
"grad_norm": 0.06842462718486786, |
|
"learning_rate": 0.00019207829686205882, |
|
"loss": 10.3716, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13282247765006386, |
|
"grad_norm": 0.06660367548465729, |
|
"learning_rate": 0.00019144126230158127, |
|
"loss": 10.3826, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.08344798535108566, |
|
"learning_rate": 0.00019078073571269922, |
|
"loss": 10.3783, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14303959131545338, |
|
"grad_norm": 0.08380083739757538, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 10.3732, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.0946107804775238, |
|
"learning_rate": 0.00018938989122074197, |
|
"loss": 10.37, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1532567049808429, |
|
"grad_norm": 0.06197074055671692, |
|
"learning_rate": 0.00018865993063730004, |
|
"loss": 10.3761, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1583652618135377, |
|
"grad_norm": 0.0819283053278923, |
|
"learning_rate": 0.00018790719257292174, |
|
"loss": 10.371, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.16347381864623245, |
|
"grad_norm": 0.09213186800479889, |
|
"learning_rate": 0.00018713187041233896, |
|
"loss": 10.3725, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1685823754789272, |
|
"grad_norm": 0.08193743228912354, |
|
"learning_rate": 0.00018633416334232753, |
|
"loss": 10.3725, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.17369093231162197, |
|
"grad_norm": 0.07233049720525742, |
|
"learning_rate": 0.00018551427630053463, |
|
"loss": 10.374, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.17879948914431673, |
|
"grad_norm": 0.09028832614421844, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 10.3732, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.10519926995038986, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 10.3769, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.18901660280970625, |
|
"grad_norm": 0.06255685538053513, |
|
"learning_rate": 0.0001829236698681195, |
|
"loss": 10.378, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.194125159642401, |
|
"grad_norm": 0.08233755081892014, |
|
"learning_rate": 0.0001820172254596956, |
|
"loss": 10.3704, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.19923371647509577, |
|
"grad_norm": 0.05263242870569229, |
|
"learning_rate": 0.0001810897101370951, |
|
"loss": 10.3834, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.20434227330779056, |
|
"grad_norm": 0.1387779265642166, |
|
"learning_rate": 0.00018014136218679567, |
|
"loss": 10.3707, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20945083014048532, |
|
"grad_norm": 0.08574540168046951, |
|
"learning_rate": 0.000179172425247352, |
|
"loss": 10.377, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.21455938697318008, |
|
"grad_norm": 0.06211940944194794, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 10.3647, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.21966794380587484, |
|
"grad_norm": 0.09258867800235748, |
|
"learning_rate": 0.00017717378533872017, |
|
"loss": 10.3722, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2247765006385696, |
|
"grad_norm": 0.08623415976762772, |
|
"learning_rate": 0.00017614459583691346, |
|
"loss": 10.3754, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.1082397997379303, |
|
"learning_rate": 0.00017509584414881113, |
|
"loss": 10.3736, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23499361430395913, |
|
"grad_norm": 0.0896887555718422, |
|
"learning_rate": 0.00017402779970753155, |
|
"loss": 10.3732, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.24010217113665389, |
|
"grad_norm": 0.09336112439632416, |
|
"learning_rate": 0.00017294073690266344, |
|
"loss": 10.373, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.24521072796934865, |
|
"grad_norm": 0.08002498000860214, |
|
"learning_rate": 0.00017183493500977278, |
|
"loss": 10.3689, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2503192848020434, |
|
"grad_norm": 0.07161203771829605, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 10.3702, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2554278416347382, |
|
"grad_norm": 0.121755450963974, |
|
"learning_rate": 0.00016956825506034867, |
|
"loss": 10.3821, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26053639846743293, |
|
"grad_norm": 0.08657146245241165, |
|
"learning_rate": 0.00016840795933293463, |
|
"loss": 10.3803, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2656449553001277, |
|
"grad_norm": 0.07844258099794388, |
|
"learning_rate": 0.0001672300890261317, |
|
"loss": 10.3719, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2707535121328225, |
|
"grad_norm": 0.08680905401706696, |
|
"learning_rate": 0.00016603494674471593, |
|
"loss": 10.3649, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.10259506851434708, |
|
"learning_rate": 0.00016482283953077887, |
|
"loss": 10.3725, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.280970625798212, |
|
"grad_norm": 0.08116184920072556, |
|
"learning_rate": 0.00016359407878484552, |
|
"loss": 10.3681, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.28607918263090676, |
|
"grad_norm": 0.06668803840875626, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 10.3731, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.29118773946360155, |
|
"grad_norm": 0.08566579967737198, |
|
"learning_rate": 0.00016108786361015143, |
|
"loss": 10.37, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.11228417605161667, |
|
"learning_rate": 0.00015981105304912162, |
|
"loss": 10.3716, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.30140485312899107, |
|
"grad_norm": 0.07987914979457855, |
|
"learning_rate": 0.00015851887652614237, |
|
"loss": 10.3712, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3065134099616858, |
|
"grad_norm": 0.1201016902923584, |
|
"learning_rate": 0.00015721166601221698, |
|
"loss": 10.3647, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3116219667943806, |
|
"grad_norm": 0.08329416066408157, |
|
"learning_rate": 0.00015588975734070717, |
|
"loss": 10.3792, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3167305236270754, |
|
"grad_norm": 0.10588457435369492, |
|
"learning_rate": 0.00015455349012105486, |
|
"loss": 10.3688, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 0.12081307172775269, |
|
"learning_rate": 0.00015320320765153367, |
|
"loss": 10.3642, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3269476372924649, |
|
"grad_norm": 0.08195498585700989, |
|
"learning_rate": 0.00015183925683105254, |
|
"loss": 10.3682, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.33205619412515963, |
|
"grad_norm": 0.06908663362264633, |
|
"learning_rate": 0.0001504619880700346, |
|
"loss": 10.367, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3371647509578544, |
|
"grad_norm": 0.09832872450351715, |
|
"learning_rate": 0.0001490717552003938, |
|
"loss": 10.377, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.34227330779054915, |
|
"grad_norm": 0.0903470367193222, |
|
"learning_rate": 0.00014766891538463254, |
|
"loss": 10.3772, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.34738186462324394, |
|
"grad_norm": 0.09079986065626144, |
|
"learning_rate": 0.00014625382902408356, |
|
"loss": 10.366, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3524904214559387, |
|
"grad_norm": 0.05147509649395943, |
|
"learning_rate": 0.0001448268596663197, |
|
"loss": 10.3652, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.35759897828863346, |
|
"grad_norm": 0.14060315489768982, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 10.3629, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.36270753512132825, |
|
"grad_norm": 0.11259511113166809, |
|
"learning_rate": 0.0001419387413194657, |
|
"loss": 10.3691, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.09399597346782684, |
|
"learning_rate": 0.00014047833431223938, |
|
"loss": 10.3617, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.37292464878671777, |
|
"grad_norm": 0.11537044495344162, |
|
"learning_rate": 0.00013900752808090468, |
|
"loss": 10.366, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3780332056194125, |
|
"grad_norm": 0.11738992482423782, |
|
"learning_rate": 0.00013752670048793744, |
|
"loss": 10.3471, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3831417624521073, |
|
"grad_norm": 0.11229626834392548, |
|
"learning_rate": 0.00013603623197038536, |
|
"loss": 10.3645, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.388250319284802, |
|
"grad_norm": 0.14596045017242432, |
|
"learning_rate": 0.00013453650544213076, |
|
"loss": 10.3751, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3933588761174968, |
|
"grad_norm": 0.06448373198509216, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 10.3572, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.39846743295019155, |
|
"grad_norm": 0.11809150874614716, |
|
"learning_rate": 0.0001315108218023621, |
|
"loss": 10.3695, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.40357598978288634, |
|
"grad_norm": 0.08517828583717346, |
|
"learning_rate": 0.00012998564201439116, |
|
"loss": 10.3629, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4086845466155811, |
|
"grad_norm": 0.12408934533596039, |
|
"learning_rate": 0.00012845275866310324, |
|
"loss": 10.3607, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.12312637269496918, |
|
"learning_rate": 0.00012691256555910768, |
|
"loss": 10.3654, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.41890166028097064, |
|
"grad_norm": 0.12126640975475311, |
|
"learning_rate": 0.00012536545839095074, |
|
"loss": 10.3617, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4240102171136654, |
|
"grad_norm": 0.15894931554794312, |
|
"learning_rate": 0.00012381183462345982, |
|
"loss": 10.3564, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.42911877394636017, |
|
"grad_norm": 0.07874782383441925, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 10.3653, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4342273307790549, |
|
"grad_norm": 0.12640683352947235, |
|
"learning_rate": 0.00012068663541808909, |
|
"loss": 10.3631, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4393358876117497, |
|
"grad_norm": 0.10024367272853851, |
|
"learning_rate": 0.00011911586287013725, |
|
"loss": 10.3672, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.12512411177158356, |
|
"learning_rate": 0.00011754017929643817, |
|
"loss": 10.3611, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.4495530012771392, |
|
"grad_norm": 0.1329926997423172, |
|
"learning_rate": 0.00011595998950333793, |
|
"loss": 10.3591, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.454661558109834, |
|
"grad_norm": 0.11963527649641037, |
|
"learning_rate": 0.00011437569945486819, |
|
"loss": 10.365, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.11451319605112076, |
|
"learning_rate": 0.00011278771616845061, |
|
"loss": 10.3549, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4648786717752235, |
|
"grad_norm": 0.09816919267177582, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 10.3577, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.46998722860791825, |
|
"grad_norm": 0.15458016097545624, |
|
"learning_rate": 0.00010960230259076818, |
|
"loss": 10.3668, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.47509578544061304, |
|
"grad_norm": 0.1010480597615242, |
|
"learning_rate": 0.00010800569065900933, |
|
"loss": 10.3592, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.48020434227330777, |
|
"grad_norm": 0.11193486303091049, |
|
"learning_rate": 0.0001064070219980713, |
|
"loss": 10.3568, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.48531289910600256, |
|
"grad_norm": 0.10354366153478622, |
|
"learning_rate": 0.00010480670731936208, |
|
"loss": 10.3509, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4904214559386973, |
|
"grad_norm": 0.15040072798728943, |
|
"learning_rate": 0.00010320515775716555, |
|
"loss": 10.3566, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4955300127713921, |
|
"grad_norm": 0.10507191717624664, |
|
"learning_rate": 0.0001016027847630174, |
|
"loss": 10.3547, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5006385696040868, |
|
"grad_norm": 0.11989340931177139, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3656, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 0.12597250938415527, |
|
"learning_rate": 9.839721523698264e-05, |
|
"loss": 10.3657, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5108556832694764, |
|
"grad_norm": 0.16077381372451782, |
|
"learning_rate": 9.679484224283449e-05, |
|
"loss": 10.3527, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5159642401021711, |
|
"grad_norm": 0.12048140168190002, |
|
"learning_rate": 9.519329268063795e-05, |
|
"loss": 10.3627, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5210727969348659, |
|
"grad_norm": 0.10793527215719223, |
|
"learning_rate": 9.359297800192872e-05, |
|
"loss": 10.3508, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5261813537675607, |
|
"grad_norm": 0.070986308157444, |
|
"learning_rate": 9.199430934099068e-05, |
|
"loss": 10.3545, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5312899106002554, |
|
"grad_norm": 0.08290573209524155, |
|
"learning_rate": 9.039769740923183e-05, |
|
"loss": 10.3568, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5363984674329502, |
|
"grad_norm": 0.09891117364168167, |
|
"learning_rate": 8.880355238966923e-05, |
|
"loss": 10.3572, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.541507024265645, |
|
"grad_norm": 0.10156704485416412, |
|
"learning_rate": 8.721228383154939e-05, |
|
"loss": 10.3588, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5466155810983397, |
|
"grad_norm": 0.10102825611829758, |
|
"learning_rate": 8.562430054513184e-05, |
|
"loss": 10.3583, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.10080267488956451, |
|
"learning_rate": 8.404001049666211e-05, |
|
"loss": 10.365, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5568326947637292, |
|
"grad_norm": 0.08475471287965775, |
|
"learning_rate": 8.245982070356185e-05, |
|
"loss": 10.3606, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.561941251596424, |
|
"grad_norm": 0.13166114687919617, |
|
"learning_rate": 8.08841371298628e-05, |
|
"loss": 10.3538, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5670498084291188, |
|
"grad_norm": 0.1031697690486908, |
|
"learning_rate": 7.931336458191092e-05, |
|
"loss": 10.3672, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5721583652618135, |
|
"grad_norm": 0.08675719797611237, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 10.3575, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5772669220945083, |
|
"grad_norm": 0.10204310715198517, |
|
"learning_rate": 7.618816537654018e-05, |
|
"loss": 10.3607, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5823754789272031, |
|
"grad_norm": 0.12430866807699203, |
|
"learning_rate": 7.463454160904928e-05, |
|
"loss": 10.3586, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5874840357598978, |
|
"grad_norm": 0.09749015420675278, |
|
"learning_rate": 7.308743444089232e-05, |
|
"loss": 10.3493, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.11196263879537582, |
|
"learning_rate": 7.154724133689677e-05, |
|
"loss": 10.3479, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 0.1198868602514267, |
|
"learning_rate": 7.001435798560883e-05, |
|
"loss": 10.349, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6028097062579821, |
|
"grad_norm": 0.10104350000619888, |
|
"learning_rate": 6.848917819763793e-05, |
|
"loss": 10.353, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6079182630906769, |
|
"grad_norm": 0.09838002175092697, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 10.3552, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6130268199233716, |
|
"grad_norm": 0.12079446762800217, |
|
"learning_rate": 6.546349455786926e-05, |
|
"loss": 10.3541, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6181353767560664, |
|
"grad_norm": 0.07775072753429413, |
|
"learning_rate": 6.396376802961468e-05, |
|
"loss": 10.3623, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6232439335887612, |
|
"grad_norm": 0.10357434302568436, |
|
"learning_rate": 6.24732995120626e-05, |
|
"loss": 10.3578, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6283524904214559, |
|
"grad_norm": 0.11605259031057358, |
|
"learning_rate": 6.0992471919095315e-05, |
|
"loss": 10.3513, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6334610472541508, |
|
"grad_norm": 0.0947057381272316, |
|
"learning_rate": 5.952166568776062e-05, |
|
"loss": 10.3484, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6385696040868455, |
|
"grad_norm": 0.09507757425308228, |
|
"learning_rate": 5.806125868053433e-05, |
|
"loss": 10.3521, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.09619959443807602, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 10.3583, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.648786717752235, |
|
"grad_norm": 0.0792725682258606, |
|
"learning_rate": 5.5173140333680306e-05, |
|
"loss": 10.354, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6538952745849298, |
|
"grad_norm": 0.105789415538311, |
|
"learning_rate": 5.37461709759165e-05, |
|
"loss": 10.3583, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6590038314176245, |
|
"grad_norm": 0.10440662503242493, |
|
"learning_rate": 5.2331084615367485e-05, |
|
"loss": 10.3523, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6641123882503193, |
|
"grad_norm": 0.10326969623565674, |
|
"learning_rate": 5.092824479960625e-05, |
|
"loss": 10.3574, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.669220945083014, |
|
"grad_norm": 0.08976173400878906, |
|
"learning_rate": 4.953801192996543e-05, |
|
"loss": 10.3548, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6743295019157088, |
|
"grad_norm": 0.09830819070339203, |
|
"learning_rate": 4.8160743168947496e-05, |
|
"loss": 10.3497, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6794380587484036, |
|
"grad_norm": 0.12448275089263916, |
|
"learning_rate": 4.6796792348466356e-05, |
|
"loss": 10.3661, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6845466155810983, |
|
"grad_norm": 0.1166510358452797, |
|
"learning_rate": 4.544650987894514e-05, |
|
"loss": 10.3572, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.11028123646974564, |
|
"learning_rate": 4.4110242659292836e-05, |
|
"loss": 10.3607, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6947637292464879, |
|
"grad_norm": 0.14801649749279022, |
|
"learning_rate": 4.278833398778306e-05, |
|
"loss": 10.3544, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6998722860791826, |
|
"grad_norm": 0.10162738710641861, |
|
"learning_rate": 4.148112347385762e-05, |
|
"loss": 10.3514, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7049808429118773, |
|
"grad_norm": 0.08851826190948486, |
|
"learning_rate": 4.0188946950878404e-05, |
|
"loss": 10.3545, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7100893997445722, |
|
"grad_norm": 0.0674971416592598, |
|
"learning_rate": 3.8912136389848576e-05, |
|
"loss": 10.3527, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7151979565772669, |
|
"grad_norm": 0.07837618142366409, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 10.3495, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7203065134099617, |
|
"grad_norm": 0.13337913155555725, |
|
"learning_rate": 3.6405921215154494e-05, |
|
"loss": 10.3601, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7254150702426565, |
|
"grad_norm": 0.1068163514137268, |
|
"learning_rate": 3.517716046922118e-05, |
|
"loss": 10.3553, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7305236270753512, |
|
"grad_norm": 0.09636684507131577, |
|
"learning_rate": 3.3965053255284084e-05, |
|
"loss": 10.3609, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.10259624570608139, |
|
"learning_rate": 3.276991097386831e-05, |
|
"loss": 10.3514, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.08181837946176529, |
|
"learning_rate": 3.159204066706539e-05, |
|
"loss": 10.3508, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7458492975734355, |
|
"grad_norm": 0.10370145738124847, |
|
"learning_rate": 3.0431744939651364e-05, |
|
"loss": 10.3518, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7509578544061303, |
|
"grad_norm": 0.11586267501115799, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 10.3494, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.756066411238825, |
|
"grad_norm": 0.07326210290193558, |
|
"learning_rate": 2.8165064990227252e-05, |
|
"loss": 10.3618, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7611749680715197, |
|
"grad_norm": 0.07508935779333115, |
|
"learning_rate": 2.7059263097336597e-05, |
|
"loss": 10.354, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7662835249042146, |
|
"grad_norm": 0.09258796274662018, |
|
"learning_rate": 2.5972200292468464e-05, |
|
"loss": 10.3528, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7713920817369093, |
|
"grad_norm": 0.09141936898231506, |
|
"learning_rate": 2.4904155851188872e-05, |
|
"loss": 10.3475, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.776500638569604, |
|
"grad_norm": 0.06457791477441788, |
|
"learning_rate": 2.3855404163086558e-05, |
|
"loss": 10.3594, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 0.0867636650800705, |
|
"learning_rate": 2.282621466127982e-05, |
|
"loss": 10.357, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7867177522349936, |
|
"grad_norm": 0.07386265695095062, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 10.3601, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.7918263090676884, |
|
"grad_norm": 0.09301372617483139, |
|
"learning_rate": 2.0827574752648038e-05, |
|
"loss": 10.3521, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7969348659003831, |
|
"grad_norm": 0.09825734794139862, |
|
"learning_rate": 1.985863781320435e-05, |
|
"loss": 10.3518, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8020434227330779, |
|
"grad_norm": 0.09146848320960999, |
|
"learning_rate": 1.891028986290492e-05, |
|
"loss": 10.3543, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.8071519795657727, |
|
"grad_norm": 0.10811572521924973, |
|
"learning_rate": 1.7982774540304403e-05, |
|
"loss": 10.3486, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8122605363984674, |
|
"grad_norm": 0.10702500492334366, |
|
"learning_rate": 1.7076330131880526e-05, |
|
"loss": 10.3506, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8173690932311622, |
|
"grad_norm": 0.09714721143245697, |
|
"learning_rate": 1.619118951081594e-05, |
|
"loss": 10.3542, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.822477650063857, |
|
"grad_norm": 0.09883596003055573, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 10.355, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.07649561762809753, |
|
"learning_rate": 1.4485723699465392e-05, |
|
"loss": 10.3488, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8326947637292464, |
|
"grad_norm": 0.0714419037103653, |
|
"learning_rate": 1.3665836657672493e-05, |
|
"loss": 10.3463, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8378033205619413, |
|
"grad_norm": 0.09287029504776001, |
|
"learning_rate": 1.286812958766106e-05, |
|
"loss": 10.3506, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.842911877394636, |
|
"grad_norm": 0.0882708728313446, |
|
"learning_rate": 1.2092807427078279e-05, |
|
"loss": 10.3538, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8480204342273308, |
|
"grad_norm": 0.0939875915646553, |
|
"learning_rate": 1.134006936269999e-05, |
|
"loss": 10.3498, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8531289910600255, |
|
"grad_norm": 0.10232677310705185, |
|
"learning_rate": 1.0610108779258044e-05, |
|
"loss": 10.3573, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8582375478927203, |
|
"grad_norm": 0.09528540074825287, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 10.3558, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8633461047254151, |
|
"grad_norm": 0.08980326354503632, |
|
"learning_rate": 9.219264287300799e-06, |
|
"loss": 10.3564, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8684546615581098, |
|
"grad_norm": 0.0988466814160347, |
|
"learning_rate": 8.558737698418761e-06, |
|
"loss": 10.3518, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 0.09410892426967621, |
|
"learning_rate": 7.921703137941173e-06, |
|
"loss": 10.3501, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.8786717752234994, |
|
"grad_norm": 0.0989941656589508, |
|
"learning_rate": 7.308324265397836e-06, |
|
"loss": 10.3563, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8837803320561941, |
|
"grad_norm": 0.12267417460680008, |
|
"learning_rate": 6.718758662973523e-06, |
|
"loss": 10.3536, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.09049776196479797, |
|
"learning_rate": 6.153157795023956e-06, |
|
"loss": 10.3524, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.8939974457215837, |
|
"grad_norm": 0.10820028930902481, |
|
"learning_rate": 5.611666969163243e-06, |
|
"loss": 10.3631, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8991060025542784, |
|
"grad_norm": 0.08953528106212616, |
|
"learning_rate": 5.094425298933136e-06, |
|
"loss": 10.3467, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9042145593869731, |
|
"grad_norm": 0.12113954871892929, |
|
"learning_rate": 4.601565668063623e-06, |
|
"loss": 10.3563, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.909323116219668, |
|
"grad_norm": 0.08720055967569351, |
|
"learning_rate": 4.133214696333942e-06, |
|
"loss": 10.356, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9144316730523627, |
|
"grad_norm": 0.09181863814592361, |
|
"learning_rate": 3.689492707042974e-06, |
|
"loss": 10.3552, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.09627222269773483, |
|
"learning_rate": 3.270513696097055e-06, |
|
"loss": 10.3541, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9246487867177522, |
|
"grad_norm": 0.11901070922613144, |
|
"learning_rate": 2.876385302723628e-06, |
|
"loss": 10.3597, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.929757343550447, |
|
"grad_norm": 0.0824970006942749, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 10.3537, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9348659003831418, |
|
"grad_norm": 0.10875054448843002, |
|
"learning_rate": 2.1630789779284675e-06, |
|
"loss": 10.3465, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9399744572158365, |
|
"grad_norm": 0.08391505479812622, |
|
"learning_rate": 1.8440843008934561e-06, |
|
"loss": 10.3537, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9450830140485313, |
|
"grad_norm": 0.0997113361954689, |
|
"learning_rate": 1.5503067031247598e-06, |
|
"loss": 10.3528, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9501915708812261, |
|
"grad_norm": 0.1007162481546402, |
|
"learning_rate": 1.2818216585549825e-06, |
|
"loss": 10.3498, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9553001277139208, |
|
"grad_norm": 0.0825323835015297, |
|
"learning_rate": 1.0386981432474074e-06, |
|
"loss": 10.3464, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.9604086845466155, |
|
"grad_norm": 0.07373323291540146, |
|
"learning_rate": 8.209986176753948e-07, |
|
"loss": 10.3546, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.10504812747240067, |
|
"learning_rate": 6.287790106757396e-07, |
|
"loss": 10.3522, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9706257982120051, |
|
"grad_norm": 0.07949844747781754, |
|
"learning_rate": 4.62088705080177e-07, |
|
"loss": 10.3554, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9757343550446999, |
|
"grad_norm": 0.09091745316982269, |
|
"learning_rate": 3.2097052502843007e-07, |
|
"loss": 10.3477, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.9808429118773946, |
|
"grad_norm": 0.11523844301700592, |
|
"learning_rate": 2.054607249663665e-07, |
|
"loss": 10.3526, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.9859514687100894, |
|
"grad_norm": 0.09753245860338211, |
|
"learning_rate": 1.1558898033191546e-07, |
|
"loss": 10.3566, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.9910600255427842, |
|
"grad_norm": 0.08241667598485947, |
|
"learning_rate": 5.137837993121064e-08, |
|
"loss": 10.3576, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.9961685823754789, |
|
"grad_norm": 0.08668815344572067, |
|
"learning_rate": 1.2845420006879494e-08, |
|
"loss": 10.3612, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9961685823754789, |
|
"eval_loss": 10.354520797729492, |
|
"eval_runtime": 0.2879, |
|
"eval_samples_per_second": 145.877, |
|
"eval_steps_per_second": 145.877, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0012771392081736, |
|
"grad_norm": 0.08981695771217346, |
|
"learning_rate": 0.0, |
|
"loss": 12.4365, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0012771392081736, |
|
"eval_loss": 10.354520797729492, |
|
"eval_runtime": 0.2791, |
|
"eval_samples_per_second": 150.484, |
|
"eval_steps_per_second": 150.484, |
|
"step": 196 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 196, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2534457999360.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|