|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.986175115207373, |
|
"eval_steps": 500, |
|
"global_step": 216, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013824884792626729, |
|
"grad_norm": 7.633559049394886, |
|
"learning_rate": 4.5454545454545457e-07, |
|
"loss": 0.8764, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027649769585253458, |
|
"grad_norm": 7.6469224678917, |
|
"learning_rate": 9.090909090909091e-07, |
|
"loss": 0.8712, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.041474654377880185, |
|
"grad_norm": 7.5089460233893295, |
|
"learning_rate": 1.3636363636363636e-06, |
|
"loss": 0.8804, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.055299539170506916, |
|
"grad_norm": 7.691760047071555, |
|
"learning_rate": 1.8181818181818183e-06, |
|
"loss": 0.8775, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06912442396313365, |
|
"grad_norm": 7.294963661060089, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 0.8709, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08294930875576037, |
|
"grad_norm": 5.443182880842741, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 0.7907, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 5.047617387545844, |
|
"learning_rate": 3.181818181818182e-06, |
|
"loss": 0.7836, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11059907834101383, |
|
"grad_norm": 3.2253892674265594, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 0.7531, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.12442396313364056, |
|
"grad_norm": 2.8131888920599657, |
|
"learning_rate": 4.0909090909090915e-06, |
|
"loss": 0.7417, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1382488479262673, |
|
"grad_norm": 3.6977429779435815, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.723, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15207373271889402, |
|
"grad_norm": 4.281498675575116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7246, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.16589861751152074, |
|
"grad_norm": 3.6996387465448373, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 0.6595, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.17972350230414746, |
|
"grad_norm": 3.78830344374056, |
|
"learning_rate": 5.90909090909091e-06, |
|
"loss": 0.6694, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 3.3490463351493713, |
|
"learning_rate": 6.363636363636364e-06, |
|
"loss": 0.6227, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2073732718894009, |
|
"grad_norm": 3.374405746489098, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 0.6273, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.22119815668202766, |
|
"grad_norm": 2.4954014740266195, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 0.6062, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2350230414746544, |
|
"grad_norm": 1.8239008983436202, |
|
"learning_rate": 7.727272727272727e-06, |
|
"loss": 0.593, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2488479262672811, |
|
"grad_norm": 1.635875724467323, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 0.5959, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2626728110599078, |
|
"grad_norm": 2.0334716933077166, |
|
"learning_rate": 8.636363636363637e-06, |
|
"loss": 0.5672, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2764976958525346, |
|
"grad_norm": 1.9168703834196266, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.5729, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 1.4466966621915673, |
|
"learning_rate": 9.545454545454547e-06, |
|
"loss": 0.5534, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.30414746543778803, |
|
"grad_norm": 1.0267896741626927, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5628, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.31797235023041476, |
|
"grad_norm": 1.262616482011545, |
|
"learning_rate": 9.999344418328161e-06, |
|
"loss": 0.5572, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3317972350230415, |
|
"grad_norm": 1.0906634918414906, |
|
"learning_rate": 9.997377845227577e-06, |
|
"loss": 0.5146, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3456221198156682, |
|
"grad_norm": 1.0415845965231307, |
|
"learning_rate": 9.994100796397954e-06, |
|
"loss": 0.4973, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.35944700460829493, |
|
"grad_norm": 0.815227789570975, |
|
"learning_rate": 9.98951413118856e-06, |
|
"loss": 0.5218, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.37327188940092165, |
|
"grad_norm": 0.8903243919241076, |
|
"learning_rate": 9.983619052372847e-06, |
|
"loss": 0.5167, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 1.0991055385660755, |
|
"learning_rate": 9.97641710583307e-06, |
|
"loss": 0.5003, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4009216589861751, |
|
"grad_norm": 0.9348867416996314, |
|
"learning_rate": 9.96791018015489e-06, |
|
"loss": 0.5078, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4147465437788018, |
|
"grad_norm": 0.7138578554042776, |
|
"learning_rate": 9.958100506132127e-06, |
|
"loss": 0.5075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.8349505675026924, |
|
"learning_rate": 9.946990656181782e-06, |
|
"loss": 0.511, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4423963133640553, |
|
"grad_norm": 1.052621697155694, |
|
"learning_rate": 9.934583543669454e-06, |
|
"loss": 0.4824, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.45622119815668205, |
|
"grad_norm": 0.7618516629853732, |
|
"learning_rate": 9.920882422145372e-06, |
|
"loss": 0.4842, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4700460829493088, |
|
"grad_norm": 0.8523130797451687, |
|
"learning_rate": 9.905890884491196e-06, |
|
"loss": 0.4843, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.7854821313979271, |
|
"learning_rate": 9.889612861977855e-06, |
|
"loss": 0.4893, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4976958525345622, |
|
"grad_norm": 0.9098881831155701, |
|
"learning_rate": 9.872052623234632e-06, |
|
"loss": 0.5024, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.511520737327189, |
|
"grad_norm": 0.9133145917535354, |
|
"learning_rate": 9.853214773129796e-06, |
|
"loss": 0.4828, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5253456221198156, |
|
"grad_norm": 0.7771673181178608, |
|
"learning_rate": 9.833104251563058e-06, |
|
"loss": 0.478, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5391705069124424, |
|
"grad_norm": 0.6686414859648492, |
|
"learning_rate": 9.811726332170153e-06, |
|
"loss": 0.4945, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5529953917050692, |
|
"grad_norm": 0.7697756564970056, |
|
"learning_rate": 9.789086620939936e-06, |
|
"loss": 0.4704, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5668202764976958, |
|
"grad_norm": 0.8065645041682494, |
|
"learning_rate": 9.765191054744305e-06, |
|
"loss": 0.4512, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 0.7428419868914332, |
|
"learning_rate": 9.740045899781353e-06, |
|
"loss": 0.4631, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5944700460829493, |
|
"grad_norm": 0.7370318617715542, |
|
"learning_rate": 9.713657749932172e-06, |
|
"loss": 0.4834, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6082949308755761, |
|
"grad_norm": 0.5647708496508699, |
|
"learning_rate": 9.68603352503172e-06, |
|
"loss": 0.4774, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6221198156682027, |
|
"grad_norm": 0.6790318592831586, |
|
"learning_rate": 9.657180469054213e-06, |
|
"loss": 0.4964, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6359447004608295, |
|
"grad_norm": 0.6728496708051098, |
|
"learning_rate": 9.627106148213521e-06, |
|
"loss": 0.5024, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6497695852534562, |
|
"grad_norm": 0.6158261913286127, |
|
"learning_rate": 9.595818448979061e-06, |
|
"loss": 0.4656, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.663594470046083, |
|
"grad_norm": 0.6396258914140209, |
|
"learning_rate": 9.563325576007702e-06, |
|
"loss": 0.4593, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 0.55574623840064, |
|
"learning_rate": 9.529636049992235e-06, |
|
"loss": 0.4848, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6912442396313364, |
|
"grad_norm": 0.5960107153575481, |
|
"learning_rate": 9.494758705426978e-06, |
|
"loss": 0.4606, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7050691244239631, |
|
"grad_norm": 0.6187132785014103, |
|
"learning_rate": 9.458702688291072e-06, |
|
"loss": 0.4706, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7188940092165899, |
|
"grad_norm": 0.596391894724299, |
|
"learning_rate": 9.421477453650118e-06, |
|
"loss": 0.4552, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7327188940092166, |
|
"grad_norm": 0.5828525328792538, |
|
"learning_rate": 9.38309276317674e-06, |
|
"loss": 0.4419, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7465437788018433, |
|
"grad_norm": 0.5574488192167764, |
|
"learning_rate": 9.343558682590757e-06, |
|
"loss": 0.4482, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7603686635944701, |
|
"grad_norm": 0.5810226590826171, |
|
"learning_rate": 9.302885579019626e-06, |
|
"loss": 0.4574, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 0.6373464198147619, |
|
"learning_rate": 9.261084118279846e-06, |
|
"loss": 0.5021, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7880184331797235, |
|
"grad_norm": 0.6473035563889259, |
|
"learning_rate": 9.218165262080024e-06, |
|
"loss": 0.4688, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8018433179723502, |
|
"grad_norm": 0.5496475133999553, |
|
"learning_rate": 9.174140265146356e-06, |
|
"loss": 0.4614, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.815668202764977, |
|
"grad_norm": 0.5633672236656848, |
|
"learning_rate": 9.129020672271283e-06, |
|
"loss": 0.4631, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8294930875576036, |
|
"grad_norm": 0.5996514489934792, |
|
"learning_rate": 9.082818315286054e-06, |
|
"loss": 0.4709, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8433179723502304, |
|
"grad_norm": 0.5800489851592833, |
|
"learning_rate": 9.035545309958048e-06, |
|
"loss": 0.4529, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5728565615351726, |
|
"learning_rate": 8.987214052813605e-06, |
|
"loss": 0.4689, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 0.635495629930791, |
|
"learning_rate": 8.937837217887273e-06, |
|
"loss": 0.4577, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8847926267281107, |
|
"grad_norm": 0.6832859023985155, |
|
"learning_rate": 8.887427753398249e-06, |
|
"loss": 0.4551, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8986175115207373, |
|
"grad_norm": 0.6250704218728608, |
|
"learning_rate": 8.83599887835493e-06, |
|
"loss": 0.4412, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9124423963133641, |
|
"grad_norm": 0.4941669349016631, |
|
"learning_rate": 8.783564079088478e-06, |
|
"loss": 0.452, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9262672811059908, |
|
"grad_norm": 0.6418218067831372, |
|
"learning_rate": 8.730137105716231e-06, |
|
"loss": 0.4374, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9400921658986175, |
|
"grad_norm": 0.4944995367016919, |
|
"learning_rate": 8.675731968536004e-06, |
|
"loss": 0.4538, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9539170506912442, |
|
"grad_norm": 0.5351112267428982, |
|
"learning_rate": 8.620362934352109e-06, |
|
"loss": 0.456, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.550190483341697, |
|
"learning_rate": 8.564044522734147e-06, |
|
"loss": 0.4696, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9815668202764977, |
|
"grad_norm": 0.5416241347883733, |
|
"learning_rate": 8.506791502209497e-06, |
|
"loss": 0.4351, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9953917050691244, |
|
"grad_norm": 0.5646211850916225, |
|
"learning_rate": 8.448618886390523e-06, |
|
"loss": 0.4395, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0092165898617511, |
|
"grad_norm": 1.0497545314797017, |
|
"learning_rate": 8.389541930037516e-06, |
|
"loss": 0.7044, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.023041474654378, |
|
"grad_norm": 0.6050451609110485, |
|
"learning_rate": 8.329576125058406e-06, |
|
"loss": 0.4019, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0368663594470047, |
|
"grad_norm": 0.668170300230099, |
|
"learning_rate": 8.268737196446264e-06, |
|
"loss": 0.4301, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0506912442396312, |
|
"grad_norm": 0.5614144997152238, |
|
"learning_rate": 8.207041098155701e-06, |
|
"loss": 0.3802, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 0.6517654191953809, |
|
"learning_rate": 8.144504008919224e-06, |
|
"loss": 0.4079, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0783410138248848, |
|
"grad_norm": 0.5437003169084378, |
|
"learning_rate": 8.081142328004638e-06, |
|
"loss": 0.3725, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0921658986175116, |
|
"grad_norm": 0.6545122792430208, |
|
"learning_rate": 8.016972670914624e-06, |
|
"loss": 0.4282, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.1059907834101383, |
|
"grad_norm": 0.6048618596815725, |
|
"learning_rate": 7.952011865029614e-06, |
|
"loss": 0.4037, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.119815668202765, |
|
"grad_norm": 0.6691394124559549, |
|
"learning_rate": 7.886276945195098e-06, |
|
"loss": 0.3643, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1336405529953917, |
|
"grad_norm": 0.7270083026506138, |
|
"learning_rate": 7.819785149254534e-06, |
|
"loss": 0.4131, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.1474654377880185, |
|
"grad_norm": 0.5582455458120927, |
|
"learning_rate": 7.752553913529019e-06, |
|
"loss": 0.368, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 0.6782237172690554, |
|
"learning_rate": 7.68460086824492e-06, |
|
"loss": 0.4308, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1751152073732718, |
|
"grad_norm": 0.6173590398850847, |
|
"learning_rate": 7.61594383291065e-06, |
|
"loss": 0.3943, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1889400921658986, |
|
"grad_norm": 0.5909172461024365, |
|
"learning_rate": 7.546600811643816e-06, |
|
"loss": 0.4211, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2027649769585254, |
|
"grad_norm": 0.5670563053696445, |
|
"learning_rate": 7.476589988449939e-06, |
|
"loss": 0.3704, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2165898617511521, |
|
"grad_norm": 0.5698865538843494, |
|
"learning_rate": 7.405929722454026e-06, |
|
"loss": 0.3912, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.230414746543779, |
|
"grad_norm": 0.5331376655462895, |
|
"learning_rate": 7.334638543086203e-06, |
|
"loss": 0.3836, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2442396313364055, |
|
"grad_norm": 0.6325980757371303, |
|
"learning_rate": 7.262735145222696e-06, |
|
"loss": 0.3915, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 0.5952012132281536, |
|
"learning_rate": 7.190238384283413e-06, |
|
"loss": 0.4005, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.271889400921659, |
|
"grad_norm": 0.5581313424613799, |
|
"learning_rate": 7.117167271287453e-06, |
|
"loss": 0.3991, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.5043175284762973, |
|
"learning_rate": 7.043540967867782e-06, |
|
"loss": 0.3643, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2995391705069124, |
|
"grad_norm": 0.5598625221287372, |
|
"learning_rate": 6.969378781246436e-06, |
|
"loss": 0.4334, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.3133640552995391, |
|
"grad_norm": 0.5503414372922475, |
|
"learning_rate": 6.894700159171535e-06, |
|
"loss": 0.415, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.327188940092166, |
|
"grad_norm": 0.5592853069996093, |
|
"learning_rate": 6.819524684817439e-06, |
|
"loss": 0.407, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3410138248847927, |
|
"grad_norm": 0.5431739142566777, |
|
"learning_rate": 6.743872071649411e-06, |
|
"loss": 0.401, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 0.5168119415119301, |
|
"learning_rate": 6.667762158254104e-06, |
|
"loss": 0.3713, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.368663594470046, |
|
"grad_norm": 0.6207479541491344, |
|
"learning_rate": 6.591214903137221e-06, |
|
"loss": 0.389, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3824884792626728, |
|
"grad_norm": 0.5364662180025893, |
|
"learning_rate": 6.514250379489754e-06, |
|
"loss": 0.3954, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3963133640552996, |
|
"grad_norm": 0.5066869500264701, |
|
"learning_rate": 6.436888769924142e-06, |
|
"loss": 0.3939, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.4101382488479262, |
|
"grad_norm": 0.5780367268696035, |
|
"learning_rate": 6.3591503611817155e-06, |
|
"loss": 0.4076, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.423963133640553, |
|
"grad_norm": 0.5070490336891309, |
|
"learning_rate": 6.281055538812861e-06, |
|
"loss": 0.3604, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.4377880184331797, |
|
"grad_norm": 0.612079276266504, |
|
"learning_rate": 6.202624781831269e-06, |
|
"loss": 0.4067, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 0.5734702647572101, |
|
"learning_rate": 6.123878657343648e-06, |
|
"loss": 0.4255, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.4654377880184333, |
|
"grad_norm": 0.5354043563850319, |
|
"learning_rate": 6.044837815156377e-06, |
|
"loss": 0.4074, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.4792626728110598, |
|
"grad_norm": 0.5791707978034125, |
|
"learning_rate": 5.965522982360441e-06, |
|
"loss": 0.3895, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.4930875576036866, |
|
"grad_norm": 0.5703922296796244, |
|
"learning_rate": 5.885954957896115e-06, |
|
"loss": 0.4086, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5069124423963134, |
|
"grad_norm": 0.5065784500919478, |
|
"learning_rate": 5.806154607098799e-06, |
|
"loss": 0.3693, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.52073732718894, |
|
"grad_norm": 0.556653840063644, |
|
"learning_rate": 5.726142856227453e-06, |
|
"loss": 0.4299, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5345622119815667, |
|
"grad_norm": 0.5474174976443692, |
|
"learning_rate": 5.645940686977033e-06, |
|
"loss": 0.4148, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 0.4806025731120033, |
|
"learning_rate": 5.5655691309764225e-06, |
|
"loss": 0.3924, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.5622119815668203, |
|
"grad_norm": 0.5324709526786499, |
|
"learning_rate": 5.485049264273241e-06, |
|
"loss": 0.3789, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.576036866359447, |
|
"grad_norm": 0.4659634896990746, |
|
"learning_rate": 5.404402201807022e-06, |
|
"loss": 0.3609, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.5898617511520738, |
|
"grad_norm": 0.5808920912192916, |
|
"learning_rate": 5.323649091872179e-06, |
|
"loss": 0.4271, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.6036866359447006, |
|
"grad_norm": 0.5993918356229727, |
|
"learning_rate": 5.242811110572243e-06, |
|
"loss": 0.3989, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.6175115207373272, |
|
"grad_norm": 0.5109808802516896, |
|
"learning_rate": 5.161909456266781e-06, |
|
"loss": 0.3814, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.631336405529954, |
|
"grad_norm": 0.5423585025364462, |
|
"learning_rate": 5.080965344012509e-06, |
|
"loss": 0.3985, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 0.5004581821839913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3523, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.6589861751152073, |
|
"grad_norm": 0.555371258631358, |
|
"learning_rate": 4.919034655987493e-06, |
|
"loss": 0.3977, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.672811059907834, |
|
"grad_norm": 0.6223674774033777, |
|
"learning_rate": 4.838090543733222e-06, |
|
"loss": 0.4122, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.6866359447004609, |
|
"grad_norm": 0.49317572687267214, |
|
"learning_rate": 4.757188889427761e-06, |
|
"loss": 0.3936, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.7004608294930876, |
|
"grad_norm": 0.5116997857144775, |
|
"learning_rate": 4.6763509081278215e-06, |
|
"loss": 0.3812, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.5050661891764184, |
|
"learning_rate": 4.59559779819298e-06, |
|
"loss": 0.3744, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.728110599078341, |
|
"grad_norm": 0.5660294368372862, |
|
"learning_rate": 4.51495073572676e-06, |
|
"loss": 0.4133, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 0.5414753484738278, |
|
"learning_rate": 4.434430869023579e-06, |
|
"loss": 0.4059, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.7557603686635943, |
|
"grad_norm": 0.5274523827704818, |
|
"learning_rate": 4.3540593130229695e-06, |
|
"loss": 0.4058, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.769585253456221, |
|
"grad_norm": 0.48723665989341286, |
|
"learning_rate": 4.27385714377255e-06, |
|
"loss": 0.4261, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.7834101382488479, |
|
"grad_norm": 0.5557410469617062, |
|
"learning_rate": 4.1938453929012014e-06, |
|
"loss": 0.3799, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.7972350230414746, |
|
"grad_norm": 0.5628455615899114, |
|
"learning_rate": 4.1140450421038865e-06, |
|
"loss": 0.4027, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.8110599078341014, |
|
"grad_norm": 0.5407292854180369, |
|
"learning_rate": 4.034477017639561e-06, |
|
"loss": 0.3414, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.8248847926267282, |
|
"grad_norm": 0.5110854563369478, |
|
"learning_rate": 3.955162184843625e-06, |
|
"loss": 0.4054, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 0.573230524328855, |
|
"learning_rate": 3.8761213426563546e-06, |
|
"loss": 0.4111, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.8525345622119815, |
|
"grad_norm": 0.5266763198269163, |
|
"learning_rate": 3.7973752181687336e-06, |
|
"loss": 0.3725, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.8663594470046083, |
|
"grad_norm": 0.5053510634901648, |
|
"learning_rate": 3.7189444611871383e-06, |
|
"loss": 0.3989, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8801843317972349, |
|
"grad_norm": 0.5767131796506803, |
|
"learning_rate": 3.6408496388182857e-06, |
|
"loss": 0.456, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.8940092165898617, |
|
"grad_norm": 0.4875050911999597, |
|
"learning_rate": 3.5631112300758595e-06, |
|
"loss": 0.3442, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.9078341013824884, |
|
"grad_norm": 0.5145298101038916, |
|
"learning_rate": 3.4857496205102475e-06, |
|
"loss": 0.4418, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.9216589861751152, |
|
"grad_norm": 0.46196122478440654, |
|
"learning_rate": 3.4087850968627823e-06, |
|
"loss": 0.3882, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.4906484173808876, |
|
"learning_rate": 3.3322378417458985e-06, |
|
"loss": 0.3947, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.9493087557603688, |
|
"grad_norm": 0.4989650080684894, |
|
"learning_rate": 3.2561279283505888e-06, |
|
"loss": 0.3399, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.9631336405529956, |
|
"grad_norm": 0.46237620183821815, |
|
"learning_rate": 3.180475315182563e-06, |
|
"loss": 0.405, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.976958525345622, |
|
"grad_norm": 0.4637781556805509, |
|
"learning_rate": 3.1052998408284664e-06, |
|
"loss": 0.4145, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.9907834101382489, |
|
"grad_norm": 0.4775120449168806, |
|
"learning_rate": 3.0306212187535653e-06, |
|
"loss": 0.3988, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.0046082949308754, |
|
"grad_norm": 0.9379207114350527, |
|
"learning_rate": 2.9564590321322206e-06, |
|
"loss": 0.6161, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.0184331797235022, |
|
"grad_norm": 0.4763284552212227, |
|
"learning_rate": 2.882832728712551e-06, |
|
"loss": 0.3328, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.032258064516129, |
|
"grad_norm": 0.5068219788042055, |
|
"learning_rate": 2.8097616157165886e-06, |
|
"loss": 0.3514, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.046082949308756, |
|
"grad_norm": 0.5004585471417805, |
|
"learning_rate": 2.7372648547773063e-06, |
|
"loss": 0.3663, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.0599078341013826, |
|
"grad_norm": 0.48776506523697694, |
|
"learning_rate": 2.665361456913797e-06, |
|
"loss": 0.3557, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.0737327188940093, |
|
"grad_norm": 0.4565469138842312, |
|
"learning_rate": 2.594070277545975e-06, |
|
"loss": 0.3325, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.087557603686636, |
|
"grad_norm": 0.5109481857152307, |
|
"learning_rate": 2.5234100115500643e-06, |
|
"loss": 0.3723, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.1013824884792625, |
|
"grad_norm": 0.503915544983321, |
|
"learning_rate": 2.4533991883561868e-06, |
|
"loss": 0.3454, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.1152073732718892, |
|
"grad_norm": 0.4920495646406938, |
|
"learning_rate": 2.38405616708935e-06, |
|
"loss": 0.3491, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.129032258064516, |
|
"grad_norm": 0.46249250593080354, |
|
"learning_rate": 2.315399131755081e-06, |
|
"loss": 0.3714, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.4756209372804288, |
|
"learning_rate": 2.2474460864709825e-06, |
|
"loss": 0.341, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.1566820276497696, |
|
"grad_norm": 0.49448121204775364, |
|
"learning_rate": 2.1802148507454675e-06, |
|
"loss": 0.3362, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.1705069124423964, |
|
"grad_norm": 0.49087318600689567, |
|
"learning_rate": 2.1137230548049042e-06, |
|
"loss": 0.3712, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.184331797235023, |
|
"grad_norm": 0.4394130757578322, |
|
"learning_rate": 2.0479881349703885e-06, |
|
"loss": 0.3289, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.19815668202765, |
|
"grad_norm": 0.49665520195966556, |
|
"learning_rate": 1.983027329085377e-06, |
|
"loss": 0.3648, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.2119815668202767, |
|
"grad_norm": 0.4113878764187301, |
|
"learning_rate": 1.9188576719953635e-06, |
|
"loss": 0.2849, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.225806451612903, |
|
"grad_norm": 0.4999778833865041, |
|
"learning_rate": 1.8554959910807773e-06, |
|
"loss": 0.3695, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.23963133640553, |
|
"grad_norm": 0.5189860690853264, |
|
"learning_rate": 1.7929589018443016e-06, |
|
"loss": 0.3504, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.2534562211981566, |
|
"grad_norm": 0.48043530531701206, |
|
"learning_rate": 1.7312628035537388e-06, |
|
"loss": 0.332, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.2672811059907834, |
|
"grad_norm": 0.5324429864403901, |
|
"learning_rate": 1.6704238749415958e-06, |
|
"loss": 0.3387, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.28110599078341, |
|
"grad_norm": 0.5029628675101937, |
|
"learning_rate": 1.6104580699624839e-06, |
|
"loss": 0.3592, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.294930875576037, |
|
"grad_norm": 0.45804507081769336, |
|
"learning_rate": 1.5513811136094786e-06, |
|
"loss": 0.3063, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.3087557603686637, |
|
"grad_norm": 0.4647295250952741, |
|
"learning_rate": 1.4932084977905043e-06, |
|
"loss": 0.3254, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.3225806451612905, |
|
"grad_norm": 0.46453643921934695, |
|
"learning_rate": 1.4359554772658551e-06, |
|
"loss": 0.3774, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.3364055299539173, |
|
"grad_norm": 0.45343468728467234, |
|
"learning_rate": 1.3796370656478936e-06, |
|
"loss": 0.3325, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.3502304147465436, |
|
"grad_norm": 0.4732479977172983, |
|
"learning_rate": 1.3242680314639995e-06, |
|
"loss": 0.3373, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.3640552995391704, |
|
"grad_norm": 0.47208675924851334, |
|
"learning_rate": 1.2698628942837698e-06, |
|
"loss": 0.3427, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.377880184331797, |
|
"grad_norm": 0.4454283369984702, |
|
"learning_rate": 1.2164359209115235e-06, |
|
"loss": 0.3257, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.391705069124424, |
|
"grad_norm": 0.49273795207826715, |
|
"learning_rate": 1.164001121645069e-06, |
|
"loss": 0.3772, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.4055299539170507, |
|
"grad_norm": 0.46302965066920987, |
|
"learning_rate": 1.1125722466017547e-06, |
|
"loss": 0.326, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"grad_norm": 0.44068332388009435, |
|
"learning_rate": 1.062162782112729e-06, |
|
"loss": 0.3947, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.4331797235023043, |
|
"grad_norm": 0.41223830858735033, |
|
"learning_rate": 1.012785947186397e-06, |
|
"loss": 0.3027, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.447004608294931, |
|
"grad_norm": 0.44346734726903453, |
|
"learning_rate": 9.644546900419533e-07, |
|
"loss": 0.3306, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.460829493087558, |
|
"grad_norm": 0.45992313059173967, |
|
"learning_rate": 9.171816847139447e-07, |
|
"loss": 0.3596, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.474654377880184, |
|
"grad_norm": 0.4558549067269223, |
|
"learning_rate": 8.709793277287182e-07, |
|
"loss": 0.3684, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.488479262672811, |
|
"grad_norm": 0.41803380966992537, |
|
"learning_rate": 8.258597348536452e-07, |
|
"loss": 0.32, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5023041474654377, |
|
"grad_norm": 0.42403429172582247, |
|
"learning_rate": 7.818347379199781e-07, |
|
"loss": 0.3273, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.5161290322580645, |
|
"grad_norm": 0.46169239465826656, |
|
"learning_rate": 7.389158817201541e-07, |
|
"loss": 0.3313, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.5299539170506913, |
|
"grad_norm": 0.49336488496655645, |
|
"learning_rate": 6.971144209803738e-07, |
|
"loss": 0.369, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.543778801843318, |
|
"grad_norm": 0.47648081907983714, |
|
"learning_rate": 6.564413174092443e-07, |
|
"loss": 0.3464, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.557603686635945, |
|
"grad_norm": 0.4151223124023637, |
|
"learning_rate": 6.16907236823262e-07, |
|
"loss": 0.3334, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.4214581402580324, |
|
"learning_rate": 5.785225463498828e-07, |
|
"loss": 0.3336, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.5852534562211984, |
|
"grad_norm": 0.4404488936158401, |
|
"learning_rate": 5.412973117089288e-07, |
|
"loss": 0.357, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.5990783410138247, |
|
"grad_norm": 0.4694197953797554, |
|
"learning_rate": 5.05241294573024e-07, |
|
"loss": 0.3389, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.6129032258064515, |
|
"grad_norm": 0.43224485913576827, |
|
"learning_rate": 4.7036395000776556e-07, |
|
"loss": 0.3816, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.6267281105990783, |
|
"grad_norm": 0.46915895400007485, |
|
"learning_rate": 4.3667442399229985e-07, |
|
"loss": 0.3212, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.640552995391705, |
|
"grad_norm": 0.41455265232398864, |
|
"learning_rate": 4.041815510209396e-07, |
|
"loss": 0.349, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.654377880184332, |
|
"grad_norm": 0.4138588542457796, |
|
"learning_rate": 3.728938517864794e-07, |
|
"loss": 0.3155, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.6682027649769586, |
|
"grad_norm": 0.44949477525662346, |
|
"learning_rate": 3.4281953094578877e-07, |
|
"loss": 0.3608, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.6820276497695854, |
|
"grad_norm": 0.45077480983651, |
|
"learning_rate": 3.1396647496828245e-07, |
|
"loss": 0.366, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.6958525345622117, |
|
"grad_norm": 0.43977571760655004, |
|
"learning_rate": 2.8634225006782867e-07, |
|
"loss": 0.3447, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.709677419354839, |
|
"grad_norm": 0.4531801768047782, |
|
"learning_rate": 2.599541002186479e-07, |
|
"loss": 0.3791, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.7235023041474653, |
|
"grad_norm": 0.40108250009956564, |
|
"learning_rate": 2.3480894525569564e-07, |
|
"loss": 0.303, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.737327188940092, |
|
"grad_norm": 0.40980648885083565, |
|
"learning_rate": 2.109133790600648e-07, |
|
"loss": 0.3427, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.751152073732719, |
|
"grad_norm": 0.41297927531361595, |
|
"learning_rate": 1.8827366782984913e-07, |
|
"loss": 0.325, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.7649769585253456, |
|
"grad_norm": 0.429935038739435, |
|
"learning_rate": 1.6689574843694433e-07, |
|
"loss": 0.3852, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.7788018433179724, |
|
"grad_norm": 0.4020104101913433, |
|
"learning_rate": 1.4678522687020414e-07, |
|
"loss": 0.3513, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.792626728110599, |
|
"grad_norm": 0.39219309859175616, |
|
"learning_rate": 1.2794737676536993e-07, |
|
"loss": 0.331, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.806451612903226, |
|
"grad_norm": 0.4097391288745776, |
|
"learning_rate": 1.1038713802214718e-07, |
|
"loss": 0.3411, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.8202764976958523, |
|
"grad_norm": 0.4309769162405172, |
|
"learning_rate": 9.410911550880474e-08, |
|
"loss": 0.3244, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.8341013824884795, |
|
"grad_norm": 0.4449894760218027, |
|
"learning_rate": 7.911757785462882e-08, |
|
"loss": 0.3417, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.847926267281106, |
|
"grad_norm": 0.4303644153909974, |
|
"learning_rate": 6.54164563305465e-08, |
|
"loss": 0.3391, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.8617511520737327, |
|
"grad_norm": 0.42194746880922407, |
|
"learning_rate": 5.3009343818219985e-08, |
|
"loss": 0.3707, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.8755760368663594, |
|
"grad_norm": 0.4196594211589896, |
|
"learning_rate": 4.189949386787462e-08, |
|
"loss": 0.351, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.889400921658986, |
|
"grad_norm": 0.40539018280598516, |
|
"learning_rate": 3.2089819845111946e-08, |
|
"loss": 0.3168, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 0.4658694331536401, |
|
"learning_rate": 2.358289416693027e-08, |
|
"loss": 0.4198, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.9170506912442398, |
|
"grad_norm": 0.41736837046748076, |
|
"learning_rate": 1.6380947627153143e-08, |
|
"loss": 0.3174, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.9308755760368665, |
|
"grad_norm": 0.42675226026620394, |
|
"learning_rate": 1.0485868811441757e-08, |
|
"loss": 0.3538, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.944700460829493, |
|
"grad_norm": 0.4160647077774905, |
|
"learning_rate": 5.899203602046655e-09, |
|
"loss": 0.3474, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.9585253456221197, |
|
"grad_norm": 0.40018027901405295, |
|
"learning_rate": 2.6221547724253337e-09, |
|
"loss": 0.319, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.9723502304147464, |
|
"grad_norm": 0.4334075522903773, |
|
"learning_rate": 6.555816718389896e-10, |
|
"loss": 0.3884, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.986175115207373, |
|
"grad_norm": 0.4031378328194389, |
|
"learning_rate": 0.0, |
|
"loss": 0.331, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.986175115207373, |
|
"step": 216, |
|
"total_flos": 130743839866880.0, |
|
"train_loss": 0.4320466773653472, |
|
"train_runtime": 2638.4795, |
|
"train_samples_per_second": 7.869, |
|
"train_steps_per_second": 0.082 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 216, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 130743839866880.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|