Doctor-Shotgun's picture
Training in progress, step 256, checkpoint
8e6aa4a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 500,
"global_step": 256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001953125,
"grad_norm": 2.2842363876082494,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.7076,
"step": 1
},
{
"epoch": 0.00390625,
"grad_norm": 2.317015212334916,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6296,
"step": 2
},
{
"epoch": 0.005859375,
"grad_norm": 2.0835939653262883,
"learning_rate": 3e-06,
"loss": 1.5593,
"step": 3
},
{
"epoch": 0.0078125,
"grad_norm": 2.1357657121975797,
"learning_rate": 4.000000000000001e-06,
"loss": 1.6713,
"step": 4
},
{
"epoch": 0.009765625,
"grad_norm": 2.0362735997756847,
"learning_rate": 5e-06,
"loss": 1.5327,
"step": 5
},
{
"epoch": 0.01171875,
"grad_norm": 2.1597413317388523,
"learning_rate": 6e-06,
"loss": 1.6435,
"step": 6
},
{
"epoch": 0.013671875,
"grad_norm": 2.1354234831872616,
"learning_rate": 7e-06,
"loss": 1.539,
"step": 7
},
{
"epoch": 0.015625,
"grad_norm": 2.0222980997885682,
"learning_rate": 8.000000000000001e-06,
"loss": 1.491,
"step": 8
},
{
"epoch": 0.017578125,
"grad_norm": 1.8336578914749888,
"learning_rate": 9e-06,
"loss": 1.567,
"step": 9
},
{
"epoch": 0.01953125,
"grad_norm": 1.7535364548043673,
"learning_rate": 1e-05,
"loss": 1.5181,
"step": 10
},
{
"epoch": 0.021484375,
"grad_norm": 1.348232072077207,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.4633,
"step": 11
},
{
"epoch": 0.0234375,
"grad_norm": 1.079057032053978,
"learning_rate": 1.2e-05,
"loss": 1.36,
"step": 12
},
{
"epoch": 0.025390625,
"grad_norm": 0.7143765277543237,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.3195,
"step": 13
},
{
"epoch": 0.02734375,
"grad_norm": 0.8120880164824964,
"learning_rate": 1.4e-05,
"loss": 1.3469,
"step": 14
},
{
"epoch": 0.029296875,
"grad_norm": 0.6746494578904082,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.3626,
"step": 15
},
{
"epoch": 0.03125,
"grad_norm": 0.9663545707089416,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.2772,
"step": 16
},
{
"epoch": 0.033203125,
"grad_norm": 0.961439588523319,
"learning_rate": 1.7e-05,
"loss": 1.2911,
"step": 17
},
{
"epoch": 0.03515625,
"grad_norm": 1.1738444068957379,
"learning_rate": 1.8e-05,
"loss": 1.3346,
"step": 18
},
{
"epoch": 0.037109375,
"grad_norm": 1.2332387671295317,
"learning_rate": 1.9e-05,
"loss": 1.3761,
"step": 19
},
{
"epoch": 0.0390625,
"grad_norm": 1.268714744941341,
"learning_rate": 2e-05,
"loss": 1.3042,
"step": 20
},
{
"epoch": 0.041015625,
"grad_norm": 1.078415802927275,
"learning_rate": 2.1000000000000002e-05,
"loss": 1.2102,
"step": 21
},
{
"epoch": 0.04296875,
"grad_norm": 1.330999136602917,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.2755,
"step": 22
},
{
"epoch": 0.044921875,
"grad_norm": 0.7130882289363479,
"learning_rate": 2.3e-05,
"loss": 1.1706,
"step": 23
},
{
"epoch": 0.046875,
"grad_norm": 0.5729960230193528,
"learning_rate": 2.4e-05,
"loss": 1.3215,
"step": 24
},
{
"epoch": 0.048828125,
"grad_norm": 0.6125271472968751,
"learning_rate": 2.5e-05,
"loss": 1.3213,
"step": 25
},
{
"epoch": 0.05078125,
"grad_norm": 0.6108864130655043,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.2865,
"step": 26
},
{
"epoch": 0.052734375,
"grad_norm": 0.6479528408256864,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.3383,
"step": 27
},
{
"epoch": 0.0546875,
"grad_norm": 0.8412108818700305,
"learning_rate": 2.8e-05,
"loss": 1.2763,
"step": 28
},
{
"epoch": 0.056640625,
"grad_norm": 0.8629612077288169,
"learning_rate": 2.9e-05,
"loss": 1.3045,
"step": 29
},
{
"epoch": 0.05859375,
"grad_norm": 0.7600858737745863,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.2352,
"step": 30
},
{
"epoch": 0.060546875,
"grad_norm": 0.7130629485255873,
"learning_rate": 3.1e-05,
"loss": 1.2299,
"step": 31
},
{
"epoch": 0.0625,
"grad_norm": 0.5912964724458128,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.2234,
"step": 32
},
{
"epoch": 0.064453125,
"grad_norm": 0.5368820032381596,
"learning_rate": 3.3e-05,
"loss": 1.1934,
"step": 33
},
{
"epoch": 0.06640625,
"grad_norm": 0.5570421986755116,
"learning_rate": 3.4e-05,
"loss": 1.2581,
"step": 34
},
{
"epoch": 0.068359375,
"grad_norm": 0.46598864760360764,
"learning_rate": 3.5000000000000004e-05,
"loss": 1.2535,
"step": 35
},
{
"epoch": 0.0703125,
"grad_norm": 0.6392299897042107,
"learning_rate": 3.6e-05,
"loss": 1.2331,
"step": 36
},
{
"epoch": 0.072265625,
"grad_norm": 0.49983937474417145,
"learning_rate": 3.7000000000000005e-05,
"loss": 1.2432,
"step": 37
},
{
"epoch": 0.07421875,
"grad_norm": 0.652858138736506,
"learning_rate": 3.8e-05,
"loss": 1.2759,
"step": 38
},
{
"epoch": 0.076171875,
"grad_norm": 0.5926189930170476,
"learning_rate": 3.9e-05,
"loss": 1.3016,
"step": 39
},
{
"epoch": 0.078125,
"grad_norm": 0.6646763351870284,
"learning_rate": 4e-05,
"loss": 1.344,
"step": 40
},
{
"epoch": 0.080078125,
"grad_norm": 0.6228429864196855,
"learning_rate": 3.99998980683206e-05,
"loss": 1.2794,
"step": 41
},
{
"epoch": 0.08203125,
"grad_norm": 0.5633101870154669,
"learning_rate": 3.9999592274321385e-05,
"loss": 1.2931,
"step": 42
},
{
"epoch": 0.083984375,
"grad_norm": 0.6866774046182069,
"learning_rate": 3.999908262111937e-05,
"loss": 1.2647,
"step": 43
},
{
"epoch": 0.0859375,
"grad_norm": 0.5312790576505163,
"learning_rate": 3.9998369113909555e-05,
"loss": 1.2255,
"step": 44
},
{
"epoch": 0.087890625,
"grad_norm": 0.5694229658922494,
"learning_rate": 3.999745175996481e-05,
"loss": 1.3104,
"step": 45
},
{
"epoch": 0.08984375,
"grad_norm": 0.5068013674566277,
"learning_rate": 3.999633056863589e-05,
"loss": 1.1771,
"step": 46
},
{
"epoch": 0.091796875,
"grad_norm": 0.5428027277075501,
"learning_rate": 3.999500555135129e-05,
"loss": 1.3508,
"step": 47
},
{
"epoch": 0.09375,
"grad_norm": 0.4792441915562371,
"learning_rate": 3.999347672161713e-05,
"loss": 1.1144,
"step": 48
},
{
"epoch": 0.095703125,
"grad_norm": 0.5033945174929487,
"learning_rate": 3.999174409501703e-05,
"loss": 1.1474,
"step": 49
},
{
"epoch": 0.09765625,
"grad_norm": 0.5609150975698594,
"learning_rate": 3.9989807689211946e-05,
"loss": 1.2558,
"step": 50
},
{
"epoch": 0.099609375,
"grad_norm": 0.5558707293914855,
"learning_rate": 3.998766752393998e-05,
"loss": 1.1411,
"step": 51
},
{
"epoch": 0.1015625,
"grad_norm": 0.4429585853749615,
"learning_rate": 3.99853236210162e-05,
"loss": 1.1715,
"step": 52
},
{
"epoch": 0.103515625,
"grad_norm": 0.5064052852591816,
"learning_rate": 3.998277600433241e-05,
"loss": 1.2018,
"step": 53
},
{
"epoch": 0.10546875,
"grad_norm": 0.526020419983389,
"learning_rate": 3.998002469985688e-05,
"loss": 1.1164,
"step": 54
},
{
"epoch": 0.107421875,
"grad_norm": 0.504222879676158,
"learning_rate": 3.997706973563413e-05,
"loss": 1.191,
"step": 55
},
{
"epoch": 0.109375,
"grad_norm": 0.5614145336635687,
"learning_rate": 3.9973911141784605e-05,
"loss": 1.3011,
"step": 56
},
{
"epoch": 0.111328125,
"grad_norm": 0.4391770801146478,
"learning_rate": 3.997054895050437e-05,
"loss": 1.2535,
"step": 57
},
{
"epoch": 0.11328125,
"grad_norm": 0.5583307267784473,
"learning_rate": 3.996698319606482e-05,
"loss": 1.153,
"step": 58
},
{
"epoch": 0.115234375,
"grad_norm": 0.4576133947689655,
"learning_rate": 3.996321391481229e-05,
"loss": 1.1564,
"step": 59
},
{
"epoch": 0.1171875,
"grad_norm": 0.41970646962377184,
"learning_rate": 3.995924114516769e-05,
"loss": 1.1935,
"step": 60
},
{
"epoch": 0.119140625,
"grad_norm": 0.44805324266797203,
"learning_rate": 3.995506492762613e-05,
"loss": 1.1339,
"step": 61
},
{
"epoch": 0.12109375,
"grad_norm": 0.5208068893189155,
"learning_rate": 3.9950685304756494e-05,
"loss": 1.2092,
"step": 62
},
{
"epoch": 0.123046875,
"grad_norm": 0.44195618774115664,
"learning_rate": 3.994610232120101e-05,
"loss": 1.1292,
"step": 63
},
{
"epoch": 0.125,
"grad_norm": 0.4514887790554273,
"learning_rate": 3.994131602367481e-05,
"loss": 1.1658,
"step": 64
},
{
"epoch": 0.126953125,
"grad_norm": 0.5908686231033371,
"learning_rate": 3.9936326460965423e-05,
"loss": 1.2076,
"step": 65
},
{
"epoch": 0.12890625,
"grad_norm": 0.46799815417666174,
"learning_rate": 3.99311336839323e-05,
"loss": 1.1889,
"step": 66
},
{
"epoch": 0.130859375,
"grad_norm": 0.45939729407525115,
"learning_rate": 3.992573774550629e-05,
"loss": 1.1704,
"step": 67
},
{
"epoch": 0.1328125,
"grad_norm": 0.4142175477343616,
"learning_rate": 3.9920138700689095e-05,
"loss": 1.1848,
"step": 68
},
{
"epoch": 0.134765625,
"grad_norm": 0.37685838553537837,
"learning_rate": 3.991433660655273e-05,
"loss": 1.1041,
"step": 69
},
{
"epoch": 0.13671875,
"grad_norm": 0.39832807246827023,
"learning_rate": 3.99083315222389e-05,
"loss": 1.2002,
"step": 70
},
{
"epoch": 0.138671875,
"grad_norm": 0.43218323629933336,
"learning_rate": 3.990212350895845e-05,
"loss": 1.1487,
"step": 71
},
{
"epoch": 0.140625,
"grad_norm": 0.43302460007599547,
"learning_rate": 3.98957126299907e-05,
"loss": 1.1638,
"step": 72
},
{
"epoch": 0.142578125,
"grad_norm": 0.41150363252077565,
"learning_rate": 3.988909895068281e-05,
"loss": 1.1353,
"step": 73
},
{
"epoch": 0.14453125,
"grad_norm": 0.4362254605938381,
"learning_rate": 3.988228253844913e-05,
"loss": 1.2202,
"step": 74
},
{
"epoch": 0.146484375,
"grad_norm": 0.4696684841153936,
"learning_rate": 3.987526346277049e-05,
"loss": 1.1722,
"step": 75
},
{
"epoch": 0.1484375,
"grad_norm": 0.42274900639715757,
"learning_rate": 3.9868041795193505e-05,
"loss": 1.179,
"step": 76
},
{
"epoch": 0.150390625,
"grad_norm": 0.47381294364503707,
"learning_rate": 3.9860617609329856e-05,
"loss": 1.1978,
"step": 77
},
{
"epoch": 0.15234375,
"grad_norm": 0.448192967722078,
"learning_rate": 3.9852990980855505e-05,
"loss": 1.2042,
"step": 78
},
{
"epoch": 0.154296875,
"grad_norm": 0.388483486919693,
"learning_rate": 3.984516198750997e-05,
"loss": 1.148,
"step": 79
},
{
"epoch": 0.15625,
"grad_norm": 0.4057112657252388,
"learning_rate": 3.9837130709095475e-05,
"loss": 1.1267,
"step": 80
},
{
"epoch": 0.158203125,
"grad_norm": 0.5111257616377479,
"learning_rate": 3.982889722747621e-05,
"loss": 1.1992,
"step": 81
},
{
"epoch": 0.16015625,
"grad_norm": 0.42800919524357695,
"learning_rate": 3.9820461626577426e-05,
"loss": 1.2214,
"step": 82
},
{
"epoch": 0.162109375,
"grad_norm": 0.6604320971658805,
"learning_rate": 3.981182399238462e-05,
"loss": 1.1046,
"step": 83
},
{
"epoch": 0.1640625,
"grad_norm": 0.4650529995861808,
"learning_rate": 3.980298441294265e-05,
"loss": 1.1485,
"step": 84
},
{
"epoch": 0.166015625,
"grad_norm": 0.8247014006092652,
"learning_rate": 3.9793942978354835e-05,
"loss": 1.2345,
"step": 85
},
{
"epoch": 0.16796875,
"grad_norm": 0.5111463246016623,
"learning_rate": 3.978469978078203e-05,
"loss": 1.1406,
"step": 86
},
{
"epoch": 0.169921875,
"grad_norm": 0.3980549366997817,
"learning_rate": 3.977525491444171e-05,
"loss": 1.138,
"step": 87
},
{
"epoch": 0.171875,
"grad_norm": 0.4500013345653544,
"learning_rate": 3.976560847560697e-05,
"loss": 1.1803,
"step": 88
},
{
"epoch": 0.173828125,
"grad_norm": 0.6144879263096161,
"learning_rate": 3.975576056260559e-05,
"loss": 1.376,
"step": 89
},
{
"epoch": 0.17578125,
"grad_norm": 0.45250166677505255,
"learning_rate": 3.974571127581901e-05,
"loss": 1.2616,
"step": 90
},
{
"epoch": 0.177734375,
"grad_norm": 0.7260361194779941,
"learning_rate": 3.973546071768128e-05,
"loss": 1.207,
"step": 91
},
{
"epoch": 0.1796875,
"grad_norm": 0.40590569325939646,
"learning_rate": 3.972500899267807e-05,
"loss": 1.1857,
"step": 92
},
{
"epoch": 0.181640625,
"grad_norm": 0.7059204956983739,
"learning_rate": 3.971435620734557e-05,
"loss": 1.1629,
"step": 93
},
{
"epoch": 0.18359375,
"grad_norm": 0.4166494769492577,
"learning_rate": 3.97035024702694e-05,
"loss": 1.2105,
"step": 94
},
{
"epoch": 0.185546875,
"grad_norm": 0.4708428232528331,
"learning_rate": 3.969244789208354e-05,
"loss": 1.2074,
"step": 95
},
{
"epoch": 0.1875,
"grad_norm": 0.46187395897944283,
"learning_rate": 3.9681192585469146e-05,
"loss": 1.2411,
"step": 96
},
{
"epoch": 0.189453125,
"grad_norm": 0.40887786827875044,
"learning_rate": 3.9669736665153455e-05,
"loss": 1.181,
"step": 97
},
{
"epoch": 0.19140625,
"grad_norm": 0.5783677933870661,
"learning_rate": 3.96580802479086e-05,
"loss": 1.2412,
"step": 98
},
{
"epoch": 0.193359375,
"grad_norm": 0.46098155681455955,
"learning_rate": 3.9646223452550374e-05,
"loss": 1.0478,
"step": 99
},
{
"epoch": 0.1953125,
"grad_norm": 0.4421189367731534,
"learning_rate": 3.9634166399937104e-05,
"loss": 1.1528,
"step": 100
},
{
"epoch": 0.197265625,
"grad_norm": 0.44208897843282735,
"learning_rate": 3.962190921296834e-05,
"loss": 1.1294,
"step": 101
},
{
"epoch": 0.19921875,
"grad_norm": 0.41115810620405063,
"learning_rate": 3.9609452016583654e-05,
"loss": 1.0787,
"step": 102
},
{
"epoch": 0.201171875,
"grad_norm": 0.4592703963732682,
"learning_rate": 3.959679493776134e-05,
"loss": 1.2084,
"step": 103
},
{
"epoch": 0.203125,
"grad_norm": 0.46514364761525706,
"learning_rate": 3.9583938105517127e-05,
"loss": 1.169,
"step": 104
},
{
"epoch": 0.205078125,
"grad_norm": 0.5044144386089332,
"learning_rate": 3.957088165090287e-05,
"loss": 1.121,
"step": 105
},
{
"epoch": 0.20703125,
"grad_norm": 0.4160320267546915,
"learning_rate": 3.9557625707005185e-05,
"loss": 1.1133,
"step": 106
},
{
"epoch": 0.208984375,
"grad_norm": 0.46611013560363507,
"learning_rate": 3.954417040894416e-05,
"loss": 1.0846,
"step": 107
},
{
"epoch": 0.2109375,
"grad_norm": 0.494489354902747,
"learning_rate": 3.953051589387189e-05,
"loss": 1.1762,
"step": 108
},
{
"epoch": 0.212890625,
"grad_norm": 0.4226200871032249,
"learning_rate": 3.951666230097115e-05,
"loss": 1.0346,
"step": 109
},
{
"epoch": 0.21484375,
"grad_norm": 0.4032354878018358,
"learning_rate": 3.9502609771453934e-05,
"loss": 1.1223,
"step": 110
},
{
"epoch": 0.216796875,
"grad_norm": 0.4148468151686513,
"learning_rate": 3.948835844856004e-05,
"loss": 1.1581,
"step": 111
},
{
"epoch": 0.21875,
"grad_norm": 0.4655201875464092,
"learning_rate": 3.947390847755559e-05,
"loss": 1.141,
"step": 112
},
{
"epoch": 0.220703125,
"grad_norm": 0.44131202754652804,
"learning_rate": 3.945926000573156e-05,
"loss": 1.228,
"step": 113
},
{
"epoch": 0.22265625,
"grad_norm": 0.4878464713519324,
"learning_rate": 3.94444131824023e-05,
"loss": 1.2023,
"step": 114
},
{
"epoch": 0.224609375,
"grad_norm": 0.4433704308856408,
"learning_rate": 3.942936815890396e-05,
"loss": 1.2479,
"step": 115
},
{
"epoch": 0.2265625,
"grad_norm": 0.4848454824446459,
"learning_rate": 3.941412508859299e-05,
"loss": 1.1269,
"step": 116
},
{
"epoch": 0.228515625,
"grad_norm": 0.419630467357436,
"learning_rate": 3.939868412684458e-05,
"loss": 1.1806,
"step": 117
},
{
"epoch": 0.23046875,
"grad_norm": 0.39683375502836515,
"learning_rate": 3.938304543105104e-05,
"loss": 1.1054,
"step": 118
},
{
"epoch": 0.232421875,
"grad_norm": 0.4832371787668091,
"learning_rate": 3.936720916062022e-05,
"loss": 1.1174,
"step": 119
},
{
"epoch": 0.234375,
"grad_norm": 0.5986867637436046,
"learning_rate": 3.935117547697387e-05,
"loss": 1.1791,
"step": 120
},
{
"epoch": 0.236328125,
"grad_norm": 0.4150490343483682,
"learning_rate": 3.933494454354605e-05,
"loss": 1.2129,
"step": 121
},
{
"epoch": 0.23828125,
"grad_norm": 0.4215588087170942,
"learning_rate": 3.931851652578137e-05,
"loss": 1.1414,
"step": 122
},
{
"epoch": 0.240234375,
"grad_norm": 0.42515318009071157,
"learning_rate": 3.9301891591133377e-05,
"loss": 1.0854,
"step": 123
},
{
"epoch": 0.2421875,
"grad_norm": 0.4488701042494301,
"learning_rate": 3.928506990906282e-05,
"loss": 1.0725,
"step": 124
},
{
"epoch": 0.244140625,
"grad_norm": 0.41531581194897543,
"learning_rate": 3.9268051651035944e-05,
"loss": 1.0746,
"step": 125
},
{
"epoch": 0.24609375,
"grad_norm": 0.46204021714125687,
"learning_rate": 3.9250836990522685e-05,
"loss": 1.2164,
"step": 126
},
{
"epoch": 0.248046875,
"grad_norm": 0.6677384727690392,
"learning_rate": 3.923342610299499e-05,
"loss": 1.1834,
"step": 127
},
{
"epoch": 0.25,
"grad_norm": 0.4961785465516465,
"learning_rate": 3.9215819165924956e-05,
"loss": 1.2178,
"step": 128
},
{
"epoch": 0.251953125,
"grad_norm": 0.4651476735438144,
"learning_rate": 3.919801635878305e-05,
"loss": 1.1005,
"step": 129
},
{
"epoch": 0.25390625,
"grad_norm": 0.49434332973849215,
"learning_rate": 3.918001786303627e-05,
"loss": 1.1922,
"step": 130
},
{
"epoch": 0.255859375,
"grad_norm": 0.45671514667179935,
"learning_rate": 3.9161823862146297e-05,
"loss": 1.0617,
"step": 131
},
{
"epoch": 0.2578125,
"grad_norm": 0.49674226929417115,
"learning_rate": 3.9143434541567654e-05,
"loss": 1.2203,
"step": 132
},
{
"epoch": 0.259765625,
"grad_norm": 0.5208683235687923,
"learning_rate": 3.912485008874577e-05,
"loss": 1.1587,
"step": 133
},
{
"epoch": 0.26171875,
"grad_norm": 0.517022288962491,
"learning_rate": 3.9106070693115087e-05,
"loss": 1.1427,
"step": 134
},
{
"epoch": 0.263671875,
"grad_norm": 0.38942661826422087,
"learning_rate": 3.908709654609715e-05,
"loss": 1.0629,
"step": 135
},
{
"epoch": 0.265625,
"grad_norm": 0.4564236281556844,
"learning_rate": 3.9067927841098614e-05,
"loss": 1.0919,
"step": 136
},
{
"epoch": 0.267578125,
"grad_norm": 0.4929559987928741,
"learning_rate": 3.9048564773509314e-05,
"loss": 1.1502,
"step": 137
},
{
"epoch": 0.26953125,
"grad_norm": 0.48513251932309925,
"learning_rate": 3.902900754070025e-05,
"loss": 1.1158,
"step": 138
},
{
"epoch": 0.271484375,
"grad_norm": 0.5349569441078609,
"learning_rate": 3.900925634202158e-05,
"loss": 1.1279,
"step": 139
},
{
"epoch": 0.2734375,
"grad_norm": 0.47177459620840684,
"learning_rate": 3.898931137880059e-05,
"loss": 1.1595,
"step": 140
},
{
"epoch": 0.275390625,
"grad_norm": 0.4904546697998669,
"learning_rate": 3.896917285433964e-05,
"loss": 1.2615,
"step": 141
},
{
"epoch": 0.27734375,
"grad_norm": 0.5768180408665089,
"learning_rate": 3.894884097391409e-05,
"loss": 1.1688,
"step": 142
},
{
"epoch": 0.279296875,
"grad_norm": 0.4362108519904031,
"learning_rate": 3.892831594477021e-05,
"loss": 1.0983,
"step": 143
},
{
"epoch": 0.28125,
"grad_norm": 0.4570710320413065,
"learning_rate": 3.890759797612307e-05,
"loss": 1.3706,
"step": 144
},
{
"epoch": 0.283203125,
"grad_norm": 0.4465318663671251,
"learning_rate": 3.888668727915441e-05,
"loss": 1.1377,
"step": 145
},
{
"epoch": 0.28515625,
"grad_norm": 0.5047852656660148,
"learning_rate": 3.886558406701046e-05,
"loss": 1.0747,
"step": 146
},
{
"epoch": 0.287109375,
"grad_norm": 0.4412295789497703,
"learning_rate": 3.884428855479983e-05,
"loss": 1.1261,
"step": 147
},
{
"epoch": 0.2890625,
"grad_norm": 0.4476476539228374,
"learning_rate": 3.8822800959591236e-05,
"loss": 1.1769,
"step": 148
},
{
"epoch": 0.291015625,
"grad_norm": 0.45924117326794117,
"learning_rate": 3.880112150041134e-05,
"loss": 1.1564,
"step": 149
},
{
"epoch": 0.29296875,
"grad_norm": 0.43931168833110684,
"learning_rate": 3.877925039824253e-05,
"loss": 1.1682,
"step": 150
},
{
"epoch": 0.294921875,
"grad_norm": 0.5438637955362605,
"learning_rate": 3.8757187876020603e-05,
"loss": 1.1448,
"step": 151
},
{
"epoch": 0.296875,
"grad_norm": 0.42928963297461137,
"learning_rate": 3.873493415863256e-05,
"loss": 1.2078,
"step": 152
},
{
"epoch": 0.298828125,
"grad_norm": 0.4381709802123583,
"learning_rate": 3.8712489472914286e-05,
"loss": 1.0604,
"step": 153
},
{
"epoch": 0.30078125,
"grad_norm": 0.4988490117613772,
"learning_rate": 3.8689854047648224e-05,
"loss": 1.1424,
"step": 154
},
{
"epoch": 0.302734375,
"grad_norm": 0.4257038437137218,
"learning_rate": 3.866702811356107e-05,
"loss": 1.0955,
"step": 155
},
{
"epoch": 0.3046875,
"grad_norm": 0.4893472968930594,
"learning_rate": 3.86440119033214e-05,
"loss": 1.1854,
"step": 156
},
{
"epoch": 0.306640625,
"grad_norm": 0.5731240348991923,
"learning_rate": 3.862080565153731e-05,
"loss": 1.2505,
"step": 157
},
{
"epoch": 0.30859375,
"grad_norm": 0.4594995644663965,
"learning_rate": 3.8597409594754025e-05,
"loss": 1.1047,
"step": 158
},
{
"epoch": 0.310546875,
"grad_norm": 0.3898970756217597,
"learning_rate": 3.857382397145148e-05,
"loss": 1.1728,
"step": 159
},
{
"epoch": 0.3125,
"grad_norm": 0.5165759238716673,
"learning_rate": 3.85500490220419e-05,
"loss": 1.1232,
"step": 160
},
{
"epoch": 0.314453125,
"grad_norm": 0.42169317869735606,
"learning_rate": 3.852608498886732e-05,
"loss": 1.1087,
"step": 161
},
{
"epoch": 0.31640625,
"grad_norm": 0.4831766592421198,
"learning_rate": 3.850193211619718e-05,
"loss": 1.0902,
"step": 162
},
{
"epoch": 0.318359375,
"grad_norm": 0.5168422003190449,
"learning_rate": 3.8477590650225735e-05,
"loss": 1.1979,
"step": 163
},
{
"epoch": 0.3203125,
"grad_norm": 0.44267326014624,
"learning_rate": 3.845306083906967e-05,
"loss": 1.1311,
"step": 164
},
{
"epoch": 0.322265625,
"grad_norm": 0.42634229457641887,
"learning_rate": 3.842834293276545e-05,
"loss": 1.1729,
"step": 165
},
{
"epoch": 0.32421875,
"grad_norm": 0.40628491116146026,
"learning_rate": 3.8403437183266834e-05,
"loss": 1.0984,
"step": 166
},
{
"epoch": 0.326171875,
"grad_norm": 0.4159045672550255,
"learning_rate": 3.8378343844442344e-05,
"loss": 1.1731,
"step": 167
},
{
"epoch": 0.328125,
"grad_norm": 0.5968785135150301,
"learning_rate": 3.8353063172072564e-05,
"loss": 1.0247,
"step": 168
},
{
"epoch": 0.330078125,
"grad_norm": 0.4649591605790638,
"learning_rate": 3.8327595423847645e-05,
"loss": 1.139,
"step": 169
},
{
"epoch": 0.33203125,
"grad_norm": 0.48079030109724175,
"learning_rate": 3.830194085936463e-05,
"loss": 1.1268,
"step": 170
},
{
"epoch": 0.333984375,
"grad_norm": 0.46348618416181137,
"learning_rate": 3.8276099740124794e-05,
"loss": 1.2004,
"step": 171
},
{
"epoch": 0.3359375,
"grad_norm": 0.4832617358199499,
"learning_rate": 3.8250072329531004e-05,
"loss": 1.0743,
"step": 172
},
{
"epoch": 0.337890625,
"grad_norm": 0.4420229534375586,
"learning_rate": 3.822385889288503e-05,
"loss": 1.141,
"step": 173
},
{
"epoch": 0.33984375,
"grad_norm": 0.39752191495545935,
"learning_rate": 3.819745969738484e-05,
"loss": 1.0972,
"step": 174
},
{
"epoch": 0.341796875,
"grad_norm": 0.4411421700040708,
"learning_rate": 3.817087501212185e-05,
"loss": 1.0233,
"step": 175
},
{
"epoch": 0.34375,
"grad_norm": 0.4017237336736879,
"learning_rate": 3.8144105108078246e-05,
"loss": 1.1563,
"step": 176
},
{
"epoch": 0.345703125,
"grad_norm": 0.686922962042273,
"learning_rate": 3.8117150258124134e-05,
"loss": 1.147,
"step": 177
},
{
"epoch": 0.34765625,
"grad_norm": 0.4294357539370898,
"learning_rate": 3.8090010737014836e-05,
"loss": 1.1116,
"step": 178
},
{
"epoch": 0.349609375,
"grad_norm": 0.41962832297995667,
"learning_rate": 3.806268682138805e-05,
"loss": 1.0827,
"step": 179
},
{
"epoch": 0.3515625,
"grad_norm": 0.4413195950046206,
"learning_rate": 3.803517878976103e-05,
"loss": 1.0814,
"step": 180
},
{
"epoch": 0.353515625,
"grad_norm": 0.45365068157119814,
"learning_rate": 3.8007486922527774e-05,
"loss": 1.0599,
"step": 181
},
{
"epoch": 0.35546875,
"grad_norm": 0.5286445380979327,
"learning_rate": 3.7979611501956124e-05,
"loss": 1.2251,
"step": 182
},
{
"epoch": 0.357421875,
"grad_norm": 0.38599209970455534,
"learning_rate": 3.795155281218493e-05,
"loss": 1.1676,
"step": 183
},
{
"epoch": 0.359375,
"grad_norm": 0.44025531979392435,
"learning_rate": 3.7923311139221114e-05,
"loss": 1.0514,
"step": 184
},
{
"epoch": 0.361328125,
"grad_norm": 0.42167205583593925,
"learning_rate": 3.789488677093681e-05,
"loss": 1.1002,
"step": 185
},
{
"epoch": 0.36328125,
"grad_norm": 0.4466402130651366,
"learning_rate": 3.786627999706638e-05,
"loss": 1.1013,
"step": 186
},
{
"epoch": 0.365234375,
"grad_norm": 0.496760952886551,
"learning_rate": 3.783749110920345e-05,
"loss": 1.1465,
"step": 187
},
{
"epoch": 0.3671875,
"grad_norm": 0.4367613213432748,
"learning_rate": 3.780852040079802e-05,
"loss": 1.0657,
"step": 188
},
{
"epoch": 0.369140625,
"grad_norm": 0.41447069424638583,
"learning_rate": 3.777936816715336e-05,
"loss": 1.116,
"step": 189
},
{
"epoch": 0.37109375,
"grad_norm": 0.4361134375016492,
"learning_rate": 3.7750034705423095e-05,
"loss": 1.2767,
"step": 190
},
{
"epoch": 0.373046875,
"grad_norm": 0.4066150259484398,
"learning_rate": 3.772052031460812e-05,
"loss": 1.0785,
"step": 191
},
{
"epoch": 0.375,
"grad_norm": 0.40407841923262816,
"learning_rate": 3.769082529555359e-05,
"loss": 1.1644,
"step": 192
},
{
"epoch": 0.376953125,
"grad_norm": 0.44561296429853814,
"learning_rate": 3.766094995094581e-05,
"loss": 1.0663,
"step": 193
},
{
"epoch": 0.37890625,
"grad_norm": 0.5352430776738828,
"learning_rate": 3.7630894585309195e-05,
"loss": 1.0209,
"step": 194
},
{
"epoch": 0.380859375,
"grad_norm": 0.43636357529723163,
"learning_rate": 3.7600659505003125e-05,
"loss": 1.0621,
"step": 195
},
{
"epoch": 0.3828125,
"grad_norm": 0.4264879021475797,
"learning_rate": 3.757024501821885e-05,
"loss": 1.1336,
"step": 196
},
{
"epoch": 0.384765625,
"grad_norm": 0.3873402520476977,
"learning_rate": 3.753965143497635e-05,
"loss": 1.1378,
"step": 197
},
{
"epoch": 0.38671875,
"grad_norm": 0.40092066811193233,
"learning_rate": 3.750887906712115e-05,
"loss": 1.0685,
"step": 198
},
{
"epoch": 0.388671875,
"grad_norm": 0.43572366333630774,
"learning_rate": 3.747792822832117e-05,
"loss": 1.1723,
"step": 199
},
{
"epoch": 0.390625,
"grad_norm": 0.37730662296410905,
"learning_rate": 3.744679923406351e-05,
"loss": 1.0823,
"step": 200
},
{
"epoch": 0.392578125,
"grad_norm": 0.4578098403628755,
"learning_rate": 3.741549240165122e-05,
"loss": 1.1354,
"step": 201
},
{
"epoch": 0.39453125,
"grad_norm": 0.4402925550279655,
"learning_rate": 3.738400805020011e-05,
"loss": 1.0921,
"step": 202
},
{
"epoch": 0.396484375,
"grad_norm": 0.3814506298253285,
"learning_rate": 3.7352346500635466e-05,
"loss": 1.0813,
"step": 203
},
{
"epoch": 0.3984375,
"grad_norm": 0.5352313284178145,
"learning_rate": 3.732050807568878e-05,
"loss": 1.2286,
"step": 204
},
{
"epoch": 0.400390625,
"grad_norm": 0.4394941726895711,
"learning_rate": 3.728849309989445e-05,
"loss": 1.1362,
"step": 205
},
{
"epoch": 0.40234375,
"grad_norm": 0.40009193940161264,
"learning_rate": 3.7256301899586524e-05,
"loss": 1.014,
"step": 206
},
{
"epoch": 0.404296875,
"grad_norm": 0.4093033957375515,
"learning_rate": 3.7223934802895294e-05,
"loss": 1.0731,
"step": 207
},
{
"epoch": 0.40625,
"grad_norm": 0.47801078784248796,
"learning_rate": 3.719139213974403e-05,
"loss": 1.2081,
"step": 208
},
{
"epoch": 0.408203125,
"grad_norm": 0.5965083454407833,
"learning_rate": 3.715867424184554e-05,
"loss": 1.1495,
"step": 209
},
{
"epoch": 0.41015625,
"grad_norm": 0.43672026913516004,
"learning_rate": 3.712578144269887e-05,
"loss": 1.1201,
"step": 210
},
{
"epoch": 0.412109375,
"grad_norm": 0.5253144641112631,
"learning_rate": 3.7092714077585836e-05,
"loss": 1.2268,
"step": 211
},
{
"epoch": 0.4140625,
"grad_norm": 0.4738073414405108,
"learning_rate": 3.705947248356765e-05,
"loss": 1.1188,
"step": 212
},
{
"epoch": 0.416015625,
"grad_norm": 0.4477140058126639,
"learning_rate": 3.7026056999481464e-05,
"loss": 1.0571,
"step": 213
},
{
"epoch": 0.41796875,
"grad_norm": 0.4471574730565842,
"learning_rate": 3.699246796593692e-05,
"loss": 1.0847,
"step": 214
},
{
"epoch": 0.419921875,
"grad_norm": 0.41405988952981876,
"learning_rate": 3.6958705725312655e-05,
"loss": 1.1401,
"step": 215
},
{
"epoch": 0.421875,
"grad_norm": 0.49370245896699827,
"learning_rate": 3.692477062175289e-05,
"loss": 1.0703,
"step": 216
},
{
"epoch": 0.423828125,
"grad_norm": 0.4406399072344879,
"learning_rate": 3.689066300116381e-05,
"loss": 1.1793,
"step": 217
},
{
"epoch": 0.42578125,
"grad_norm": 0.43483619180179833,
"learning_rate": 3.6856383211210134e-05,
"loss": 1.1305,
"step": 218
},
{
"epoch": 0.427734375,
"grad_norm": 0.43256055966703133,
"learning_rate": 3.682193160131152e-05,
"loss": 1.0943,
"step": 219
},
{
"epoch": 0.4296875,
"grad_norm": 0.5598257236379292,
"learning_rate": 3.678730852263901e-05,
"loss": 1.2309,
"step": 220
},
{
"epoch": 0.431640625,
"grad_norm": 0.39045352547405415,
"learning_rate": 3.675251432811144e-05,
"loss": 1.0047,
"step": 221
},
{
"epoch": 0.43359375,
"grad_norm": 0.44912102512870905,
"learning_rate": 3.671754937239191e-05,
"loss": 1.1087,
"step": 222
},
{
"epoch": 0.435546875,
"grad_norm": 0.4174420596478436,
"learning_rate": 3.668241401188407e-05,
"loss": 1.0313,
"step": 223
},
{
"epoch": 0.4375,
"grad_norm": 0.36458359932139156,
"learning_rate": 3.6647108604728546e-05,
"loss": 0.9782,
"step": 224
},
{
"epoch": 0.439453125,
"grad_norm": 0.4419635662052487,
"learning_rate": 3.661163351079929e-05,
"loss": 1.1076,
"step": 225
},
{
"epoch": 0.44140625,
"grad_norm": 0.4537093691655119,
"learning_rate": 3.6575989091699895e-05,
"loss": 1.1265,
"step": 226
},
{
"epoch": 0.443359375,
"grad_norm": 0.4515222234083662,
"learning_rate": 3.65401757107599e-05,
"loss": 1.124,
"step": 227
},
{
"epoch": 0.4453125,
"grad_norm": 0.4509933735945529,
"learning_rate": 3.650419373303112e-05,
"loss": 1.2212,
"step": 228
},
{
"epoch": 0.447265625,
"grad_norm": 0.39315970041656184,
"learning_rate": 3.646804352528389e-05,
"loss": 1.1003,
"step": 229
},
{
"epoch": 0.44921875,
"grad_norm": 0.583897939706095,
"learning_rate": 3.643172545600336e-05,
"loss": 1.0984,
"step": 230
},
{
"epoch": 0.451171875,
"grad_norm": 0.5164803615434137,
"learning_rate": 3.63952398953857e-05,
"loss": 1.0738,
"step": 231
},
{
"epoch": 0.453125,
"grad_norm": 0.4070265753872102,
"learning_rate": 3.6358587215334355e-05,
"loss": 1.034,
"step": 232
},
{
"epoch": 0.455078125,
"grad_norm": 0.4101472350679783,
"learning_rate": 3.632176778945626e-05,
"loss": 1.1234,
"step": 233
},
{
"epoch": 0.45703125,
"grad_norm": 0.410956088362877,
"learning_rate": 3.628478199305799e-05,
"loss": 1.1062,
"step": 234
},
{
"epoch": 0.458984375,
"grad_norm": 0.42181972355385416,
"learning_rate": 3.624763020314199e-05,
"loss": 1.1848,
"step": 235
},
{
"epoch": 0.4609375,
"grad_norm": 0.4069735981570203,
"learning_rate": 3.62103127984027e-05,
"loss": 1.1203,
"step": 236
},
{
"epoch": 0.462890625,
"grad_norm": 0.4142934678480609,
"learning_rate": 3.617283015922268e-05,
"loss": 1.1044,
"step": 237
},
{
"epoch": 0.46484375,
"grad_norm": 0.4697374307040272,
"learning_rate": 3.6135182667668764e-05,
"loss": 1.1947,
"step": 238
},
{
"epoch": 0.466796875,
"grad_norm": 0.3985058819632944,
"learning_rate": 3.6097370707488175e-05,
"loss": 1.0906,
"step": 239
},
{
"epoch": 0.46875,
"grad_norm": 0.40215610602620183,
"learning_rate": 3.6059394664104554e-05,
"loss": 1.1607,
"step": 240
},
{
"epoch": 0.470703125,
"grad_norm": 0.3985665062059567,
"learning_rate": 3.60212549246141e-05,
"loss": 1.0787,
"step": 241
},
{
"epoch": 0.47265625,
"grad_norm": 0.43711415007382576,
"learning_rate": 3.598295187778158e-05,
"loss": 1.1554,
"step": 242
},
{
"epoch": 0.474609375,
"grad_norm": 0.4382023321095773,
"learning_rate": 3.5944485914036384e-05,
"loss": 1.0126,
"step": 243
},
{
"epoch": 0.4765625,
"grad_norm": 0.37488265505774904,
"learning_rate": 3.590585742546853e-05,
"loss": 1.1054,
"step": 244
},
{
"epoch": 0.478515625,
"grad_norm": 0.40930451172856447,
"learning_rate": 3.586706680582471e-05,
"loss": 1.0321,
"step": 245
},
{
"epoch": 0.48046875,
"grad_norm": 0.5059310227059168,
"learning_rate": 3.5828114450504205e-05,
"loss": 1.1239,
"step": 246
},
{
"epoch": 0.482421875,
"grad_norm": 0.45898297435796365,
"learning_rate": 3.5789000756554927e-05,
"loss": 1.0467,
"step": 247
},
{
"epoch": 0.484375,
"grad_norm": 0.42551550838444063,
"learning_rate": 3.5749726122669316e-05,
"loss": 1.051,
"step": 248
},
{
"epoch": 0.486328125,
"grad_norm": 0.4451344613451106,
"learning_rate": 3.5710290949180325e-05,
"loss": 1.1036,
"step": 249
},
{
"epoch": 0.48828125,
"grad_norm": 0.43151805025113255,
"learning_rate": 3.5670695638057285e-05,
"loss": 1.1906,
"step": 250
},
{
"epoch": 0.490234375,
"grad_norm": 0.492114391902568,
"learning_rate": 3.563094059290186e-05,
"loss": 1.1629,
"step": 251
},
{
"epoch": 0.4921875,
"grad_norm": 0.4144331093915329,
"learning_rate": 3.5591026218943905e-05,
"loss": 1.1485,
"step": 252
},
{
"epoch": 0.494140625,
"grad_norm": 0.4201461662795515,
"learning_rate": 3.5550952923037337e-05,
"loss": 1.1451,
"step": 253
},
{
"epoch": 0.49609375,
"grad_norm": 0.41132936789582963,
"learning_rate": 3.551072111365598e-05,
"loss": 1.1216,
"step": 254
},
{
"epoch": 0.498046875,
"grad_norm": 0.40892606177310264,
"learning_rate": 3.547033120088943e-05,
"loss": 1.0282,
"step": 255
},
{
"epoch": 0.5,
"grad_norm": 0.39721649148962185,
"learning_rate": 3.5429783596438864e-05,
"loss": 1.113,
"step": 256
}
],
"logging_steps": 1,
"max_steps": 1024,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 256,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 531064116215808.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}