{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001953125, "grad_norm": 2.2842363876082494, "learning_rate": 1.0000000000000002e-06, "loss": 1.7076, "step": 1 }, { "epoch": 0.00390625, "grad_norm": 2.317015212334916, "learning_rate": 2.0000000000000003e-06, "loss": 1.6296, "step": 2 }, { "epoch": 0.005859375, "grad_norm": 2.0835939653262883, "learning_rate": 3e-06, "loss": 1.5593, "step": 3 }, { "epoch": 0.0078125, "grad_norm": 2.1357657121975797, "learning_rate": 4.000000000000001e-06, "loss": 1.6713, "step": 4 }, { "epoch": 0.009765625, "grad_norm": 2.0362735997756847, "learning_rate": 5e-06, "loss": 1.5327, "step": 5 }, { "epoch": 0.01171875, "grad_norm": 2.1597413317388523, "learning_rate": 6e-06, "loss": 1.6435, "step": 6 }, { "epoch": 0.013671875, "grad_norm": 2.1354234831872616, "learning_rate": 7e-06, "loss": 1.539, "step": 7 }, { "epoch": 0.015625, "grad_norm": 2.0222980997885682, "learning_rate": 8.000000000000001e-06, "loss": 1.491, "step": 8 }, { "epoch": 0.017578125, "grad_norm": 1.8336578914749888, "learning_rate": 9e-06, "loss": 1.567, "step": 9 }, { "epoch": 0.01953125, "grad_norm": 1.7535364548043673, "learning_rate": 1e-05, "loss": 1.5181, "step": 10 }, { "epoch": 0.021484375, "grad_norm": 1.348232072077207, "learning_rate": 1.1000000000000001e-05, "loss": 1.4633, "step": 11 }, { "epoch": 0.0234375, "grad_norm": 1.079057032053978, "learning_rate": 1.2e-05, "loss": 1.36, "step": 12 }, { "epoch": 0.025390625, "grad_norm": 0.7143765277543237, "learning_rate": 1.3000000000000001e-05, "loss": 1.3195, "step": 13 }, { "epoch": 0.02734375, "grad_norm": 0.8120880164824964, "learning_rate": 1.4e-05, "loss": 1.3469, "step": 14 }, { "epoch": 0.029296875, "grad_norm": 0.6746494578904082, "learning_rate": 1.5000000000000002e-05, "loss": 1.3626, "step": 15 }, { "epoch": 0.03125, "grad_norm": 0.9663545707089416, "learning_rate": 1.6000000000000003e-05, "loss": 1.2772, "step": 16 }, { "epoch": 0.033203125, "grad_norm": 0.961439588523319, "learning_rate": 1.7e-05, "loss": 1.2911, "step": 17 }, { "epoch": 0.03515625, "grad_norm": 1.1738444068957379, "learning_rate": 1.8e-05, "loss": 1.3346, "step": 18 }, { "epoch": 0.037109375, "grad_norm": 1.2332387671295317, "learning_rate": 1.9e-05, "loss": 1.3761, "step": 19 }, { "epoch": 0.0390625, "grad_norm": 1.268714744941341, "learning_rate": 2e-05, "loss": 1.3042, "step": 20 }, { "epoch": 0.041015625, "grad_norm": 1.078415802927275, "learning_rate": 2.1000000000000002e-05, "loss": 1.2102, "step": 21 }, { "epoch": 0.04296875, "grad_norm": 1.330999136602917, "learning_rate": 2.2000000000000003e-05, "loss": 1.2755, "step": 22 }, { "epoch": 0.044921875, "grad_norm": 0.7130882289363479, "learning_rate": 2.3e-05, "loss": 1.1706, "step": 23 }, { "epoch": 0.046875, "grad_norm": 0.5729960230193528, "learning_rate": 2.4e-05, "loss": 1.3215, "step": 24 }, { "epoch": 0.048828125, "grad_norm": 0.6125271472968751, "learning_rate": 2.5e-05, "loss": 1.3213, "step": 25 }, { "epoch": 0.05078125, "grad_norm": 0.6108864130655043, "learning_rate": 2.6000000000000002e-05, "loss": 1.2865, "step": 26 }, { "epoch": 0.052734375, "grad_norm": 0.6479528408256864, "learning_rate": 2.7000000000000002e-05, "loss": 1.3383, "step": 27 }, { "epoch": 0.0546875, "grad_norm": 0.8412108818700305, "learning_rate": 2.8e-05, "loss": 1.2763, "step": 28 }, { "epoch": 0.056640625, "grad_norm": 0.8629612077288169, "learning_rate": 2.9e-05, "loss": 1.3045, "step": 29 }, { "epoch": 0.05859375, "grad_norm": 0.7600858737745863, "learning_rate": 3.0000000000000004e-05, "loss": 1.2352, "step": 30 }, { "epoch": 0.060546875, "grad_norm": 0.7130629485255873, "learning_rate": 3.1e-05, "loss": 1.2299, "step": 31 }, { "epoch": 0.0625, "grad_norm": 0.5912964724458128, "learning_rate": 3.2000000000000005e-05, "loss": 1.2234, "step": 32 }, { "epoch": 0.064453125, "grad_norm": 0.5368820032381596, "learning_rate": 3.3e-05, "loss": 1.1934, "step": 33 }, { "epoch": 0.06640625, "grad_norm": 0.5570421986755116, "learning_rate": 3.4e-05, "loss": 1.2581, "step": 34 }, { "epoch": 0.068359375, "grad_norm": 0.46598864760360764, "learning_rate": 3.5000000000000004e-05, "loss": 1.2535, "step": 35 }, { "epoch": 0.0703125, "grad_norm": 0.6392299897042107, "learning_rate": 3.6e-05, "loss": 1.2331, "step": 36 }, { "epoch": 0.072265625, "grad_norm": 0.49983937474417145, "learning_rate": 3.7000000000000005e-05, "loss": 1.2432, "step": 37 }, { "epoch": 0.07421875, "grad_norm": 0.652858138736506, "learning_rate": 3.8e-05, "loss": 1.2759, "step": 38 }, { "epoch": 0.076171875, "grad_norm": 0.5926189930170476, "learning_rate": 3.9e-05, "loss": 1.3016, "step": 39 }, { "epoch": 0.078125, "grad_norm": 0.6646763351870284, "learning_rate": 4e-05, "loss": 1.344, "step": 40 }, { "epoch": 0.080078125, "grad_norm": 0.6228429864196855, "learning_rate": 3.99998980683206e-05, "loss": 1.2794, "step": 41 }, { "epoch": 0.08203125, "grad_norm": 0.5633101870154669, "learning_rate": 3.9999592274321385e-05, "loss": 1.2931, "step": 42 }, { "epoch": 0.083984375, "grad_norm": 0.6866774046182069, "learning_rate": 3.999908262111937e-05, "loss": 1.2647, "step": 43 }, { "epoch": 0.0859375, "grad_norm": 0.5312790576505163, "learning_rate": 3.9998369113909555e-05, "loss": 1.2255, "step": 44 }, { "epoch": 0.087890625, "grad_norm": 0.5694229658922494, "learning_rate": 3.999745175996481e-05, "loss": 1.3104, "step": 45 }, { "epoch": 0.08984375, "grad_norm": 0.5068013674566277, "learning_rate": 3.999633056863589e-05, "loss": 1.1771, "step": 46 }, { "epoch": 0.091796875, "grad_norm": 0.5428027277075501, "learning_rate": 3.999500555135129e-05, "loss": 1.3508, "step": 47 }, { "epoch": 0.09375, "grad_norm": 0.4792441915562371, "learning_rate": 3.999347672161713e-05, "loss": 1.1144, "step": 48 }, { "epoch": 0.095703125, "grad_norm": 0.5033945174929487, "learning_rate": 3.999174409501703e-05, "loss": 1.1474, "step": 49 }, { "epoch": 0.09765625, "grad_norm": 0.5609150975698594, "learning_rate": 3.9989807689211946e-05, "loss": 1.2558, "step": 50 }, { "epoch": 0.099609375, "grad_norm": 0.5558707293914855, "learning_rate": 3.998766752393998e-05, "loss": 1.1411, "step": 51 }, { "epoch": 0.1015625, "grad_norm": 0.4429585853749615, "learning_rate": 3.99853236210162e-05, "loss": 1.1715, "step": 52 }, { "epoch": 0.103515625, "grad_norm": 0.5064052852591816, "learning_rate": 3.998277600433241e-05, "loss": 1.2018, "step": 53 }, { "epoch": 0.10546875, "grad_norm": 0.526020419983389, "learning_rate": 3.998002469985688e-05, "loss": 1.1164, "step": 54 }, { "epoch": 0.107421875, "grad_norm": 0.504222879676158, "learning_rate": 3.997706973563413e-05, "loss": 1.191, "step": 55 }, { "epoch": 0.109375, "grad_norm": 0.5614145336635687, "learning_rate": 3.9973911141784605e-05, "loss": 1.3011, "step": 56 }, { "epoch": 0.111328125, "grad_norm": 0.4391770801146478, "learning_rate": 3.997054895050437e-05, "loss": 1.2535, "step": 57 }, { "epoch": 0.11328125, "grad_norm": 0.5583307267784473, "learning_rate": 3.996698319606482e-05, "loss": 1.153, "step": 58 }, { "epoch": 0.115234375, "grad_norm": 0.4576133947689655, "learning_rate": 3.996321391481229e-05, "loss": 1.1564, "step": 59 }, { "epoch": 0.1171875, "grad_norm": 0.41970646962377184, "learning_rate": 3.995924114516769e-05, "loss": 1.1935, "step": 60 }, { "epoch": 0.119140625, "grad_norm": 0.44805324266797203, "learning_rate": 3.995506492762613e-05, "loss": 1.1339, "step": 61 }, { "epoch": 0.12109375, "grad_norm": 0.5208068893189155, "learning_rate": 3.9950685304756494e-05, "loss": 1.2092, "step": 62 }, { "epoch": 0.123046875, "grad_norm": 0.44195618774115664, "learning_rate": 3.994610232120101e-05, "loss": 1.1292, "step": 63 }, { "epoch": 0.125, "grad_norm": 0.4514887790554273, "learning_rate": 3.994131602367481e-05, "loss": 1.1658, "step": 64 }, { "epoch": 0.126953125, "grad_norm": 0.5908686231033371, "learning_rate": 3.9936326460965423e-05, "loss": 1.2076, "step": 65 }, { "epoch": 0.12890625, "grad_norm": 0.46799815417666174, "learning_rate": 3.99311336839323e-05, "loss": 1.1889, "step": 66 }, { "epoch": 0.130859375, "grad_norm": 0.45939729407525115, "learning_rate": 3.992573774550629e-05, "loss": 1.1704, "step": 67 }, { "epoch": 0.1328125, "grad_norm": 0.4142175477343616, "learning_rate": 3.9920138700689095e-05, "loss": 1.1848, "step": 68 }, { "epoch": 0.134765625, "grad_norm": 0.37685838553537837, "learning_rate": 3.991433660655273e-05, "loss": 1.1041, "step": 69 }, { "epoch": 0.13671875, "grad_norm": 0.39832807246827023, "learning_rate": 3.99083315222389e-05, "loss": 1.2002, "step": 70 }, { "epoch": 0.138671875, "grad_norm": 0.43218323629933336, "learning_rate": 3.990212350895845e-05, "loss": 1.1487, "step": 71 }, { "epoch": 0.140625, "grad_norm": 0.43302460007599547, "learning_rate": 3.98957126299907e-05, "loss": 1.1638, "step": 72 }, { "epoch": 0.142578125, "grad_norm": 0.41150363252077565, "learning_rate": 3.988909895068281e-05, "loss": 1.1353, "step": 73 }, { "epoch": 0.14453125, "grad_norm": 0.4362254605938381, "learning_rate": 3.988228253844913e-05, "loss": 1.2202, "step": 74 }, { "epoch": 0.146484375, "grad_norm": 0.4696684841153936, "learning_rate": 3.987526346277049e-05, "loss": 1.1722, "step": 75 }, { "epoch": 0.1484375, "grad_norm": 0.42274900639715757, "learning_rate": 3.9868041795193505e-05, "loss": 1.179, "step": 76 }, { "epoch": 0.150390625, "grad_norm": 0.47381294364503707, "learning_rate": 3.9860617609329856e-05, "loss": 1.1978, "step": 77 }, { "epoch": 0.15234375, "grad_norm": 0.448192967722078, "learning_rate": 3.9852990980855505e-05, "loss": 1.2042, "step": 78 }, { "epoch": 0.154296875, "grad_norm": 0.388483486919693, "learning_rate": 3.984516198750997e-05, "loss": 1.148, "step": 79 }, { "epoch": 0.15625, "grad_norm": 0.4057112657252388, "learning_rate": 3.9837130709095475e-05, "loss": 1.1267, "step": 80 }, { "epoch": 0.158203125, "grad_norm": 0.5111257616377479, "learning_rate": 3.982889722747621e-05, "loss": 1.1992, "step": 81 }, { "epoch": 0.16015625, "grad_norm": 0.42800919524357695, "learning_rate": 3.9820461626577426e-05, "loss": 1.2214, "step": 82 }, { "epoch": 0.162109375, "grad_norm": 0.6604320971658805, "learning_rate": 3.981182399238462e-05, "loss": 1.1046, "step": 83 }, { "epoch": 0.1640625, "grad_norm": 0.4650529995861808, "learning_rate": 3.980298441294265e-05, "loss": 1.1485, "step": 84 }, { "epoch": 0.166015625, "grad_norm": 0.8247014006092652, "learning_rate": 3.9793942978354835e-05, "loss": 1.2345, "step": 85 }, { "epoch": 0.16796875, "grad_norm": 0.5111463246016623, "learning_rate": 3.978469978078203e-05, "loss": 1.1406, "step": 86 }, { "epoch": 0.169921875, "grad_norm": 0.3980549366997817, "learning_rate": 3.977525491444171e-05, "loss": 1.138, "step": 87 }, { "epoch": 0.171875, "grad_norm": 0.4500013345653544, "learning_rate": 3.976560847560697e-05, "loss": 1.1803, "step": 88 }, { "epoch": 0.173828125, "grad_norm": 0.6144879263096161, "learning_rate": 3.975576056260559e-05, "loss": 1.376, "step": 89 }, { "epoch": 0.17578125, "grad_norm": 0.45250166677505255, "learning_rate": 3.974571127581901e-05, "loss": 1.2616, "step": 90 }, { "epoch": 0.177734375, "grad_norm": 0.7260361194779941, "learning_rate": 3.973546071768128e-05, "loss": 1.207, "step": 91 }, { "epoch": 0.1796875, "grad_norm": 0.40590569325939646, "learning_rate": 3.972500899267807e-05, "loss": 1.1857, "step": 92 }, { "epoch": 0.181640625, "grad_norm": 0.7059204956983739, "learning_rate": 3.971435620734557e-05, "loss": 1.1629, "step": 93 }, { "epoch": 0.18359375, "grad_norm": 0.4166494769492577, "learning_rate": 3.97035024702694e-05, "loss": 1.2105, "step": 94 }, { "epoch": 0.185546875, "grad_norm": 0.4708428232528331, "learning_rate": 3.969244789208354e-05, "loss": 1.2074, "step": 95 }, { "epoch": 0.1875, "grad_norm": 0.46187395897944283, "learning_rate": 3.9681192585469146e-05, "loss": 1.2411, "step": 96 }, { "epoch": 0.189453125, "grad_norm": 0.40887786827875044, "learning_rate": 3.9669736665153455e-05, "loss": 1.181, "step": 97 }, { "epoch": 0.19140625, "grad_norm": 0.5783677933870661, "learning_rate": 3.96580802479086e-05, "loss": 1.2412, "step": 98 }, { "epoch": 0.193359375, "grad_norm": 0.46098155681455955, "learning_rate": 3.9646223452550374e-05, "loss": 1.0478, "step": 99 }, { "epoch": 0.1953125, "grad_norm": 0.4421189367731534, "learning_rate": 3.9634166399937104e-05, "loss": 1.1528, "step": 100 }, { "epoch": 0.197265625, "grad_norm": 0.44208897843282735, "learning_rate": 3.962190921296834e-05, "loss": 1.1294, "step": 101 }, { "epoch": 0.19921875, "grad_norm": 0.41115810620405063, "learning_rate": 3.9609452016583654e-05, "loss": 1.0787, "step": 102 }, { "epoch": 0.201171875, "grad_norm": 0.4592703963732682, "learning_rate": 3.959679493776134e-05, "loss": 1.2084, "step": 103 }, { "epoch": 0.203125, "grad_norm": 0.46514364761525706, "learning_rate": 3.9583938105517127e-05, "loss": 1.169, "step": 104 }, { "epoch": 0.205078125, "grad_norm": 0.5044144386089332, "learning_rate": 3.957088165090287e-05, "loss": 1.121, "step": 105 }, { "epoch": 0.20703125, "grad_norm": 0.4160320267546915, "learning_rate": 3.9557625707005185e-05, "loss": 1.1133, "step": 106 }, { "epoch": 0.208984375, "grad_norm": 0.46611013560363507, "learning_rate": 3.954417040894416e-05, "loss": 1.0846, "step": 107 }, { "epoch": 0.2109375, "grad_norm": 0.494489354902747, "learning_rate": 3.953051589387189e-05, "loss": 1.1762, "step": 108 }, { "epoch": 0.212890625, "grad_norm": 0.4226200871032249, "learning_rate": 3.951666230097115e-05, "loss": 1.0346, "step": 109 }, { "epoch": 0.21484375, "grad_norm": 0.4032354878018358, "learning_rate": 3.9502609771453934e-05, "loss": 1.1223, "step": 110 }, { "epoch": 0.216796875, "grad_norm": 0.4148468151686513, "learning_rate": 3.948835844856004e-05, "loss": 1.1581, "step": 111 }, { "epoch": 0.21875, "grad_norm": 0.4655201875464092, "learning_rate": 3.947390847755559e-05, "loss": 1.141, "step": 112 }, { "epoch": 0.220703125, "grad_norm": 0.44131202754652804, "learning_rate": 3.945926000573156e-05, "loss": 1.228, "step": 113 }, { "epoch": 0.22265625, "grad_norm": 0.4878464713519324, "learning_rate": 3.94444131824023e-05, "loss": 1.2023, "step": 114 }, { "epoch": 0.224609375, "grad_norm": 0.4433704308856408, "learning_rate": 3.942936815890396e-05, "loss": 1.2479, "step": 115 }, { "epoch": 0.2265625, "grad_norm": 0.4848454824446459, "learning_rate": 3.941412508859299e-05, "loss": 1.1269, "step": 116 }, { "epoch": 0.228515625, "grad_norm": 0.419630467357436, "learning_rate": 3.939868412684458e-05, "loss": 1.1806, "step": 117 }, { "epoch": 0.23046875, "grad_norm": 0.39683375502836515, "learning_rate": 3.938304543105104e-05, "loss": 1.1054, "step": 118 }, { "epoch": 0.232421875, "grad_norm": 0.4832371787668091, "learning_rate": 3.936720916062022e-05, "loss": 1.1174, "step": 119 }, { "epoch": 0.234375, "grad_norm": 0.5986867637436046, "learning_rate": 3.935117547697387e-05, "loss": 1.1791, "step": 120 }, { "epoch": 0.236328125, "grad_norm": 0.4150490343483682, "learning_rate": 3.933494454354605e-05, "loss": 1.2129, "step": 121 }, { "epoch": 0.23828125, "grad_norm": 0.4215588087170942, "learning_rate": 3.931851652578137e-05, "loss": 1.1414, "step": 122 }, { "epoch": 0.240234375, "grad_norm": 0.42515318009071157, "learning_rate": 3.9301891591133377e-05, "loss": 1.0854, "step": 123 }, { "epoch": 0.2421875, "grad_norm": 0.4488701042494301, "learning_rate": 3.928506990906282e-05, "loss": 1.0725, "step": 124 }, { "epoch": 0.244140625, "grad_norm": 0.41531581194897543, "learning_rate": 3.9268051651035944e-05, "loss": 1.0746, "step": 125 }, { "epoch": 0.24609375, "grad_norm": 0.46204021714125687, "learning_rate": 3.9250836990522685e-05, "loss": 1.2164, "step": 126 }, { "epoch": 0.248046875, "grad_norm": 0.6677384727690392, "learning_rate": 3.923342610299499e-05, "loss": 1.1834, "step": 127 }, { "epoch": 0.25, "grad_norm": 0.4961785465516465, "learning_rate": 3.9215819165924956e-05, "loss": 1.2178, "step": 128 }, { "epoch": 0.251953125, "grad_norm": 0.4651476735438144, "learning_rate": 3.919801635878305e-05, "loss": 1.1005, "step": 129 }, { "epoch": 0.25390625, "grad_norm": 0.49434332973849215, "learning_rate": 3.918001786303627e-05, "loss": 1.1922, "step": 130 }, { "epoch": 0.255859375, "grad_norm": 0.45671514667179935, "learning_rate": 3.9161823862146297e-05, "loss": 1.0617, "step": 131 }, { "epoch": 0.2578125, "grad_norm": 0.49674226929417115, "learning_rate": 3.9143434541567654e-05, "loss": 1.2203, "step": 132 }, { "epoch": 0.259765625, "grad_norm": 0.5208683235687923, "learning_rate": 3.912485008874577e-05, "loss": 1.1587, "step": 133 }, { "epoch": 0.26171875, "grad_norm": 0.517022288962491, "learning_rate": 3.9106070693115087e-05, "loss": 1.1427, "step": 134 }, { "epoch": 0.263671875, "grad_norm": 0.38942661826422087, "learning_rate": 3.908709654609715e-05, "loss": 1.0629, "step": 135 }, { "epoch": 0.265625, "grad_norm": 0.4564236281556844, "learning_rate": 3.9067927841098614e-05, "loss": 1.0919, "step": 136 }, { "epoch": 0.267578125, "grad_norm": 0.4929559987928741, "learning_rate": 3.9048564773509314e-05, "loss": 1.1502, "step": 137 }, { "epoch": 0.26953125, "grad_norm": 0.48513251932309925, "learning_rate": 3.902900754070025e-05, "loss": 1.1158, "step": 138 }, { "epoch": 0.271484375, "grad_norm": 0.5349569441078609, "learning_rate": 3.900925634202158e-05, "loss": 1.1279, "step": 139 }, { "epoch": 0.2734375, "grad_norm": 0.47177459620840684, "learning_rate": 3.898931137880059e-05, "loss": 1.1595, "step": 140 }, { "epoch": 0.275390625, "grad_norm": 0.4904546697998669, "learning_rate": 3.896917285433964e-05, "loss": 1.2615, "step": 141 }, { "epoch": 0.27734375, "grad_norm": 0.5768180408665089, "learning_rate": 3.894884097391409e-05, "loss": 1.1688, "step": 142 }, { "epoch": 0.279296875, "grad_norm": 0.4362108519904031, "learning_rate": 3.892831594477021e-05, "loss": 1.0983, "step": 143 }, { "epoch": 0.28125, "grad_norm": 0.4570710320413065, "learning_rate": 3.890759797612307e-05, "loss": 1.3706, "step": 144 }, { "epoch": 0.283203125, "grad_norm": 0.4465318663671251, "learning_rate": 3.888668727915441e-05, "loss": 1.1377, "step": 145 }, { "epoch": 0.28515625, "grad_norm": 0.5047852656660148, "learning_rate": 3.886558406701046e-05, "loss": 1.0747, "step": 146 }, { "epoch": 0.287109375, "grad_norm": 0.4412295789497703, "learning_rate": 3.884428855479983e-05, "loss": 1.1261, "step": 147 }, { "epoch": 0.2890625, "grad_norm": 0.4476476539228374, "learning_rate": 3.8822800959591236e-05, "loss": 1.1769, "step": 148 }, { "epoch": 0.291015625, "grad_norm": 0.45924117326794117, "learning_rate": 3.880112150041134e-05, "loss": 1.1564, "step": 149 }, { "epoch": 0.29296875, "grad_norm": 0.43931168833110684, "learning_rate": 3.877925039824253e-05, "loss": 1.1682, "step": 150 }, { "epoch": 0.294921875, "grad_norm": 0.5438637955362605, "learning_rate": 3.8757187876020603e-05, "loss": 1.1448, "step": 151 }, { "epoch": 0.296875, "grad_norm": 0.42928963297461137, "learning_rate": 3.873493415863256e-05, "loss": 1.2078, "step": 152 }, { "epoch": 0.298828125, "grad_norm": 0.4381709802123583, "learning_rate": 3.8712489472914286e-05, "loss": 1.0604, "step": 153 }, { "epoch": 0.30078125, "grad_norm": 0.4988490117613772, "learning_rate": 3.8689854047648224e-05, "loss": 1.1424, "step": 154 }, { "epoch": 0.302734375, "grad_norm": 0.4257038437137218, "learning_rate": 3.866702811356107e-05, "loss": 1.0955, "step": 155 }, { "epoch": 0.3046875, "grad_norm": 0.4893472968930594, "learning_rate": 3.86440119033214e-05, "loss": 1.1854, "step": 156 }, { "epoch": 0.306640625, "grad_norm": 0.5731240348991923, "learning_rate": 3.862080565153731e-05, "loss": 1.2505, "step": 157 }, { "epoch": 0.30859375, "grad_norm": 0.4594995644663965, "learning_rate": 3.8597409594754025e-05, "loss": 1.1047, "step": 158 }, { "epoch": 0.310546875, "grad_norm": 0.3898970756217597, "learning_rate": 3.857382397145148e-05, "loss": 1.1728, "step": 159 }, { "epoch": 0.3125, "grad_norm": 0.5165759238716673, "learning_rate": 3.85500490220419e-05, "loss": 1.1232, "step": 160 }, { "epoch": 0.314453125, "grad_norm": 0.42169317869735606, "learning_rate": 3.852608498886732e-05, "loss": 1.1087, "step": 161 }, { "epoch": 0.31640625, "grad_norm": 0.4831766592421198, "learning_rate": 3.850193211619718e-05, "loss": 1.0902, "step": 162 }, { "epoch": 0.318359375, "grad_norm": 0.5168422003190449, "learning_rate": 3.8477590650225735e-05, "loss": 1.1979, "step": 163 }, { "epoch": 0.3203125, "grad_norm": 0.44267326014624, "learning_rate": 3.845306083906967e-05, "loss": 1.1311, "step": 164 }, { "epoch": 0.322265625, "grad_norm": 0.42634229457641887, "learning_rate": 3.842834293276545e-05, "loss": 1.1729, "step": 165 }, { "epoch": 0.32421875, "grad_norm": 0.40628491116146026, "learning_rate": 3.8403437183266834e-05, "loss": 1.0984, "step": 166 }, { "epoch": 0.326171875, "grad_norm": 0.4159045672550255, "learning_rate": 3.8378343844442344e-05, "loss": 1.1731, "step": 167 }, { "epoch": 0.328125, "grad_norm": 0.5968785135150301, "learning_rate": 3.8353063172072564e-05, "loss": 1.0247, "step": 168 }, { "epoch": 0.330078125, "grad_norm": 0.4649591605790638, "learning_rate": 3.8327595423847645e-05, "loss": 1.139, "step": 169 }, { "epoch": 0.33203125, "grad_norm": 0.48079030109724175, "learning_rate": 3.830194085936463e-05, "loss": 1.1268, "step": 170 }, { "epoch": 0.333984375, "grad_norm": 0.46348618416181137, "learning_rate": 3.8276099740124794e-05, "loss": 1.2004, "step": 171 }, { "epoch": 0.3359375, "grad_norm": 0.4832617358199499, "learning_rate": 3.8250072329531004e-05, "loss": 1.0743, "step": 172 }, { "epoch": 0.337890625, "grad_norm": 0.4420229534375586, "learning_rate": 3.822385889288503e-05, "loss": 1.141, "step": 173 }, { "epoch": 0.33984375, "grad_norm": 0.39752191495545935, "learning_rate": 3.819745969738484e-05, "loss": 1.0972, "step": 174 }, { "epoch": 0.341796875, "grad_norm": 0.4411421700040708, "learning_rate": 3.817087501212185e-05, "loss": 1.0233, "step": 175 }, { "epoch": 0.34375, "grad_norm": 0.4017237336736879, "learning_rate": 3.8144105108078246e-05, "loss": 1.1563, "step": 176 }, { "epoch": 0.345703125, "grad_norm": 0.686922962042273, "learning_rate": 3.8117150258124134e-05, "loss": 1.147, "step": 177 }, { "epoch": 0.34765625, "grad_norm": 0.4294357539370898, "learning_rate": 3.8090010737014836e-05, "loss": 1.1116, "step": 178 }, { "epoch": 0.349609375, "grad_norm": 0.41962832297995667, "learning_rate": 3.806268682138805e-05, "loss": 1.0827, "step": 179 }, { "epoch": 0.3515625, "grad_norm": 0.4413195950046206, "learning_rate": 3.803517878976103e-05, "loss": 1.0814, "step": 180 }, { "epoch": 0.353515625, "grad_norm": 0.45365068157119814, "learning_rate": 3.8007486922527774e-05, "loss": 1.0599, "step": 181 }, { "epoch": 0.35546875, "grad_norm": 0.5286445380979327, "learning_rate": 3.7979611501956124e-05, "loss": 1.2251, "step": 182 }, { "epoch": 0.357421875, "grad_norm": 0.38599209970455534, "learning_rate": 3.795155281218493e-05, "loss": 1.1676, "step": 183 }, { "epoch": 0.359375, "grad_norm": 0.44025531979392435, "learning_rate": 3.7923311139221114e-05, "loss": 1.0514, "step": 184 }, { "epoch": 0.361328125, "grad_norm": 0.42167205583593925, "learning_rate": 3.789488677093681e-05, "loss": 1.1002, "step": 185 }, { "epoch": 0.36328125, "grad_norm": 0.4466402130651366, "learning_rate": 3.786627999706638e-05, "loss": 1.1013, "step": 186 }, { "epoch": 0.365234375, "grad_norm": 0.496760952886551, "learning_rate": 3.783749110920345e-05, "loss": 1.1465, "step": 187 }, { "epoch": 0.3671875, "grad_norm": 0.4367613213432748, "learning_rate": 3.780852040079802e-05, "loss": 1.0657, "step": 188 }, { "epoch": 0.369140625, "grad_norm": 0.41447069424638583, "learning_rate": 3.777936816715336e-05, "loss": 1.116, "step": 189 }, { "epoch": 0.37109375, "grad_norm": 0.4361134375016492, "learning_rate": 3.7750034705423095e-05, "loss": 1.2767, "step": 190 }, { "epoch": 0.373046875, "grad_norm": 0.4066150259484398, "learning_rate": 3.772052031460812e-05, "loss": 1.0785, "step": 191 }, { "epoch": 0.375, "grad_norm": 0.40407841923262816, "learning_rate": 3.769082529555359e-05, "loss": 1.1644, "step": 192 }, { "epoch": 0.376953125, "grad_norm": 0.44561296429853814, "learning_rate": 3.766094995094581e-05, "loss": 1.0663, "step": 193 }, { "epoch": 0.37890625, "grad_norm": 0.5352430776738828, "learning_rate": 3.7630894585309195e-05, "loss": 1.0209, "step": 194 }, { "epoch": 0.380859375, "grad_norm": 0.43636357529723163, "learning_rate": 3.7600659505003125e-05, "loss": 1.0621, "step": 195 }, { "epoch": 0.3828125, "grad_norm": 0.4264879021475797, "learning_rate": 3.757024501821885e-05, "loss": 1.1336, "step": 196 }, { "epoch": 0.384765625, "grad_norm": 0.3873402520476977, "learning_rate": 3.753965143497635e-05, "loss": 1.1378, "step": 197 }, { "epoch": 0.38671875, "grad_norm": 0.40092066811193233, "learning_rate": 3.750887906712115e-05, "loss": 1.0685, "step": 198 }, { "epoch": 0.388671875, "grad_norm": 0.43572366333630774, "learning_rate": 3.747792822832117e-05, "loss": 1.1723, "step": 199 }, { "epoch": 0.390625, "grad_norm": 0.37730662296410905, "learning_rate": 3.744679923406351e-05, "loss": 1.0823, "step": 200 }, { "epoch": 0.392578125, "grad_norm": 0.4578098403628755, "learning_rate": 3.741549240165122e-05, "loss": 1.1354, "step": 201 }, { "epoch": 0.39453125, "grad_norm": 0.4402925550279655, "learning_rate": 3.738400805020011e-05, "loss": 1.0921, "step": 202 }, { "epoch": 0.396484375, "grad_norm": 0.3814506298253285, "learning_rate": 3.7352346500635466e-05, "loss": 1.0813, "step": 203 }, { "epoch": 0.3984375, "grad_norm": 0.5352313284178145, "learning_rate": 3.732050807568878e-05, "loss": 1.2286, "step": 204 }, { "epoch": 0.400390625, "grad_norm": 0.4394941726895711, "learning_rate": 3.728849309989445e-05, "loss": 1.1362, "step": 205 }, { "epoch": 0.40234375, "grad_norm": 0.40009193940161264, "learning_rate": 3.7256301899586524e-05, "loss": 1.014, "step": 206 }, { "epoch": 0.404296875, "grad_norm": 0.4093033957375515, "learning_rate": 3.7223934802895294e-05, "loss": 1.0731, "step": 207 }, { "epoch": 0.40625, "grad_norm": 0.47801078784248796, "learning_rate": 3.719139213974403e-05, "loss": 1.2081, "step": 208 }, { "epoch": 0.408203125, "grad_norm": 0.5965083454407833, "learning_rate": 3.715867424184554e-05, "loss": 1.1495, "step": 209 }, { "epoch": 0.41015625, "grad_norm": 0.43672026913516004, "learning_rate": 3.712578144269887e-05, "loss": 1.1201, "step": 210 }, { "epoch": 0.412109375, "grad_norm": 0.5253144641112631, "learning_rate": 3.7092714077585836e-05, "loss": 1.2268, "step": 211 }, { "epoch": 0.4140625, "grad_norm": 0.4738073414405108, "learning_rate": 3.705947248356765e-05, "loss": 1.1188, "step": 212 }, { "epoch": 0.416015625, "grad_norm": 0.4477140058126639, "learning_rate": 3.7026056999481464e-05, "loss": 1.0571, "step": 213 }, { "epoch": 0.41796875, "grad_norm": 0.4471574730565842, "learning_rate": 3.699246796593692e-05, "loss": 1.0847, "step": 214 }, { "epoch": 0.419921875, "grad_norm": 0.41405988952981876, "learning_rate": 3.6958705725312655e-05, "loss": 1.1401, "step": 215 }, { "epoch": 0.421875, "grad_norm": 0.49370245896699827, "learning_rate": 3.692477062175289e-05, "loss": 1.0703, "step": 216 }, { "epoch": 0.423828125, "grad_norm": 0.4406399072344879, "learning_rate": 3.689066300116381e-05, "loss": 1.1793, "step": 217 }, { "epoch": 0.42578125, "grad_norm": 0.43483619180179833, "learning_rate": 3.6856383211210134e-05, "loss": 1.1305, "step": 218 }, { "epoch": 0.427734375, "grad_norm": 0.43256055966703133, "learning_rate": 3.682193160131152e-05, "loss": 1.0943, "step": 219 }, { "epoch": 0.4296875, "grad_norm": 0.5598257236379292, "learning_rate": 3.678730852263901e-05, "loss": 1.2309, "step": 220 }, { "epoch": 0.431640625, "grad_norm": 0.39045352547405415, "learning_rate": 3.675251432811144e-05, "loss": 1.0047, "step": 221 }, { "epoch": 0.43359375, "grad_norm": 0.44912102512870905, "learning_rate": 3.671754937239191e-05, "loss": 1.1087, "step": 222 }, { "epoch": 0.435546875, "grad_norm": 0.4174420596478436, "learning_rate": 3.668241401188407e-05, "loss": 1.0313, "step": 223 }, { "epoch": 0.4375, "grad_norm": 0.36458359932139156, "learning_rate": 3.6647108604728546e-05, "loss": 0.9782, "step": 224 }, { "epoch": 0.439453125, "grad_norm": 0.4419635662052487, "learning_rate": 3.661163351079929e-05, "loss": 1.1076, "step": 225 }, { "epoch": 0.44140625, "grad_norm": 0.4537093691655119, "learning_rate": 3.6575989091699895e-05, "loss": 1.1265, "step": 226 }, { "epoch": 0.443359375, "grad_norm": 0.4515222234083662, "learning_rate": 3.65401757107599e-05, "loss": 1.124, "step": 227 }, { "epoch": 0.4453125, "grad_norm": 0.4509933735945529, "learning_rate": 3.650419373303112e-05, "loss": 1.2212, "step": 228 }, { "epoch": 0.447265625, "grad_norm": 0.39315970041656184, "learning_rate": 3.646804352528389e-05, "loss": 1.1003, "step": 229 }, { "epoch": 0.44921875, "grad_norm": 0.583897939706095, "learning_rate": 3.643172545600336e-05, "loss": 1.0984, "step": 230 }, { "epoch": 0.451171875, "grad_norm": 0.5164803615434137, "learning_rate": 3.63952398953857e-05, "loss": 1.0738, "step": 231 }, { "epoch": 0.453125, "grad_norm": 0.4070265753872102, "learning_rate": 3.6358587215334355e-05, "loss": 1.034, "step": 232 }, { "epoch": 0.455078125, "grad_norm": 0.4101472350679783, "learning_rate": 3.632176778945626e-05, "loss": 1.1234, "step": 233 }, { "epoch": 0.45703125, "grad_norm": 0.410956088362877, "learning_rate": 3.628478199305799e-05, "loss": 1.1062, "step": 234 }, { "epoch": 0.458984375, "grad_norm": 0.42181972355385416, "learning_rate": 3.624763020314199e-05, "loss": 1.1848, "step": 235 }, { "epoch": 0.4609375, "grad_norm": 0.4069735981570203, "learning_rate": 3.62103127984027e-05, "loss": 1.1203, "step": 236 }, { "epoch": 0.462890625, "grad_norm": 0.4142934678480609, "learning_rate": 3.617283015922268e-05, "loss": 1.1044, "step": 237 }, { "epoch": 0.46484375, "grad_norm": 0.4697374307040272, "learning_rate": 3.6135182667668764e-05, "loss": 1.1947, "step": 238 }, { "epoch": 0.466796875, "grad_norm": 0.3985058819632944, "learning_rate": 3.6097370707488175e-05, "loss": 1.0906, "step": 239 }, { "epoch": 0.46875, "grad_norm": 0.40215610602620183, "learning_rate": 3.6059394664104554e-05, "loss": 1.1607, "step": 240 }, { "epoch": 0.470703125, "grad_norm": 0.3985665062059567, "learning_rate": 3.60212549246141e-05, "loss": 1.0787, "step": 241 }, { "epoch": 0.47265625, "grad_norm": 0.43711415007382576, "learning_rate": 3.598295187778158e-05, "loss": 1.1554, "step": 242 }, { "epoch": 0.474609375, "grad_norm": 0.4382023321095773, "learning_rate": 3.5944485914036384e-05, "loss": 1.0126, "step": 243 }, { "epoch": 0.4765625, "grad_norm": 0.37488265505774904, "learning_rate": 3.590585742546853e-05, "loss": 1.1054, "step": 244 }, { "epoch": 0.478515625, "grad_norm": 0.40930451172856447, "learning_rate": 3.586706680582471e-05, "loss": 1.0321, "step": 245 }, { "epoch": 0.48046875, "grad_norm": 0.5059310227059168, "learning_rate": 3.5828114450504205e-05, "loss": 1.1239, "step": 246 }, { "epoch": 0.482421875, "grad_norm": 0.45898297435796365, "learning_rate": 3.5789000756554927e-05, "loss": 1.0467, "step": 247 }, { "epoch": 0.484375, "grad_norm": 0.42551550838444063, "learning_rate": 3.5749726122669316e-05, "loss": 1.051, "step": 248 }, { "epoch": 0.486328125, "grad_norm": 0.4451344613451106, "learning_rate": 3.5710290949180325e-05, "loss": 1.1036, "step": 249 }, { "epoch": 0.48828125, "grad_norm": 0.43151805025113255, "learning_rate": 3.5670695638057285e-05, "loss": 1.1906, "step": 250 }, { "epoch": 0.490234375, "grad_norm": 0.492114391902568, "learning_rate": 3.563094059290186e-05, "loss": 1.1629, "step": 251 }, { "epoch": 0.4921875, "grad_norm": 0.4144331093915329, "learning_rate": 3.5591026218943905e-05, "loss": 1.1485, "step": 252 }, { "epoch": 0.494140625, "grad_norm": 0.4201461662795515, "learning_rate": 3.5550952923037337e-05, "loss": 1.1451, "step": 253 }, { "epoch": 0.49609375, "grad_norm": 0.41132936789582963, "learning_rate": 3.551072111365598e-05, "loss": 1.1216, "step": 254 }, { "epoch": 0.498046875, "grad_norm": 0.40892606177310264, "learning_rate": 3.547033120088943e-05, "loss": 1.0282, "step": 255 }, { "epoch": 0.5, "grad_norm": 0.39721649148962185, "learning_rate": 3.5429783596438864e-05, "loss": 1.113, "step": 256 } ], "logging_steps": 1, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 256, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 531064116215808.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }