|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 500, |
|
"global_step": 256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001953125, |
|
"grad_norm": 2.2842363876082494, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.7076, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00390625, |
|
"grad_norm": 2.317015212334916, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.6296, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005859375, |
|
"grad_norm": 2.0835939653262883, |
|
"learning_rate": 3e-06, |
|
"loss": 1.5593, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0078125, |
|
"grad_norm": 2.1357657121975797, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.6713, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009765625, |
|
"grad_norm": 2.0362735997756847, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5327, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01171875, |
|
"grad_norm": 2.1597413317388523, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6435, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.013671875, |
|
"grad_norm": 2.1354234831872616, |
|
"learning_rate": 7e-06, |
|
"loss": 1.539, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 2.0222980997885682, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.491, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.017578125, |
|
"grad_norm": 1.8336578914749888, |
|
"learning_rate": 9e-06, |
|
"loss": 1.567, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01953125, |
|
"grad_norm": 1.7535364548043673, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5181, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021484375, |
|
"grad_norm": 1.348232072077207, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.4633, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0234375, |
|
"grad_norm": 1.079057032053978, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.36, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.025390625, |
|
"grad_norm": 0.7143765277543237, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.3195, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02734375, |
|
"grad_norm": 0.8120880164824964, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.3469, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.029296875, |
|
"grad_norm": 0.6746494578904082, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.3626, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.9663545707089416, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.2772, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.033203125, |
|
"grad_norm": 0.961439588523319, |
|
"learning_rate": 1.7e-05, |
|
"loss": 1.2911, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03515625, |
|
"grad_norm": 1.1738444068957379, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.3346, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.037109375, |
|
"grad_norm": 1.2332387671295317, |
|
"learning_rate": 1.9e-05, |
|
"loss": 1.3761, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 1.268714744941341, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3042, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.041015625, |
|
"grad_norm": 1.078415802927275, |
|
"learning_rate": 2.1000000000000002e-05, |
|
"loss": 1.2102, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04296875, |
|
"grad_norm": 1.330999136602917, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.2755, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.044921875, |
|
"grad_norm": 0.7130882289363479, |
|
"learning_rate": 2.3e-05, |
|
"loss": 1.1706, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 0.5729960230193528, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.3215, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.048828125, |
|
"grad_norm": 0.6125271472968751, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3213, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05078125, |
|
"grad_norm": 0.6108864130655043, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.2865, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.052734375, |
|
"grad_norm": 0.6479528408256864, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.3383, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0546875, |
|
"grad_norm": 0.8412108818700305, |
|
"learning_rate": 2.8e-05, |
|
"loss": 1.2763, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.056640625, |
|
"grad_norm": 0.8629612077288169, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.3045, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05859375, |
|
"grad_norm": 0.7600858737745863, |
|
"learning_rate": 3.0000000000000004e-05, |
|
"loss": 1.2352, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.060546875, |
|
"grad_norm": 0.7130629485255873, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.2299, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.5912964724458128, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.2234, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.064453125, |
|
"grad_norm": 0.5368820032381596, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.1934, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06640625, |
|
"grad_norm": 0.5570421986755116, |
|
"learning_rate": 3.4e-05, |
|
"loss": 1.2581, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.068359375, |
|
"grad_norm": 0.46598864760360764, |
|
"learning_rate": 3.5000000000000004e-05, |
|
"loss": 1.2535, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0703125, |
|
"grad_norm": 0.6392299897042107, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.2331, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.072265625, |
|
"grad_norm": 0.49983937474417145, |
|
"learning_rate": 3.7000000000000005e-05, |
|
"loss": 1.2432, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07421875, |
|
"grad_norm": 0.652858138736506, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.2759, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.076171875, |
|
"grad_norm": 0.5926189930170476, |
|
"learning_rate": 3.9e-05, |
|
"loss": 1.3016, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 0.6646763351870284, |
|
"learning_rate": 4e-05, |
|
"loss": 1.344, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.080078125, |
|
"grad_norm": 0.6228429864196855, |
|
"learning_rate": 3.99998980683206e-05, |
|
"loss": 1.2794, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08203125, |
|
"grad_norm": 0.5633101870154669, |
|
"learning_rate": 3.9999592274321385e-05, |
|
"loss": 1.2931, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.083984375, |
|
"grad_norm": 0.6866774046182069, |
|
"learning_rate": 3.999908262111937e-05, |
|
"loss": 1.2647, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0859375, |
|
"grad_norm": 0.5312790576505163, |
|
"learning_rate": 3.9998369113909555e-05, |
|
"loss": 1.2255, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.087890625, |
|
"grad_norm": 0.5694229658922494, |
|
"learning_rate": 3.999745175996481e-05, |
|
"loss": 1.3104, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08984375, |
|
"grad_norm": 0.5068013674566277, |
|
"learning_rate": 3.999633056863589e-05, |
|
"loss": 1.1771, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.091796875, |
|
"grad_norm": 0.5428027277075501, |
|
"learning_rate": 3.999500555135129e-05, |
|
"loss": 1.3508, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.4792441915562371, |
|
"learning_rate": 3.999347672161713e-05, |
|
"loss": 1.1144, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.095703125, |
|
"grad_norm": 0.5033945174929487, |
|
"learning_rate": 3.999174409501703e-05, |
|
"loss": 1.1474, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09765625, |
|
"grad_norm": 0.5609150975698594, |
|
"learning_rate": 3.9989807689211946e-05, |
|
"loss": 1.2558, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.099609375, |
|
"grad_norm": 0.5558707293914855, |
|
"learning_rate": 3.998766752393998e-05, |
|
"loss": 1.1411, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1015625, |
|
"grad_norm": 0.4429585853749615, |
|
"learning_rate": 3.99853236210162e-05, |
|
"loss": 1.1715, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.103515625, |
|
"grad_norm": 0.5064052852591816, |
|
"learning_rate": 3.998277600433241e-05, |
|
"loss": 1.2018, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10546875, |
|
"grad_norm": 0.526020419983389, |
|
"learning_rate": 3.998002469985688e-05, |
|
"loss": 1.1164, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.107421875, |
|
"grad_norm": 0.504222879676158, |
|
"learning_rate": 3.997706973563413e-05, |
|
"loss": 1.191, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 0.5614145336635687, |
|
"learning_rate": 3.9973911141784605e-05, |
|
"loss": 1.3011, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.111328125, |
|
"grad_norm": 0.4391770801146478, |
|
"learning_rate": 3.997054895050437e-05, |
|
"loss": 1.2535, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11328125, |
|
"grad_norm": 0.5583307267784473, |
|
"learning_rate": 3.996698319606482e-05, |
|
"loss": 1.153, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.115234375, |
|
"grad_norm": 0.4576133947689655, |
|
"learning_rate": 3.996321391481229e-05, |
|
"loss": 1.1564, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 0.41970646962377184, |
|
"learning_rate": 3.995924114516769e-05, |
|
"loss": 1.1935, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.119140625, |
|
"grad_norm": 0.44805324266797203, |
|
"learning_rate": 3.995506492762613e-05, |
|
"loss": 1.1339, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12109375, |
|
"grad_norm": 0.5208068893189155, |
|
"learning_rate": 3.9950685304756494e-05, |
|
"loss": 1.2092, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.123046875, |
|
"grad_norm": 0.44195618774115664, |
|
"learning_rate": 3.994610232120101e-05, |
|
"loss": 1.1292, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.4514887790554273, |
|
"learning_rate": 3.994131602367481e-05, |
|
"loss": 1.1658, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.126953125, |
|
"grad_norm": 0.5908686231033371, |
|
"learning_rate": 3.9936326460965423e-05, |
|
"loss": 1.2076, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12890625, |
|
"grad_norm": 0.46799815417666174, |
|
"learning_rate": 3.99311336839323e-05, |
|
"loss": 1.1889, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.130859375, |
|
"grad_norm": 0.45939729407525115, |
|
"learning_rate": 3.992573774550629e-05, |
|
"loss": 1.1704, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1328125, |
|
"grad_norm": 0.4142175477343616, |
|
"learning_rate": 3.9920138700689095e-05, |
|
"loss": 1.1848, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.134765625, |
|
"grad_norm": 0.37685838553537837, |
|
"learning_rate": 3.991433660655273e-05, |
|
"loss": 1.1041, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.13671875, |
|
"grad_norm": 0.39832807246827023, |
|
"learning_rate": 3.99083315222389e-05, |
|
"loss": 1.2002, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.138671875, |
|
"grad_norm": 0.43218323629933336, |
|
"learning_rate": 3.990212350895845e-05, |
|
"loss": 1.1487, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 0.43302460007599547, |
|
"learning_rate": 3.98957126299907e-05, |
|
"loss": 1.1638, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.142578125, |
|
"grad_norm": 0.41150363252077565, |
|
"learning_rate": 3.988909895068281e-05, |
|
"loss": 1.1353, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14453125, |
|
"grad_norm": 0.4362254605938381, |
|
"learning_rate": 3.988228253844913e-05, |
|
"loss": 1.2202, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.146484375, |
|
"grad_norm": 0.4696684841153936, |
|
"learning_rate": 3.987526346277049e-05, |
|
"loss": 1.1722, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1484375, |
|
"grad_norm": 0.42274900639715757, |
|
"learning_rate": 3.9868041795193505e-05, |
|
"loss": 1.179, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.150390625, |
|
"grad_norm": 0.47381294364503707, |
|
"learning_rate": 3.9860617609329856e-05, |
|
"loss": 1.1978, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15234375, |
|
"grad_norm": 0.448192967722078, |
|
"learning_rate": 3.9852990980855505e-05, |
|
"loss": 1.2042, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.154296875, |
|
"grad_norm": 0.388483486919693, |
|
"learning_rate": 3.984516198750997e-05, |
|
"loss": 1.148, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.4057112657252388, |
|
"learning_rate": 3.9837130709095475e-05, |
|
"loss": 1.1267, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.158203125, |
|
"grad_norm": 0.5111257616377479, |
|
"learning_rate": 3.982889722747621e-05, |
|
"loss": 1.1992, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16015625, |
|
"grad_norm": 0.42800919524357695, |
|
"learning_rate": 3.9820461626577426e-05, |
|
"loss": 1.2214, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.162109375, |
|
"grad_norm": 0.6604320971658805, |
|
"learning_rate": 3.981182399238462e-05, |
|
"loss": 1.1046, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1640625, |
|
"grad_norm": 0.4650529995861808, |
|
"learning_rate": 3.980298441294265e-05, |
|
"loss": 1.1485, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.166015625, |
|
"grad_norm": 0.8247014006092652, |
|
"learning_rate": 3.9793942978354835e-05, |
|
"loss": 1.2345, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16796875, |
|
"grad_norm": 0.5111463246016623, |
|
"learning_rate": 3.978469978078203e-05, |
|
"loss": 1.1406, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.169921875, |
|
"grad_norm": 0.3980549366997817, |
|
"learning_rate": 3.977525491444171e-05, |
|
"loss": 1.138, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 0.4500013345653544, |
|
"learning_rate": 3.976560847560697e-05, |
|
"loss": 1.1803, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.173828125, |
|
"grad_norm": 0.6144879263096161, |
|
"learning_rate": 3.975576056260559e-05, |
|
"loss": 1.376, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.17578125, |
|
"grad_norm": 0.45250166677505255, |
|
"learning_rate": 3.974571127581901e-05, |
|
"loss": 1.2616, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.177734375, |
|
"grad_norm": 0.7260361194779941, |
|
"learning_rate": 3.973546071768128e-05, |
|
"loss": 1.207, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1796875, |
|
"grad_norm": 0.40590569325939646, |
|
"learning_rate": 3.972500899267807e-05, |
|
"loss": 1.1857, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.181640625, |
|
"grad_norm": 0.7059204956983739, |
|
"learning_rate": 3.971435620734557e-05, |
|
"loss": 1.1629, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18359375, |
|
"grad_norm": 0.4166494769492577, |
|
"learning_rate": 3.97035024702694e-05, |
|
"loss": 1.2105, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.185546875, |
|
"grad_norm": 0.4708428232528331, |
|
"learning_rate": 3.969244789208354e-05, |
|
"loss": 1.2074, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.46187395897944283, |
|
"learning_rate": 3.9681192585469146e-05, |
|
"loss": 1.2411, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.189453125, |
|
"grad_norm": 0.40887786827875044, |
|
"learning_rate": 3.9669736665153455e-05, |
|
"loss": 1.181, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.19140625, |
|
"grad_norm": 0.5783677933870661, |
|
"learning_rate": 3.96580802479086e-05, |
|
"loss": 1.2412, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.193359375, |
|
"grad_norm": 0.46098155681455955, |
|
"learning_rate": 3.9646223452550374e-05, |
|
"loss": 1.0478, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 0.4421189367731534, |
|
"learning_rate": 3.9634166399937104e-05, |
|
"loss": 1.1528, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.197265625, |
|
"grad_norm": 0.44208897843282735, |
|
"learning_rate": 3.962190921296834e-05, |
|
"loss": 1.1294, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19921875, |
|
"grad_norm": 0.41115810620405063, |
|
"learning_rate": 3.9609452016583654e-05, |
|
"loss": 1.0787, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.201171875, |
|
"grad_norm": 0.4592703963732682, |
|
"learning_rate": 3.959679493776134e-05, |
|
"loss": 1.2084, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 0.46514364761525706, |
|
"learning_rate": 3.9583938105517127e-05, |
|
"loss": 1.169, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.205078125, |
|
"grad_norm": 0.5044144386089332, |
|
"learning_rate": 3.957088165090287e-05, |
|
"loss": 1.121, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.20703125, |
|
"grad_norm": 0.4160320267546915, |
|
"learning_rate": 3.9557625707005185e-05, |
|
"loss": 1.1133, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.208984375, |
|
"grad_norm": 0.46611013560363507, |
|
"learning_rate": 3.954417040894416e-05, |
|
"loss": 1.0846, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2109375, |
|
"grad_norm": 0.494489354902747, |
|
"learning_rate": 3.953051589387189e-05, |
|
"loss": 1.1762, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.212890625, |
|
"grad_norm": 0.4226200871032249, |
|
"learning_rate": 3.951666230097115e-05, |
|
"loss": 1.0346, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.21484375, |
|
"grad_norm": 0.4032354878018358, |
|
"learning_rate": 3.9502609771453934e-05, |
|
"loss": 1.1223, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.216796875, |
|
"grad_norm": 0.4148468151686513, |
|
"learning_rate": 3.948835844856004e-05, |
|
"loss": 1.1581, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.4655201875464092, |
|
"learning_rate": 3.947390847755559e-05, |
|
"loss": 1.141, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.220703125, |
|
"grad_norm": 0.44131202754652804, |
|
"learning_rate": 3.945926000573156e-05, |
|
"loss": 1.228, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.22265625, |
|
"grad_norm": 0.4878464713519324, |
|
"learning_rate": 3.94444131824023e-05, |
|
"loss": 1.2023, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.224609375, |
|
"grad_norm": 0.4433704308856408, |
|
"learning_rate": 3.942936815890396e-05, |
|
"loss": 1.2479, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2265625, |
|
"grad_norm": 0.4848454824446459, |
|
"learning_rate": 3.941412508859299e-05, |
|
"loss": 1.1269, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.228515625, |
|
"grad_norm": 0.419630467357436, |
|
"learning_rate": 3.939868412684458e-05, |
|
"loss": 1.1806, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.23046875, |
|
"grad_norm": 0.39683375502836515, |
|
"learning_rate": 3.938304543105104e-05, |
|
"loss": 1.1054, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.232421875, |
|
"grad_norm": 0.4832371787668091, |
|
"learning_rate": 3.936720916062022e-05, |
|
"loss": 1.1174, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.5986867637436046, |
|
"learning_rate": 3.935117547697387e-05, |
|
"loss": 1.1791, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.236328125, |
|
"grad_norm": 0.4150490343483682, |
|
"learning_rate": 3.933494454354605e-05, |
|
"loss": 1.2129, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.23828125, |
|
"grad_norm": 0.4215588087170942, |
|
"learning_rate": 3.931851652578137e-05, |
|
"loss": 1.1414, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.240234375, |
|
"grad_norm": 0.42515318009071157, |
|
"learning_rate": 3.9301891591133377e-05, |
|
"loss": 1.0854, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2421875, |
|
"grad_norm": 0.4488701042494301, |
|
"learning_rate": 3.928506990906282e-05, |
|
"loss": 1.0725, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.244140625, |
|
"grad_norm": 0.41531581194897543, |
|
"learning_rate": 3.9268051651035944e-05, |
|
"loss": 1.0746, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.24609375, |
|
"grad_norm": 0.46204021714125687, |
|
"learning_rate": 3.9250836990522685e-05, |
|
"loss": 1.2164, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.248046875, |
|
"grad_norm": 0.6677384727690392, |
|
"learning_rate": 3.923342610299499e-05, |
|
"loss": 1.1834, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4961785465516465, |
|
"learning_rate": 3.9215819165924956e-05, |
|
"loss": 1.2178, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.251953125, |
|
"grad_norm": 0.4651476735438144, |
|
"learning_rate": 3.919801635878305e-05, |
|
"loss": 1.1005, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.25390625, |
|
"grad_norm": 0.49434332973849215, |
|
"learning_rate": 3.918001786303627e-05, |
|
"loss": 1.1922, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.255859375, |
|
"grad_norm": 0.45671514667179935, |
|
"learning_rate": 3.9161823862146297e-05, |
|
"loss": 1.0617, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2578125, |
|
"grad_norm": 0.49674226929417115, |
|
"learning_rate": 3.9143434541567654e-05, |
|
"loss": 1.2203, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.259765625, |
|
"grad_norm": 0.5208683235687923, |
|
"learning_rate": 3.912485008874577e-05, |
|
"loss": 1.1587, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.26171875, |
|
"grad_norm": 0.517022288962491, |
|
"learning_rate": 3.9106070693115087e-05, |
|
"loss": 1.1427, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.263671875, |
|
"grad_norm": 0.38942661826422087, |
|
"learning_rate": 3.908709654609715e-05, |
|
"loss": 1.0629, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 0.4564236281556844, |
|
"learning_rate": 3.9067927841098614e-05, |
|
"loss": 1.0919, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.267578125, |
|
"grad_norm": 0.4929559987928741, |
|
"learning_rate": 3.9048564773509314e-05, |
|
"loss": 1.1502, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.26953125, |
|
"grad_norm": 0.48513251932309925, |
|
"learning_rate": 3.902900754070025e-05, |
|
"loss": 1.1158, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.271484375, |
|
"grad_norm": 0.5349569441078609, |
|
"learning_rate": 3.900925634202158e-05, |
|
"loss": 1.1279, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 0.47177459620840684, |
|
"learning_rate": 3.898931137880059e-05, |
|
"loss": 1.1595, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.275390625, |
|
"grad_norm": 0.4904546697998669, |
|
"learning_rate": 3.896917285433964e-05, |
|
"loss": 1.2615, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.27734375, |
|
"grad_norm": 0.5768180408665089, |
|
"learning_rate": 3.894884097391409e-05, |
|
"loss": 1.1688, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.279296875, |
|
"grad_norm": 0.4362108519904031, |
|
"learning_rate": 3.892831594477021e-05, |
|
"loss": 1.0983, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.4570710320413065, |
|
"learning_rate": 3.890759797612307e-05, |
|
"loss": 1.3706, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.283203125, |
|
"grad_norm": 0.4465318663671251, |
|
"learning_rate": 3.888668727915441e-05, |
|
"loss": 1.1377, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.28515625, |
|
"grad_norm": 0.5047852656660148, |
|
"learning_rate": 3.886558406701046e-05, |
|
"loss": 1.0747, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.287109375, |
|
"grad_norm": 0.4412295789497703, |
|
"learning_rate": 3.884428855479983e-05, |
|
"loss": 1.1261, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2890625, |
|
"grad_norm": 0.4476476539228374, |
|
"learning_rate": 3.8822800959591236e-05, |
|
"loss": 1.1769, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.291015625, |
|
"grad_norm": 0.45924117326794117, |
|
"learning_rate": 3.880112150041134e-05, |
|
"loss": 1.1564, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.29296875, |
|
"grad_norm": 0.43931168833110684, |
|
"learning_rate": 3.877925039824253e-05, |
|
"loss": 1.1682, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.294921875, |
|
"grad_norm": 0.5438637955362605, |
|
"learning_rate": 3.8757187876020603e-05, |
|
"loss": 1.1448, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 0.42928963297461137, |
|
"learning_rate": 3.873493415863256e-05, |
|
"loss": 1.2078, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.298828125, |
|
"grad_norm": 0.4381709802123583, |
|
"learning_rate": 3.8712489472914286e-05, |
|
"loss": 1.0604, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.30078125, |
|
"grad_norm": 0.4988490117613772, |
|
"learning_rate": 3.8689854047648224e-05, |
|
"loss": 1.1424, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.302734375, |
|
"grad_norm": 0.4257038437137218, |
|
"learning_rate": 3.866702811356107e-05, |
|
"loss": 1.0955, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3046875, |
|
"grad_norm": 0.4893472968930594, |
|
"learning_rate": 3.86440119033214e-05, |
|
"loss": 1.1854, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.306640625, |
|
"grad_norm": 0.5731240348991923, |
|
"learning_rate": 3.862080565153731e-05, |
|
"loss": 1.2505, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.30859375, |
|
"grad_norm": 0.4594995644663965, |
|
"learning_rate": 3.8597409594754025e-05, |
|
"loss": 1.1047, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.310546875, |
|
"grad_norm": 0.3898970756217597, |
|
"learning_rate": 3.857382397145148e-05, |
|
"loss": 1.1728, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.5165759238716673, |
|
"learning_rate": 3.85500490220419e-05, |
|
"loss": 1.1232, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.314453125, |
|
"grad_norm": 0.42169317869735606, |
|
"learning_rate": 3.852608498886732e-05, |
|
"loss": 1.1087, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.31640625, |
|
"grad_norm": 0.4831766592421198, |
|
"learning_rate": 3.850193211619718e-05, |
|
"loss": 1.0902, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.318359375, |
|
"grad_norm": 0.5168422003190449, |
|
"learning_rate": 3.8477590650225735e-05, |
|
"loss": 1.1979, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3203125, |
|
"grad_norm": 0.44267326014624, |
|
"learning_rate": 3.845306083906967e-05, |
|
"loss": 1.1311, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.322265625, |
|
"grad_norm": 0.42634229457641887, |
|
"learning_rate": 3.842834293276545e-05, |
|
"loss": 1.1729, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.32421875, |
|
"grad_norm": 0.40628491116146026, |
|
"learning_rate": 3.8403437183266834e-05, |
|
"loss": 1.0984, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.326171875, |
|
"grad_norm": 0.4159045672550255, |
|
"learning_rate": 3.8378343844442344e-05, |
|
"loss": 1.1731, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 0.5968785135150301, |
|
"learning_rate": 3.8353063172072564e-05, |
|
"loss": 1.0247, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.330078125, |
|
"grad_norm": 0.4649591605790638, |
|
"learning_rate": 3.8327595423847645e-05, |
|
"loss": 1.139, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.33203125, |
|
"grad_norm": 0.48079030109724175, |
|
"learning_rate": 3.830194085936463e-05, |
|
"loss": 1.1268, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.333984375, |
|
"grad_norm": 0.46348618416181137, |
|
"learning_rate": 3.8276099740124794e-05, |
|
"loss": 1.2004, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3359375, |
|
"grad_norm": 0.4832617358199499, |
|
"learning_rate": 3.8250072329531004e-05, |
|
"loss": 1.0743, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.337890625, |
|
"grad_norm": 0.4420229534375586, |
|
"learning_rate": 3.822385889288503e-05, |
|
"loss": 1.141, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.33984375, |
|
"grad_norm": 0.39752191495545935, |
|
"learning_rate": 3.819745969738484e-05, |
|
"loss": 1.0972, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.341796875, |
|
"grad_norm": 0.4411421700040708, |
|
"learning_rate": 3.817087501212185e-05, |
|
"loss": 1.0233, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.4017237336736879, |
|
"learning_rate": 3.8144105108078246e-05, |
|
"loss": 1.1563, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.345703125, |
|
"grad_norm": 0.686922962042273, |
|
"learning_rate": 3.8117150258124134e-05, |
|
"loss": 1.147, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.34765625, |
|
"grad_norm": 0.4294357539370898, |
|
"learning_rate": 3.8090010737014836e-05, |
|
"loss": 1.1116, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.349609375, |
|
"grad_norm": 0.41962832297995667, |
|
"learning_rate": 3.806268682138805e-05, |
|
"loss": 1.0827, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 0.4413195950046206, |
|
"learning_rate": 3.803517878976103e-05, |
|
"loss": 1.0814, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.353515625, |
|
"grad_norm": 0.45365068157119814, |
|
"learning_rate": 3.8007486922527774e-05, |
|
"loss": 1.0599, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.35546875, |
|
"grad_norm": 0.5286445380979327, |
|
"learning_rate": 3.7979611501956124e-05, |
|
"loss": 1.2251, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.357421875, |
|
"grad_norm": 0.38599209970455534, |
|
"learning_rate": 3.795155281218493e-05, |
|
"loss": 1.1676, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 0.44025531979392435, |
|
"learning_rate": 3.7923311139221114e-05, |
|
"loss": 1.0514, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.361328125, |
|
"grad_norm": 0.42167205583593925, |
|
"learning_rate": 3.789488677093681e-05, |
|
"loss": 1.1002, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.36328125, |
|
"grad_norm": 0.4466402130651366, |
|
"learning_rate": 3.786627999706638e-05, |
|
"loss": 1.1013, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.365234375, |
|
"grad_norm": 0.496760952886551, |
|
"learning_rate": 3.783749110920345e-05, |
|
"loss": 1.1465, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3671875, |
|
"grad_norm": 0.4367613213432748, |
|
"learning_rate": 3.780852040079802e-05, |
|
"loss": 1.0657, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.369140625, |
|
"grad_norm": 0.41447069424638583, |
|
"learning_rate": 3.777936816715336e-05, |
|
"loss": 1.116, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.37109375, |
|
"grad_norm": 0.4361134375016492, |
|
"learning_rate": 3.7750034705423095e-05, |
|
"loss": 1.2767, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.373046875, |
|
"grad_norm": 0.4066150259484398, |
|
"learning_rate": 3.772052031460812e-05, |
|
"loss": 1.0785, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.40407841923262816, |
|
"learning_rate": 3.769082529555359e-05, |
|
"loss": 1.1644, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.376953125, |
|
"grad_norm": 0.44561296429853814, |
|
"learning_rate": 3.766094995094581e-05, |
|
"loss": 1.0663, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.37890625, |
|
"grad_norm": 0.5352430776738828, |
|
"learning_rate": 3.7630894585309195e-05, |
|
"loss": 1.0209, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.380859375, |
|
"grad_norm": 0.43636357529723163, |
|
"learning_rate": 3.7600659505003125e-05, |
|
"loss": 1.0621, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3828125, |
|
"grad_norm": 0.4264879021475797, |
|
"learning_rate": 3.757024501821885e-05, |
|
"loss": 1.1336, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.384765625, |
|
"grad_norm": 0.3873402520476977, |
|
"learning_rate": 3.753965143497635e-05, |
|
"loss": 1.1378, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.38671875, |
|
"grad_norm": 0.40092066811193233, |
|
"learning_rate": 3.750887906712115e-05, |
|
"loss": 1.0685, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.388671875, |
|
"grad_norm": 0.43572366333630774, |
|
"learning_rate": 3.747792822832117e-05, |
|
"loss": 1.1723, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.37730662296410905, |
|
"learning_rate": 3.744679923406351e-05, |
|
"loss": 1.0823, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.392578125, |
|
"grad_norm": 0.4578098403628755, |
|
"learning_rate": 3.741549240165122e-05, |
|
"loss": 1.1354, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.39453125, |
|
"grad_norm": 0.4402925550279655, |
|
"learning_rate": 3.738400805020011e-05, |
|
"loss": 1.0921, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.396484375, |
|
"grad_norm": 0.3814506298253285, |
|
"learning_rate": 3.7352346500635466e-05, |
|
"loss": 1.0813, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3984375, |
|
"grad_norm": 0.5352313284178145, |
|
"learning_rate": 3.732050807568878e-05, |
|
"loss": 1.2286, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.400390625, |
|
"grad_norm": 0.4394941726895711, |
|
"learning_rate": 3.728849309989445e-05, |
|
"loss": 1.1362, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.40234375, |
|
"grad_norm": 0.40009193940161264, |
|
"learning_rate": 3.7256301899586524e-05, |
|
"loss": 1.014, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.404296875, |
|
"grad_norm": 0.4093033957375515, |
|
"learning_rate": 3.7223934802895294e-05, |
|
"loss": 1.0731, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.47801078784248796, |
|
"learning_rate": 3.719139213974403e-05, |
|
"loss": 1.2081, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.408203125, |
|
"grad_norm": 0.5965083454407833, |
|
"learning_rate": 3.715867424184554e-05, |
|
"loss": 1.1495, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.41015625, |
|
"grad_norm": 0.43672026913516004, |
|
"learning_rate": 3.712578144269887e-05, |
|
"loss": 1.1201, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.412109375, |
|
"grad_norm": 0.5253144641112631, |
|
"learning_rate": 3.7092714077585836e-05, |
|
"loss": 1.2268, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.4140625, |
|
"grad_norm": 0.4738073414405108, |
|
"learning_rate": 3.705947248356765e-05, |
|
"loss": 1.1188, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.416015625, |
|
"grad_norm": 0.4477140058126639, |
|
"learning_rate": 3.7026056999481464e-05, |
|
"loss": 1.0571, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.41796875, |
|
"grad_norm": 0.4471574730565842, |
|
"learning_rate": 3.699246796593692e-05, |
|
"loss": 1.0847, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.419921875, |
|
"grad_norm": 0.41405988952981876, |
|
"learning_rate": 3.6958705725312655e-05, |
|
"loss": 1.1401, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 0.49370245896699827, |
|
"learning_rate": 3.692477062175289e-05, |
|
"loss": 1.0703, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.423828125, |
|
"grad_norm": 0.4406399072344879, |
|
"learning_rate": 3.689066300116381e-05, |
|
"loss": 1.1793, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.42578125, |
|
"grad_norm": 0.43483619180179833, |
|
"learning_rate": 3.6856383211210134e-05, |
|
"loss": 1.1305, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.427734375, |
|
"grad_norm": 0.43256055966703133, |
|
"learning_rate": 3.682193160131152e-05, |
|
"loss": 1.0943, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.5598257236379292, |
|
"learning_rate": 3.678730852263901e-05, |
|
"loss": 1.2309, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.431640625, |
|
"grad_norm": 0.39045352547405415, |
|
"learning_rate": 3.675251432811144e-05, |
|
"loss": 1.0047, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.43359375, |
|
"grad_norm": 0.44912102512870905, |
|
"learning_rate": 3.671754937239191e-05, |
|
"loss": 1.1087, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.435546875, |
|
"grad_norm": 0.4174420596478436, |
|
"learning_rate": 3.668241401188407e-05, |
|
"loss": 1.0313, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.36458359932139156, |
|
"learning_rate": 3.6647108604728546e-05, |
|
"loss": 0.9782, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.439453125, |
|
"grad_norm": 0.4419635662052487, |
|
"learning_rate": 3.661163351079929e-05, |
|
"loss": 1.1076, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44140625, |
|
"grad_norm": 0.4537093691655119, |
|
"learning_rate": 3.6575989091699895e-05, |
|
"loss": 1.1265, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.443359375, |
|
"grad_norm": 0.4515222234083662, |
|
"learning_rate": 3.65401757107599e-05, |
|
"loss": 1.124, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4453125, |
|
"grad_norm": 0.4509933735945529, |
|
"learning_rate": 3.650419373303112e-05, |
|
"loss": 1.2212, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.447265625, |
|
"grad_norm": 0.39315970041656184, |
|
"learning_rate": 3.646804352528389e-05, |
|
"loss": 1.1003, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.44921875, |
|
"grad_norm": 0.583897939706095, |
|
"learning_rate": 3.643172545600336e-05, |
|
"loss": 1.0984, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.451171875, |
|
"grad_norm": 0.5164803615434137, |
|
"learning_rate": 3.63952398953857e-05, |
|
"loss": 1.0738, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 0.4070265753872102, |
|
"learning_rate": 3.6358587215334355e-05, |
|
"loss": 1.034, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.455078125, |
|
"grad_norm": 0.4101472350679783, |
|
"learning_rate": 3.632176778945626e-05, |
|
"loss": 1.1234, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.45703125, |
|
"grad_norm": 0.410956088362877, |
|
"learning_rate": 3.628478199305799e-05, |
|
"loss": 1.1062, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.458984375, |
|
"grad_norm": 0.42181972355385416, |
|
"learning_rate": 3.624763020314199e-05, |
|
"loss": 1.1848, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4609375, |
|
"grad_norm": 0.4069735981570203, |
|
"learning_rate": 3.62103127984027e-05, |
|
"loss": 1.1203, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.462890625, |
|
"grad_norm": 0.4142934678480609, |
|
"learning_rate": 3.617283015922268e-05, |
|
"loss": 1.1044, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.46484375, |
|
"grad_norm": 0.4697374307040272, |
|
"learning_rate": 3.6135182667668764e-05, |
|
"loss": 1.1947, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.466796875, |
|
"grad_norm": 0.3985058819632944, |
|
"learning_rate": 3.6097370707488175e-05, |
|
"loss": 1.0906, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.40215610602620183, |
|
"learning_rate": 3.6059394664104554e-05, |
|
"loss": 1.1607, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.470703125, |
|
"grad_norm": 0.3985665062059567, |
|
"learning_rate": 3.60212549246141e-05, |
|
"loss": 1.0787, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.47265625, |
|
"grad_norm": 0.43711415007382576, |
|
"learning_rate": 3.598295187778158e-05, |
|
"loss": 1.1554, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.474609375, |
|
"grad_norm": 0.4382023321095773, |
|
"learning_rate": 3.5944485914036384e-05, |
|
"loss": 1.0126, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4765625, |
|
"grad_norm": 0.37488265505774904, |
|
"learning_rate": 3.590585742546853e-05, |
|
"loss": 1.1054, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.478515625, |
|
"grad_norm": 0.40930451172856447, |
|
"learning_rate": 3.586706680582471e-05, |
|
"loss": 1.0321, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.48046875, |
|
"grad_norm": 0.5059310227059168, |
|
"learning_rate": 3.5828114450504205e-05, |
|
"loss": 1.1239, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.482421875, |
|
"grad_norm": 0.45898297435796365, |
|
"learning_rate": 3.5789000756554927e-05, |
|
"loss": 1.0467, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 0.42551550838444063, |
|
"learning_rate": 3.5749726122669316e-05, |
|
"loss": 1.051, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.486328125, |
|
"grad_norm": 0.4451344613451106, |
|
"learning_rate": 3.5710290949180325e-05, |
|
"loss": 1.1036, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.48828125, |
|
"grad_norm": 0.43151805025113255, |
|
"learning_rate": 3.5670695638057285e-05, |
|
"loss": 1.1906, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.490234375, |
|
"grad_norm": 0.492114391902568, |
|
"learning_rate": 3.563094059290186e-05, |
|
"loss": 1.1629, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.4921875, |
|
"grad_norm": 0.4144331093915329, |
|
"learning_rate": 3.5591026218943905e-05, |
|
"loss": 1.1485, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.494140625, |
|
"grad_norm": 0.4201461662795515, |
|
"learning_rate": 3.5550952923037337e-05, |
|
"loss": 1.1451, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.49609375, |
|
"grad_norm": 0.41132936789582963, |
|
"learning_rate": 3.551072111365598e-05, |
|
"loss": 1.1216, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.498046875, |
|
"grad_norm": 0.40892606177310264, |
|
"learning_rate": 3.547033120088943e-05, |
|
"loss": 1.0282, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.39721649148962185, |
|
"learning_rate": 3.5429783596438864e-05, |
|
"loss": 1.113, |
|
"step": 256 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 256, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 531064116215808.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|