|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 0, |
|
"global_step": 154, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006493506493506494, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 9.935064935064936e-06, |
|
"loss": 1.9216, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012987012987012988, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 9.87012987012987e-06, |
|
"loss": 1.9203, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01948051948051948, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 9.805194805194806e-06, |
|
"loss": 1.8268, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025974025974025976, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 9.740259740259742e-06, |
|
"loss": 1.8553, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 9.675324675324677e-06, |
|
"loss": 1.8987, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03896103896103896, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.610389610389611e-06, |
|
"loss": 1.9061, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.545454545454547e-06, |
|
"loss": 1.8952, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.48051948051948e-06, |
|
"loss": 1.8343, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05844155844155844, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.415584415584416e-06, |
|
"loss": 1.8108, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.350649350649352e-06, |
|
"loss": 1.7731, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.285714285714288e-06, |
|
"loss": 1.8083, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07792207792207792, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 9.220779220779221e-06, |
|
"loss": 1.8363, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08441558441558442, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.155844155844157e-06, |
|
"loss": 1.7684, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.6423, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.025974025974027e-06, |
|
"loss": 1.7531, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.96103896103896e-06, |
|
"loss": 1.718, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11038961038961038, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 8.896103896103896e-06, |
|
"loss": 1.6784, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11688311688311688, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 8.831168831168832e-06, |
|
"loss": 1.6809, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12337662337662338, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.766233766233767e-06, |
|
"loss": 1.7605, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 8.701298701298701e-06, |
|
"loss": 1.6866, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 8.636363636363637e-06, |
|
"loss": 1.6806, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.7233, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14935064935064934, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 8.506493506493507e-06, |
|
"loss": 1.702, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 8.441558441558442e-06, |
|
"loss": 1.6865, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 8.376623376623378e-06, |
|
"loss": 1.7162, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16883116883116883, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 8.311688311688313e-06, |
|
"loss": 1.6615, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17532467532467533, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 8.246753246753247e-06, |
|
"loss": 1.6482, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 1.6257, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18831168831168832, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 8.116883116883117e-06, |
|
"loss": 1.7048, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 8.051948051948052e-06, |
|
"loss": 1.6033, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2012987012987013, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 7.987012987012988e-06, |
|
"loss": 1.6595, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.922077922077924e-06, |
|
"loss": 1.6255, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 7.857142857142858e-06, |
|
"loss": 1.6825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22077922077922077, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 7.792207792207793e-06, |
|
"loss": 1.6171, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 7.727272727272727e-06, |
|
"loss": 1.6086, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23376623376623376, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 7.662337662337663e-06, |
|
"loss": 1.5977, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24025974025974026, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 7.597402597402598e-06, |
|
"loss": 1.6018, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24675324675324675, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 7.532467532467533e-06, |
|
"loss": 1.6183, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2532467532467532, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 7.467532467532468e-06, |
|
"loss": 1.5952, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 7.402597402597404e-06, |
|
"loss": 1.564, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2662337662337662, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 7.3376623376623375e-06, |
|
"loss": 1.5618, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 1.5533, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2792207792207792, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 7.207792207792208e-06, |
|
"loss": 1.6762, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.5851, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 7.077922077922078e-06, |
|
"loss": 1.5862, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2987012987012987, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 7.012987012987014e-06, |
|
"loss": 1.5626, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3051948051948052, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 6.948051948051948e-06, |
|
"loss": 1.4968, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 6.8831168831168835e-06, |
|
"loss": 1.5374, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.6145, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 6.753246753246754e-06, |
|
"loss": 1.5309, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33116883116883117, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 6.688311688311689e-06, |
|
"loss": 1.5764, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33766233766233766, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 6.623376623376624e-06, |
|
"loss": 1.558, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34415584415584416, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 6.55844155844156e-06, |
|
"loss": 1.51, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35064935064935066, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 6.493506493506494e-06, |
|
"loss": 1.5634, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 6.4285714285714295e-06, |
|
"loss": 1.5444, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 6.363636363636364e-06, |
|
"loss": 1.5565, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37012987012987014, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 6.2987012987013e-06, |
|
"loss": 1.4929, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37662337662337664, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 6.233766233766234e-06, |
|
"loss": 1.5763, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38311688311688313, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 6.168831168831169e-06, |
|
"loss": 1.5113, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 6.103896103896104e-06, |
|
"loss": 1.5285, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3961038961038961, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 6.03896103896104e-06, |
|
"loss": 1.5784, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4025974025974026, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 5.9740259740259746e-06, |
|
"loss": 1.5411, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 5.90909090909091e-06, |
|
"loss": 1.5138, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 5.844155844155844e-06, |
|
"loss": 1.5787, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 5.77922077922078e-06, |
|
"loss": 1.5045, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 1.528, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43506493506493504, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 5.64935064935065e-06, |
|
"loss": 1.5024, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44155844155844154, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 5.584415584415585e-06, |
|
"loss": 1.5593, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44805194805194803, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 5.5194805194805205e-06, |
|
"loss": 1.4983, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 1.5183, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.461038961038961, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 5.38961038961039e-06, |
|
"loss": 1.5491, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 5.324675324675325e-06, |
|
"loss": 1.43, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.474025974025974, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 5.2597402597402605e-06, |
|
"loss": 1.5414, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4805194805194805, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 5.194805194805194e-06, |
|
"loss": 1.5338, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 5.12987012987013e-06, |
|
"loss": 1.5223, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4935064935064935, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 5.064935064935065e-06, |
|
"loss": 1.5068, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 5e-06, |
|
"loss": 1.506, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5064935064935064, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 4.935064935064935e-06, |
|
"loss": 1.4808, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.512987012987013, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 4.870129870129871e-06, |
|
"loss": 1.5765, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 4.805194805194806e-06, |
|
"loss": 1.4691, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.525974025974026, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 4.74025974025974e-06, |
|
"loss": 1.4995, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5324675324675324, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 4.675324675324676e-06, |
|
"loss": 1.5206, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.538961038961039, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 4.610389610389611e-06, |
|
"loss": 1.4908, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.4625, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 4.48051948051948e-06, |
|
"loss": 1.4645, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5584415584415584, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 4.415584415584416e-06, |
|
"loss": 1.5982, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.564935064935065, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 4.350649350649351e-06, |
|
"loss": 1.4427, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 1.4655, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.577922077922078, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 4.220779220779221e-06, |
|
"loss": 1.5021, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 4.155844155844157e-06, |
|
"loss": 1.4549, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5909090909090909, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 4.0909090909090915e-06, |
|
"loss": 1.4115, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5974025974025974, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 4.025974025974026e-06, |
|
"loss": 1.4747, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6038961038961039, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 3.961038961038962e-06, |
|
"loss": 1.4906, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6103896103896104, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 3.896103896103897e-06, |
|
"loss": 1.5562, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 3.831168831168831e-06, |
|
"loss": 1.5198, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 3.7662337662337666e-06, |
|
"loss": 1.4991, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6298701298701299, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 3.701298701298702e-06, |
|
"loss": 1.4371, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 1.4679, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 1.5392, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 3.506493506493507e-06, |
|
"loss": 1.5854, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6558441558441559, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 3.4415584415584418e-06, |
|
"loss": 1.4859, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6623376623376623, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 3.376623376623377e-06, |
|
"loss": 1.3568, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6688311688311688, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.311688311688312e-06, |
|
"loss": 1.5947, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 3.246753246753247e-06, |
|
"loss": 1.4949, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 3.181818181818182e-06, |
|
"loss": 1.4853, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6883116883116883, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 3.116883116883117e-06, |
|
"loss": 1.425, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6948051948051948, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 3.051948051948052e-06, |
|
"loss": 1.4564, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7012987012987013, |
|
"grad_norm": 0.11328125, |
|
"learning_rate": 2.9870129870129873e-06, |
|
"loss": 1.4962, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7077922077922078, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 2.922077922077922e-06, |
|
"loss": 1.4239, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 1.4496, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7207792207792207, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 2.7922077922077925e-06, |
|
"loss": 1.5018, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 1.4486, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7337662337662337, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 2.6623376623376624e-06, |
|
"loss": 1.4482, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7402597402597403, |
|
"grad_norm": 0.1240234375, |
|
"learning_rate": 2.597402597402597e-06, |
|
"loss": 1.4378, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 2.5324675324675324e-06, |
|
"loss": 1.4501, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7532467532467533, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 2.4675324675324676e-06, |
|
"loss": 1.5188, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7597402597402597, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 2.402597402597403e-06, |
|
"loss": 1.4609, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7662337662337663, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 2.337662337662338e-06, |
|
"loss": 1.4207, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7727272727272727, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 1.4122, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 2.207792207792208e-06, |
|
"loss": 1.4604, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 1.4657, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7922077922077922, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 2.0779220779220784e-06, |
|
"loss": 1.4832, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7987012987012987, |
|
"grad_norm": 0.12451171875, |
|
"learning_rate": 2.012987012987013e-06, |
|
"loss": 1.3874, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8051948051948052, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 1.9480519480519483e-06, |
|
"loss": 1.4578, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 1.8831168831168833e-06, |
|
"loss": 1.4518, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 1.8181818181818183e-06, |
|
"loss": 1.446, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8246753246753247, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 1.7532467532467535e-06, |
|
"loss": 1.4921, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 1.6883116883116885e-06, |
|
"loss": 1.4776, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8376623376623377, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 1.6233766233766235e-06, |
|
"loss": 1.557, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 1.5584415584415584e-06, |
|
"loss": 1.4844, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8506493506493507, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 1.4935064935064936e-06, |
|
"loss": 1.5005, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 1.4655, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8636363636363636, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 1.3636363636363636e-06, |
|
"loss": 1.4916, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8701298701298701, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.2987012987012986e-06, |
|
"loss": 1.4502, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 1.2337662337662338e-06, |
|
"loss": 1.4147, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 1.168831168831169e-06, |
|
"loss": 1.5244, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8896103896103896, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.103896103896104e-06, |
|
"loss": 1.4393, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8961038961038961, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 1.0389610389610392e-06, |
|
"loss": 1.3782, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9025974025974026, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 9.740259740259742e-07, |
|
"loss": 1.3691, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 9.090909090909091e-07, |
|
"loss": 1.4324, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9155844155844156, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 8.441558441558442e-07, |
|
"loss": 1.4545, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.922077922077922, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 7.792207792207792e-07, |
|
"loss": 1.4364, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 1.5295, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 0.11328125, |
|
"learning_rate": 6.493506493506493e-07, |
|
"loss": 1.4032, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 5.844155844155845e-07, |
|
"loss": 1.483, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.948051948051948, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 5.194805194805196e-07, |
|
"loss": 1.4865, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9545454545454546, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 4.5454545454545457e-07, |
|
"loss": 1.4349, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.961038961038961, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 3.896103896103896e-07, |
|
"loss": 1.4733, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9675324675324676, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 3.2467532467532465e-07, |
|
"loss": 1.4665, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 2.597402597402598e-07, |
|
"loss": 1.5085, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9805194805194806, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 1.948051948051948e-07, |
|
"loss": 1.4872, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 1.298701298701299e-07, |
|
"loss": 1.4249, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9935064935064936, |
|
"grad_norm": 0.125, |
|
"learning_rate": 6.493506493506495e-08, |
|
"loss": 1.4824, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 0.0, |
|
"loss": 1.4958, |
|
"step": 154 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 154, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.350584389810258e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|