whale-v3-base-lora-1100 / trainer_state.json
theblackcat102's picture
Upload folder using huggingface_hub
e9e227c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1372169899582112,
"eval_steps": 500,
"global_step": 1100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016188433364361165,
"grad_norm": 24103.080078125,
"learning_rate": 1.6000000000000001e-06,
"loss": 416.3619,
"step": 1
},
{
"epoch": 0.0003237686672872233,
"grad_norm": 8362.611328125,
"learning_rate": 3.2000000000000003e-06,
"loss": 406.7542,
"step": 2
},
{
"epoch": 0.0004856530009308349,
"grad_norm": 7507.2021484375,
"learning_rate": 4.800000000000001e-06,
"loss": 361.5908,
"step": 3
},
{
"epoch": 0.0006475373345744466,
"grad_norm": 5111.49853515625,
"learning_rate": 6.4000000000000006e-06,
"loss": 291.2071,
"step": 4
},
{
"epoch": 0.0008094216682180582,
"grad_norm": 9454.7490234375,
"learning_rate": 8.000000000000001e-06,
"loss": 261.3245,
"step": 5
},
{
"epoch": 0.0009713060018616698,
"grad_norm": 2652.902099609375,
"learning_rate": 9.600000000000001e-06,
"loss": 243.8967,
"step": 6
},
{
"epoch": 0.0011331903355052814,
"grad_norm": 2334.48486328125,
"learning_rate": 1.1200000000000001e-05,
"loss": 231.5565,
"step": 7
},
{
"epoch": 0.0012950746691488932,
"grad_norm": 1413.470703125,
"learning_rate": 1.2800000000000001e-05,
"loss": 210.4121,
"step": 8
},
{
"epoch": 0.0014569590027925048,
"grad_norm": 1339.1798095703125,
"learning_rate": 1.4400000000000001e-05,
"loss": 206.2145,
"step": 9
},
{
"epoch": 0.0016188433364361164,
"grad_norm": 1151.4617919921875,
"learning_rate": 1.6000000000000003e-05,
"loss": 201.3061,
"step": 10
},
{
"epoch": 0.001780727670079728,
"grad_norm": 613.8676147460938,
"learning_rate": 1.76e-05,
"loss": 192.7747,
"step": 11
},
{
"epoch": 0.0019426120037233396,
"grad_norm": 419.9425354003906,
"learning_rate": 1.9200000000000003e-05,
"loss": 183.5707,
"step": 12
},
{
"epoch": 0.002104496337366951,
"grad_norm": 11710.265625,
"learning_rate": 2.08e-05,
"loss": 182.2417,
"step": 13
},
{
"epoch": 0.0022663806710105628,
"grad_norm": 822.8071899414062,
"learning_rate": 2.2400000000000002e-05,
"loss": 179.644,
"step": 14
},
{
"epoch": 0.0024282650046541744,
"grad_norm": 949.3472900390625,
"learning_rate": 2.4e-05,
"loss": 173.091,
"step": 15
},
{
"epoch": 0.0025901493382977864,
"grad_norm": 1254.593505859375,
"learning_rate": 2.5600000000000002e-05,
"loss": 174.3419,
"step": 16
},
{
"epoch": 0.002752033671941398,
"grad_norm": 2232.380859375,
"learning_rate": 2.7200000000000004e-05,
"loss": 178.925,
"step": 17
},
{
"epoch": 0.0029139180055850096,
"grad_norm": 596.9391479492188,
"learning_rate": 2.8800000000000002e-05,
"loss": 173.8553,
"step": 18
},
{
"epoch": 0.003075802339228621,
"grad_norm": 237.45948791503906,
"learning_rate": 3.0400000000000004e-05,
"loss": 169.216,
"step": 19
},
{
"epoch": 0.003237686672872233,
"grad_norm": 676.1233520507812,
"learning_rate": 3.2000000000000005e-05,
"loss": 169.3529,
"step": 20
},
{
"epoch": 0.0033995710065158444,
"grad_norm": 1278.8111572265625,
"learning_rate": 3.3600000000000004e-05,
"loss": 166.7066,
"step": 21
},
{
"epoch": 0.003561455340159456,
"grad_norm": 284.8902587890625,
"learning_rate": 3.52e-05,
"loss": 164.5595,
"step": 22
},
{
"epoch": 0.0037233396738030676,
"grad_norm": 573.7898559570312,
"learning_rate": 3.680000000000001e-05,
"loss": 159.1037,
"step": 23
},
{
"epoch": 0.003885224007446679,
"grad_norm": 1203.91357421875,
"learning_rate": 3.8400000000000005e-05,
"loss": 159.4815,
"step": 24
},
{
"epoch": 0.004047108341090291,
"grad_norm": 1167.90185546875,
"learning_rate": 4e-05,
"loss": 166.8396,
"step": 25
},
{
"epoch": 0.004208992674733902,
"grad_norm": 2555.7080078125,
"learning_rate": 4.16e-05,
"loss": 152.0457,
"step": 26
},
{
"epoch": 0.004370877008377514,
"grad_norm": 625.3372802734375,
"learning_rate": 4.3200000000000007e-05,
"loss": 156.2104,
"step": 27
},
{
"epoch": 0.0045327613420211256,
"grad_norm": 705.95654296875,
"learning_rate": 4.4800000000000005e-05,
"loss": 150.5511,
"step": 28
},
{
"epoch": 0.004694645675664738,
"grad_norm": 1175.135986328125,
"learning_rate": 4.64e-05,
"loss": 155.9813,
"step": 29
},
{
"epoch": 0.004856530009308349,
"grad_norm": 1093.1334228515625,
"learning_rate": 4.8e-05,
"loss": 152.7447,
"step": 30
},
{
"epoch": 0.005018414342951961,
"grad_norm": 1421.1544189453125,
"learning_rate": 4.9600000000000006e-05,
"loss": 147.1483,
"step": 31
},
{
"epoch": 0.005180298676595573,
"grad_norm": 2280.692626953125,
"learning_rate": 5.1200000000000004e-05,
"loss": 152.8863,
"step": 32
},
{
"epoch": 0.005342183010239184,
"grad_norm": 4522.5048828125,
"learning_rate": 5.280000000000001e-05,
"loss": 148.1559,
"step": 33
},
{
"epoch": 0.005504067343882796,
"grad_norm": 1268.85888671875,
"learning_rate": 5.440000000000001e-05,
"loss": 150.9389,
"step": 34
},
{
"epoch": 0.005665951677526407,
"grad_norm": 2783.045166015625,
"learning_rate": 5.6e-05,
"loss": 154.5852,
"step": 35
},
{
"epoch": 0.005827836011170019,
"grad_norm": 2022.623046875,
"learning_rate": 5.7600000000000004e-05,
"loss": 154.8941,
"step": 36
},
{
"epoch": 0.00598972034481363,
"grad_norm": 853.1558837890625,
"learning_rate": 5.92e-05,
"loss": 143.959,
"step": 37
},
{
"epoch": 0.006151604678457242,
"grad_norm": 6000.6416015625,
"learning_rate": 6.080000000000001e-05,
"loss": 143.1304,
"step": 38
},
{
"epoch": 0.0063134890121008536,
"grad_norm": 907.4410400390625,
"learning_rate": 6.240000000000001e-05,
"loss": 144.4922,
"step": 39
},
{
"epoch": 0.006475373345744466,
"grad_norm": 609.7958374023438,
"learning_rate": 6.400000000000001e-05,
"loss": 143.3392,
"step": 40
},
{
"epoch": 0.006637257679388078,
"grad_norm": 2869.024658203125,
"learning_rate": 6.56e-05,
"loss": 144.1269,
"step": 41
},
{
"epoch": 0.006799142013031689,
"grad_norm": 104268.515625,
"learning_rate": 6.720000000000001e-05,
"loss": 144.2501,
"step": 42
},
{
"epoch": 0.006961026346675301,
"grad_norm": 13787.85546875,
"learning_rate": 6.88e-05,
"loss": 147.7556,
"step": 43
},
{
"epoch": 0.007122910680318912,
"grad_norm": 1340.838623046875,
"learning_rate": 7.04e-05,
"loss": 146.0247,
"step": 44
},
{
"epoch": 0.007284795013962524,
"grad_norm": 2306.974853515625,
"learning_rate": 7.2e-05,
"loss": 151.1902,
"step": 45
},
{
"epoch": 0.007446679347606135,
"grad_norm": 10211.7421875,
"learning_rate": 7.360000000000001e-05,
"loss": 142.9976,
"step": 46
},
{
"epoch": 0.007608563681249747,
"grad_norm": 13293.638671875,
"learning_rate": 7.52e-05,
"loss": 153.0655,
"step": 47
},
{
"epoch": 0.007770448014893358,
"grad_norm": 1096.130615234375,
"learning_rate": 7.680000000000001e-05,
"loss": 146.9181,
"step": 48
},
{
"epoch": 0.00793233234853697,
"grad_norm": 1578.099365234375,
"learning_rate": 7.840000000000001e-05,
"loss": 144.2133,
"step": 49
},
{
"epoch": 0.008094216682180582,
"grad_norm": 1912.48583984375,
"learning_rate": 8e-05,
"loss": 145.5814,
"step": 50
},
{
"epoch": 0.008256101015824194,
"grad_norm": 239.28268432617188,
"learning_rate": 7.999999967535102e-05,
"loss": 138.508,
"step": 51
},
{
"epoch": 0.008417985349467805,
"grad_norm": 236.72445678710938,
"learning_rate": 7.999999870140409e-05,
"loss": 138.9372,
"step": 52
},
{
"epoch": 0.008579869683111417,
"grad_norm": 373.92010498046875,
"learning_rate": 7.99999970781592e-05,
"loss": 146.0557,
"step": 53
},
{
"epoch": 0.008741754016755029,
"grad_norm": 2691.880615234375,
"learning_rate": 7.999999480561641e-05,
"loss": 142.9117,
"step": 54
},
{
"epoch": 0.00890363835039864,
"grad_norm": 497.7879943847656,
"learning_rate": 7.999999188377575e-05,
"loss": 147.3189,
"step": 55
},
{
"epoch": 0.009065522684042251,
"grad_norm": 231.4438934326172,
"learning_rate": 7.999998831263725e-05,
"loss": 142.1491,
"step": 56
},
{
"epoch": 0.009227407017685863,
"grad_norm": 154.6627197265625,
"learning_rate": 7.999998409220098e-05,
"loss": 141.9744,
"step": 57
},
{
"epoch": 0.009389291351329475,
"grad_norm": 197.37191772460938,
"learning_rate": 7.999997922246699e-05,
"loss": 142.1519,
"step": 58
},
{
"epoch": 0.009551175684973087,
"grad_norm": 127.02053833007812,
"learning_rate": 7.99999737034354e-05,
"loss": 137.5525,
"step": 59
},
{
"epoch": 0.009713060018616698,
"grad_norm": 240.41546630859375,
"learning_rate": 7.999996753510626e-05,
"loss": 136.2462,
"step": 60
},
{
"epoch": 0.00987494435226031,
"grad_norm": 159.99560546875,
"learning_rate": 7.99999607174797e-05,
"loss": 140.3864,
"step": 61
},
{
"epoch": 0.010036828685903922,
"grad_norm": 161.4716796875,
"learning_rate": 7.999995325055579e-05,
"loss": 143.2473,
"step": 62
},
{
"epoch": 0.010198713019547534,
"grad_norm": 155.2477569580078,
"learning_rate": 7.999994513433469e-05,
"loss": 133.2346,
"step": 63
},
{
"epoch": 0.010360597353191146,
"grad_norm": 100.8320541381836,
"learning_rate": 7.999993636881653e-05,
"loss": 135.0861,
"step": 64
},
{
"epoch": 0.010522481686834756,
"grad_norm": 85.67591094970703,
"learning_rate": 7.999992695400142e-05,
"loss": 133.2635,
"step": 65
},
{
"epoch": 0.010684366020478368,
"grad_norm": 69.6679916381836,
"learning_rate": 7.999991688988955e-05,
"loss": 128.1581,
"step": 66
},
{
"epoch": 0.01084625035412198,
"grad_norm": 85.88739776611328,
"learning_rate": 7.999990617648107e-05,
"loss": 136.6223,
"step": 67
},
{
"epoch": 0.011008134687765592,
"grad_norm": 87.6140365600586,
"learning_rate": 7.999989481377614e-05,
"loss": 131.995,
"step": 68
},
{
"epoch": 0.011170019021409202,
"grad_norm": 133.94168090820312,
"learning_rate": 7.999988280177496e-05,
"loss": 134.3478,
"step": 69
},
{
"epoch": 0.011331903355052814,
"grad_norm": 63.528648376464844,
"learning_rate": 7.999987014047773e-05,
"loss": 125.9856,
"step": 70
},
{
"epoch": 0.011493787688696426,
"grad_norm": 74.52141571044922,
"learning_rate": 7.999985682988462e-05,
"loss": 129.8778,
"step": 71
},
{
"epoch": 0.011655672022340038,
"grad_norm": 453.0955505371094,
"learning_rate": 7.99998428699959e-05,
"loss": 132.9664,
"step": 72
},
{
"epoch": 0.01181755635598365,
"grad_norm": 101.82209014892578,
"learning_rate": 7.999982826081175e-05,
"loss": 135.4103,
"step": 73
},
{
"epoch": 0.01197944068962726,
"grad_norm": 164.14788818359375,
"learning_rate": 7.999981300233244e-05,
"loss": 129.22,
"step": 74
},
{
"epoch": 0.012141325023270873,
"grad_norm": 89.10115051269531,
"learning_rate": 7.99997970945582e-05,
"loss": 133.9528,
"step": 75
},
{
"epoch": 0.012303209356914485,
"grad_norm": 82.56475067138672,
"learning_rate": 7.999978053748929e-05,
"loss": 132.1314,
"step": 76
},
{
"epoch": 0.012465093690558097,
"grad_norm": 112.5723648071289,
"learning_rate": 7.999976333112596e-05,
"loss": 134.234,
"step": 77
},
{
"epoch": 0.012626978024201707,
"grad_norm": 129.81126403808594,
"learning_rate": 7.999974547546854e-05,
"loss": 126.8794,
"step": 78
},
{
"epoch": 0.01278886235784532,
"grad_norm": 165.73876953125,
"learning_rate": 7.999972697051726e-05,
"loss": 132.2223,
"step": 79
},
{
"epoch": 0.012950746691488931,
"grad_norm": 88.56263732910156,
"learning_rate": 7.999970781627248e-05,
"loss": 127.7658,
"step": 80
},
{
"epoch": 0.013112631025132543,
"grad_norm": 64.36229705810547,
"learning_rate": 7.999968801273448e-05,
"loss": 125.5283,
"step": 81
},
{
"epoch": 0.013274515358776155,
"grad_norm": 66.01007843017578,
"learning_rate": 7.999966755990356e-05,
"loss": 126.7069,
"step": 82
},
{
"epoch": 0.013436399692419766,
"grad_norm": 51.27324295043945,
"learning_rate": 7.999964645778009e-05,
"loss": 129.4959,
"step": 83
},
{
"epoch": 0.013598284026063378,
"grad_norm": 58.374027252197266,
"learning_rate": 7.999962470636439e-05,
"loss": 123.1187,
"step": 84
},
{
"epoch": 0.01376016835970699,
"grad_norm": 49.6333122253418,
"learning_rate": 7.999960230565682e-05,
"loss": 124.5258,
"step": 85
},
{
"epoch": 0.013922052693350602,
"grad_norm": 75.77702331542969,
"learning_rate": 7.999957925565775e-05,
"loss": 123.124,
"step": 86
},
{
"epoch": 0.014083937026994212,
"grad_norm": 74.09357452392578,
"learning_rate": 7.999955555636756e-05,
"loss": 127.6408,
"step": 87
},
{
"epoch": 0.014245821360637824,
"grad_norm": 68.89026641845703,
"learning_rate": 7.99995312077866e-05,
"loss": 122.5667,
"step": 88
},
{
"epoch": 0.014407705694281436,
"grad_norm": 146.59506225585938,
"learning_rate": 7.99995062099153e-05,
"loss": 124.9104,
"step": 89
},
{
"epoch": 0.014569590027925048,
"grad_norm": 86.42540740966797,
"learning_rate": 7.999948056275404e-05,
"loss": 126.9094,
"step": 90
},
{
"epoch": 0.01473147436156866,
"grad_norm": 56.522586822509766,
"learning_rate": 7.999945426630326e-05,
"loss": 121.8118,
"step": 91
},
{
"epoch": 0.01489335869521227,
"grad_norm": 78.91332244873047,
"learning_rate": 7.999942732056337e-05,
"loss": 128.3101,
"step": 92
},
{
"epoch": 0.015055243028855882,
"grad_norm": 54.05183792114258,
"learning_rate": 7.999939972553482e-05,
"loss": 125.511,
"step": 93
},
{
"epoch": 0.015217127362499494,
"grad_norm": 89.96048736572266,
"learning_rate": 7.999937148121805e-05,
"loss": 123.1977,
"step": 94
},
{
"epoch": 0.015379011696143106,
"grad_norm": 60.051822662353516,
"learning_rate": 7.999934258761353e-05,
"loss": 126.068,
"step": 95
},
{
"epoch": 0.015540896029786717,
"grad_norm": 49.81428909301758,
"learning_rate": 7.999931304472171e-05,
"loss": 125.8266,
"step": 96
},
{
"epoch": 0.01570278036343033,
"grad_norm": 49.57832336425781,
"learning_rate": 7.999928285254308e-05,
"loss": 119.8124,
"step": 97
},
{
"epoch": 0.01586466469707394,
"grad_norm": 60.71403121948242,
"learning_rate": 7.999925201107813e-05,
"loss": 126.2386,
"step": 98
},
{
"epoch": 0.01602654903071755,
"grad_norm": 57.2307014465332,
"learning_rate": 7.999922052032736e-05,
"loss": 121.1685,
"step": 99
},
{
"epoch": 0.016188433364361165,
"grad_norm": 64.66001892089844,
"learning_rate": 7.999918838029128e-05,
"loss": 122.7087,
"step": 100
},
{
"epoch": 0.016350317698004775,
"grad_norm": 49.135162353515625,
"learning_rate": 7.999915559097041e-05,
"loss": 124.1852,
"step": 101
},
{
"epoch": 0.01651220203164839,
"grad_norm": 59.58419418334961,
"learning_rate": 7.999912215236528e-05,
"loss": 121.8455,
"step": 102
},
{
"epoch": 0.016674086365292,
"grad_norm": 49.92463684082031,
"learning_rate": 7.999908806447645e-05,
"loss": 126.9805,
"step": 103
},
{
"epoch": 0.01683597069893561,
"grad_norm": 106.82424926757812,
"learning_rate": 7.999905332730446e-05,
"loss": 121.7415,
"step": 104
},
{
"epoch": 0.016997855032579223,
"grad_norm": 78.89348602294922,
"learning_rate": 7.999901794084987e-05,
"loss": 119.2107,
"step": 105
},
{
"epoch": 0.017159739366222834,
"grad_norm": 79.10139465332031,
"learning_rate": 7.999898190511326e-05,
"loss": 122.9975,
"step": 106
},
{
"epoch": 0.017321623699866444,
"grad_norm": 162.663330078125,
"learning_rate": 7.999894522009522e-05,
"loss": 120.7979,
"step": 107
},
{
"epoch": 0.017483508033510058,
"grad_norm": 57.36802291870117,
"learning_rate": 7.999890788579633e-05,
"loss": 120.7847,
"step": 108
},
{
"epoch": 0.017645392367153668,
"grad_norm": 275.1585388183594,
"learning_rate": 7.999886990221721e-05,
"loss": 126.1534,
"step": 109
},
{
"epoch": 0.01780727670079728,
"grad_norm": 68.8106460571289,
"learning_rate": 7.999883126935849e-05,
"loss": 121.7999,
"step": 110
},
{
"epoch": 0.017969161034440892,
"grad_norm": 73.94481658935547,
"learning_rate": 7.999879198722075e-05,
"loss": 123.5686,
"step": 111
},
{
"epoch": 0.018131045368084502,
"grad_norm": 45.65679931640625,
"learning_rate": 7.999875205580468e-05,
"loss": 122.1366,
"step": 112
},
{
"epoch": 0.018292929701728116,
"grad_norm": 65.0594711303711,
"learning_rate": 7.999871147511088e-05,
"loss": 122.6376,
"step": 113
},
{
"epoch": 0.018454814035371726,
"grad_norm": 78.8693618774414,
"learning_rate": 7.999867024514006e-05,
"loss": 121.3849,
"step": 114
},
{
"epoch": 0.01861669836901534,
"grad_norm": 61.9366455078125,
"learning_rate": 7.999862836589285e-05,
"loss": 115.2652,
"step": 115
},
{
"epoch": 0.01877858270265895,
"grad_norm": 377.050537109375,
"learning_rate": 7.999858583736995e-05,
"loss": 122.9807,
"step": 116
},
{
"epoch": 0.01894046703630256,
"grad_norm": 79.6026611328125,
"learning_rate": 7.999854265957204e-05,
"loss": 118.3197,
"step": 117
},
{
"epoch": 0.019102351369946174,
"grad_norm": 164.14610290527344,
"learning_rate": 7.999849883249982e-05,
"loss": 122.7552,
"step": 118
},
{
"epoch": 0.019264235703589785,
"grad_norm": 59.93232727050781,
"learning_rate": 7.999845435615401e-05,
"loss": 119.5683,
"step": 119
},
{
"epoch": 0.019426120037233395,
"grad_norm": 64.89917755126953,
"learning_rate": 7.999840923053533e-05,
"loss": 125.2309,
"step": 120
},
{
"epoch": 0.01958800437087701,
"grad_norm": 54.14371871948242,
"learning_rate": 7.99983634556445e-05,
"loss": 118.7013,
"step": 121
},
{
"epoch": 0.01974988870452062,
"grad_norm": 48.026832580566406,
"learning_rate": 7.999831703148229e-05,
"loss": 113.3792,
"step": 122
},
{
"epoch": 0.019911773038164233,
"grad_norm": 48.39354705810547,
"learning_rate": 7.999826995804942e-05,
"loss": 118.6348,
"step": 123
},
{
"epoch": 0.020073657371807843,
"grad_norm": 85.40504455566406,
"learning_rate": 7.999822223534668e-05,
"loss": 119.9578,
"step": 124
},
{
"epoch": 0.020235541705451453,
"grad_norm": 44.40727233886719,
"learning_rate": 7.999817386337483e-05,
"loss": 122.5157,
"step": 125
},
{
"epoch": 0.020397426039095067,
"grad_norm": 67.95315551757812,
"learning_rate": 7.999812484213467e-05,
"loss": 122.867,
"step": 126
},
{
"epoch": 0.020559310372738678,
"grad_norm": 75.48043823242188,
"learning_rate": 7.999807517162698e-05,
"loss": 124.8626,
"step": 127
},
{
"epoch": 0.02072119470638229,
"grad_norm": 224.90846252441406,
"learning_rate": 7.999802485185257e-05,
"loss": 122.3649,
"step": 128
},
{
"epoch": 0.0208830790400259,
"grad_norm": 74.21021270751953,
"learning_rate": 7.999797388281227e-05,
"loss": 122.9342,
"step": 129
},
{
"epoch": 0.021044963373669512,
"grad_norm": 58.15437316894531,
"learning_rate": 7.99979222645069e-05,
"loss": 123.0002,
"step": 130
},
{
"epoch": 0.021206847707313126,
"grad_norm": 42.266761779785156,
"learning_rate": 7.999786999693728e-05,
"loss": 117.3017,
"step": 131
},
{
"epoch": 0.021368732040956736,
"grad_norm": 56.28133773803711,
"learning_rate": 7.999781708010426e-05,
"loss": 121.0497,
"step": 132
},
{
"epoch": 0.02153061637460035,
"grad_norm": 79.92772674560547,
"learning_rate": 7.999776351400874e-05,
"loss": 122.6974,
"step": 133
},
{
"epoch": 0.02169250070824396,
"grad_norm": 97.44646453857422,
"learning_rate": 7.999770929865157e-05,
"loss": 120.2693,
"step": 134
},
{
"epoch": 0.02185438504188757,
"grad_norm": 705.3715209960938,
"learning_rate": 7.999765443403359e-05,
"loss": 124.3555,
"step": 135
},
{
"epoch": 0.022016269375531184,
"grad_norm": 371.02191162109375,
"learning_rate": 7.999759892015574e-05,
"loss": 122.0452,
"step": 136
},
{
"epoch": 0.022178153709174794,
"grad_norm": 109.09474182128906,
"learning_rate": 7.99975427570189e-05,
"loss": 120.4164,
"step": 137
},
{
"epoch": 0.022340038042818405,
"grad_norm": 66.44133758544922,
"learning_rate": 7.999748594462399e-05,
"loss": 113.6669,
"step": 138
},
{
"epoch": 0.02250192237646202,
"grad_norm": 83.61808013916016,
"learning_rate": 7.999742848297192e-05,
"loss": 118.8697,
"step": 139
},
{
"epoch": 0.02266380671010563,
"grad_norm": 75.45435333251953,
"learning_rate": 7.999737037206363e-05,
"loss": 114.4902,
"step": 140
},
{
"epoch": 0.022825691043749242,
"grad_norm": 91.38420867919922,
"learning_rate": 7.999731161190006e-05,
"loss": 118.9704,
"step": 141
},
{
"epoch": 0.022987575377392853,
"grad_norm": 45.258827209472656,
"learning_rate": 7.999725220248218e-05,
"loss": 113.6098,
"step": 142
},
{
"epoch": 0.023149459711036463,
"grad_norm": 52.511253356933594,
"learning_rate": 7.999719214381094e-05,
"loss": 113.9329,
"step": 143
},
{
"epoch": 0.023311344044680077,
"grad_norm": 46.202606201171875,
"learning_rate": 7.999713143588731e-05,
"loss": 120.0421,
"step": 144
},
{
"epoch": 0.023473228378323687,
"grad_norm": 49.14585876464844,
"learning_rate": 7.999707007871228e-05,
"loss": 120.4388,
"step": 145
},
{
"epoch": 0.0236351127119673,
"grad_norm": 49.412445068359375,
"learning_rate": 7.999700807228686e-05,
"loss": 121.0349,
"step": 146
},
{
"epoch": 0.02379699704561091,
"grad_norm": 59.629974365234375,
"learning_rate": 7.999694541661203e-05,
"loss": 119.7618,
"step": 147
},
{
"epoch": 0.02395888137925452,
"grad_norm": 41.098506927490234,
"learning_rate": 7.999688211168883e-05,
"loss": 112.5112,
"step": 148
},
{
"epoch": 0.024120765712898135,
"grad_norm": 48.01566696166992,
"learning_rate": 7.999681815751828e-05,
"loss": 114.618,
"step": 149
},
{
"epoch": 0.024282650046541746,
"grad_norm": 42.75497817993164,
"learning_rate": 7.999675355410141e-05,
"loss": 116.463,
"step": 150
},
{
"epoch": 0.02444453438018536,
"grad_norm": 49.553123474121094,
"learning_rate": 7.999668830143928e-05,
"loss": 117.1506,
"step": 151
},
{
"epoch": 0.02460641871382897,
"grad_norm": 43.47461700439453,
"learning_rate": 7.999662239953294e-05,
"loss": 116.0322,
"step": 152
},
{
"epoch": 0.02476830304747258,
"grad_norm": 55.971641540527344,
"learning_rate": 7.999655584838347e-05,
"loss": 114.7504,
"step": 153
},
{
"epoch": 0.024930187381116194,
"grad_norm": 44.42329406738281,
"learning_rate": 7.999648864799195e-05,
"loss": 112.4409,
"step": 154
},
{
"epoch": 0.025092071714759804,
"grad_norm": 498.4210510253906,
"learning_rate": 7.999642079835947e-05,
"loss": 117.3467,
"step": 155
},
{
"epoch": 0.025253956048403414,
"grad_norm": 119.55657196044922,
"learning_rate": 7.999635229948711e-05,
"loss": 123.9309,
"step": 156
},
{
"epoch": 0.025415840382047028,
"grad_norm": 51.79906463623047,
"learning_rate": 7.999628315137601e-05,
"loss": 116.7525,
"step": 157
},
{
"epoch": 0.02557772471569064,
"grad_norm": 77.05377960205078,
"learning_rate": 7.999621335402727e-05,
"loss": 121.5105,
"step": 158
},
{
"epoch": 0.025739609049334252,
"grad_norm": 54.6245231628418,
"learning_rate": 7.999614290744205e-05,
"loss": 120.5251,
"step": 159
},
{
"epoch": 0.025901493382977862,
"grad_norm": 92.46525573730469,
"learning_rate": 7.999607181162148e-05,
"loss": 117.9801,
"step": 160
},
{
"epoch": 0.026063377716621473,
"grad_norm": 76.75994873046875,
"learning_rate": 7.999600006656669e-05,
"loss": 120.8726,
"step": 161
},
{
"epoch": 0.026225262050265086,
"grad_norm": 97.33930206298828,
"learning_rate": 7.999592767227889e-05,
"loss": 118.02,
"step": 162
},
{
"epoch": 0.026387146383908697,
"grad_norm": 46.77922439575195,
"learning_rate": 7.999585462875922e-05,
"loss": 119.7141,
"step": 163
},
{
"epoch": 0.02654903071755231,
"grad_norm": 42.398468017578125,
"learning_rate": 7.999578093600889e-05,
"loss": 113.7869,
"step": 164
},
{
"epoch": 0.02671091505119592,
"grad_norm": 50.29785919189453,
"learning_rate": 7.999570659402908e-05,
"loss": 116.579,
"step": 165
},
{
"epoch": 0.02687279938483953,
"grad_norm": 44.221405029296875,
"learning_rate": 7.999563160282098e-05,
"loss": 116.4308,
"step": 166
},
{
"epoch": 0.027034683718483145,
"grad_norm": 184.91827392578125,
"learning_rate": 7.999555596238585e-05,
"loss": 117.5965,
"step": 167
},
{
"epoch": 0.027196568052126755,
"grad_norm": 53.792728424072266,
"learning_rate": 7.999547967272489e-05,
"loss": 112.9841,
"step": 168
},
{
"epoch": 0.027358452385770365,
"grad_norm": 48.12382888793945,
"learning_rate": 7.999540273383934e-05,
"loss": 114.204,
"step": 169
},
{
"epoch": 0.02752033671941398,
"grad_norm": 39.53679275512695,
"learning_rate": 7.999532514573046e-05,
"loss": 115.2311,
"step": 170
},
{
"epoch": 0.02768222105305759,
"grad_norm": 40.317108154296875,
"learning_rate": 7.999524690839951e-05,
"loss": 116.0226,
"step": 171
},
{
"epoch": 0.027844105386701203,
"grad_norm": 45.769554138183594,
"learning_rate": 7.999516802184772e-05,
"loss": 113.7299,
"step": 172
},
{
"epoch": 0.028005989720344814,
"grad_norm": 82.4209213256836,
"learning_rate": 7.999508848607644e-05,
"loss": 119.3476,
"step": 173
},
{
"epoch": 0.028167874053988424,
"grad_norm": 45.29478454589844,
"learning_rate": 7.99950083010869e-05,
"loss": 112.4692,
"step": 174
},
{
"epoch": 0.028329758387632038,
"grad_norm": 38.68589782714844,
"learning_rate": 7.999492746688044e-05,
"loss": 112.9427,
"step": 175
},
{
"epoch": 0.028491642721275648,
"grad_norm": 83.98104858398438,
"learning_rate": 7.999484598345834e-05,
"loss": 111.3235,
"step": 176
},
{
"epoch": 0.02865352705491926,
"grad_norm": 46.8161506652832,
"learning_rate": 7.999476385082196e-05,
"loss": 115.0109,
"step": 177
},
{
"epoch": 0.028815411388562872,
"grad_norm": 307.06634521484375,
"learning_rate": 7.99946810689726e-05,
"loss": 119.3569,
"step": 178
},
{
"epoch": 0.028977295722206482,
"grad_norm": 74.05174255371094,
"learning_rate": 7.999459763791162e-05,
"loss": 116.2485,
"step": 179
},
{
"epoch": 0.029139180055850096,
"grad_norm": 56.12079620361328,
"learning_rate": 7.999451355764038e-05,
"loss": 116.3827,
"step": 180
},
{
"epoch": 0.029301064389493706,
"grad_norm": 249.50506591796875,
"learning_rate": 7.999442882816024e-05,
"loss": 114.3779,
"step": 181
},
{
"epoch": 0.02946294872313732,
"grad_norm": 77.33687591552734,
"learning_rate": 7.999434344947256e-05,
"loss": 109.4699,
"step": 182
},
{
"epoch": 0.02962483305678093,
"grad_norm": 53.129337310791016,
"learning_rate": 7.999425742157874e-05,
"loss": 117.3508,
"step": 183
},
{
"epoch": 0.02978671739042454,
"grad_norm": 70.03951263427734,
"learning_rate": 7.999417074448018e-05,
"loss": 113.5749,
"step": 184
},
{
"epoch": 0.029948601724068154,
"grad_norm": 1390.8787841796875,
"learning_rate": 7.999408341817827e-05,
"loss": 125.7953,
"step": 185
},
{
"epoch": 0.030110486057711765,
"grad_norm": 64.23770141601562,
"learning_rate": 7.999399544267445e-05,
"loss": 122.2353,
"step": 186
},
{
"epoch": 0.030272370391355375,
"grad_norm": 281.6788024902344,
"learning_rate": 7.999390681797013e-05,
"loss": 120.5905,
"step": 187
},
{
"epoch": 0.03043425472499899,
"grad_norm": 891.4392700195312,
"learning_rate": 7.999381754406676e-05,
"loss": 117.5163,
"step": 188
},
{
"epoch": 0.0305961390586426,
"grad_norm": 98.16897583007812,
"learning_rate": 7.999372762096578e-05,
"loss": 114.4067,
"step": 189
},
{
"epoch": 0.030758023392286213,
"grad_norm": 63.46809768676758,
"learning_rate": 7.999363704866865e-05,
"loss": 115.9111,
"step": 190
},
{
"epoch": 0.030919907725929823,
"grad_norm": 236.7198486328125,
"learning_rate": 7.999354582717685e-05,
"loss": 117.1484,
"step": 191
},
{
"epoch": 0.031081792059573433,
"grad_norm": 113.45466613769531,
"learning_rate": 7.999345395649185e-05,
"loss": 120.786,
"step": 192
},
{
"epoch": 0.031243676393217047,
"grad_norm": 62.65834426879883,
"learning_rate": 7.999336143661517e-05,
"loss": 116.0947,
"step": 193
},
{
"epoch": 0.03140556072686066,
"grad_norm": 50.163082122802734,
"learning_rate": 7.999326826754826e-05,
"loss": 118.1403,
"step": 194
},
{
"epoch": 0.03156744506050427,
"grad_norm": 58.482852935791016,
"learning_rate": 7.999317444929268e-05,
"loss": 115.2894,
"step": 195
},
{
"epoch": 0.03172932939414788,
"grad_norm": 61.55516815185547,
"learning_rate": 7.999307998184992e-05,
"loss": 112.5059,
"step": 196
},
{
"epoch": 0.031891213727791495,
"grad_norm": 49.308223724365234,
"learning_rate": 7.999298486522152e-05,
"loss": 121.2084,
"step": 197
},
{
"epoch": 0.0320530980614351,
"grad_norm": 50.27874755859375,
"learning_rate": 7.999288909940905e-05,
"loss": 113.5961,
"step": 198
},
{
"epoch": 0.032214982395078716,
"grad_norm": 77.31110382080078,
"learning_rate": 7.999279268441404e-05,
"loss": 116.1164,
"step": 199
},
{
"epoch": 0.03237686672872233,
"grad_norm": 622.5211791992188,
"learning_rate": 7.999269562023806e-05,
"loss": 115.2331,
"step": 200
},
{
"epoch": 0.025073286346909498,
"grad_norm": 105.12308502197266,
"learning_rate": 7.999560757516067e-05,
"loss": 116.1936,
"step": 201
},
{
"epoch": 0.025198029065053328,
"grad_norm": 118.732666015625,
"learning_rate": 7.999554920578897e-05,
"loss": 117.1372,
"step": 202
},
{
"epoch": 0.025322771783197157,
"grad_norm": 168.808837890625,
"learning_rate": 7.999549045116955e-05,
"loss": 120.1179,
"step": 203
},
{
"epoch": 0.025447514501340984,
"grad_norm": 125.00849914550781,
"learning_rate": 7.999543131130301e-05,
"loss": 125.3942,
"step": 204
},
{
"epoch": 0.025572257219484813,
"grad_norm": 511.9884338378906,
"learning_rate": 7.999537178618988e-05,
"loss": 112.4453,
"step": 205
},
{
"epoch": 0.02569699993762864,
"grad_norm": 68.77386474609375,
"learning_rate": 7.999531187583077e-05,
"loss": 112.7173,
"step": 206
},
{
"epoch": 0.02582174265577247,
"grad_norm": 87.68142700195312,
"learning_rate": 7.999525158022624e-05,
"loss": 106.5586,
"step": 207
},
{
"epoch": 0.0259464853739163,
"grad_norm": 72.27877807617188,
"learning_rate": 7.999519089937685e-05,
"loss": 115.5773,
"step": 208
},
{
"epoch": 0.026071228092060125,
"grad_norm": 57.863582611083984,
"learning_rate": 7.999512983328323e-05,
"loss": 114.1945,
"step": 209
},
{
"epoch": 0.026195970810203955,
"grad_norm": 42.40835952758789,
"learning_rate": 7.999506838194593e-05,
"loss": 114.9819,
"step": 210
},
{
"epoch": 0.02632071352834778,
"grad_norm": 76.22808837890625,
"learning_rate": 7.999500654536556e-05,
"loss": 113.9357,
"step": 211
},
{
"epoch": 0.02644545624649161,
"grad_norm": 43.060150146484375,
"learning_rate": 7.999494432354271e-05,
"loss": 118.4651,
"step": 212
},
{
"epoch": 0.02657019896463544,
"grad_norm": 47.212493896484375,
"learning_rate": 7.999488171647798e-05,
"loss": 109.7108,
"step": 213
},
{
"epoch": 0.026694941682779267,
"grad_norm": 46.4222412109375,
"learning_rate": 7.999481872417197e-05,
"loss": 113.8001,
"step": 214
},
{
"epoch": 0.026819684400923097,
"grad_norm": 102.76275634765625,
"learning_rate": 7.999475534662529e-05,
"loss": 117.6439,
"step": 215
},
{
"epoch": 0.026944427119066923,
"grad_norm": 48.28736114501953,
"learning_rate": 7.999469158383856e-05,
"loss": 116.2615,
"step": 216
},
{
"epoch": 0.027069169837210753,
"grad_norm": 46.31293869018555,
"learning_rate": 7.999462743581238e-05,
"loss": 115.7722,
"step": 217
},
{
"epoch": 0.027193912555354582,
"grad_norm": 69.12755584716797,
"learning_rate": 7.999456290254736e-05,
"loss": 113.3705,
"step": 218
},
{
"epoch": 0.02731865527349841,
"grad_norm": 61.42793273925781,
"learning_rate": 7.999449798404416e-05,
"loss": 112.418,
"step": 219
},
{
"epoch": 0.02744339799164224,
"grad_norm": 57.58517074584961,
"learning_rate": 7.999443268030336e-05,
"loss": 116.1657,
"step": 220
},
{
"epoch": 0.027568140709786065,
"grad_norm": 45.741390228271484,
"learning_rate": 7.99943669913256e-05,
"loss": 118.0532,
"step": 221
},
{
"epoch": 0.027692883427929894,
"grad_norm": 77.13056945800781,
"learning_rate": 7.999430091711153e-05,
"loss": 113.6944,
"step": 222
},
{
"epoch": 0.027817626146073724,
"grad_norm": 76.47047424316406,
"learning_rate": 7.999423445766179e-05,
"loss": 109.7807,
"step": 223
},
{
"epoch": 0.02794236886421755,
"grad_norm": 125.53425598144531,
"learning_rate": 7.9994167612977e-05,
"loss": 109.5459,
"step": 224
},
{
"epoch": 0.02806711158236138,
"grad_norm": 131.52804565429688,
"learning_rate": 7.999410038305782e-05,
"loss": 113.8478,
"step": 225
},
{
"epoch": 0.028191854300505206,
"grad_norm": 50.395442962646484,
"learning_rate": 7.999403276790488e-05,
"loss": 112.5497,
"step": 226
},
{
"epoch": 0.028316597018649036,
"grad_norm": 42.5948371887207,
"learning_rate": 7.999396476751884e-05,
"loss": 114.8396,
"step": 227
},
{
"epoch": 0.028441339736792866,
"grad_norm": 46.43785095214844,
"learning_rate": 7.999389638190035e-05,
"loss": 115.9784,
"step": 228
},
{
"epoch": 0.028566082454936692,
"grad_norm": 48.53314208984375,
"learning_rate": 7.999382761105008e-05,
"loss": 113.7063,
"step": 229
},
{
"epoch": 0.028690825173080522,
"grad_norm": 50.68423080444336,
"learning_rate": 7.999375845496869e-05,
"loss": 117.9796,
"step": 230
},
{
"epoch": 0.028815567891224348,
"grad_norm": 45.6437873840332,
"learning_rate": 7.999368891365685e-05,
"loss": 112.1453,
"step": 231
},
{
"epoch": 0.028940310609368178,
"grad_norm": 42.53821563720703,
"learning_rate": 7.99936189871152e-05,
"loss": 113.976,
"step": 232
},
{
"epoch": 0.029065053327512008,
"grad_norm": 45.97554397583008,
"learning_rate": 7.999354867534445e-05,
"loss": 115.7249,
"step": 233
},
{
"epoch": 0.029189796045655834,
"grad_norm": 43.29530715942383,
"learning_rate": 7.999347797834526e-05,
"loss": 112.1225,
"step": 234
},
{
"epoch": 0.029314538763799664,
"grad_norm": 71.46146392822266,
"learning_rate": 7.999340689611833e-05,
"loss": 111.0506,
"step": 235
},
{
"epoch": 0.029439281481943493,
"grad_norm": 71.78829193115234,
"learning_rate": 7.99933354286643e-05,
"loss": 110.5349,
"step": 236
},
{
"epoch": 0.02956402420008732,
"grad_norm": 98.57122802734375,
"learning_rate": 7.999326357598392e-05,
"loss": 115.8976,
"step": 237
},
{
"epoch": 0.02968876691823115,
"grad_norm": 119.7531967163086,
"learning_rate": 7.999319133807783e-05,
"loss": 115.0299,
"step": 238
},
{
"epoch": 0.029813509636374975,
"grad_norm": 54.207393646240234,
"learning_rate": 7.999311871494675e-05,
"loss": 112.0084,
"step": 239
},
{
"epoch": 0.029938252354518805,
"grad_norm": 62.05988311767578,
"learning_rate": 7.999304570659138e-05,
"loss": 114.0009,
"step": 240
},
{
"epoch": 0.030062995072662635,
"grad_norm": 119.48426818847656,
"learning_rate": 7.99929723130124e-05,
"loss": 117.775,
"step": 241
},
{
"epoch": 0.03018773779080646,
"grad_norm": 77.92237854003906,
"learning_rate": 7.999289853421054e-05,
"loss": 116.1383,
"step": 242
},
{
"epoch": 0.03031248050895029,
"grad_norm": 159.76934814453125,
"learning_rate": 7.999282437018652e-05,
"loss": 116.2029,
"step": 243
},
{
"epoch": 0.030437223227094117,
"grad_norm": 112.40584564208984,
"learning_rate": 7.999274982094104e-05,
"loss": 109.8428,
"step": 244
},
{
"epoch": 0.030561965945237947,
"grad_norm": 1105.4193115234375,
"learning_rate": 7.99926748864748e-05,
"loss": 117.7431,
"step": 245
},
{
"epoch": 0.030686708663381777,
"grad_norm": 166.08311462402344,
"learning_rate": 7.999259956678857e-05,
"loss": 113.0148,
"step": 246
},
{
"epoch": 0.030811451381525603,
"grad_norm": 319.7405700683594,
"learning_rate": 7.999252386188302e-05,
"loss": 115.9864,
"step": 247
},
{
"epoch": 0.030936194099669433,
"grad_norm": 101.30410766601562,
"learning_rate": 7.999244777175891e-05,
"loss": 118.5819,
"step": 248
},
{
"epoch": 0.03106093681781326,
"grad_norm": 100.4976806640625,
"learning_rate": 7.999237129641697e-05,
"loss": 110.9641,
"step": 249
},
{
"epoch": 0.03118567953595709,
"grad_norm": 86.59204864501953,
"learning_rate": 7.999229443585793e-05,
"loss": 115.846,
"step": 250
},
{
"epoch": 0.03131042225410092,
"grad_norm": 100.86474609375,
"learning_rate": 7.999221719008254e-05,
"loss": 112.824,
"step": 251
},
{
"epoch": 0.031435164972244745,
"grad_norm": 77.95992279052734,
"learning_rate": 7.999213955909154e-05,
"loss": 110.218,
"step": 252
},
{
"epoch": 0.03155990769038857,
"grad_norm": 57.19294357299805,
"learning_rate": 7.999206154288567e-05,
"loss": 107.5719,
"step": 253
},
{
"epoch": 0.031684650408532404,
"grad_norm": 82.20671081542969,
"learning_rate": 7.99919831414657e-05,
"loss": 109.2522,
"step": 254
},
{
"epoch": 0.03180939312667623,
"grad_norm": 357.7333068847656,
"learning_rate": 7.999190435483237e-05,
"loss": 113.72,
"step": 255
},
{
"epoch": 0.03193413584482006,
"grad_norm": 69.6839828491211,
"learning_rate": 7.999182518298644e-05,
"loss": 110.0215,
"step": 256
},
{
"epoch": 0.03205887856296389,
"grad_norm": 84.4172592163086,
"learning_rate": 7.999174562592866e-05,
"loss": 115.6207,
"step": 257
},
{
"epoch": 0.032183621281107716,
"grad_norm": 48.480587005615234,
"learning_rate": 7.999166568365982e-05,
"loss": 117.9731,
"step": 258
},
{
"epoch": 0.03230836399925154,
"grad_norm": 47.569271087646484,
"learning_rate": 7.99915853561807e-05,
"loss": 109.5783,
"step": 259
},
{
"epoch": 0.032433106717395375,
"grad_norm": 46.10083770751953,
"learning_rate": 7.999150464349202e-05,
"loss": 111.3293,
"step": 260
},
{
"epoch": 0.0325578494355392,
"grad_norm": 94.08402252197266,
"learning_rate": 7.999142354559462e-05,
"loss": 116.551,
"step": 261
},
{
"epoch": 0.03268259215368303,
"grad_norm": 44.781005859375,
"learning_rate": 7.999134206248924e-05,
"loss": 113.2368,
"step": 262
},
{
"epoch": 0.032807334871826854,
"grad_norm": 722.23876953125,
"learning_rate": 7.999126019417668e-05,
"loss": 113.3793,
"step": 263
},
{
"epoch": 0.03293207758997069,
"grad_norm": 91.24082946777344,
"learning_rate": 7.999117794065773e-05,
"loss": 111.8051,
"step": 264
},
{
"epoch": 0.033056820308114514,
"grad_norm": 114.59203338623047,
"learning_rate": 7.999109530193317e-05,
"loss": 116.2187,
"step": 265
},
{
"epoch": 0.03318156302625834,
"grad_norm": 771.6221923828125,
"learning_rate": 7.999101227800382e-05,
"loss": 114.8163,
"step": 266
},
{
"epoch": 0.03330630574440217,
"grad_norm": 328.97900390625,
"learning_rate": 7.999092886887045e-05,
"loss": 109.051,
"step": 267
},
{
"epoch": 0.033431048462546,
"grad_norm": 76.53649139404297,
"learning_rate": 7.99908450745339e-05,
"loss": 115.6839,
"step": 268
},
{
"epoch": 0.033555791180689826,
"grad_norm": 78.25635528564453,
"learning_rate": 7.999076089499493e-05,
"loss": 114.3509,
"step": 269
},
{
"epoch": 0.03368053389883366,
"grad_norm": 49.23170852661133,
"learning_rate": 7.999067633025439e-05,
"loss": 106.3253,
"step": 270
},
{
"epoch": 0.033805276616977485,
"grad_norm": 58.18213653564453,
"learning_rate": 7.999059138031309e-05,
"loss": 118.5565,
"step": 271
},
{
"epoch": 0.03393001933512131,
"grad_norm": 144.3839111328125,
"learning_rate": 7.999050604517183e-05,
"loss": 114.6147,
"step": 272
},
{
"epoch": 0.03405476205326514,
"grad_norm": 163.0501251220703,
"learning_rate": 7.999042032483143e-05,
"loss": 112.4761,
"step": 273
},
{
"epoch": 0.03417950477140897,
"grad_norm": 154.41696166992188,
"learning_rate": 7.999033421929273e-05,
"loss": 112.8006,
"step": 274
},
{
"epoch": 0.0343042474895528,
"grad_norm": 1187.2044677734375,
"learning_rate": 7.999024772855657e-05,
"loss": 131.112,
"step": 275
},
{
"epoch": 0.03442899020769662,
"grad_norm": 479.05755615234375,
"learning_rate": 7.999016085262375e-05,
"loss": 117.9143,
"step": 276
},
{
"epoch": 0.03455373292584046,
"grad_norm": 125.08583068847656,
"learning_rate": 7.999007359149513e-05,
"loss": 119.7999,
"step": 277
},
{
"epoch": 0.03467847564398428,
"grad_norm": 201.83090209960938,
"learning_rate": 7.998998594517157e-05,
"loss": 113.1415,
"step": 278
},
{
"epoch": 0.03480321836212811,
"grad_norm": 67.61177825927734,
"learning_rate": 7.998989791365387e-05,
"loss": 116.9025,
"step": 279
},
{
"epoch": 0.03492796108027194,
"grad_norm": 72.33734130859375,
"learning_rate": 7.99898094969429e-05,
"loss": 115.6189,
"step": 280
},
{
"epoch": 0.03505270379841577,
"grad_norm": 78.88639831542969,
"learning_rate": 7.99897206950395e-05,
"loss": 109.8411,
"step": 281
},
{
"epoch": 0.035177446516559595,
"grad_norm": 51.89592361450195,
"learning_rate": 7.998963150794455e-05,
"loss": 110.4134,
"step": 282
},
{
"epoch": 0.03530218923470342,
"grad_norm": 81.0772705078125,
"learning_rate": 7.998954193565889e-05,
"loss": 112.6962,
"step": 283
},
{
"epoch": 0.035426931952847254,
"grad_norm": 57.40787124633789,
"learning_rate": 7.998945197818339e-05,
"loss": 118.4073,
"step": 284
},
{
"epoch": 0.03555167467099108,
"grad_norm": 58.126365661621094,
"learning_rate": 7.998936163551892e-05,
"loss": 111.6763,
"step": 285
},
{
"epoch": 0.03567641738913491,
"grad_norm": 78.57954406738281,
"learning_rate": 7.998927090766633e-05,
"loss": 116.7808,
"step": 286
},
{
"epoch": 0.03580116010727874,
"grad_norm": 154.63348388671875,
"learning_rate": 7.998917979462652e-05,
"loss": 116.942,
"step": 287
},
{
"epoch": 0.035925902825422566,
"grad_norm": 839.3560791015625,
"learning_rate": 7.998908829640035e-05,
"loss": 121.1782,
"step": 288
},
{
"epoch": 0.03605064554356639,
"grad_norm": 545.6408081054688,
"learning_rate": 7.99889964129887e-05,
"loss": 112.7618,
"step": 289
},
{
"epoch": 0.036175388261710226,
"grad_norm": 437.62237548828125,
"learning_rate": 7.998890414439247e-05,
"loss": 114.082,
"step": 290
},
{
"epoch": 0.03630013097985405,
"grad_norm": 367.7287902832031,
"learning_rate": 7.998881149061255e-05,
"loss": 118.9622,
"step": 291
},
{
"epoch": 0.03642487369799788,
"grad_norm": 127.22196960449219,
"learning_rate": 7.998871845164981e-05,
"loss": 110.2612,
"step": 292
},
{
"epoch": 0.036549616416141704,
"grad_norm": 106.7751693725586,
"learning_rate": 7.998862502750517e-05,
"loss": 114.9125,
"step": 293
},
{
"epoch": 0.03667435913428554,
"grad_norm": 344.0660095214844,
"learning_rate": 7.99885312181795e-05,
"loss": 117.8494,
"step": 294
},
{
"epoch": 0.036799101852429364,
"grad_norm": 50.17765426635742,
"learning_rate": 7.998843702367374e-05,
"loss": 115.9412,
"step": 295
},
{
"epoch": 0.03692384457057319,
"grad_norm": 52.166080474853516,
"learning_rate": 7.998834244398877e-05,
"loss": 112.8982,
"step": 296
},
{
"epoch": 0.03704858728871702,
"grad_norm": 52.18928146362305,
"learning_rate": 7.998824747912552e-05,
"loss": 117.6271,
"step": 297
},
{
"epoch": 0.03717333000686085,
"grad_norm": 47.1049690246582,
"learning_rate": 7.99881521290849e-05,
"loss": 114.3812,
"step": 298
},
{
"epoch": 0.037298072725004676,
"grad_norm": 45.30157470703125,
"learning_rate": 7.998805639386781e-05,
"loss": 115.5785,
"step": 299
},
{
"epoch": 0.03742281544314851,
"grad_norm": 48.85715103149414,
"learning_rate": 7.99879602734752e-05,
"loss": 106.8577,
"step": 300
},
{
"epoch": 0.037547558161292335,
"grad_norm": 50.428287506103516,
"learning_rate": 7.998786376790798e-05,
"loss": 113.6527,
"step": 301
},
{
"epoch": 0.03767230087943616,
"grad_norm": 68.31146240234375,
"learning_rate": 7.998776687716708e-05,
"loss": 118.1029,
"step": 302
},
{
"epoch": 0.03779704359757999,
"grad_norm": 71.05658721923828,
"learning_rate": 7.998766960125344e-05,
"loss": 115.2668,
"step": 303
},
{
"epoch": 0.03792178631572382,
"grad_norm": 43.14583206176758,
"learning_rate": 7.998757194016799e-05,
"loss": 113.1149,
"step": 304
},
{
"epoch": 0.03804652903386765,
"grad_norm": 50.23191452026367,
"learning_rate": 7.998747389391167e-05,
"loss": 107.7411,
"step": 305
},
{
"epoch": 0.038171271752011474,
"grad_norm": 49.16249084472656,
"learning_rate": 7.998737546248542e-05,
"loss": 111.6142,
"step": 306
},
{
"epoch": 0.03829601447015531,
"grad_norm": 44.528621673583984,
"learning_rate": 7.99872766458902e-05,
"loss": 108.265,
"step": 307
},
{
"epoch": 0.03842075718829913,
"grad_norm": 47.962974548339844,
"learning_rate": 7.998717744412697e-05,
"loss": 113.5101,
"step": 308
},
{
"epoch": 0.03854549990644296,
"grad_norm": 70.02986907958984,
"learning_rate": 7.998707785719666e-05,
"loss": 113.568,
"step": 309
},
{
"epoch": 0.03867024262458679,
"grad_norm": 43.98558807373047,
"learning_rate": 7.998697788510024e-05,
"loss": 111.3562,
"step": 310
},
{
"epoch": 0.03879498534273062,
"grad_norm": 47.76047134399414,
"learning_rate": 7.998687752783869e-05,
"loss": 114.7694,
"step": 311
},
{
"epoch": 0.038919728060874445,
"grad_norm": 55.51844787597656,
"learning_rate": 7.998677678541293e-05,
"loss": 110.2691,
"step": 312
},
{
"epoch": 0.03904447077901828,
"grad_norm": 44.33373260498047,
"learning_rate": 7.998667565782399e-05,
"loss": 111.0799,
"step": 313
},
{
"epoch": 0.039169213497162104,
"grad_norm": 57.86030197143555,
"learning_rate": 7.998657414507281e-05,
"loss": 113.2832,
"step": 314
},
{
"epoch": 0.03929395621530593,
"grad_norm": 67.02445983886719,
"learning_rate": 7.998647224716038e-05,
"loss": 116.5833,
"step": 315
},
{
"epoch": 0.03941869893344976,
"grad_norm": 50.096012115478516,
"learning_rate": 7.998636996408768e-05,
"loss": 116.1737,
"step": 316
},
{
"epoch": 0.03954344165159359,
"grad_norm": 43.61912155151367,
"learning_rate": 7.998626729585567e-05,
"loss": 109.8736,
"step": 317
},
{
"epoch": 0.039668184369737416,
"grad_norm": 269.91156005859375,
"learning_rate": 7.998616424246537e-05,
"loss": 114.4917,
"step": 318
},
{
"epoch": 0.03979292708788124,
"grad_norm": 52.25141525268555,
"learning_rate": 7.998606080391776e-05,
"loss": 116.3288,
"step": 319
},
{
"epoch": 0.039917669806025076,
"grad_norm": 46.691043853759766,
"learning_rate": 7.998595698021384e-05,
"loss": 108.5532,
"step": 320
},
{
"epoch": 0.0400424125241689,
"grad_norm": 43.550960540771484,
"learning_rate": 7.998585277135462e-05,
"loss": 108.8213,
"step": 321
},
{
"epoch": 0.04016715524231273,
"grad_norm": 55.430694580078125,
"learning_rate": 7.998574817734107e-05,
"loss": 113.6213,
"step": 322
},
{
"epoch": 0.04029189796045656,
"grad_norm": 69.84290313720703,
"learning_rate": 7.998564319817423e-05,
"loss": 115.1024,
"step": 323
},
{
"epoch": 0.04041664067860039,
"grad_norm": 42.40615463256836,
"learning_rate": 7.998553783385512e-05,
"loss": 110.7397,
"step": 324
},
{
"epoch": 0.040541383396744214,
"grad_norm": 91.48319244384766,
"learning_rate": 7.99854320843847e-05,
"loss": 110.5998,
"step": 325
},
{
"epoch": 0.04066612611488804,
"grad_norm": 58.631927490234375,
"learning_rate": 7.998532594976406e-05,
"loss": 112.1909,
"step": 326
},
{
"epoch": 0.040790868833031874,
"grad_norm": 68.29501342773438,
"learning_rate": 7.998521942999417e-05,
"loss": 113.6327,
"step": 327
},
{
"epoch": 0.0409156115511757,
"grad_norm": 65.35039520263672,
"learning_rate": 7.998511252507608e-05,
"loss": 110.0244,
"step": 328
},
{
"epoch": 0.041040354269319526,
"grad_norm": 52.01390075683594,
"learning_rate": 7.998500523501079e-05,
"loss": 106.7796,
"step": 329
},
{
"epoch": 0.04116509698746336,
"grad_norm": 61.199378967285156,
"learning_rate": 7.998489755979938e-05,
"loss": 107.1548,
"step": 330
},
{
"epoch": 0.041289839705607186,
"grad_norm": 88.75727081298828,
"learning_rate": 7.998478949944286e-05,
"loss": 112.4001,
"step": 331
},
{
"epoch": 0.04141458242375101,
"grad_norm": 55.24753952026367,
"learning_rate": 7.998468105394226e-05,
"loss": 109.2197,
"step": 332
},
{
"epoch": 0.041539325141894845,
"grad_norm": 43.90126419067383,
"learning_rate": 7.998457222329865e-05,
"loss": 108.2061,
"step": 333
},
{
"epoch": 0.04166406786003867,
"grad_norm": 56.548641204833984,
"learning_rate": 7.998446300751307e-05,
"loss": 111.9715,
"step": 334
},
{
"epoch": 0.0417888105781825,
"grad_norm": 46.53313446044922,
"learning_rate": 7.998435340658656e-05,
"loss": 109.7335,
"step": 335
},
{
"epoch": 0.041913553296326324,
"grad_norm": 91.63359832763672,
"learning_rate": 7.998424342052019e-05,
"loss": 111.1686,
"step": 336
},
{
"epoch": 0.04203829601447016,
"grad_norm": 46.20280456542969,
"learning_rate": 7.998413304931503e-05,
"loss": 113.3594,
"step": 337
},
{
"epoch": 0.04216303873261398,
"grad_norm": 48.1473503112793,
"learning_rate": 7.99840222929721e-05,
"loss": 112.963,
"step": 338
},
{
"epoch": 0.04228778145075781,
"grad_norm": 43.75265121459961,
"learning_rate": 7.998391115149251e-05,
"loss": 112.8245,
"step": 339
},
{
"epoch": 0.04241252416890164,
"grad_norm": 48.73992919921875,
"learning_rate": 7.998379962487731e-05,
"loss": 110.8602,
"step": 340
},
{
"epoch": 0.04253726688704547,
"grad_norm": 44.90249252319336,
"learning_rate": 7.998368771312757e-05,
"loss": 109.2814,
"step": 341
},
{
"epoch": 0.042662009605189295,
"grad_norm": 48.73604202270508,
"learning_rate": 7.99835754162444e-05,
"loss": 113.5373,
"step": 342
},
{
"epoch": 0.04278675232333313,
"grad_norm": 48.41697311401367,
"learning_rate": 7.998346273422883e-05,
"loss": 106.7752,
"step": 343
},
{
"epoch": 0.042911495041476955,
"grad_norm": 53.00064468383789,
"learning_rate": 7.998334966708199e-05,
"loss": 107.1834,
"step": 344
},
{
"epoch": 0.04303623775962078,
"grad_norm": 39.42155075073242,
"learning_rate": 7.998323621480496e-05,
"loss": 109.805,
"step": 345
},
{
"epoch": 0.04316098047776461,
"grad_norm": 42.144554138183594,
"learning_rate": 7.998312237739882e-05,
"loss": 112.5872,
"step": 346
},
{
"epoch": 0.04328572319590844,
"grad_norm": 79.71678924560547,
"learning_rate": 7.998300815486467e-05,
"loss": 105.5922,
"step": 347
},
{
"epoch": 0.04341046591405227,
"grad_norm": 83.48787689208984,
"learning_rate": 7.99828935472036e-05,
"loss": 112.9582,
"step": 348
},
{
"epoch": 0.04353520863219609,
"grad_norm": 49.306270599365234,
"learning_rate": 7.998277855441674e-05,
"loss": 109.0257,
"step": 349
},
{
"epoch": 0.043659951350339926,
"grad_norm": 42.597591400146484,
"learning_rate": 7.998266317650519e-05,
"loss": 107.7132,
"step": 350
},
{
"epoch": 0.04378469406848375,
"grad_norm": 47.67483139038086,
"learning_rate": 7.998254741347006e-05,
"loss": 111.3873,
"step": 351
},
{
"epoch": 0.04390943678662758,
"grad_norm": 44.571205139160156,
"learning_rate": 7.998243126531244e-05,
"loss": 116.1524,
"step": 352
},
{
"epoch": 0.04403417950477141,
"grad_norm": 151.65357971191406,
"learning_rate": 7.998231473203348e-05,
"loss": 109.0785,
"step": 353
},
{
"epoch": 0.04415892222291524,
"grad_norm": 47.44921875,
"learning_rate": 7.998219781363428e-05,
"loss": 107.1083,
"step": 354
},
{
"epoch": 0.044283664941059064,
"grad_norm": 155.82046508789062,
"learning_rate": 7.9982080510116e-05,
"loss": 112.6771,
"step": 355
},
{
"epoch": 0.04440840765920289,
"grad_norm": 45.865482330322266,
"learning_rate": 7.998196282147974e-05,
"loss": 113.0289,
"step": 356
},
{
"epoch": 0.044533150377346724,
"grad_norm": 74.91254425048828,
"learning_rate": 7.998184474772662e-05,
"loss": 113.1085,
"step": 357
},
{
"epoch": 0.04465789309549055,
"grad_norm": 99.66394805908203,
"learning_rate": 7.998172628885782e-05,
"loss": 108.1667,
"step": 358
},
{
"epoch": 0.044782635813634376,
"grad_norm": 128.02420043945312,
"learning_rate": 7.998160744487446e-05,
"loss": 106.8772,
"step": 359
},
{
"epoch": 0.04490737853177821,
"grad_norm": 47.10701370239258,
"learning_rate": 7.998148821577768e-05,
"loss": 114.4381,
"step": 360
},
{
"epoch": 0.045032121249922036,
"grad_norm": 54.99660110473633,
"learning_rate": 7.998136860156864e-05,
"loss": 112.5437,
"step": 361
},
{
"epoch": 0.04515686396806586,
"grad_norm": 97.91155242919922,
"learning_rate": 7.998124860224848e-05,
"loss": 106.907,
"step": 362
},
{
"epoch": 0.045281606686209695,
"grad_norm": 113.19481658935547,
"learning_rate": 7.998112821781835e-05,
"loss": 112.0859,
"step": 363
},
{
"epoch": 0.04540634940435352,
"grad_norm": 68.8985595703125,
"learning_rate": 7.998100744827943e-05,
"loss": 112.1743,
"step": 364
},
{
"epoch": 0.04553109212249735,
"grad_norm": 48.79725646972656,
"learning_rate": 7.998088629363289e-05,
"loss": 108.8169,
"step": 365
},
{
"epoch": 0.04565583484064118,
"grad_norm": 212.30215454101562,
"learning_rate": 7.998076475387986e-05,
"loss": 112.0246,
"step": 366
},
{
"epoch": 0.04578057755878501,
"grad_norm": 203.7574920654297,
"learning_rate": 7.998064282902153e-05,
"loss": 113.507,
"step": 367
},
{
"epoch": 0.045905320276928833,
"grad_norm": 87.0968246459961,
"learning_rate": 7.998052051905909e-05,
"loss": 111.6263,
"step": 368
},
{
"epoch": 0.04603006299507266,
"grad_norm": 66.83361053466797,
"learning_rate": 7.99803978239937e-05,
"loss": 115.3663,
"step": 369
},
{
"epoch": 0.04615480571321649,
"grad_norm": 48.28069305419922,
"learning_rate": 7.998027474382653e-05,
"loss": 105.5364,
"step": 370
},
{
"epoch": 0.04627954843136032,
"grad_norm": 65.44700622558594,
"learning_rate": 7.99801512785588e-05,
"loss": 110.4523,
"step": 371
},
{
"epoch": 0.046404291149504145,
"grad_norm": 50.70936584472656,
"learning_rate": 7.998002742819168e-05,
"loss": 116.3054,
"step": 372
},
{
"epoch": 0.04652903386764798,
"grad_norm": 116.09341430664062,
"learning_rate": 7.997990319272635e-05,
"loss": 112.0927,
"step": 373
},
{
"epoch": 0.046653776585791805,
"grad_norm": 47.97681427001953,
"learning_rate": 7.997977857216404e-05,
"loss": 109.8502,
"step": 374
},
{
"epoch": 0.04677851930393563,
"grad_norm": 345.10882568359375,
"learning_rate": 7.997965356650592e-05,
"loss": 109.4424,
"step": 375
},
{
"epoch": 0.046903262022079464,
"grad_norm": 60.53480911254883,
"learning_rate": 7.99795281757532e-05,
"loss": 108.5812,
"step": 376
},
{
"epoch": 0.04702800474022329,
"grad_norm": 68.96783447265625,
"learning_rate": 7.99794023999071e-05,
"loss": 111.185,
"step": 377
},
{
"epoch": 0.04715274745836712,
"grad_norm": 888.7139282226562,
"learning_rate": 7.997927623896882e-05,
"loss": 112.0229,
"step": 378
},
{
"epoch": 0.04727749017651094,
"grad_norm": 81.00037384033203,
"learning_rate": 7.997914969293958e-05,
"loss": 112.5362,
"step": 379
},
{
"epoch": 0.047402232894654776,
"grad_norm": 44.987770080566406,
"learning_rate": 7.997902276182061e-05,
"loss": 114.1376,
"step": 380
},
{
"epoch": 0.0475269756127986,
"grad_norm": 61.59324645996094,
"learning_rate": 7.99788954456131e-05,
"loss": 108.5044,
"step": 381
},
{
"epoch": 0.04765171833094243,
"grad_norm": 47.72024917602539,
"learning_rate": 7.997876774431831e-05,
"loss": 109.7798,
"step": 382
},
{
"epoch": 0.04777646104908626,
"grad_norm": 45.34351348876953,
"learning_rate": 7.997863965793746e-05,
"loss": 109.4833,
"step": 383
},
{
"epoch": 0.04790120376723009,
"grad_norm": 49.744873046875,
"learning_rate": 7.997851118647177e-05,
"loss": 111.6404,
"step": 384
},
{
"epoch": 0.048025946485373915,
"grad_norm": 48.63509750366211,
"learning_rate": 7.99783823299225e-05,
"loss": 113.4954,
"step": 385
},
{
"epoch": 0.04815068920351775,
"grad_norm": 43.381961822509766,
"learning_rate": 7.997825308829087e-05,
"loss": 111.8188,
"step": 386
},
{
"epoch": 0.048275431921661574,
"grad_norm": 49.75021743774414,
"learning_rate": 7.997812346157815e-05,
"loss": 116.6581,
"step": 387
},
{
"epoch": 0.0484001746398054,
"grad_norm": 55.746620178222656,
"learning_rate": 7.997799344978555e-05,
"loss": 107.2333,
"step": 388
},
{
"epoch": 0.04852491735794923,
"grad_norm": 49.89067077636719,
"learning_rate": 7.997786305291437e-05,
"loss": 111.8551,
"step": 389
},
{
"epoch": 0.04864966007609306,
"grad_norm": 85.20364379882812,
"learning_rate": 7.997773227096583e-05,
"loss": 113.6097,
"step": 390
},
{
"epoch": 0.048774402794236886,
"grad_norm": 44.94792938232422,
"learning_rate": 7.99776011039412e-05,
"loss": 108.8617,
"step": 391
},
{
"epoch": 0.04889914551238071,
"grad_norm": 47.735260009765625,
"learning_rate": 7.997746955184174e-05,
"loss": 111.1817,
"step": 392
},
{
"epoch": 0.049023888230524545,
"grad_norm": 46.30888748168945,
"learning_rate": 7.997733761466872e-05,
"loss": 109.0251,
"step": 393
},
{
"epoch": 0.04914863094866837,
"grad_norm": 46.03862762451172,
"learning_rate": 7.997720529242342e-05,
"loss": 114.6094,
"step": 394
},
{
"epoch": 0.0492733736668122,
"grad_norm": 41.99230194091797,
"learning_rate": 7.997707258510711e-05,
"loss": 109.9235,
"step": 395
},
{
"epoch": 0.04939811638495603,
"grad_norm": 53.81755447387695,
"learning_rate": 7.997693949272107e-05,
"loss": 109.1505,
"step": 396
},
{
"epoch": 0.04952285910309986,
"grad_norm": 38.50382614135742,
"learning_rate": 7.997680601526657e-05,
"loss": 111.5721,
"step": 397
},
{
"epoch": 0.049647601821243684,
"grad_norm": 66.82505798339844,
"learning_rate": 7.99766721527449e-05,
"loss": 113.2395,
"step": 398
},
{
"epoch": 0.04977234453938751,
"grad_norm": 67.15796661376953,
"learning_rate": 7.997653790515735e-05,
"loss": 110.2326,
"step": 399
},
{
"epoch": 0.04989708725753134,
"grad_norm": 50.49867630004883,
"learning_rate": 7.997640327250523e-05,
"loss": 112.31,
"step": 400
},
{
"epoch": 0.05002182997567517,
"grad_norm": 38.90182113647461,
"learning_rate": 7.997626825478982e-05,
"loss": 107.1969,
"step": 401
},
{
"epoch": 0.050146572693818996,
"grad_norm": 55.61573028564453,
"learning_rate": 7.997613285201241e-05,
"loss": 107.8303,
"step": 402
},
{
"epoch": 0.05027131541196283,
"grad_norm": 56.26730728149414,
"learning_rate": 7.997599706417433e-05,
"loss": 104.8799,
"step": 403
},
{
"epoch": 0.050396058130106655,
"grad_norm": 43.704776763916016,
"learning_rate": 7.997586089127688e-05,
"loss": 106.8678,
"step": 404
},
{
"epoch": 0.05052080084825048,
"grad_norm": 40.53383255004883,
"learning_rate": 7.997572433332136e-05,
"loss": 111.3877,
"step": 405
},
{
"epoch": 0.050645543566394315,
"grad_norm": 44.30156707763672,
"learning_rate": 7.997558739030907e-05,
"loss": 112.9497,
"step": 406
},
{
"epoch": 0.05077028628453814,
"grad_norm": 53.10981369018555,
"learning_rate": 7.997545006224137e-05,
"loss": 109.728,
"step": 407
},
{
"epoch": 0.05089502900268197,
"grad_norm": 39.82461166381836,
"learning_rate": 7.997531234911957e-05,
"loss": 113.7085,
"step": 408
},
{
"epoch": 0.05101977172082579,
"grad_norm": 46.92598342895508,
"learning_rate": 7.997517425094499e-05,
"loss": 108.0431,
"step": 409
},
{
"epoch": 0.05114451443896963,
"grad_norm": 51.97957992553711,
"learning_rate": 7.997503576771895e-05,
"loss": 115.3918,
"step": 410
},
{
"epoch": 0.05126925715711345,
"grad_norm": 137.64361572265625,
"learning_rate": 7.997489689944281e-05,
"loss": 111.1459,
"step": 411
},
{
"epoch": 0.05139399987525728,
"grad_norm": 74.92753601074219,
"learning_rate": 7.997475764611787e-05,
"loss": 101.5177,
"step": 412
},
{
"epoch": 0.05151874259340111,
"grad_norm": 42.835811614990234,
"learning_rate": 7.997461800774551e-05,
"loss": 112.3244,
"step": 413
},
{
"epoch": 0.05164348531154494,
"grad_norm": 41.18304443359375,
"learning_rate": 7.997447798432706e-05,
"loss": 110.1583,
"step": 414
},
{
"epoch": 0.051768228029688765,
"grad_norm": 70.586181640625,
"learning_rate": 7.997433757586386e-05,
"loss": 109.417,
"step": 415
},
{
"epoch": 0.0518929707478326,
"grad_norm": 42.05277633666992,
"learning_rate": 7.997419678235729e-05,
"loss": 111.8823,
"step": 416
},
{
"epoch": 0.052017713465976424,
"grad_norm": 58.777809143066406,
"learning_rate": 7.997405560380867e-05,
"loss": 110.4972,
"step": 417
},
{
"epoch": 0.05214245618412025,
"grad_norm": 52.07402038574219,
"learning_rate": 7.997391404021937e-05,
"loss": 112.2989,
"step": 418
},
{
"epoch": 0.052267198902264084,
"grad_norm": 64.20259857177734,
"learning_rate": 7.997377209159076e-05,
"loss": 111.9527,
"step": 419
},
{
"epoch": 0.05239194162040791,
"grad_norm": 45.94440841674805,
"learning_rate": 7.997362975792421e-05,
"loss": 112.8634,
"step": 420
},
{
"epoch": 0.052516684338551736,
"grad_norm": 44.264583587646484,
"learning_rate": 7.997348703922109e-05,
"loss": 113.4373,
"step": 421
},
{
"epoch": 0.05264142705669556,
"grad_norm": 44.01023483276367,
"learning_rate": 7.997334393548277e-05,
"loss": 104.0684,
"step": 422
},
{
"epoch": 0.052766169774839396,
"grad_norm": 41.45041275024414,
"learning_rate": 7.997320044671064e-05,
"loss": 110.7127,
"step": 423
},
{
"epoch": 0.05289091249298322,
"grad_norm": 47.00846481323242,
"learning_rate": 7.997305657290606e-05,
"loss": 113.1113,
"step": 424
},
{
"epoch": 0.05301565521112705,
"grad_norm": 60.392372131347656,
"learning_rate": 7.997291231407043e-05,
"loss": 111.5279,
"step": 425
},
{
"epoch": 0.05314039792927088,
"grad_norm": 45.56961441040039,
"learning_rate": 7.997276767020514e-05,
"loss": 110.9575,
"step": 426
},
{
"epoch": 0.05326514064741471,
"grad_norm": 41.345619201660156,
"learning_rate": 7.99726226413116e-05,
"loss": 109.7846,
"step": 427
},
{
"epoch": 0.053389883365558534,
"grad_norm": 99.81289672851562,
"learning_rate": 7.997247722739118e-05,
"loss": 109.0746,
"step": 428
},
{
"epoch": 0.05351462608370237,
"grad_norm": 61.65415954589844,
"learning_rate": 7.997233142844526e-05,
"loss": 115.9337,
"step": 429
},
{
"epoch": 0.05363936880184619,
"grad_norm": 66.80583953857422,
"learning_rate": 7.99721852444753e-05,
"loss": 106.7807,
"step": 430
},
{
"epoch": 0.05376411151999002,
"grad_norm": 42.357669830322266,
"learning_rate": 7.997203867548267e-05,
"loss": 111.6367,
"step": 431
},
{
"epoch": 0.053888854238133846,
"grad_norm": 42.70869445800781,
"learning_rate": 7.997189172146881e-05,
"loss": 106.6895,
"step": 432
},
{
"epoch": 0.05401359695627768,
"grad_norm": 37.493900299072266,
"learning_rate": 7.997174438243511e-05,
"loss": 112.0144,
"step": 433
},
{
"epoch": 0.054138339674421505,
"grad_norm": 37.78601837158203,
"learning_rate": 7.9971596658383e-05,
"loss": 109.6203,
"step": 434
},
{
"epoch": 0.05426308239256533,
"grad_norm": 37.59940719604492,
"learning_rate": 7.99714485493139e-05,
"loss": 108.4705,
"step": 435
},
{
"epoch": 0.054387825110709165,
"grad_norm": 38.27436447143555,
"learning_rate": 7.997130005522924e-05,
"loss": 106.622,
"step": 436
},
{
"epoch": 0.05451256782885299,
"grad_norm": 44.28715896606445,
"learning_rate": 7.997115117613045e-05,
"loss": 107.7663,
"step": 437
},
{
"epoch": 0.05463731054699682,
"grad_norm": 38.746665954589844,
"learning_rate": 7.997100191201896e-05,
"loss": 106.3057,
"step": 438
},
{
"epoch": 0.05476205326514065,
"grad_norm": 51.35078811645508,
"learning_rate": 7.99708522628962e-05,
"loss": 108.617,
"step": 439
},
{
"epoch": 0.05488679598328448,
"grad_norm": 43.55855941772461,
"learning_rate": 7.997070222876362e-05,
"loss": 111.9156,
"step": 440
},
{
"epoch": 0.0550115387014283,
"grad_norm": 44.59617614746094,
"learning_rate": 7.997055180962268e-05,
"loss": 112.6384,
"step": 441
},
{
"epoch": 0.05513628141957213,
"grad_norm": 46.06429672241211,
"learning_rate": 7.99704010054748e-05,
"loss": 110.126,
"step": 442
},
{
"epoch": 0.05526102413771596,
"grad_norm": 45.208656311035156,
"learning_rate": 7.997024981632146e-05,
"loss": 112.6224,
"step": 443
},
{
"epoch": 0.05538576685585979,
"grad_norm": 38.09982681274414,
"learning_rate": 7.997009824216411e-05,
"loss": 105.5123,
"step": 444
},
{
"epoch": 0.055510509574003615,
"grad_norm": 42.186119079589844,
"learning_rate": 7.996994628300419e-05,
"loss": 107.9967,
"step": 445
},
{
"epoch": 0.05563525229214745,
"grad_norm": 43.967994689941406,
"learning_rate": 7.99697939388432e-05,
"loss": 106.9768,
"step": 446
},
{
"epoch": 0.055759995010291274,
"grad_norm": 53.384239196777344,
"learning_rate": 7.996964120968257e-05,
"loss": 113.2065,
"step": 447
},
{
"epoch": 0.0558847377284351,
"grad_norm": 49.633148193359375,
"learning_rate": 7.996948809552378e-05,
"loss": 109.0759,
"step": 448
},
{
"epoch": 0.056009480446578934,
"grad_norm": 324.83343505859375,
"learning_rate": 7.996933459636832e-05,
"loss": 114.3023,
"step": 449
},
{
"epoch": 0.05613422316472276,
"grad_norm": 43.87398147583008,
"learning_rate": 7.996918071221766e-05,
"loss": 108.2289,
"step": 450
},
{
"epoch": 0.056258965882866586,
"grad_norm": 57.3250732421875,
"learning_rate": 7.996902644307328e-05,
"loss": 113.0327,
"step": 451
},
{
"epoch": 0.05638370860101041,
"grad_norm": 41.39522171020508,
"learning_rate": 7.996887178893667e-05,
"loss": 109.307,
"step": 452
},
{
"epoch": 0.056508451319154246,
"grad_norm": 49.11154556274414,
"learning_rate": 7.996871674980932e-05,
"loss": 106.8853,
"step": 453
},
{
"epoch": 0.05663319403729807,
"grad_norm": 64.08037567138672,
"learning_rate": 7.99685613256927e-05,
"loss": 110.1903,
"step": 454
},
{
"epoch": 0.0567579367554419,
"grad_norm": 42.89550018310547,
"learning_rate": 7.996840551658836e-05,
"loss": 106.0608,
"step": 455
},
{
"epoch": 0.05688267947358573,
"grad_norm": 49.20216369628906,
"learning_rate": 7.996824932249775e-05,
"loss": 106.8496,
"step": 456
},
{
"epoch": 0.05700742219172956,
"grad_norm": 50.31205749511719,
"learning_rate": 7.99680927434224e-05,
"loss": 111.2042,
"step": 457
},
{
"epoch": 0.057132164909873384,
"grad_norm": 43.67504119873047,
"learning_rate": 7.99679357793638e-05,
"loss": 109.3573,
"step": 458
},
{
"epoch": 0.05725690762801722,
"grad_norm": 39.90746307373047,
"learning_rate": 7.99677784303235e-05,
"loss": 110.3428,
"step": 459
},
{
"epoch": 0.057381650346161044,
"grad_norm": 44.7550163269043,
"learning_rate": 7.996762069630298e-05,
"loss": 103.7398,
"step": 460
},
{
"epoch": 0.05750639306430487,
"grad_norm": 43.251731872558594,
"learning_rate": 7.996746257730375e-05,
"loss": 109.4099,
"step": 461
},
{
"epoch": 0.057631135782448696,
"grad_norm": 42.20736312866211,
"learning_rate": 7.996730407332736e-05,
"loss": 109.548,
"step": 462
},
{
"epoch": 0.05775587850059253,
"grad_norm": 44.85642623901367,
"learning_rate": 7.996714518437533e-05,
"loss": 114.2823,
"step": 463
},
{
"epoch": 0.057880621218736356,
"grad_norm": 55.39629364013672,
"learning_rate": 7.996698591044919e-05,
"loss": 108.737,
"step": 464
},
{
"epoch": 0.05800536393688018,
"grad_norm": 41.346282958984375,
"learning_rate": 7.996682625155048e-05,
"loss": 110.5066,
"step": 465
},
{
"epoch": 0.058130106655024015,
"grad_norm": 54.856895446777344,
"learning_rate": 7.996666620768071e-05,
"loss": 107.4771,
"step": 466
},
{
"epoch": 0.05825484937316784,
"grad_norm": 45.15315628051758,
"learning_rate": 7.996650577884147e-05,
"loss": 109.2763,
"step": 467
},
{
"epoch": 0.05837959209131167,
"grad_norm": 40.95747375488281,
"learning_rate": 7.996634496503425e-05,
"loss": 110.0234,
"step": 468
},
{
"epoch": 0.0585043348094555,
"grad_norm": 41.91789627075195,
"learning_rate": 7.996618376626066e-05,
"loss": 110.0417,
"step": 469
},
{
"epoch": 0.05862907752759933,
"grad_norm": 64.76249694824219,
"learning_rate": 7.99660221825222e-05,
"loss": 106.658,
"step": 470
},
{
"epoch": 0.05875382024574315,
"grad_norm": 41.87682342529297,
"learning_rate": 7.996586021382045e-05,
"loss": 111.2904,
"step": 471
},
{
"epoch": 0.058878562963886986,
"grad_norm": 58.87449645996094,
"learning_rate": 7.996569786015696e-05,
"loss": 107.9845,
"step": 472
},
{
"epoch": 0.05900330568203081,
"grad_norm": 45.59565353393555,
"learning_rate": 7.99655351215333e-05,
"loss": 108.5766,
"step": 473
},
{
"epoch": 0.05912804840017464,
"grad_norm": 36.895816802978516,
"learning_rate": 7.996537199795104e-05,
"loss": 108.789,
"step": 474
},
{
"epoch": 0.059252791118318465,
"grad_norm": 38.27165222167969,
"learning_rate": 7.996520848941175e-05,
"loss": 111.6161,
"step": 475
},
{
"epoch": 0.0593775338364623,
"grad_norm": 42.49201965332031,
"learning_rate": 7.9965044595917e-05,
"loss": 107.5036,
"step": 476
},
{
"epoch": 0.059502276554606125,
"grad_norm": 43.45027160644531,
"learning_rate": 7.996488031746839e-05,
"loss": 106.4966,
"step": 477
},
{
"epoch": 0.05962701927274995,
"grad_norm": 46.35447692871094,
"learning_rate": 7.996471565406746e-05,
"loss": 113.0325,
"step": 478
},
{
"epoch": 0.059751761990893784,
"grad_norm": 185.98251342773438,
"learning_rate": 7.996455060571583e-05,
"loss": 107.1076,
"step": 479
},
{
"epoch": 0.05987650470903761,
"grad_norm": 43.02204895019531,
"learning_rate": 7.996438517241509e-05,
"loss": 108.5553,
"step": 480
},
{
"epoch": 0.06000124742718144,
"grad_norm": 50.793701171875,
"learning_rate": 7.996421935416681e-05,
"loss": 101.7499,
"step": 481
},
{
"epoch": 0.06012599014532527,
"grad_norm": 43.51426315307617,
"learning_rate": 7.99640531509726e-05,
"loss": 109.3193,
"step": 482
},
{
"epoch": 0.060250732863469096,
"grad_norm": 41.8190803527832,
"learning_rate": 7.996388656283407e-05,
"loss": 108.2197,
"step": 483
},
{
"epoch": 0.06037547558161292,
"grad_norm": 47.07117462158203,
"learning_rate": 7.996371958975282e-05,
"loss": 105.2699,
"step": 484
},
{
"epoch": 0.06050021829975675,
"grad_norm": 40.79714584350586,
"learning_rate": 7.996355223173046e-05,
"loss": 109.0768,
"step": 485
},
{
"epoch": 0.06062496101790058,
"grad_norm": 42.76191329956055,
"learning_rate": 7.996338448876858e-05,
"loss": 108.954,
"step": 486
},
{
"epoch": 0.06074970373604441,
"grad_norm": 43.45090103149414,
"learning_rate": 7.996321636086882e-05,
"loss": 110.2291,
"step": 487
},
{
"epoch": 0.060874446454188234,
"grad_norm": 56.796974182128906,
"learning_rate": 7.99630478480328e-05,
"loss": 110.625,
"step": 488
},
{
"epoch": 0.06099918917233207,
"grad_norm": 45.878482818603516,
"learning_rate": 7.996287895026213e-05,
"loss": 104.9758,
"step": 489
},
{
"epoch": 0.061123931890475894,
"grad_norm": 39.95423889160156,
"learning_rate": 7.996270966755843e-05,
"loss": 108.0018,
"step": 490
},
{
"epoch": 0.06124867460861972,
"grad_norm": 43.178077697753906,
"learning_rate": 7.996253999992336e-05,
"loss": 106.1532,
"step": 491
},
{
"epoch": 0.06137341732676355,
"grad_norm": 46.03929901123047,
"learning_rate": 7.996236994735853e-05,
"loss": 108.7368,
"step": 492
},
{
"epoch": 0.06149816004490738,
"grad_norm": 40.41731643676758,
"learning_rate": 7.99621995098656e-05,
"loss": 113.5876,
"step": 493
},
{
"epoch": 0.061622902763051206,
"grad_norm": 47.84632110595703,
"learning_rate": 7.996202868744617e-05,
"loss": 106.2748,
"step": 494
},
{
"epoch": 0.06174764548119503,
"grad_norm": 39.101505279541016,
"learning_rate": 7.996185748010193e-05,
"loss": 106.6452,
"step": 495
},
{
"epoch": 0.061872388199338865,
"grad_norm": 43.00922393798828,
"learning_rate": 7.99616858878345e-05,
"loss": 109.5444,
"step": 496
},
{
"epoch": 0.06199713091748269,
"grad_norm": 55.39699935913086,
"learning_rate": 7.996151391064555e-05,
"loss": 104.6551,
"step": 497
},
{
"epoch": 0.06212187363562652,
"grad_norm": 41.510711669921875,
"learning_rate": 7.996134154853674e-05,
"loss": 104.6918,
"step": 498
},
{
"epoch": 0.06224661635377035,
"grad_norm": 44.09526824951172,
"learning_rate": 7.996116880150972e-05,
"loss": 110.1849,
"step": 499
},
{
"epoch": 0.06237135907191418,
"grad_norm": 44.76930236816406,
"learning_rate": 7.996099566956615e-05,
"loss": 107.9706,
"step": 500
},
{
"epoch": 0.062496101790058003,
"grad_norm": 113.53074645996094,
"learning_rate": 7.996082215270769e-05,
"loss": 108.7049,
"step": 501
},
{
"epoch": 0.06262084450820184,
"grad_norm": 43.60973358154297,
"learning_rate": 7.996064825093603e-05,
"loss": 111.4565,
"step": 502
},
{
"epoch": 0.06274558722634566,
"grad_norm": 44.48723602294922,
"learning_rate": 7.996047396425285e-05,
"loss": 112.0623,
"step": 503
},
{
"epoch": 0.06287032994448949,
"grad_norm": 51.564537048339844,
"learning_rate": 7.996029929265982e-05,
"loss": 103.8537,
"step": 504
},
{
"epoch": 0.06299507266263332,
"grad_norm": 68.52472686767578,
"learning_rate": 7.996012423615862e-05,
"loss": 110.9377,
"step": 505
},
{
"epoch": 0.06311981538077714,
"grad_norm": 44.44695281982422,
"learning_rate": 7.995994879475092e-05,
"loss": 105.0381,
"step": 506
},
{
"epoch": 0.06324455809892097,
"grad_norm": 45.37175750732422,
"learning_rate": 7.995977296843844e-05,
"loss": 104.8582,
"step": 507
},
{
"epoch": 0.06336930081706481,
"grad_norm": 44.6590576171875,
"learning_rate": 7.995959675722285e-05,
"loss": 109.3032,
"step": 508
},
{
"epoch": 0.06349404353520863,
"grad_norm": 44.328487396240234,
"learning_rate": 7.995942016110587e-05,
"loss": 107.0624,
"step": 509
},
{
"epoch": 0.06361878625335246,
"grad_norm": 41.63954162597656,
"learning_rate": 7.995924318008918e-05,
"loss": 111.5054,
"step": 510
},
{
"epoch": 0.0637435289714963,
"grad_norm": 46.916133880615234,
"learning_rate": 7.99590658141745e-05,
"loss": 109.2073,
"step": 511
},
{
"epoch": 0.06386827168964011,
"grad_norm": 41.07780456542969,
"learning_rate": 7.995888806336352e-05,
"loss": 106.1475,
"step": 512
},
{
"epoch": 0.06399301440778395,
"grad_norm": 105.76994323730469,
"learning_rate": 7.995870992765797e-05,
"loss": 111.1441,
"step": 513
},
{
"epoch": 0.06411775712592778,
"grad_norm": 40.03337097167969,
"learning_rate": 7.995853140705956e-05,
"loss": 108.6795,
"step": 514
},
{
"epoch": 0.0642424998440716,
"grad_norm": 55.95668029785156,
"learning_rate": 7.995835250157e-05,
"loss": 107.0958,
"step": 515
},
{
"epoch": 0.06436724256221543,
"grad_norm": 68.05082702636719,
"learning_rate": 7.995817321119105e-05,
"loss": 103.0981,
"step": 516
},
{
"epoch": 0.06449198528035927,
"grad_norm": 194.20814514160156,
"learning_rate": 7.995799353592438e-05,
"loss": 108.7744,
"step": 517
},
{
"epoch": 0.06461672799850308,
"grad_norm": 48.73104476928711,
"learning_rate": 7.995781347577176e-05,
"loss": 108.1342,
"step": 518
},
{
"epoch": 0.06474147071664692,
"grad_norm": 49.77934646606445,
"learning_rate": 7.995763303073491e-05,
"loss": 108.9714,
"step": 519
},
{
"epoch": 0.06486621343479075,
"grad_norm": 47.1278190612793,
"learning_rate": 7.995745220081558e-05,
"loss": 107.1164,
"step": 520
},
{
"epoch": 0.06499095615293457,
"grad_norm": 65.26655578613281,
"learning_rate": 7.99572709860155e-05,
"loss": 104.2223,
"step": 521
},
{
"epoch": 0.0651156988710784,
"grad_norm": 61.42082977294922,
"learning_rate": 7.99570893863364e-05,
"loss": 114.7821,
"step": 522
},
{
"epoch": 0.06524044158922222,
"grad_norm": 59.47987747192383,
"learning_rate": 7.995690740178008e-05,
"loss": 107.6004,
"step": 523
},
{
"epoch": 0.06536518430736606,
"grad_norm": 41.58585739135742,
"learning_rate": 7.995672503234826e-05,
"loss": 105.1208,
"step": 524
},
{
"epoch": 0.06548992702550989,
"grad_norm": 41.565364837646484,
"learning_rate": 7.995654227804269e-05,
"loss": 107.8084,
"step": 525
},
{
"epoch": 0.06561466974365371,
"grad_norm": 42.82693099975586,
"learning_rate": 7.995635913886514e-05,
"loss": 111.2114,
"step": 526
},
{
"epoch": 0.06573941246179754,
"grad_norm": 43.36703109741211,
"learning_rate": 7.995617561481737e-05,
"loss": 105.257,
"step": 527
},
{
"epoch": 0.06586415517994137,
"grad_norm": 38.68037796020508,
"learning_rate": 7.995599170590116e-05,
"loss": 103.4582,
"step": 528
},
{
"epoch": 0.0659888978980852,
"grad_norm": 65.8855209350586,
"learning_rate": 7.995580741211826e-05,
"loss": 107.4576,
"step": 529
},
{
"epoch": 0.06611364061622903,
"grad_norm": 136.07855224609375,
"learning_rate": 7.995562273347046e-05,
"loss": 109.2926,
"step": 530
},
{
"epoch": 0.06623838333437286,
"grad_norm": 41.07363510131836,
"learning_rate": 7.995543766995954e-05,
"loss": 108.2975,
"step": 531
},
{
"epoch": 0.06636312605251668,
"grad_norm": 41.75198745727539,
"learning_rate": 7.995525222158729e-05,
"loss": 109.1072,
"step": 532
},
{
"epoch": 0.06648786877066051,
"grad_norm": 40.21206283569336,
"learning_rate": 7.995506638835545e-05,
"loss": 105.4424,
"step": 533
},
{
"epoch": 0.06661261148880435,
"grad_norm": 47.10055160522461,
"learning_rate": 7.995488017026588e-05,
"loss": 109.3388,
"step": 534
},
{
"epoch": 0.06673735420694817,
"grad_norm": 39.7672233581543,
"learning_rate": 7.995469356732033e-05,
"loss": 107.9269,
"step": 535
},
{
"epoch": 0.066862096925092,
"grad_norm": 44.60293197631836,
"learning_rate": 7.99545065795206e-05,
"loss": 109.5374,
"step": 536
},
{
"epoch": 0.06698683964323583,
"grad_norm": 135.0781707763672,
"learning_rate": 7.99543192068685e-05,
"loss": 109.8438,
"step": 537
},
{
"epoch": 0.06711158236137965,
"grad_norm": 54.93995666503906,
"learning_rate": 7.995413144936584e-05,
"loss": 109.5679,
"step": 538
},
{
"epoch": 0.06723632507952348,
"grad_norm": 41.30718231201172,
"learning_rate": 7.995394330701441e-05,
"loss": 103.3309,
"step": 539
},
{
"epoch": 0.06736106779766732,
"grad_norm": 67.57846069335938,
"learning_rate": 7.995375477981603e-05,
"loss": 110.9092,
"step": 540
},
{
"epoch": 0.06748581051581114,
"grad_norm": 49.075225830078125,
"learning_rate": 7.995356586777252e-05,
"loss": 107.742,
"step": 541
},
{
"epoch": 0.06761055323395497,
"grad_norm": 43.79375076293945,
"learning_rate": 7.99533765708857e-05,
"loss": 111.4678,
"step": 542
},
{
"epoch": 0.06773529595209879,
"grad_norm": 49.00505065917969,
"learning_rate": 7.99531868891574e-05,
"loss": 108.9275,
"step": 543
},
{
"epoch": 0.06786003867024262,
"grad_norm": 58.2841682434082,
"learning_rate": 7.995299682258943e-05,
"loss": 110.0014,
"step": 544
},
{
"epoch": 0.06798478138838646,
"grad_norm": 41.18083953857422,
"learning_rate": 7.995280637118364e-05,
"loss": 111.0127,
"step": 545
},
{
"epoch": 0.06810952410653028,
"grad_norm": 39.835872650146484,
"learning_rate": 7.995261553494183e-05,
"loss": 106.499,
"step": 546
},
{
"epoch": 0.06823426682467411,
"grad_norm": 39.67194747924805,
"learning_rate": 7.995242431386589e-05,
"loss": 102.9768,
"step": 547
},
{
"epoch": 0.06835900954281794,
"grad_norm": 159.96546936035156,
"learning_rate": 7.995223270795762e-05,
"loss": 111.2534,
"step": 548
},
{
"epoch": 0.06848375226096176,
"grad_norm": 41.861961364746094,
"learning_rate": 7.995204071721889e-05,
"loss": 102.42,
"step": 549
},
{
"epoch": 0.0686084949791056,
"grad_norm": 40.4280891418457,
"learning_rate": 7.995184834165153e-05,
"loss": 108.8337,
"step": 550
},
{
"epoch": 0.06873323769724943,
"grad_norm": 45.16064453125,
"learning_rate": 7.99516555812574e-05,
"loss": 112.2044,
"step": 551
},
{
"epoch": 0.06885798041539325,
"grad_norm": 46.67704772949219,
"learning_rate": 7.995146243603836e-05,
"loss": 103.9661,
"step": 552
},
{
"epoch": 0.06898272313353708,
"grad_norm": 215.15719604492188,
"learning_rate": 7.995126890599629e-05,
"loss": 108.6657,
"step": 553
},
{
"epoch": 0.06910746585168091,
"grad_norm": 40.48838424682617,
"learning_rate": 7.995107499113302e-05,
"loss": 100.9273,
"step": 554
},
{
"epoch": 0.06923220856982473,
"grad_norm": 53.09294509887695,
"learning_rate": 7.995088069145041e-05,
"loss": 106.9633,
"step": 555
},
{
"epoch": 0.06935695128796857,
"grad_norm": 40.80364990234375,
"learning_rate": 7.995068600695037e-05,
"loss": 106.0769,
"step": 556
},
{
"epoch": 0.0694816940061124,
"grad_norm": 49.59056854248047,
"learning_rate": 7.995049093763476e-05,
"loss": 111.5405,
"step": 557
},
{
"epoch": 0.06960643672425622,
"grad_norm": 36.67897415161133,
"learning_rate": 7.995029548350547e-05,
"loss": 106.377,
"step": 558
},
{
"epoch": 0.06973117944240005,
"grad_norm": 48.19470977783203,
"learning_rate": 7.995009964456435e-05,
"loss": 103.1056,
"step": 559
},
{
"epoch": 0.06985592216054388,
"grad_norm": 187.04112243652344,
"learning_rate": 7.99499034208133e-05,
"loss": 113.4918,
"step": 560
},
{
"epoch": 0.0699806648786877,
"grad_norm": 43.95857620239258,
"learning_rate": 7.994970681225424e-05,
"loss": 107.237,
"step": 561
},
{
"epoch": 0.07010540759683154,
"grad_norm": 95.65042877197266,
"learning_rate": 7.994950981888903e-05,
"loss": 109.9392,
"step": 562
},
{
"epoch": 0.07023015031497536,
"grad_norm": 47.80479431152344,
"learning_rate": 7.994931244071957e-05,
"loss": 109.7152,
"step": 563
},
{
"epoch": 0.07035489303311919,
"grad_norm": 45.60518264770508,
"learning_rate": 7.994911467774777e-05,
"loss": 107.9085,
"step": 564
},
{
"epoch": 0.07047963575126302,
"grad_norm": 41.16765594482422,
"learning_rate": 7.994891652997555e-05,
"loss": 104.2381,
"step": 565
},
{
"epoch": 0.07060437846940684,
"grad_norm": 42.75853729248047,
"learning_rate": 7.994871799740478e-05,
"loss": 107.3365,
"step": 566
},
{
"epoch": 0.07072912118755068,
"grad_norm": 58.23500061035156,
"learning_rate": 7.99485190800374e-05,
"loss": 105.3102,
"step": 567
},
{
"epoch": 0.07085386390569451,
"grad_norm": 42.835880279541016,
"learning_rate": 7.994831977787532e-05,
"loss": 105.0044,
"step": 568
},
{
"epoch": 0.07097860662383833,
"grad_norm": 37.81028747558594,
"learning_rate": 7.994812009092046e-05,
"loss": 105.0157,
"step": 569
},
{
"epoch": 0.07110334934198216,
"grad_norm": 73.62580871582031,
"learning_rate": 7.994792001917475e-05,
"loss": 106.7293,
"step": 570
},
{
"epoch": 0.071228092060126,
"grad_norm": 40.63880920410156,
"learning_rate": 7.99477195626401e-05,
"loss": 99.5903,
"step": 571
},
{
"epoch": 0.07135283477826981,
"grad_norm": 40.66940689086914,
"learning_rate": 7.994751872131847e-05,
"loss": 110.1632,
"step": 572
},
{
"epoch": 0.07147757749641365,
"grad_norm": 36.45511245727539,
"learning_rate": 7.994731749521177e-05,
"loss": 102.8861,
"step": 573
},
{
"epoch": 0.07160232021455748,
"grad_norm": 42.30653381347656,
"learning_rate": 7.994711588432194e-05,
"loss": 114.3612,
"step": 574
},
{
"epoch": 0.0717270629327013,
"grad_norm": 54.46384048461914,
"learning_rate": 7.994691388865094e-05,
"loss": 111.6793,
"step": 575
},
{
"epoch": 0.07185180565084513,
"grad_norm": 42.14632797241211,
"learning_rate": 7.994671150820067e-05,
"loss": 107.1911,
"step": 576
},
{
"epoch": 0.07197654836898897,
"grad_norm": 38.54106140136719,
"learning_rate": 7.994650874297315e-05,
"loss": 106.1023,
"step": 577
},
{
"epoch": 0.07210129108713279,
"grad_norm": 42.634342193603516,
"learning_rate": 7.994630559297026e-05,
"loss": 104.9337,
"step": 578
},
{
"epoch": 0.07222603380527662,
"grad_norm": 41.22761154174805,
"learning_rate": 7.9946102058194e-05,
"loss": 104.0727,
"step": 579
},
{
"epoch": 0.07235077652342045,
"grad_norm": 46.69293212890625,
"learning_rate": 7.994589813864633e-05,
"loss": 110.3875,
"step": 580
},
{
"epoch": 0.07247551924156427,
"grad_norm": 122.36430358886719,
"learning_rate": 7.994569383432922e-05,
"loss": 107.5812,
"step": 581
},
{
"epoch": 0.0726002619597081,
"grad_norm": 61.05815887451172,
"learning_rate": 7.994548914524461e-05,
"loss": 103.0144,
"step": 582
},
{
"epoch": 0.07272500467785194,
"grad_norm": 39.230323791503906,
"learning_rate": 7.994528407139447e-05,
"loss": 104.646,
"step": 583
},
{
"epoch": 0.07284974739599576,
"grad_norm": 42.06929397583008,
"learning_rate": 7.994507861278082e-05,
"loss": 110.0576,
"step": 584
},
{
"epoch": 0.07297449011413959,
"grad_norm": 170.7604522705078,
"learning_rate": 7.994487276940558e-05,
"loss": 107.8427,
"step": 585
},
{
"epoch": 0.07309923283228341,
"grad_norm": 42.023712158203125,
"learning_rate": 7.994466654127078e-05,
"loss": 106.7374,
"step": 586
},
{
"epoch": 0.07322397555042724,
"grad_norm": 45.65747833251953,
"learning_rate": 7.994445992837839e-05,
"loss": 113.4892,
"step": 587
},
{
"epoch": 0.07334871826857108,
"grad_norm": 44.761940002441406,
"learning_rate": 7.99442529307304e-05,
"loss": 109.591,
"step": 588
},
{
"epoch": 0.0734734609867149,
"grad_norm": 41.18270492553711,
"learning_rate": 7.994404554832879e-05,
"loss": 109.4065,
"step": 589
},
{
"epoch": 0.07359820370485873,
"grad_norm": 44.871578216552734,
"learning_rate": 7.994383778117559e-05,
"loss": 104.8187,
"step": 590
},
{
"epoch": 0.07372294642300256,
"grad_norm": 37.495784759521484,
"learning_rate": 7.994362962927277e-05,
"loss": 108.5832,
"step": 591
},
{
"epoch": 0.07384768914114638,
"grad_norm": 38.73908996582031,
"learning_rate": 7.994342109262235e-05,
"loss": 108.9924,
"step": 592
},
{
"epoch": 0.07397243185929021,
"grad_norm": 38.3057746887207,
"learning_rate": 7.994321217122632e-05,
"loss": 108.4933,
"step": 593
},
{
"epoch": 0.07409717457743405,
"grad_norm": 42.46907424926758,
"learning_rate": 7.994300286508674e-05,
"loss": 106.7844,
"step": 594
},
{
"epoch": 0.07422191729557787,
"grad_norm": 50.73369598388672,
"learning_rate": 7.994279317420557e-05,
"loss": 109.7542,
"step": 595
},
{
"epoch": 0.0743466600137217,
"grad_norm": 57.01716613769531,
"learning_rate": 7.994258309858487e-05,
"loss": 103.7236,
"step": 596
},
{
"epoch": 0.07447140273186553,
"grad_norm": 43.53101348876953,
"learning_rate": 7.994237263822662e-05,
"loss": 111.4677,
"step": 597
},
{
"epoch": 0.07459614545000935,
"grad_norm": 138.70054626464844,
"learning_rate": 7.99421617931329e-05,
"loss": 108.4913,
"step": 598
},
{
"epoch": 0.07472088816815319,
"grad_norm": 42.204872131347656,
"learning_rate": 7.994195056330571e-05,
"loss": 104.4333,
"step": 599
},
{
"epoch": 0.07484563088629702,
"grad_norm": 49.68149948120117,
"learning_rate": 7.994173894874708e-05,
"loss": 106.922,
"step": 600
},
{
"epoch": 0.07497037360444084,
"grad_norm": 55.42933654785156,
"learning_rate": 7.994152694945907e-05,
"loss": 108.5966,
"step": 601
},
{
"epoch": 0.07509511632258467,
"grad_norm": 48.31084060668945,
"learning_rate": 7.99413145654437e-05,
"loss": 110.9727,
"step": 602
},
{
"epoch": 0.0752198590407285,
"grad_norm": 40.81968307495117,
"learning_rate": 7.994110179670304e-05,
"loss": 105.6283,
"step": 603
},
{
"epoch": 0.07534460175887232,
"grad_norm": 44.72487258911133,
"learning_rate": 7.994088864323912e-05,
"loss": 104.8582,
"step": 604
},
{
"epoch": 0.07546934447701616,
"grad_norm": 53.18965148925781,
"learning_rate": 7.9940675105054e-05,
"loss": 107.6085,
"step": 605
},
{
"epoch": 0.07559408719515998,
"grad_norm": 47.69053649902344,
"learning_rate": 7.994046118214973e-05,
"loss": 104.4708,
"step": 606
},
{
"epoch": 0.07571882991330381,
"grad_norm": 41.48191833496094,
"learning_rate": 7.994024687452839e-05,
"loss": 105.0886,
"step": 607
},
{
"epoch": 0.07584357263144764,
"grad_norm": 39.237884521484375,
"learning_rate": 7.994003218219201e-05,
"loss": 109.7915,
"step": 608
},
{
"epoch": 0.07596831534959146,
"grad_norm": 83.7771987915039,
"learning_rate": 7.99398171051427e-05,
"loss": 107.2816,
"step": 609
},
{
"epoch": 0.0760930580677353,
"grad_norm": 68.00579833984375,
"learning_rate": 7.99396016433825e-05,
"loss": 102.5963,
"step": 610
},
{
"epoch": 0.07621780078587913,
"grad_norm": 53.65129089355469,
"learning_rate": 7.993938579691348e-05,
"loss": 108.5811,
"step": 611
},
{
"epoch": 0.07634254350402295,
"grad_norm": 41.581912994384766,
"learning_rate": 7.993916956573776e-05,
"loss": 99.4722,
"step": 612
},
{
"epoch": 0.07646728622216678,
"grad_norm": 80.12490844726562,
"learning_rate": 7.993895294985738e-05,
"loss": 111.0181,
"step": 613
},
{
"epoch": 0.07659202894031061,
"grad_norm": 73.69095611572266,
"learning_rate": 7.993873594927446e-05,
"loss": 108.661,
"step": 614
},
{
"epoch": 0.07671677165845443,
"grad_norm": 43.16099548339844,
"learning_rate": 7.993851856399106e-05,
"loss": 108.6126,
"step": 615
},
{
"epoch": 0.07684151437659827,
"grad_norm": 53.59756851196289,
"learning_rate": 7.99383007940093e-05,
"loss": 101.9882,
"step": 616
},
{
"epoch": 0.0769662570947421,
"grad_norm": 40.67134475708008,
"learning_rate": 7.993808263933124e-05,
"loss": 103.8262,
"step": 617
},
{
"epoch": 0.07709099981288592,
"grad_norm": 54.83049774169922,
"learning_rate": 7.993786409995904e-05,
"loss": 105.0586,
"step": 618
},
{
"epoch": 0.07721574253102975,
"grad_norm": 58.17955017089844,
"learning_rate": 7.993764517589476e-05,
"loss": 103.4199,
"step": 619
},
{
"epoch": 0.07734048524917359,
"grad_norm": 60.76917266845703,
"learning_rate": 7.993742586714052e-05,
"loss": 106.6427,
"step": 620
},
{
"epoch": 0.0774652279673174,
"grad_norm": 66.48900604248047,
"learning_rate": 7.993720617369842e-05,
"loss": 108.36,
"step": 621
},
{
"epoch": 0.07758997068546124,
"grad_norm": 233.0039825439453,
"learning_rate": 7.99369860955706e-05,
"loss": 109.4235,
"step": 622
},
{
"epoch": 0.07771471340360507,
"grad_norm": 89.35187530517578,
"learning_rate": 7.993676563275918e-05,
"loss": 113.0599,
"step": 623
},
{
"epoch": 0.07783945612174889,
"grad_norm": 322.77301025390625,
"learning_rate": 7.993654478526626e-05,
"loss": 106.4018,
"step": 624
},
{
"epoch": 0.07796419883989272,
"grad_norm": 65.39131164550781,
"learning_rate": 7.993632355309399e-05,
"loss": 106.8766,
"step": 625
},
{
"epoch": 0.07808894155803656,
"grad_norm": 40.76368713378906,
"learning_rate": 7.993610193624447e-05,
"loss": 103.5935,
"step": 626
},
{
"epoch": 0.07821368427618038,
"grad_norm": 44.89019775390625,
"learning_rate": 7.993587993471988e-05,
"loss": 111.043,
"step": 627
},
{
"epoch": 0.07833842699432421,
"grad_norm": 42.3843994140625,
"learning_rate": 7.993565754852232e-05,
"loss": 105.5962,
"step": 628
},
{
"epoch": 0.07846316971246803,
"grad_norm": 41.023658752441406,
"learning_rate": 7.993543477765394e-05,
"loss": 109.0287,
"step": 629
},
{
"epoch": 0.07858791243061186,
"grad_norm": 45.35258483886719,
"learning_rate": 7.993521162211691e-05,
"loss": 108.2701,
"step": 630
},
{
"epoch": 0.0787126551487557,
"grad_norm": 39.16484069824219,
"learning_rate": 7.993498808191335e-05,
"loss": 110.6227,
"step": 631
},
{
"epoch": 0.07883739786689951,
"grad_norm": 46.19753646850586,
"learning_rate": 7.993476415704543e-05,
"loss": 105.7411,
"step": 632
},
{
"epoch": 0.07896214058504335,
"grad_norm": 44.90829849243164,
"learning_rate": 7.993453984751531e-05,
"loss": 102.5427,
"step": 633
},
{
"epoch": 0.07908688330318718,
"grad_norm": 47.81891632080078,
"learning_rate": 7.993431515332513e-05,
"loss": 110.5123,
"step": 634
},
{
"epoch": 0.079211626021331,
"grad_norm": 51.73146057128906,
"learning_rate": 7.993409007447706e-05,
"loss": 102.33,
"step": 635
},
{
"epoch": 0.07933636873947483,
"grad_norm": 51.33571243286133,
"learning_rate": 7.993386461097329e-05,
"loss": 106.9358,
"step": 636
},
{
"epoch": 0.07946111145761867,
"grad_norm": 38.96233367919922,
"learning_rate": 7.993363876281597e-05,
"loss": 104.1574,
"step": 637
},
{
"epoch": 0.07958585417576249,
"grad_norm": 49.49008560180664,
"learning_rate": 7.993341253000727e-05,
"loss": 113.0015,
"step": 638
},
{
"epoch": 0.07971059689390632,
"grad_norm": 96.30732727050781,
"learning_rate": 7.993318591254939e-05,
"loss": 106.5426,
"step": 639
},
{
"epoch": 0.07983533961205015,
"grad_norm": 41.0528678894043,
"learning_rate": 7.993295891044452e-05,
"loss": 105.3848,
"step": 640
},
{
"epoch": 0.07996008233019397,
"grad_norm": 44.76206588745117,
"learning_rate": 7.99327315236948e-05,
"loss": 106.2929,
"step": 641
},
{
"epoch": 0.0800848250483378,
"grad_norm": 49.43620681762695,
"learning_rate": 7.993250375230248e-05,
"loss": 109.397,
"step": 642
},
{
"epoch": 0.08020956776648164,
"grad_norm": 46.82504653930664,
"learning_rate": 7.99322755962697e-05,
"loss": 106.4483,
"step": 643
},
{
"epoch": 0.08033431048462546,
"grad_norm": 47.27984619140625,
"learning_rate": 7.99320470555987e-05,
"loss": 107.3205,
"step": 644
},
{
"epoch": 0.08045905320276929,
"grad_norm": 41.160770416259766,
"learning_rate": 7.993181813029164e-05,
"loss": 107.4188,
"step": 645
},
{
"epoch": 0.08058379592091312,
"grad_norm": 41.994773864746094,
"learning_rate": 7.993158882035077e-05,
"loss": 104.8079,
"step": 646
},
{
"epoch": 0.08070853863905694,
"grad_norm": 42.39130783081055,
"learning_rate": 7.993135912577827e-05,
"loss": 111.5084,
"step": 647
},
{
"epoch": 0.08083328135720078,
"grad_norm": 40.742820739746094,
"learning_rate": 7.993112904657637e-05,
"loss": 104.9538,
"step": 648
},
{
"epoch": 0.0809580240753446,
"grad_norm": 44.740196228027344,
"learning_rate": 7.993089858274726e-05,
"loss": 105.4505,
"step": 649
},
{
"epoch": 0.08108276679348843,
"grad_norm": 49.82853317260742,
"learning_rate": 7.993066773429318e-05,
"loss": 107.1045,
"step": 650
},
{
"epoch": 0.08120750951163226,
"grad_norm": 42.36725997924805,
"learning_rate": 7.993043650121636e-05,
"loss": 103.678,
"step": 651
},
{
"epoch": 0.08133225222977608,
"grad_norm": 56.55183029174805,
"learning_rate": 7.9930204883519e-05,
"loss": 108.9454,
"step": 652
},
{
"epoch": 0.08145699494791991,
"grad_norm": 49.053436279296875,
"learning_rate": 7.992997288120335e-05,
"loss": 102.7042,
"step": 653
},
{
"epoch": 0.08158173766606375,
"grad_norm": 42.100643157958984,
"learning_rate": 7.992974049427165e-05,
"loss": 106.1473,
"step": 654
},
{
"epoch": 0.08170648038420757,
"grad_norm": 45.25568771362305,
"learning_rate": 7.992950772272613e-05,
"loss": 108.5189,
"step": 655
},
{
"epoch": 0.0818312231023514,
"grad_norm": 43.13885498046875,
"learning_rate": 7.992927456656902e-05,
"loss": 107.2207,
"step": 656
},
{
"epoch": 0.08195596582049523,
"grad_norm": 42.76498794555664,
"learning_rate": 7.99290410258026e-05,
"loss": 105.2226,
"step": 657
},
{
"epoch": 0.08208070853863905,
"grad_norm": 39.23299789428711,
"learning_rate": 7.992880710042909e-05,
"loss": 103.3401,
"step": 658
},
{
"epoch": 0.08220545125678289,
"grad_norm": 43.75583267211914,
"learning_rate": 7.992857279045074e-05,
"loss": 108.7762,
"step": 659
},
{
"epoch": 0.08233019397492672,
"grad_norm": 45.87968444824219,
"learning_rate": 7.992833809586983e-05,
"loss": 109.7689,
"step": 660
},
{
"epoch": 0.08245493669307054,
"grad_norm": 39.33365249633789,
"learning_rate": 7.992810301668862e-05,
"loss": 101.986,
"step": 661
},
{
"epoch": 0.08257967941121437,
"grad_norm": 46.72096633911133,
"learning_rate": 7.992786755290935e-05,
"loss": 104.0511,
"step": 662
},
{
"epoch": 0.0827044221293582,
"grad_norm": 46.310935974121094,
"learning_rate": 7.99276317045343e-05,
"loss": 107.4769,
"step": 663
},
{
"epoch": 0.08282916484750202,
"grad_norm": 41.02562713623047,
"learning_rate": 7.992739547156574e-05,
"loss": 108.3204,
"step": 664
},
{
"epoch": 0.08295390756564586,
"grad_norm": 46.08228302001953,
"learning_rate": 7.992715885400595e-05,
"loss": 110.8911,
"step": 665
},
{
"epoch": 0.08307865028378969,
"grad_norm": 43.23478698730469,
"learning_rate": 7.992692185185721e-05,
"loss": 108.3253,
"step": 666
},
{
"epoch": 0.08320339300193351,
"grad_norm": 59.85237503051758,
"learning_rate": 7.992668446512181e-05,
"loss": 107.5154,
"step": 667
},
{
"epoch": 0.08332813572007734,
"grad_norm": 53.061336517333984,
"learning_rate": 7.992644669380202e-05,
"loss": 104.7316,
"step": 668
},
{
"epoch": 0.08345287843822116,
"grad_norm": 69.10020446777344,
"learning_rate": 7.992620853790014e-05,
"loss": 103.0065,
"step": 669
},
{
"epoch": 0.083577621156365,
"grad_norm": 38.546260833740234,
"learning_rate": 7.992596999741847e-05,
"loss": 107.0554,
"step": 670
},
{
"epoch": 0.08370236387450883,
"grad_norm": 38.52355194091797,
"learning_rate": 7.992573107235927e-05,
"loss": 106.9307,
"step": 671
},
{
"epoch": 0.08382710659265265,
"grad_norm": 83.6611557006836,
"learning_rate": 7.992549176272489e-05,
"loss": 110.4223,
"step": 672
},
{
"epoch": 0.08395184931079648,
"grad_norm": 90.02223205566406,
"learning_rate": 7.992525206851762e-05,
"loss": 109.0296,
"step": 673
},
{
"epoch": 0.08407659202894031,
"grad_norm": 47.07838439941406,
"learning_rate": 7.992501198973976e-05,
"loss": 104.4575,
"step": 674
},
{
"epoch": 0.08420133474708413,
"grad_norm": 49.113651275634766,
"learning_rate": 7.992477152639362e-05,
"loss": 107.0104,
"step": 675
},
{
"epoch": 0.08432607746522797,
"grad_norm": 56.979713439941406,
"learning_rate": 7.992453067848153e-05,
"loss": 110.0994,
"step": 676
},
{
"epoch": 0.0844508201833718,
"grad_norm": 38.09870910644531,
"learning_rate": 7.99242894460058e-05,
"loss": 102.0461,
"step": 677
},
{
"epoch": 0.08457556290151562,
"grad_norm": 41.27483367919922,
"learning_rate": 7.992404782896876e-05,
"loss": 102.304,
"step": 678
},
{
"epoch": 0.08470030561965945,
"grad_norm": 43.7674674987793,
"learning_rate": 7.992380582737273e-05,
"loss": 104.4165,
"step": 679
},
{
"epoch": 0.08482504833780329,
"grad_norm": 39.6673469543457,
"learning_rate": 7.992356344122006e-05,
"loss": 103.409,
"step": 680
},
{
"epoch": 0.0849497910559471,
"grad_norm": 41.36875534057617,
"learning_rate": 7.992332067051305e-05,
"loss": 104.8153,
"step": 681
},
{
"epoch": 0.08507453377409094,
"grad_norm": 42.88503646850586,
"learning_rate": 7.992307751525406e-05,
"loss": 108.2798,
"step": 682
},
{
"epoch": 0.08519927649223477,
"grad_norm": 42.5274543762207,
"learning_rate": 7.992283397544544e-05,
"loss": 103.5675,
"step": 683
},
{
"epoch": 0.08532401921037859,
"grad_norm": 42.73109436035156,
"learning_rate": 7.992259005108953e-05,
"loss": 106.8281,
"step": 684
},
{
"epoch": 0.08544876192852242,
"grad_norm": 38.829708099365234,
"learning_rate": 7.992234574218866e-05,
"loss": 106.1184,
"step": 685
},
{
"epoch": 0.08557350464666626,
"grad_norm": 44.844303131103516,
"learning_rate": 7.99221010487452e-05,
"loss": 111.0628,
"step": 686
},
{
"epoch": 0.08569824736481008,
"grad_norm": 40.221763610839844,
"learning_rate": 7.992185597076152e-05,
"loss": 105.6422,
"step": 687
},
{
"epoch": 0.08582299008295391,
"grad_norm": 49.15947341918945,
"learning_rate": 7.992161050823996e-05,
"loss": 106.0515,
"step": 688
},
{
"epoch": 0.08594773280109774,
"grad_norm": 42.66958999633789,
"learning_rate": 7.992136466118289e-05,
"loss": 109.9914,
"step": 689
},
{
"epoch": 0.08607247551924156,
"grad_norm": 44.506282806396484,
"learning_rate": 7.992111842959268e-05,
"loss": 105.6297,
"step": 690
},
{
"epoch": 0.0861972182373854,
"grad_norm": 90.39801788330078,
"learning_rate": 7.992087181347171e-05,
"loss": 108.0484,
"step": 691
},
{
"epoch": 0.08632196095552921,
"grad_norm": 54.906681060791016,
"learning_rate": 7.992062481282234e-05,
"loss": 106.7374,
"step": 692
},
{
"epoch": 0.08644670367367305,
"grad_norm": 61.74980163574219,
"learning_rate": 7.992037742764694e-05,
"loss": 110.7419,
"step": 693
},
{
"epoch": 0.08657144639181688,
"grad_norm": 42.0678596496582,
"learning_rate": 7.992012965794792e-05,
"loss": 107.4323,
"step": 694
},
{
"epoch": 0.0866961891099607,
"grad_norm": 49.055519104003906,
"learning_rate": 7.991988150372764e-05,
"loss": 105.9663,
"step": 695
},
{
"epoch": 0.08682093182810453,
"grad_norm": 76.31434631347656,
"learning_rate": 7.991963296498853e-05,
"loss": 107.8577,
"step": 696
},
{
"epoch": 0.08694567454624837,
"grad_norm": 45.10988998413086,
"learning_rate": 7.991938404173296e-05,
"loss": 105.9756,
"step": 697
},
{
"epoch": 0.08707041726439219,
"grad_norm": 48.952816009521484,
"learning_rate": 7.991913473396332e-05,
"loss": 106.8229,
"step": 698
},
{
"epoch": 0.08719515998253602,
"grad_norm": 79.2990951538086,
"learning_rate": 7.991888504168201e-05,
"loss": 103.2979,
"step": 699
},
{
"epoch": 0.08731990270067985,
"grad_norm": 50.37455749511719,
"learning_rate": 7.991863496489145e-05,
"loss": 106.9249,
"step": 700
},
{
"epoch": 0.08744464541882367,
"grad_norm": 41.227943420410156,
"learning_rate": 7.991838450359403e-05,
"loss": 107.4157,
"step": 701
},
{
"epoch": 0.0875693881369675,
"grad_norm": 44.825679779052734,
"learning_rate": 7.991813365779218e-05,
"loss": 104.4835,
"step": 702
},
{
"epoch": 0.08769413085511134,
"grad_norm": 71.43273162841797,
"learning_rate": 7.991788242748833e-05,
"loss": 110.2309,
"step": 703
},
{
"epoch": 0.08781887357325516,
"grad_norm": 41.10805130004883,
"learning_rate": 7.991763081268486e-05,
"loss": 110.4019,
"step": 704
},
{
"epoch": 0.08794361629139899,
"grad_norm": 53.66149139404297,
"learning_rate": 7.991737881338423e-05,
"loss": 97.7135,
"step": 705
},
{
"epoch": 0.08806835900954282,
"grad_norm": 47.988792419433594,
"learning_rate": 7.991712642958883e-05,
"loss": 101.7143,
"step": 706
},
{
"epoch": 0.08819310172768664,
"grad_norm": 42.717777252197266,
"learning_rate": 7.991687366130113e-05,
"loss": 107.3156,
"step": 707
},
{
"epoch": 0.08831784444583048,
"grad_norm": 41.4083366394043,
"learning_rate": 7.991662050852354e-05,
"loss": 103.1054,
"step": 708
},
{
"epoch": 0.08844258716397431,
"grad_norm": 41.523681640625,
"learning_rate": 7.991636697125851e-05,
"loss": 105.57,
"step": 709
},
{
"epoch": 0.08856732988211813,
"grad_norm": 39.47271728515625,
"learning_rate": 7.991611304950847e-05,
"loss": 107.3682,
"step": 710
},
{
"epoch": 0.08869207260026196,
"grad_norm": 45.82164764404297,
"learning_rate": 7.991585874327588e-05,
"loss": 105.3516,
"step": 711
},
{
"epoch": 0.08881681531840578,
"grad_norm": 59.3039665222168,
"learning_rate": 7.991560405256319e-05,
"loss": 101.3,
"step": 712
},
{
"epoch": 0.08894155803654961,
"grad_norm": 50.733421325683594,
"learning_rate": 7.991534897737283e-05,
"loss": 108.7571,
"step": 713
},
{
"epoch": 0.08906630075469345,
"grad_norm": 48.42889404296875,
"learning_rate": 7.99150935177073e-05,
"loss": 108.1726,
"step": 714
},
{
"epoch": 0.08919104347283727,
"grad_norm": 50.049869537353516,
"learning_rate": 7.991483767356901e-05,
"loss": 101.2996,
"step": 715
},
{
"epoch": 0.0893157861909811,
"grad_norm": 122.33171081542969,
"learning_rate": 7.991458144496045e-05,
"loss": 103.8954,
"step": 716
},
{
"epoch": 0.08944052890912493,
"grad_norm": 42.81237030029297,
"learning_rate": 7.991432483188411e-05,
"loss": 110.4419,
"step": 717
},
{
"epoch": 0.08956527162726875,
"grad_norm": 44.00999069213867,
"learning_rate": 7.991406783434243e-05,
"loss": 106.0822,
"step": 718
},
{
"epoch": 0.08969001434541259,
"grad_norm": 46.16698455810547,
"learning_rate": 7.991381045233788e-05,
"loss": 106.8559,
"step": 719
},
{
"epoch": 0.08981475706355642,
"grad_norm": 42.88228988647461,
"learning_rate": 7.991355268587296e-05,
"loss": 109.3094,
"step": 720
},
{
"epoch": 0.08993949978170024,
"grad_norm": 42.74678421020508,
"learning_rate": 7.991329453495015e-05,
"loss": 104.0673,
"step": 721
},
{
"epoch": 0.09006424249984407,
"grad_norm": 47.917152404785156,
"learning_rate": 7.991303599957193e-05,
"loss": 106.0147,
"step": 722
},
{
"epoch": 0.0901889852179879,
"grad_norm": 42.79666519165039,
"learning_rate": 7.991277707974078e-05,
"loss": 109.2056,
"step": 723
},
{
"epoch": 0.09031372793613172,
"grad_norm": 38.00052261352539,
"learning_rate": 7.991251777545922e-05,
"loss": 105.7645,
"step": 724
},
{
"epoch": 0.09043847065427556,
"grad_norm": 47.057369232177734,
"learning_rate": 7.991225808672973e-05,
"loss": 103.2221,
"step": 725
},
{
"epoch": 0.09056321337241939,
"grad_norm": 54.878883361816406,
"learning_rate": 7.991199801355482e-05,
"loss": 106.4581,
"step": 726
},
{
"epoch": 0.09068795609056321,
"grad_norm": 37.271907806396484,
"learning_rate": 7.991173755593698e-05,
"loss": 101.2744,
"step": 727
},
{
"epoch": 0.09081269880870704,
"grad_norm": 50.39563751220703,
"learning_rate": 7.991147671387874e-05,
"loss": 104.0232,
"step": 728
},
{
"epoch": 0.09093744152685088,
"grad_norm": 50.6055793762207,
"learning_rate": 7.99112154873826e-05,
"loss": 104.7903,
"step": 729
},
{
"epoch": 0.0910621842449947,
"grad_norm": 508.98358154296875,
"learning_rate": 7.991095387645109e-05,
"loss": 104.2333,
"step": 730
},
{
"epoch": 0.09118692696313853,
"grad_norm": 42.40628433227539,
"learning_rate": 7.991069188108671e-05,
"loss": 107.5562,
"step": 731
},
{
"epoch": 0.09131166968128236,
"grad_norm": 113.87471008300781,
"learning_rate": 7.9910429501292e-05,
"loss": 106.6088,
"step": 732
},
{
"epoch": 0.09143641239942618,
"grad_norm": 61.01314163208008,
"learning_rate": 7.991016673706946e-05,
"loss": 113.2846,
"step": 733
},
{
"epoch": 0.09156115511757001,
"grad_norm": 47.210121154785156,
"learning_rate": 7.990990358842165e-05,
"loss": 110.2903,
"step": 734
},
{
"epoch": 0.09168589783571383,
"grad_norm": 48.640010833740234,
"learning_rate": 7.990964005535108e-05,
"loss": 112.807,
"step": 735
},
{
"epoch": 0.09181064055385767,
"grad_norm": 53.44465255737305,
"learning_rate": 7.990937613786033e-05,
"loss": 103.3546,
"step": 736
},
{
"epoch": 0.0919353832720015,
"grad_norm": 47.38629150390625,
"learning_rate": 7.990911183595191e-05,
"loss": 105.4458,
"step": 737
},
{
"epoch": 0.09206012599014532,
"grad_norm": 54.86217498779297,
"learning_rate": 7.990884714962837e-05,
"loss": 105.1445,
"step": 738
},
{
"epoch": 0.09218486870828915,
"grad_norm": 142.8068389892578,
"learning_rate": 7.990858207889226e-05,
"loss": 103.8207,
"step": 739
},
{
"epoch": 0.09230961142643299,
"grad_norm": 43.48946762084961,
"learning_rate": 7.990831662374612e-05,
"loss": 103.2966,
"step": 740
},
{
"epoch": 0.0924343541445768,
"grad_norm": 44.73538589477539,
"learning_rate": 7.990805078419253e-05,
"loss": 106.0971,
"step": 741
},
{
"epoch": 0.09255909686272064,
"grad_norm": 43.6491813659668,
"learning_rate": 7.990778456023405e-05,
"loss": 110.8985,
"step": 742
},
{
"epoch": 0.09268383958086447,
"grad_norm": 74.74948120117188,
"learning_rate": 7.990751795187324e-05,
"loss": 101.0157,
"step": 743
},
{
"epoch": 0.09280858229900829,
"grad_norm": 38.11289978027344,
"learning_rate": 7.990725095911264e-05,
"loss": 99.2408,
"step": 744
},
{
"epoch": 0.09293332501715212,
"grad_norm": 36.607906341552734,
"learning_rate": 7.990698358195486e-05,
"loss": 105.1861,
"step": 745
},
{
"epoch": 0.09305806773529596,
"grad_norm": 45.420860290527344,
"learning_rate": 7.990671582040247e-05,
"loss": 104.6754,
"step": 746
},
{
"epoch": 0.09318281045343978,
"grad_norm": 50.80894470214844,
"learning_rate": 7.990644767445803e-05,
"loss": 106.7719,
"step": 747
},
{
"epoch": 0.09330755317158361,
"grad_norm": 53.706722259521484,
"learning_rate": 7.990617914412414e-05,
"loss": 106.2123,
"step": 748
},
{
"epoch": 0.09343229588972744,
"grad_norm": 85.75050354003906,
"learning_rate": 7.990591022940338e-05,
"loss": 105.6857,
"step": 749
},
{
"epoch": 0.09355703860787126,
"grad_norm": 44.52595901489258,
"learning_rate": 7.990564093029832e-05,
"loss": 106.1281,
"step": 750
},
{
"epoch": 0.0936817813260151,
"grad_norm": 43.202125549316406,
"learning_rate": 7.99053712468116e-05,
"loss": 109.4265,
"step": 751
},
{
"epoch": 0.09380652404415893,
"grad_norm": 42.681949615478516,
"learning_rate": 7.990510117894578e-05,
"loss": 108.5805,
"step": 752
},
{
"epoch": 0.09393126676230275,
"grad_norm": 37.958431243896484,
"learning_rate": 7.990483072670348e-05,
"loss": 103.1166,
"step": 753
},
{
"epoch": 0.09405600948044658,
"grad_norm": 47.77585983276367,
"learning_rate": 7.990455989008728e-05,
"loss": 108.3645,
"step": 754
},
{
"epoch": 0.0941807521985904,
"grad_norm": 40.16176223754883,
"learning_rate": 7.990428866909983e-05,
"loss": 103.7373,
"step": 755
},
{
"epoch": 0.09430549491673423,
"grad_norm": 49.80518341064453,
"learning_rate": 7.990401706374371e-05,
"loss": 109.7067,
"step": 756
},
{
"epoch": 0.09443023763487807,
"grad_norm": 47.979522705078125,
"learning_rate": 7.990374507402155e-05,
"loss": 102.4978,
"step": 757
},
{
"epoch": 0.09455498035302189,
"grad_norm": 40.75996780395508,
"learning_rate": 7.990347269993595e-05,
"loss": 110.7373,
"step": 758
},
{
"epoch": 0.09467972307116572,
"grad_norm": 42.08850860595703,
"learning_rate": 7.990319994148958e-05,
"loss": 108.5893,
"step": 759
},
{
"epoch": 0.09480446578930955,
"grad_norm": 45.48325729370117,
"learning_rate": 7.9902926798685e-05,
"loss": 106.778,
"step": 760
},
{
"epoch": 0.09492920850745337,
"grad_norm": 46.223411560058594,
"learning_rate": 7.99026532715249e-05,
"loss": 106.4908,
"step": 761
},
{
"epoch": 0.0950539512255972,
"grad_norm": 58.42659378051758,
"learning_rate": 7.990237936001189e-05,
"loss": 110.1278,
"step": 762
},
{
"epoch": 0.09517869394374104,
"grad_norm": 41.85593032836914,
"learning_rate": 7.99021050641486e-05,
"loss": 104.7367,
"step": 763
},
{
"epoch": 0.09530343666188486,
"grad_norm": 336.8876037597656,
"learning_rate": 7.990183038393768e-05,
"loss": 104.4902,
"step": 764
},
{
"epoch": 0.09542817938002869,
"grad_norm": 61.45979690551758,
"learning_rate": 7.99015553193818e-05,
"loss": 104.4082,
"step": 765
},
{
"epoch": 0.09555292209817252,
"grad_norm": 39.66130447387695,
"learning_rate": 7.990127987048358e-05,
"loss": 103.0021,
"step": 766
},
{
"epoch": 0.09567766481631634,
"grad_norm": 55.256771087646484,
"learning_rate": 7.990100403724567e-05,
"loss": 106.8181,
"step": 767
},
{
"epoch": 0.09580240753446018,
"grad_norm": 499.0802917480469,
"learning_rate": 7.990072781967075e-05,
"loss": 102.9369,
"step": 768
},
{
"epoch": 0.09592715025260401,
"grad_norm": 51.57876968383789,
"learning_rate": 7.990045121776146e-05,
"loss": 109.7766,
"step": 769
},
{
"epoch": 0.09605189297074783,
"grad_norm": 76.01288604736328,
"learning_rate": 7.990017423152048e-05,
"loss": 102.3456,
"step": 770
},
{
"epoch": 0.09617663568889166,
"grad_norm": 54.761348724365234,
"learning_rate": 7.989989686095046e-05,
"loss": 101.4553,
"step": 771
},
{
"epoch": 0.0963013784070355,
"grad_norm": 57.642066955566406,
"learning_rate": 7.989961910605409e-05,
"loss": 109.8551,
"step": 772
},
{
"epoch": 0.09642612112517931,
"grad_norm": 46.24718475341797,
"learning_rate": 7.989934096683403e-05,
"loss": 106.0546,
"step": 773
},
{
"epoch": 0.09655086384332315,
"grad_norm": 59.15684509277344,
"learning_rate": 7.989906244329298e-05,
"loss": 109.5013,
"step": 774
},
{
"epoch": 0.09667560656146698,
"grad_norm": 42.57009506225586,
"learning_rate": 7.98987835354336e-05,
"loss": 98.1647,
"step": 775
},
{
"epoch": 0.0968003492796108,
"grad_norm": 43.09354782104492,
"learning_rate": 7.98985042432586e-05,
"loss": 104.0455,
"step": 776
},
{
"epoch": 0.09692509199775463,
"grad_norm": 39.82168197631836,
"learning_rate": 7.989822456677063e-05,
"loss": 100.2606,
"step": 777
},
{
"epoch": 0.09704983471589845,
"grad_norm": 45.10736846923828,
"learning_rate": 7.989794450597244e-05,
"loss": 103.48,
"step": 778
},
{
"epoch": 0.09717457743404229,
"grad_norm": 38.477699279785156,
"learning_rate": 7.989766406086669e-05,
"loss": 103.3725,
"step": 779
},
{
"epoch": 0.09729932015218612,
"grad_norm": 39.18229293823242,
"learning_rate": 7.989738323145607e-05,
"loss": 111.0592,
"step": 780
},
{
"epoch": 0.09742406287032994,
"grad_norm": 45.862674713134766,
"learning_rate": 7.989710201774332e-05,
"loss": 107.9973,
"step": 781
},
{
"epoch": 0.09754880558847377,
"grad_norm": 44.29547882080078,
"learning_rate": 7.989682041973114e-05,
"loss": 111.2779,
"step": 782
},
{
"epoch": 0.0976735483066176,
"grad_norm": 46.02275466918945,
"learning_rate": 7.989653843742222e-05,
"loss": 109.4454,
"step": 783
},
{
"epoch": 0.09779829102476142,
"grad_norm": 39.75370788574219,
"learning_rate": 7.98962560708193e-05,
"loss": 105.7309,
"step": 784
},
{
"epoch": 0.09792303374290526,
"grad_norm": 44.61409378051758,
"learning_rate": 7.98959733199251e-05,
"loss": 110.2811,
"step": 785
},
{
"epoch": 0.09804777646104909,
"grad_norm": 60.925636291503906,
"learning_rate": 7.989569018474232e-05,
"loss": 103.8995,
"step": 786
},
{
"epoch": 0.09817251917919291,
"grad_norm": 44.189571380615234,
"learning_rate": 7.98954066652737e-05,
"loss": 100.0444,
"step": 787
},
{
"epoch": 0.09829726189733674,
"grad_norm": 42.29519271850586,
"learning_rate": 7.9895122761522e-05,
"loss": 111.4144,
"step": 788
},
{
"epoch": 0.09842200461548058,
"grad_norm": 61.63140869140625,
"learning_rate": 7.98948384734899e-05,
"loss": 106.5681,
"step": 789
},
{
"epoch": 0.0985467473336244,
"grad_norm": 45.61201477050781,
"learning_rate": 7.989455380118017e-05,
"loss": 101.0333,
"step": 790
},
{
"epoch": 0.09867149005176823,
"grad_norm": 40.22490310668945,
"learning_rate": 7.989426874459557e-05,
"loss": 104.7518,
"step": 791
},
{
"epoch": 0.09879623276991206,
"grad_norm": 48.98191833496094,
"learning_rate": 7.98939833037388e-05,
"loss": 106.4405,
"step": 792
},
{
"epoch": 0.09892097548805588,
"grad_norm": 39.18212127685547,
"learning_rate": 7.989369747861264e-05,
"loss": 102.1501,
"step": 793
},
{
"epoch": 0.09904571820619971,
"grad_norm": 40.757080078125,
"learning_rate": 7.989341126921984e-05,
"loss": 107.2878,
"step": 794
},
{
"epoch": 0.09917046092434355,
"grad_norm": 45.26984786987305,
"learning_rate": 7.989312467556316e-05,
"loss": 110.3787,
"step": 795
},
{
"epoch": 0.09929520364248737,
"grad_norm": 41.0319938659668,
"learning_rate": 7.989283769764534e-05,
"loss": 103.3471,
"step": 796
},
{
"epoch": 0.0994199463606312,
"grad_norm": 43.914649963378906,
"learning_rate": 7.989255033546917e-05,
"loss": 112.2347,
"step": 797
},
{
"epoch": 0.09954468907877502,
"grad_norm": 36.17337417602539,
"learning_rate": 7.98922625890374e-05,
"loss": 101.1091,
"step": 798
},
{
"epoch": 0.09966943179691885,
"grad_norm": 42.49659729003906,
"learning_rate": 7.98919744583528e-05,
"loss": 105.4335,
"step": 799
},
{
"epoch": 0.09979417451506269,
"grad_norm": 50.60047149658203,
"learning_rate": 7.989168594341817e-05,
"loss": 104.0879,
"step": 800
},
{
"epoch": 0.0999189172332065,
"grad_norm": 50.78673553466797,
"learning_rate": 7.989139704423626e-05,
"loss": 108.062,
"step": 801
},
{
"epoch": 0.10004365995135034,
"grad_norm": 200.33123779296875,
"learning_rate": 7.989110776080988e-05,
"loss": 100.7159,
"step": 802
},
{
"epoch": 0.10016840266949417,
"grad_norm": 44.31495666503906,
"learning_rate": 7.989081809314178e-05,
"loss": 100.621,
"step": 803
},
{
"epoch": 0.10029314538763799,
"grad_norm": 43.07558822631836,
"learning_rate": 7.989052804123478e-05,
"loss": 103.0756,
"step": 804
},
{
"epoch": 0.10041788810578182,
"grad_norm": 41.00045394897461,
"learning_rate": 7.989023760509167e-05,
"loss": 105.6907,
"step": 805
},
{
"epoch": 0.10054263082392566,
"grad_norm": 52.24927520751953,
"learning_rate": 7.988994678471524e-05,
"loss": 106.8971,
"step": 806
},
{
"epoch": 0.10066737354206948,
"grad_norm": 80.47367095947266,
"learning_rate": 7.98896555801083e-05,
"loss": 101.5454,
"step": 807
},
{
"epoch": 0.10079211626021331,
"grad_norm": 41.2133674621582,
"learning_rate": 7.988936399127364e-05,
"loss": 103.9278,
"step": 808
},
{
"epoch": 0.10091685897835714,
"grad_norm": 64.10038757324219,
"learning_rate": 7.988907201821409e-05,
"loss": 108.6362,
"step": 809
},
{
"epoch": 0.10104160169650096,
"grad_norm": 62.752601623535156,
"learning_rate": 7.988877966093243e-05,
"loss": 107.9309,
"step": 810
},
{
"epoch": 0.1011663444146448,
"grad_norm": 40.524864196777344,
"learning_rate": 7.988848691943151e-05,
"loss": 109.8847,
"step": 811
},
{
"epoch": 0.10129108713278863,
"grad_norm": 44.92561721801758,
"learning_rate": 7.988819379371414e-05,
"loss": 104.7545,
"step": 812
},
{
"epoch": 0.10141582985093245,
"grad_norm": 48.244590759277344,
"learning_rate": 7.988790028378314e-05,
"loss": 106.1893,
"step": 813
},
{
"epoch": 0.10154057256907628,
"grad_norm": 77.95331573486328,
"learning_rate": 7.988760638964133e-05,
"loss": 101.8331,
"step": 814
},
{
"epoch": 0.10166531528722011,
"grad_norm": 77.04405212402344,
"learning_rate": 7.988731211129154e-05,
"loss": 109.2841,
"step": 815
},
{
"epoch": 0.10179005800536393,
"grad_norm": 45.87663269042969,
"learning_rate": 7.988701744873663e-05,
"loss": 100.363,
"step": 816
},
{
"epoch": 0.10191480072350777,
"grad_norm": 81.59390258789062,
"learning_rate": 7.988672240197941e-05,
"loss": 112.6016,
"step": 817
},
{
"epoch": 0.10203954344165159,
"grad_norm": 44.2415771484375,
"learning_rate": 7.988642697102273e-05,
"loss": 106.8036,
"step": 818
},
{
"epoch": 0.10216428615979542,
"grad_norm": 39.88206481933594,
"learning_rate": 7.988613115586944e-05,
"loss": 105.9072,
"step": 819
},
{
"epoch": 0.10228902887793925,
"grad_norm": 50.07147216796875,
"learning_rate": 7.988583495652239e-05,
"loss": 103.1147,
"step": 820
},
{
"epoch": 0.10241377159608307,
"grad_norm": 39.9105110168457,
"learning_rate": 7.988553837298443e-05,
"loss": 104.759,
"step": 821
},
{
"epoch": 0.1025385143142269,
"grad_norm": 53.808650970458984,
"learning_rate": 7.988524140525843e-05,
"loss": 106.6496,
"step": 822
},
{
"epoch": 0.10266325703237074,
"grad_norm": 41.861778259277344,
"learning_rate": 7.988494405334721e-05,
"loss": 105.2179,
"step": 823
},
{
"epoch": 0.10278799975051456,
"grad_norm": 282.9111633300781,
"learning_rate": 7.988464631725369e-05,
"loss": 103.6762,
"step": 824
},
{
"epoch": 0.10291274246865839,
"grad_norm": 41.241756439208984,
"learning_rate": 7.988434819698068e-05,
"loss": 105.4725,
"step": 825
},
{
"epoch": 0.10303748518680222,
"grad_norm": 39.310672760009766,
"learning_rate": 7.98840496925311e-05,
"loss": 103.0432,
"step": 826
},
{
"epoch": 0.10316222790494604,
"grad_norm": 40.880271911621094,
"learning_rate": 7.988375080390781e-05,
"loss": 105.192,
"step": 827
},
{
"epoch": 0.10328697062308988,
"grad_norm": 39.575096130371094,
"learning_rate": 7.988345153111368e-05,
"loss": 102.2648,
"step": 828
},
{
"epoch": 0.10341171334123371,
"grad_norm": 41.8486328125,
"learning_rate": 7.98831518741516e-05,
"loss": 107.5677,
"step": 829
},
{
"epoch": 0.10353645605937753,
"grad_norm": 40.05039596557617,
"learning_rate": 7.988285183302445e-05,
"loss": 106.2296,
"step": 830
},
{
"epoch": 0.10366119877752136,
"grad_norm": 45.02509689331055,
"learning_rate": 7.988255140773514e-05,
"loss": 108.0904,
"step": 831
},
{
"epoch": 0.1037859414956652,
"grad_norm": 60.33889389038086,
"learning_rate": 7.988225059828653e-05,
"loss": 111.0632,
"step": 832
},
{
"epoch": 0.10391068421380902,
"grad_norm": 40.7230110168457,
"learning_rate": 7.988194940468154e-05,
"loss": 102.5616,
"step": 833
},
{
"epoch": 0.10403542693195285,
"grad_norm": 50.49104690551758,
"learning_rate": 7.988164782692308e-05,
"loss": 102.9625,
"step": 834
},
{
"epoch": 0.10416016965009668,
"grad_norm": 56.1006965637207,
"learning_rate": 7.988134586501401e-05,
"loss": 106.196,
"step": 835
},
{
"epoch": 0.1042849123682405,
"grad_norm": 41.65760803222656,
"learning_rate": 7.988104351895731e-05,
"loss": 104.8416,
"step": 836
},
{
"epoch": 0.10440965508638433,
"grad_norm": 42.21990203857422,
"learning_rate": 7.988074078875583e-05,
"loss": 107.6498,
"step": 837
},
{
"epoch": 0.10453439780452817,
"grad_norm": 40.52500534057617,
"learning_rate": 7.988043767441251e-05,
"loss": 104.1593,
"step": 838
},
{
"epoch": 0.10465914052267199,
"grad_norm": 62.55048751831055,
"learning_rate": 7.988013417593028e-05,
"loss": 106.9048,
"step": 839
},
{
"epoch": 0.10478388324081582,
"grad_norm": 178.96156311035156,
"learning_rate": 7.987983029331204e-05,
"loss": 103.9648,
"step": 840
},
{
"epoch": 0.10490862595895964,
"grad_norm": 42.93333435058594,
"learning_rate": 7.987952602656073e-05,
"loss": 102.5713,
"step": 841
},
{
"epoch": 0.10503336867710347,
"grad_norm": 37.63547897338867,
"learning_rate": 7.987922137567929e-05,
"loss": 106.8622,
"step": 842
},
{
"epoch": 0.1051581113952473,
"grad_norm": 40.6112060546875,
"learning_rate": 7.987891634067064e-05,
"loss": 104.8328,
"step": 843
},
{
"epoch": 0.10528285411339112,
"grad_norm": 45.14109420776367,
"learning_rate": 7.987861092153772e-05,
"loss": 102.7242,
"step": 844
},
{
"epoch": 0.10540759683153496,
"grad_norm": 38.93944549560547,
"learning_rate": 7.987830511828346e-05,
"loss": 104.042,
"step": 845
},
{
"epoch": 0.10553233954967879,
"grad_norm": 38.24641036987305,
"learning_rate": 7.987799893091084e-05,
"loss": 104.492,
"step": 846
},
{
"epoch": 0.10565708226782261,
"grad_norm": 203.7810516357422,
"learning_rate": 7.987769235942279e-05,
"loss": 106.5983,
"step": 847
},
{
"epoch": 0.10578182498596644,
"grad_norm": 57.402645111083984,
"learning_rate": 7.987738540382225e-05,
"loss": 106.0824,
"step": 848
},
{
"epoch": 0.10590656770411028,
"grad_norm": 41.6092414855957,
"learning_rate": 7.98770780641122e-05,
"loss": 106.6543,
"step": 849
},
{
"epoch": 0.1060313104222541,
"grad_norm": 44.0145263671875,
"learning_rate": 7.987677034029559e-05,
"loss": 109.0277,
"step": 850
},
{
"epoch": 0.10615605314039793,
"grad_norm": 38.79643249511719,
"learning_rate": 7.987646223237537e-05,
"loss": 106.1932,
"step": 851
},
{
"epoch": 0.10628079585854176,
"grad_norm": 41.33721160888672,
"learning_rate": 7.987615374035453e-05,
"loss": 102.2222,
"step": 852
},
{
"epoch": 0.10640553857668558,
"grad_norm": 54.679168701171875,
"learning_rate": 7.987584486423603e-05,
"loss": 100.287,
"step": 853
},
{
"epoch": 0.10653028129482942,
"grad_norm": 45.882591247558594,
"learning_rate": 7.987553560402285e-05,
"loss": 111.3827,
"step": 854
},
{
"epoch": 0.10665502401297325,
"grad_norm": 48.85630798339844,
"learning_rate": 7.987522595971797e-05,
"loss": 102.9787,
"step": 855
},
{
"epoch": 0.10677976673111707,
"grad_norm": 54.37158966064453,
"learning_rate": 7.987491593132436e-05,
"loss": 105.97,
"step": 856
},
{
"epoch": 0.1069045094492609,
"grad_norm": 42.33659744262695,
"learning_rate": 7.987460551884501e-05,
"loss": 100.2218,
"step": 857
},
{
"epoch": 0.10702925216740473,
"grad_norm": 45.72772216796875,
"learning_rate": 7.987429472228293e-05,
"loss": 107.5223,
"step": 858
},
{
"epoch": 0.10715399488554855,
"grad_norm": 41.7576789855957,
"learning_rate": 7.987398354164109e-05,
"loss": 100.588,
"step": 859
},
{
"epoch": 0.10727873760369239,
"grad_norm": 42.842140197753906,
"learning_rate": 7.987367197692251e-05,
"loss": 106.3858,
"step": 860
},
{
"epoch": 0.1074034803218362,
"grad_norm": 44.517799377441406,
"learning_rate": 7.987336002813016e-05,
"loss": 105.162,
"step": 861
},
{
"epoch": 0.10752822303998004,
"grad_norm": 49.97475814819336,
"learning_rate": 7.987304769526707e-05,
"loss": 107.6431,
"step": 862
},
{
"epoch": 0.10765296575812387,
"grad_norm": 53.77640151977539,
"learning_rate": 7.987273497833625e-05,
"loss": 110.8637,
"step": 863
},
{
"epoch": 0.10777770847626769,
"grad_norm": 43.64461898803711,
"learning_rate": 7.987242187734069e-05,
"loss": 106.3153,
"step": 864
},
{
"epoch": 0.10790245119441152,
"grad_norm": 40.418739318847656,
"learning_rate": 7.987210839228343e-05,
"loss": 108.0903,
"step": 865
},
{
"epoch": 0.10802719391255536,
"grad_norm": 46.151283264160156,
"learning_rate": 7.987179452316747e-05,
"loss": 103.5777,
"step": 866
},
{
"epoch": 0.10815193663069918,
"grad_norm": 49.916908264160156,
"learning_rate": 7.987148026999585e-05,
"loss": 108.3025,
"step": 867
},
{
"epoch": 0.10827667934884301,
"grad_norm": 53.61832809448242,
"learning_rate": 7.987116563277157e-05,
"loss": 105.6921,
"step": 868
},
{
"epoch": 0.10840142206698684,
"grad_norm": 45.600406646728516,
"learning_rate": 7.98708506114977e-05,
"loss": 106.3967,
"step": 869
},
{
"epoch": 0.10852616478513066,
"grad_norm": 42.39280700683594,
"learning_rate": 7.987053520617725e-05,
"loss": 100.8755,
"step": 870
},
{
"epoch": 0.1086509075032745,
"grad_norm": 59.4989128112793,
"learning_rate": 7.987021941681324e-05,
"loss": 107.5212,
"step": 871
},
{
"epoch": 0.10877565022141833,
"grad_norm": 35.99620819091797,
"learning_rate": 7.986990324340876e-05,
"loss": 103.4469,
"step": 872
},
{
"epoch": 0.10890039293956215,
"grad_norm": 35.833152770996094,
"learning_rate": 7.986958668596682e-05,
"loss": 107.0715,
"step": 873
},
{
"epoch": 0.10902513565770598,
"grad_norm": 36.77204895019531,
"learning_rate": 7.986926974449047e-05,
"loss": 103.4294,
"step": 874
},
{
"epoch": 0.10914987837584982,
"grad_norm": 43.57014083862305,
"learning_rate": 7.986895241898278e-05,
"loss": 104.7524,
"step": 875
},
{
"epoch": 0.10927462109399363,
"grad_norm": 39.66145706176758,
"learning_rate": 7.98686347094468e-05,
"loss": 102.1263,
"step": 876
},
{
"epoch": 0.10939936381213747,
"grad_norm": 41.383113861083984,
"learning_rate": 7.986831661588558e-05,
"loss": 104.2822,
"step": 877
},
{
"epoch": 0.1095241065302813,
"grad_norm": 41.776611328125,
"learning_rate": 7.98679981383022e-05,
"loss": 103.5317,
"step": 878
},
{
"epoch": 0.10964884924842512,
"grad_norm": 38.522159576416016,
"learning_rate": 7.986767927669971e-05,
"loss": 102.35,
"step": 879
},
{
"epoch": 0.10977359196656895,
"grad_norm": 39.333011627197266,
"learning_rate": 7.986736003108119e-05,
"loss": 103.1551,
"step": 880
},
{
"epoch": 0.10989833468471279,
"grad_norm": 45.68740463256836,
"learning_rate": 7.986704040144974e-05,
"loss": 106.6324,
"step": 881
},
{
"epoch": 0.1100230774028566,
"grad_norm": 40.96712875366211,
"learning_rate": 7.986672038780839e-05,
"loss": 106.8072,
"step": 882
},
{
"epoch": 0.11014782012100044,
"grad_norm": 44.82123947143555,
"learning_rate": 7.986639999016024e-05,
"loss": 106.9951,
"step": 883
},
{
"epoch": 0.11027256283914426,
"grad_norm": 134.05068969726562,
"learning_rate": 7.986607920850842e-05,
"loss": 96.2491,
"step": 884
},
{
"epoch": 0.11039730555728809,
"grad_norm": 357.70623779296875,
"learning_rate": 7.986575804285595e-05,
"loss": 106.8527,
"step": 885
},
{
"epoch": 0.11052204827543193,
"grad_norm": 120.42707061767578,
"learning_rate": 7.986543649320597e-05,
"loss": 109.6929,
"step": 886
},
{
"epoch": 0.11064679099357574,
"grad_norm": 65.52249908447266,
"learning_rate": 7.986511455956155e-05,
"loss": 100.4111,
"step": 887
},
{
"epoch": 0.11077153371171958,
"grad_norm": 56.955753326416016,
"learning_rate": 7.986479224192582e-05,
"loss": 107.0889,
"step": 888
},
{
"epoch": 0.11089627642986341,
"grad_norm": 50.047325134277344,
"learning_rate": 7.986446954030186e-05,
"loss": 102.7183,
"step": 889
},
{
"epoch": 0.11102101914800723,
"grad_norm": 41.446136474609375,
"learning_rate": 7.986414645469281e-05,
"loss": 104.003,
"step": 890
},
{
"epoch": 0.11114576186615106,
"grad_norm": 39.65242004394531,
"learning_rate": 7.986382298510173e-05,
"loss": 98.6816,
"step": 891
},
{
"epoch": 0.1112705045842949,
"grad_norm": 45.511962890625,
"learning_rate": 7.986349913153178e-05,
"loss": 101.1219,
"step": 892
},
{
"epoch": 0.11139524730243872,
"grad_norm": 36.03805160522461,
"learning_rate": 7.986317489398607e-05,
"loss": 98.1383,
"step": 893
},
{
"epoch": 0.11151999002058255,
"grad_norm": 41.99919891357422,
"learning_rate": 7.986285027246771e-05,
"loss": 108.2437,
"step": 894
},
{
"epoch": 0.11164473273872638,
"grad_norm": 41.75285720825195,
"learning_rate": 7.986252526697983e-05,
"loss": 106.9758,
"step": 895
},
{
"epoch": 0.1117694754568702,
"grad_norm": 42.276554107666016,
"learning_rate": 7.986219987752558e-05,
"loss": 105.2391,
"step": 896
},
{
"epoch": 0.11189421817501403,
"grad_norm": 41.39913558959961,
"learning_rate": 7.986187410410806e-05,
"loss": 100.9819,
"step": 897
},
{
"epoch": 0.11201896089315787,
"grad_norm": 48.709537506103516,
"learning_rate": 7.986154794673046e-05,
"loss": 105.0192,
"step": 898
},
{
"epoch": 0.11214370361130169,
"grad_norm": 40.13815689086914,
"learning_rate": 7.986122140539586e-05,
"loss": 107.7475,
"step": 899
},
{
"epoch": 0.11226844632944552,
"grad_norm": 41.38735580444336,
"learning_rate": 7.986089448010744e-05,
"loss": 101.6496,
"step": 900
},
{
"epoch": 0.11239318904758935,
"grad_norm": 45.525936126708984,
"learning_rate": 7.986056717086835e-05,
"loss": 104.8874,
"step": 901
},
{
"epoch": 0.11251793176573317,
"grad_norm": 39.9875602722168,
"learning_rate": 7.986023947768173e-05,
"loss": 104.7757,
"step": 902
},
{
"epoch": 0.112642674483877,
"grad_norm": 50.26521301269531,
"learning_rate": 7.985991140055076e-05,
"loss": 100.241,
"step": 903
},
{
"epoch": 0.11276741720202083,
"grad_norm": 43.497703552246094,
"learning_rate": 7.985958293947856e-05,
"loss": 103.1805,
"step": 904
},
{
"epoch": 0.11289215992016466,
"grad_norm": 45.00985336303711,
"learning_rate": 7.985925409446832e-05,
"loss": 103.3763,
"step": 905
},
{
"epoch": 0.11301690263830849,
"grad_norm": 45.723628997802734,
"learning_rate": 7.985892486552323e-05,
"loss": 103.5462,
"step": 906
},
{
"epoch": 0.11314164535645231,
"grad_norm": 36.972415924072266,
"learning_rate": 7.985859525264642e-05,
"loss": 106.5025,
"step": 907
},
{
"epoch": 0.11326638807459614,
"grad_norm": 45.01524353027344,
"learning_rate": 7.985826525584106e-05,
"loss": 104.2281,
"step": 908
},
{
"epoch": 0.11339113079273998,
"grad_norm": 44.276493072509766,
"learning_rate": 7.985793487511038e-05,
"loss": 104.5315,
"step": 909
},
{
"epoch": 0.1135158735108838,
"grad_norm": 50.3570671081543,
"learning_rate": 7.985760411045752e-05,
"loss": 105.6181,
"step": 910
},
{
"epoch": 0.11364061622902763,
"grad_norm": 35.83014678955078,
"learning_rate": 7.985727296188567e-05,
"loss": 105.2212,
"step": 911
},
{
"epoch": 0.11376535894717146,
"grad_norm": 38.70330810546875,
"learning_rate": 7.985694142939804e-05,
"loss": 102.6777,
"step": 912
},
{
"epoch": 0.11389010166531528,
"grad_norm": 48.43614959716797,
"learning_rate": 7.985660951299779e-05,
"loss": 102.7453,
"step": 913
},
{
"epoch": 0.11401484438345912,
"grad_norm": 42.26115798950195,
"learning_rate": 7.985627721268815e-05,
"loss": 105.541,
"step": 914
},
{
"epoch": 0.11413958710160295,
"grad_norm": 36.86589431762695,
"learning_rate": 7.985594452847231e-05,
"loss": 105.0147,
"step": 915
},
{
"epoch": 0.11426432981974677,
"grad_norm": 39.08838653564453,
"learning_rate": 7.985561146035349e-05,
"loss": 102.7998,
"step": 916
},
{
"epoch": 0.1143890725378906,
"grad_norm": 42.86225128173828,
"learning_rate": 7.985527800833485e-05,
"loss": 107.9756,
"step": 917
},
{
"epoch": 0.11451381525603443,
"grad_norm": 38.890342712402344,
"learning_rate": 7.985494417241965e-05,
"loss": 100.7751,
"step": 918
},
{
"epoch": 0.11463855797417825,
"grad_norm": 40.036407470703125,
"learning_rate": 7.98546099526111e-05,
"loss": 103.0615,
"step": 919
},
{
"epoch": 0.11476330069232209,
"grad_norm": 43.01128005981445,
"learning_rate": 7.985427534891238e-05,
"loss": 105.8695,
"step": 920
},
{
"epoch": 0.11488804341046592,
"grad_norm": 40.76323699951172,
"learning_rate": 7.985394036132675e-05,
"loss": 96.6547,
"step": 921
},
{
"epoch": 0.11501278612860974,
"grad_norm": 42.03172302246094,
"learning_rate": 7.985360498985744e-05,
"loss": 102.1727,
"step": 922
},
{
"epoch": 0.11513752884675357,
"grad_norm": 40.702693939208984,
"learning_rate": 7.985326923450766e-05,
"loss": 98.7282,
"step": 923
},
{
"epoch": 0.11526227156489739,
"grad_norm": 48.374271392822266,
"learning_rate": 7.985293309528066e-05,
"loss": 103.6249,
"step": 924
},
{
"epoch": 0.11538701428304123,
"grad_norm": 42.64836120605469,
"learning_rate": 7.985259657217966e-05,
"loss": 108.0894,
"step": 925
},
{
"epoch": 0.11551175700118506,
"grad_norm": 37.28689193725586,
"learning_rate": 7.985225966520791e-05,
"loss": 107.6258,
"step": 926
},
{
"epoch": 0.11563649971932888,
"grad_norm": 43.419681549072266,
"learning_rate": 7.985192237436867e-05,
"loss": 98.6488,
"step": 927
},
{
"epoch": 0.11576124243747271,
"grad_norm": 42.134422302246094,
"learning_rate": 7.985158469966517e-05,
"loss": 99.5028,
"step": 928
},
{
"epoch": 0.11588598515561654,
"grad_norm": 38.71235275268555,
"learning_rate": 7.985124664110066e-05,
"loss": 101.7445,
"step": 929
},
{
"epoch": 0.11601072787376036,
"grad_norm": 41.5626106262207,
"learning_rate": 7.98509081986784e-05,
"loss": 103.9377,
"step": 930
},
{
"epoch": 0.1161354705919042,
"grad_norm": 38.62205123901367,
"learning_rate": 7.985056937240167e-05,
"loss": 105.5863,
"step": 931
},
{
"epoch": 0.11626021331004803,
"grad_norm": 42.40933609008789,
"learning_rate": 7.98502301622737e-05,
"loss": 101.6871,
"step": 932
},
{
"epoch": 0.11638495602819185,
"grad_norm": 255.85252380371094,
"learning_rate": 7.984989056829779e-05,
"loss": 102.0786,
"step": 933
},
{
"epoch": 0.11650969874633568,
"grad_norm": 46.96572494506836,
"learning_rate": 7.98495505904772e-05,
"loss": 103.4196,
"step": 934
},
{
"epoch": 0.11663444146447952,
"grad_norm": 39.9399299621582,
"learning_rate": 7.984921022881519e-05,
"loss": 106.5107,
"step": 935
},
{
"epoch": 0.11675918418262334,
"grad_norm": 47.718692779541016,
"learning_rate": 7.984886948331506e-05,
"loss": 104.1496,
"step": 936
},
{
"epoch": 0.11688392690076717,
"grad_norm": 41.77791976928711,
"learning_rate": 7.984852835398007e-05,
"loss": 108.9352,
"step": 937
},
{
"epoch": 0.117008669618911,
"grad_norm": 42.15372085571289,
"learning_rate": 7.984818684081353e-05,
"loss": 103.684,
"step": 938
},
{
"epoch": 0.11713341233705482,
"grad_norm": 54.79545593261719,
"learning_rate": 7.984784494381871e-05,
"loss": 105.8331,
"step": 939
},
{
"epoch": 0.11725815505519865,
"grad_norm": 41.773746490478516,
"learning_rate": 7.984750266299891e-05,
"loss": 108.9262,
"step": 940
},
{
"epoch": 0.11738289777334249,
"grad_norm": 58.15671920776367,
"learning_rate": 7.984715999835743e-05,
"loss": 100.3065,
"step": 941
},
{
"epoch": 0.1175076404914863,
"grad_norm": 40.945064544677734,
"learning_rate": 7.984681694989755e-05,
"loss": 102.2534,
"step": 942
},
{
"epoch": 0.11763238320963014,
"grad_norm": 41.1585807800293,
"learning_rate": 7.984647351762262e-05,
"loss": 107.23,
"step": 943
},
{
"epoch": 0.11775712592777397,
"grad_norm": 49.104671478271484,
"learning_rate": 7.984612970153591e-05,
"loss": 105.3267,
"step": 944
},
{
"epoch": 0.11788186864591779,
"grad_norm": 42.548301696777344,
"learning_rate": 7.984578550164073e-05,
"loss": 100.4022,
"step": 945
},
{
"epoch": 0.11800661136406163,
"grad_norm": 39.86195755004883,
"learning_rate": 7.984544091794043e-05,
"loss": 105.1634,
"step": 946
},
{
"epoch": 0.11813135408220544,
"grad_norm": 42.331966400146484,
"learning_rate": 7.984509595043829e-05,
"loss": 107.292,
"step": 947
},
{
"epoch": 0.11825609680034928,
"grad_norm": 40.497005462646484,
"learning_rate": 7.984475059913764e-05,
"loss": 104.1257,
"step": 948
},
{
"epoch": 0.11838083951849311,
"grad_norm": 47.4644775390625,
"learning_rate": 7.984440486404184e-05,
"loss": 102.0771,
"step": 949
},
{
"epoch": 0.11850558223663693,
"grad_norm": 60.29157638549805,
"learning_rate": 7.984405874515418e-05,
"loss": 108.2686,
"step": 950
},
{
"epoch": 0.11863032495478076,
"grad_norm": 42.314125061035156,
"learning_rate": 7.984371224247802e-05,
"loss": 103.8263,
"step": 951
},
{
"epoch": 0.1187550676729246,
"grad_norm": 41.67093276977539,
"learning_rate": 7.984336535601668e-05,
"loss": 102.7027,
"step": 952
},
{
"epoch": 0.11887981039106842,
"grad_norm": 45.128570556640625,
"learning_rate": 7.984301808577352e-05,
"loss": 107.38,
"step": 953
},
{
"epoch": 0.11900455310921225,
"grad_norm": 38.7513542175293,
"learning_rate": 7.984267043175186e-05,
"loss": 105.6137,
"step": 954
},
{
"epoch": 0.11912929582735608,
"grad_norm": 43.012454986572266,
"learning_rate": 7.984232239395508e-05,
"loss": 103.1292,
"step": 955
},
{
"epoch": 0.1192540385454999,
"grad_norm": 46.62331008911133,
"learning_rate": 7.98419739723865e-05,
"loss": 107.5923,
"step": 956
},
{
"epoch": 0.11937878126364374,
"grad_norm": 52.90629959106445,
"learning_rate": 7.984162516704949e-05,
"loss": 111.8281,
"step": 957
},
{
"epoch": 0.11950352398178757,
"grad_norm": 42.814151763916016,
"learning_rate": 7.984127597794741e-05,
"loss": 103.1731,
"step": 958
},
{
"epoch": 0.11962826669993139,
"grad_norm": 40.70502853393555,
"learning_rate": 7.984092640508364e-05,
"loss": 102.712,
"step": 959
},
{
"epoch": 0.11975300941807522,
"grad_norm": 38.64878463745117,
"learning_rate": 7.984057644846152e-05,
"loss": 101.0569,
"step": 960
},
{
"epoch": 0.11987775213621905,
"grad_norm": 44.21554946899414,
"learning_rate": 7.984022610808444e-05,
"loss": 100.6943,
"step": 961
},
{
"epoch": 0.12000249485436287,
"grad_norm": 47.2590446472168,
"learning_rate": 7.983987538395574e-05,
"loss": 107.211,
"step": 962
},
{
"epoch": 0.1201272375725067,
"grad_norm": 153.22581481933594,
"learning_rate": 7.983952427607886e-05,
"loss": 101.4361,
"step": 963
},
{
"epoch": 0.12025198029065054,
"grad_norm": 43.805885314941406,
"learning_rate": 7.983917278445713e-05,
"loss": 102.1416,
"step": 964
},
{
"epoch": 0.12037672300879436,
"grad_norm": 53.655426025390625,
"learning_rate": 7.983882090909396e-05,
"loss": 106.817,
"step": 965
},
{
"epoch": 0.12050146572693819,
"grad_norm": 46.62437057495117,
"learning_rate": 7.983846864999273e-05,
"loss": 108.9798,
"step": 966
},
{
"epoch": 0.12062620844508201,
"grad_norm": 41.91475296020508,
"learning_rate": 7.983811600715683e-05,
"loss": 101.471,
"step": 967
},
{
"epoch": 0.12075095116322584,
"grad_norm": 41.89741897583008,
"learning_rate": 7.983776298058967e-05,
"loss": 105.0124,
"step": 968
},
{
"epoch": 0.12087569388136968,
"grad_norm": 44.07170104980469,
"learning_rate": 7.983740957029463e-05,
"loss": 109.3714,
"step": 969
},
{
"epoch": 0.1210004365995135,
"grad_norm": 39.240535736083984,
"learning_rate": 7.983705577627515e-05,
"loss": 102.3179,
"step": 970
},
{
"epoch": 0.12112517931765733,
"grad_norm": 47.37531280517578,
"learning_rate": 7.983670159853459e-05,
"loss": 110.4658,
"step": 971
},
{
"epoch": 0.12124992203580116,
"grad_norm": 38.722755432128906,
"learning_rate": 7.98363470370764e-05,
"loss": 96.1563,
"step": 972
},
{
"epoch": 0.12137466475394498,
"grad_norm": 39.32034683227539,
"learning_rate": 7.983599209190397e-05,
"loss": 99.9755,
"step": 973
},
{
"epoch": 0.12149940747208882,
"grad_norm": 58.30826187133789,
"learning_rate": 7.983563676302075e-05,
"loss": 102.0683,
"step": 974
},
{
"epoch": 0.12162415019023265,
"grad_norm": 37.27293395996094,
"learning_rate": 7.983528105043013e-05,
"loss": 103.4321,
"step": 975
},
{
"epoch": 0.12174889290837647,
"grad_norm": 48.57150650024414,
"learning_rate": 7.983492495413555e-05,
"loss": 105.3405,
"step": 976
},
{
"epoch": 0.1218736356265203,
"grad_norm": 44.16020965576172,
"learning_rate": 7.983456847414044e-05,
"loss": 106.619,
"step": 977
},
{
"epoch": 0.12199837834466414,
"grad_norm": 142.11207580566406,
"learning_rate": 7.983421161044822e-05,
"loss": 103.2059,
"step": 978
},
{
"epoch": 0.12212312106280795,
"grad_norm": 41.91746139526367,
"learning_rate": 7.983385436306236e-05,
"loss": 99.7876,
"step": 979
},
{
"epoch": 0.12224786378095179,
"grad_norm": 43.13427734375,
"learning_rate": 7.983349673198627e-05,
"loss": 105.1217,
"step": 980
},
{
"epoch": 0.12237260649909562,
"grad_norm": 38.959285736083984,
"learning_rate": 7.983313871722341e-05,
"loss": 105.5762,
"step": 981
},
{
"epoch": 0.12249734921723944,
"grad_norm": 39.69189453125,
"learning_rate": 7.983278031877722e-05,
"loss": 100.622,
"step": 982
},
{
"epoch": 0.12262209193538327,
"grad_norm": 37.50962448120117,
"learning_rate": 7.983242153665116e-05,
"loss": 107.0912,
"step": 983
},
{
"epoch": 0.1227468346535271,
"grad_norm": 41.64042282104492,
"learning_rate": 7.983206237084868e-05,
"loss": 105.614,
"step": 984
},
{
"epoch": 0.12287157737167093,
"grad_norm": 37.61865997314453,
"learning_rate": 7.983170282137325e-05,
"loss": 102.5061,
"step": 985
},
{
"epoch": 0.12299632008981476,
"grad_norm": 41.88395309448242,
"learning_rate": 7.983134288822832e-05,
"loss": 106.9319,
"step": 986
},
{
"epoch": 0.12312106280795859,
"grad_norm": 48.3463020324707,
"learning_rate": 7.983098257141736e-05,
"loss": 99.6823,
"step": 987
},
{
"epoch": 0.12324580552610241,
"grad_norm": 41.7379035949707,
"learning_rate": 7.983062187094386e-05,
"loss": 109.6658,
"step": 988
},
{
"epoch": 0.12337054824424624,
"grad_norm": 44.162452697753906,
"learning_rate": 7.983026078681125e-05,
"loss": 101.1912,
"step": 989
},
{
"epoch": 0.12349529096239006,
"grad_norm": 38.681819915771484,
"learning_rate": 7.982989931902306e-05,
"loss": 105.3247,
"step": 990
},
{
"epoch": 0.1236200336805339,
"grad_norm": 49.75293731689453,
"learning_rate": 7.982953746758274e-05,
"loss": 101.1417,
"step": 991
},
{
"epoch": 0.12374477639867773,
"grad_norm": 44.92335891723633,
"learning_rate": 7.982917523249377e-05,
"loss": 104.315,
"step": 992
},
{
"epoch": 0.12386951911682155,
"grad_norm": 39.5562744140625,
"learning_rate": 7.982881261375967e-05,
"loss": 98.5226,
"step": 993
},
{
"epoch": 0.12399426183496538,
"grad_norm": 53.428558349609375,
"learning_rate": 7.982844961138391e-05,
"loss": 102.486,
"step": 994
},
{
"epoch": 0.12411900455310922,
"grad_norm": 40.24176788330078,
"learning_rate": 7.982808622536998e-05,
"loss": 107.4703,
"step": 995
},
{
"epoch": 0.12424374727125304,
"grad_norm": 44.205535888671875,
"learning_rate": 7.982772245572139e-05,
"loss": 99.2608,
"step": 996
},
{
"epoch": 0.12436848998939687,
"grad_norm": 42.553348541259766,
"learning_rate": 7.982735830244166e-05,
"loss": 103.2308,
"step": 997
},
{
"epoch": 0.1244932327075407,
"grad_norm": 45.96080780029297,
"learning_rate": 7.982699376553429e-05,
"loss": 105.5205,
"step": 998
},
{
"epoch": 0.12461797542568452,
"grad_norm": 37.77010726928711,
"learning_rate": 7.982662884500277e-05,
"loss": 107.2647,
"step": 999
},
{
"epoch": 0.12474271814382835,
"grad_norm": 42.76664733886719,
"learning_rate": 7.982626354085063e-05,
"loss": 103.8956,
"step": 1000
},
{
"epoch": 0.12486746086197219,
"grad_norm": 42.41483688354492,
"learning_rate": 7.98258978530814e-05,
"loss": 106.0905,
"step": 1001
},
{
"epoch": 0.12499220358011601,
"grad_norm": 289.9287414550781,
"learning_rate": 7.982553178169858e-05,
"loss": 106.1907,
"step": 1002
},
{
"epoch": 0.12511694629825984,
"grad_norm": 44.18041229248047,
"learning_rate": 7.98251653267057e-05,
"loss": 107.0502,
"step": 1003
},
{
"epoch": 0.12524168901640367,
"grad_norm": 41.63772964477539,
"learning_rate": 7.98247984881063e-05,
"loss": 104.9224,
"step": 1004
},
{
"epoch": 0.1253664317345475,
"grad_norm": 38.821815490722656,
"learning_rate": 7.982443126590392e-05,
"loss": 103.5679,
"step": 1005
},
{
"epoch": 0.1254911744526913,
"grad_norm": 50.01060485839844,
"learning_rate": 7.982406366010208e-05,
"loss": 102.1508,
"step": 1006
},
{
"epoch": 0.12561591717083515,
"grad_norm": 38.84745788574219,
"learning_rate": 7.982369567070432e-05,
"loss": 100.7557,
"step": 1007
},
{
"epoch": 0.12574065988897898,
"grad_norm": 44.66993713378906,
"learning_rate": 7.98233272977142e-05,
"loss": 100.6399,
"step": 1008
},
{
"epoch": 0.1258654026071228,
"grad_norm": 39.005435943603516,
"learning_rate": 7.982295854113527e-05,
"loss": 102.3423,
"step": 1009
},
{
"epoch": 0.12599014532526664,
"grad_norm": 46.47213363647461,
"learning_rate": 7.982258940097106e-05,
"loss": 103.4531,
"step": 1010
},
{
"epoch": 0.12611488804341048,
"grad_norm": 49.512229919433594,
"learning_rate": 7.982221987722515e-05,
"loss": 105.9795,
"step": 1011
},
{
"epoch": 0.12623963076155428,
"grad_norm": 45.11735153198242,
"learning_rate": 7.982184996990107e-05,
"loss": 105.1818,
"step": 1012
},
{
"epoch": 0.12636437347969812,
"grad_norm": 36.413204193115234,
"learning_rate": 7.982147967900242e-05,
"loss": 109.6297,
"step": 1013
},
{
"epoch": 0.12648911619784195,
"grad_norm": 39.70576095581055,
"learning_rate": 7.982110900453274e-05,
"loss": 109.7623,
"step": 1014
},
{
"epoch": 0.12661385891598578,
"grad_norm": 42.081119537353516,
"learning_rate": 7.982073794649561e-05,
"loss": 106.0906,
"step": 1015
},
{
"epoch": 0.12673860163412962,
"grad_norm": 45.607643127441406,
"learning_rate": 7.98203665048946e-05,
"loss": 101.6033,
"step": 1016
},
{
"epoch": 0.12686334435227345,
"grad_norm": 41.00294494628906,
"learning_rate": 7.981999467973329e-05,
"loss": 102.1543,
"step": 1017
},
{
"epoch": 0.12698808707041725,
"grad_norm": 43.26812744140625,
"learning_rate": 7.981962247101526e-05,
"loss": 96.3268,
"step": 1018
},
{
"epoch": 0.1271128297885611,
"grad_norm": 41.6645622253418,
"learning_rate": 7.98192498787441e-05,
"loss": 100.9683,
"step": 1019
},
{
"epoch": 0.12723757250670492,
"grad_norm": 38.62349319458008,
"learning_rate": 7.981887690292339e-05,
"loss": 102.1583,
"step": 1020
},
{
"epoch": 0.12736231522484875,
"grad_norm": 47.12042236328125,
"learning_rate": 7.981850354355673e-05,
"loss": 103.3268,
"step": 1021
},
{
"epoch": 0.1274870579429926,
"grad_norm": 55.38361358642578,
"learning_rate": 7.981812980064772e-05,
"loss": 101.0633,
"step": 1022
},
{
"epoch": 0.1276118006611364,
"grad_norm": 44.878204345703125,
"learning_rate": 7.981775567419994e-05,
"loss": 100.6047,
"step": 1023
},
{
"epoch": 0.12773654337928023,
"grad_norm": 44.93181610107422,
"learning_rate": 7.981738116421704e-05,
"loss": 107.934,
"step": 1024
},
{
"epoch": 0.12786128609742406,
"grad_norm": 73.7461929321289,
"learning_rate": 7.981700627070256e-05,
"loss": 103.9593,
"step": 1025
},
{
"epoch": 0.1279860288155679,
"grad_norm": 39.43525695800781,
"learning_rate": 7.981663099366016e-05,
"loss": 106.4115,
"step": 1026
},
{
"epoch": 0.12811077153371173,
"grad_norm": 47.666988372802734,
"learning_rate": 7.981625533309345e-05,
"loss": 109.1624,
"step": 1027
},
{
"epoch": 0.12823551425185556,
"grad_norm": 47.647090911865234,
"learning_rate": 7.981587928900602e-05,
"loss": 102.6936,
"step": 1028
},
{
"epoch": 0.12836025696999936,
"grad_norm": 39.2376708984375,
"learning_rate": 7.981550286140152e-05,
"loss": 103.5587,
"step": 1029
},
{
"epoch": 0.1284849996881432,
"grad_norm": 43.236576080322266,
"learning_rate": 7.98151260502836e-05,
"loss": 104.3997,
"step": 1030
},
{
"epoch": 0.12860974240628703,
"grad_norm": 45.18012619018555,
"learning_rate": 7.981474885565581e-05,
"loss": 100.2446,
"step": 1031
},
{
"epoch": 0.12873448512443086,
"grad_norm": 43.01432800292969,
"learning_rate": 7.981437127752186e-05,
"loss": 102.2242,
"step": 1032
},
{
"epoch": 0.1288592278425747,
"grad_norm": 41.84919738769531,
"learning_rate": 7.981399331588534e-05,
"loss": 105.1042,
"step": 1033
},
{
"epoch": 0.12898397056071853,
"grad_norm": 40.88037109375,
"learning_rate": 7.981361497074992e-05,
"loss": 105.5376,
"step": 1034
},
{
"epoch": 0.12910871327886234,
"grad_norm": 42.820411682128906,
"learning_rate": 7.981323624211923e-05,
"loss": 101.1741,
"step": 1035
},
{
"epoch": 0.12923345599700617,
"grad_norm": 43.67140579223633,
"learning_rate": 7.981285712999692e-05,
"loss": 104.4841,
"step": 1036
},
{
"epoch": 0.12935819871515,
"grad_norm": 38.88359069824219,
"learning_rate": 7.981247763438663e-05,
"loss": 106.4145,
"step": 1037
},
{
"epoch": 0.12948294143329384,
"grad_norm": 37.679752349853516,
"learning_rate": 7.981209775529203e-05,
"loss": 101.1446,
"step": 1038
},
{
"epoch": 0.12960768415143767,
"grad_norm": 48.2840461730957,
"learning_rate": 7.98117174927168e-05,
"loss": 107.0838,
"step": 1039
},
{
"epoch": 0.1297324268695815,
"grad_norm": 48.89136505126953,
"learning_rate": 7.981133684666456e-05,
"loss": 103.0558,
"step": 1040
},
{
"epoch": 0.1298571695877253,
"grad_norm": 74.89447021484375,
"learning_rate": 7.9810955817139e-05,
"loss": 100.0681,
"step": 1041
},
{
"epoch": 0.12998191230586914,
"grad_norm": 49.1029052734375,
"learning_rate": 7.98105744041438e-05,
"loss": 107.9037,
"step": 1042
},
{
"epoch": 0.13010665502401297,
"grad_norm": 38.52644348144531,
"learning_rate": 7.981019260768261e-05,
"loss": 100.9544,
"step": 1043
},
{
"epoch": 0.1302313977421568,
"grad_norm": 43.38142395019531,
"learning_rate": 7.980981042775912e-05,
"loss": 109.065,
"step": 1044
},
{
"epoch": 0.13035614046030064,
"grad_norm": 39.806941986083984,
"learning_rate": 7.980942786437698e-05,
"loss": 102.733,
"step": 1045
},
{
"epoch": 0.13048088317844445,
"grad_norm": 37.9673957824707,
"learning_rate": 7.980904491753994e-05,
"loss": 106.2415,
"step": 1046
},
{
"epoch": 0.13060562589658828,
"grad_norm": 41.74326705932617,
"learning_rate": 7.980866158725164e-05,
"loss": 107.9857,
"step": 1047
},
{
"epoch": 0.1307303686147321,
"grad_norm": 35.681400299072266,
"learning_rate": 7.980827787351577e-05,
"loss": 103.9756,
"step": 1048
},
{
"epoch": 0.13085511133287595,
"grad_norm": 39.64601516723633,
"learning_rate": 7.980789377633607e-05,
"loss": 107.1496,
"step": 1049
},
{
"epoch": 0.13097985405101978,
"grad_norm": 42.127681732177734,
"learning_rate": 7.980750929571619e-05,
"loss": 102.569,
"step": 1050
},
{
"epoch": 0.1311045967691636,
"grad_norm": 38.39411544799805,
"learning_rate": 7.980712443165987e-05,
"loss": 98.4356,
"step": 1051
},
{
"epoch": 0.13122933948730742,
"grad_norm": 65.14961242675781,
"learning_rate": 7.98067391841708e-05,
"loss": 102.5914,
"step": 1052
},
{
"epoch": 0.13135408220545125,
"grad_norm": 43.52762985229492,
"learning_rate": 7.980635355325268e-05,
"loss": 107.5715,
"step": 1053
},
{
"epoch": 0.13147882492359508,
"grad_norm": 38.31730651855469,
"learning_rate": 7.980596753890923e-05,
"loss": 104.3053,
"step": 1054
},
{
"epoch": 0.13160356764173892,
"grad_norm": 40.169647216796875,
"learning_rate": 7.980558114114418e-05,
"loss": 105.1789,
"step": 1055
},
{
"epoch": 0.13172831035988275,
"grad_norm": 38.715938568115234,
"learning_rate": 7.980519435996126e-05,
"loss": 101.6184,
"step": 1056
},
{
"epoch": 0.13185305307802658,
"grad_norm": 39.234676361083984,
"learning_rate": 7.980480719536416e-05,
"loss": 104.7157,
"step": 1057
},
{
"epoch": 0.1319777957961704,
"grad_norm": 39.56693649291992,
"learning_rate": 7.980441964735666e-05,
"loss": 107.0278,
"step": 1058
},
{
"epoch": 0.13210253851431422,
"grad_norm": 42.31689453125,
"learning_rate": 7.980403171594244e-05,
"loss": 103.4373,
"step": 1059
},
{
"epoch": 0.13222728123245805,
"grad_norm": 44.95036697387695,
"learning_rate": 7.980364340112527e-05,
"loss": 109.1877,
"step": 1060
},
{
"epoch": 0.1323520239506019,
"grad_norm": 39.58286666870117,
"learning_rate": 7.980325470290888e-05,
"loss": 105.3288,
"step": 1061
},
{
"epoch": 0.13247676666874572,
"grad_norm": 43.56354522705078,
"learning_rate": 7.980286562129702e-05,
"loss": 104.8817,
"step": 1062
},
{
"epoch": 0.13260150938688953,
"grad_norm": 55.55686950683594,
"learning_rate": 7.980247615629342e-05,
"loss": 104.4174,
"step": 1063
},
{
"epoch": 0.13272625210503336,
"grad_norm": 38.004058837890625,
"learning_rate": 7.980208630790186e-05,
"loss": 99.3472,
"step": 1064
},
{
"epoch": 0.1328509948231772,
"grad_norm": 38.910770416259766,
"learning_rate": 7.980169607612608e-05,
"loss": 105.5136,
"step": 1065
},
{
"epoch": 0.13297573754132103,
"grad_norm": 45.89297866821289,
"learning_rate": 7.980130546096982e-05,
"loss": 106.1455,
"step": 1066
},
{
"epoch": 0.13310048025946486,
"grad_norm": 43.957679748535156,
"learning_rate": 7.980091446243687e-05,
"loss": 101.8686,
"step": 1067
},
{
"epoch": 0.1332252229776087,
"grad_norm": 39.10406494140625,
"learning_rate": 7.980052308053101e-05,
"loss": 101.8556,
"step": 1068
},
{
"epoch": 0.1333499656957525,
"grad_norm": 42.96813201904297,
"learning_rate": 7.980013131525597e-05,
"loss": 103.0391,
"step": 1069
},
{
"epoch": 0.13347470841389633,
"grad_norm": 40.30632781982422,
"learning_rate": 7.979973916661553e-05,
"loss": 104.1147,
"step": 1070
},
{
"epoch": 0.13359945113204016,
"grad_norm": 38.319923400878906,
"learning_rate": 7.979934663461348e-05,
"loss": 104.163,
"step": 1071
},
{
"epoch": 0.133724193850184,
"grad_norm": 40.895347595214844,
"learning_rate": 7.979895371925362e-05,
"loss": 101.8786,
"step": 1072
},
{
"epoch": 0.13384893656832783,
"grad_norm": 39.7868537902832,
"learning_rate": 7.979856042053968e-05,
"loss": 102.4987,
"step": 1073
},
{
"epoch": 0.13397367928647166,
"grad_norm": 42.26321029663086,
"learning_rate": 7.979816673847551e-05,
"loss": 98.1734,
"step": 1074
},
{
"epoch": 0.13409842200461547,
"grad_norm": 55.74448776245117,
"learning_rate": 7.979777267306485e-05,
"loss": 103.1582,
"step": 1075
},
{
"epoch": 0.1342231647227593,
"grad_norm": 39.195308685302734,
"learning_rate": 7.979737822431155e-05,
"loss": 97.1236,
"step": 1076
},
{
"epoch": 0.13434790744090314,
"grad_norm": 44.07651901245117,
"learning_rate": 7.979698339221936e-05,
"loss": 106.1781,
"step": 1077
},
{
"epoch": 0.13447265015904697,
"grad_norm": 40.62828826904297,
"learning_rate": 7.97965881767921e-05,
"loss": 100.5183,
"step": 1078
},
{
"epoch": 0.1345973928771908,
"grad_norm": 62.084815979003906,
"learning_rate": 7.979619257803359e-05,
"loss": 105.2758,
"step": 1079
},
{
"epoch": 0.13472213559533464,
"grad_norm": 39.98674392700195,
"learning_rate": 7.979579659594762e-05,
"loss": 106.6714,
"step": 1080
},
{
"epoch": 0.13484687831347844,
"grad_norm": 41.30696105957031,
"learning_rate": 7.979540023053802e-05,
"loss": 106.12,
"step": 1081
},
{
"epoch": 0.13497162103162227,
"grad_norm": 50.68461608886719,
"learning_rate": 7.97950034818086e-05,
"loss": 103.3589,
"step": 1082
},
{
"epoch": 0.1350963637497661,
"grad_norm": 39.29159927368164,
"learning_rate": 7.979460634976318e-05,
"loss": 104.7682,
"step": 1083
},
{
"epoch": 0.13522110646790994,
"grad_norm": 42.048763275146484,
"learning_rate": 7.97942088344056e-05,
"loss": 105.9789,
"step": 1084
},
{
"epoch": 0.13534584918605377,
"grad_norm": 40.2606086730957,
"learning_rate": 7.979381093573966e-05,
"loss": 107.7236,
"step": 1085
},
{
"epoch": 0.13547059190419758,
"grad_norm": 51.1106071472168,
"learning_rate": 7.979341265376923e-05,
"loss": 105.8351,
"step": 1086
},
{
"epoch": 0.1355953346223414,
"grad_norm": 40.24812316894531,
"learning_rate": 7.97930139884981e-05,
"loss": 106.2639,
"step": 1087
},
{
"epoch": 0.13572007734048525,
"grad_norm": 52.93092727661133,
"learning_rate": 7.979261493993015e-05,
"loss": 106.2666,
"step": 1088
},
{
"epoch": 0.13584482005862908,
"grad_norm": 37.333187103271484,
"learning_rate": 7.979221550806922e-05,
"loss": 101.8498,
"step": 1089
},
{
"epoch": 0.1359695627767729,
"grad_norm": 38.67643737792969,
"learning_rate": 7.979181569291914e-05,
"loss": 106.495,
"step": 1090
},
{
"epoch": 0.13609430549491675,
"grad_norm": 42.811004638671875,
"learning_rate": 7.979141549448377e-05,
"loss": 108.3244,
"step": 1091
},
{
"epoch": 0.13621904821306055,
"grad_norm": 37.5869255065918,
"learning_rate": 7.979101491276697e-05,
"loss": 106.0436,
"step": 1092
},
{
"epoch": 0.13634379093120438,
"grad_norm": 41.9306755065918,
"learning_rate": 7.979061394777258e-05,
"loss": 105.0257,
"step": 1093
},
{
"epoch": 0.13646853364934822,
"grad_norm": 44.89603042602539,
"learning_rate": 7.979021259950448e-05,
"loss": 103.2085,
"step": 1094
},
{
"epoch": 0.13659327636749205,
"grad_norm": 40.03647994995117,
"learning_rate": 7.978981086796653e-05,
"loss": 101.0322,
"step": 1095
},
{
"epoch": 0.13671801908563588,
"grad_norm": 60.383426666259766,
"learning_rate": 7.97894087531626e-05,
"loss": 102.2396,
"step": 1096
},
{
"epoch": 0.13684276180377972,
"grad_norm": 41.114479064941406,
"learning_rate": 7.978900625509657e-05,
"loss": 101.5029,
"step": 1097
},
{
"epoch": 0.13696750452192352,
"grad_norm": 43.51894760131836,
"learning_rate": 7.97886033737723e-05,
"loss": 103.3899,
"step": 1098
},
{
"epoch": 0.13709224724006736,
"grad_norm": 41.850276947021484,
"learning_rate": 7.978820010919368e-05,
"loss": 105.0282,
"step": 1099
},
{
"epoch": 0.1372169899582112,
"grad_norm": 41.3687744140625,
"learning_rate": 7.97877964613646e-05,
"loss": 102.1651,
"step": 1100
}
],
"logging_steps": 1,
"max_steps": 32060,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3462053003682906e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}