random_KdfijGRFamnlmYzK / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
89dedc6 verified
raw
history blame
71.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 412,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0048543689320388345,
"grad_norm": 3.3324859196579646,
"learning_rate": 9.999854640567861e-06,
"loss": 0.1339,
"step": 1
},
{
"epoch": 0.009708737864077669,
"grad_norm": 3.8088042361483816,
"learning_rate": 9.999418570723189e-06,
"loss": 0.1354,
"step": 2
},
{
"epoch": 0.014563106796116505,
"grad_norm": 5.274233478446681,
"learning_rate": 9.998691815820732e-06,
"loss": 0.2009,
"step": 3
},
{
"epoch": 0.019417475728155338,
"grad_norm": 4.101699811840383,
"learning_rate": 9.997674418116759e-06,
"loss": 0.1982,
"step": 4
},
{
"epoch": 0.024271844660194174,
"grad_norm": 3.924795885151193,
"learning_rate": 9.996366436766612e-06,
"loss": 0.1883,
"step": 5
},
{
"epoch": 0.02912621359223301,
"grad_norm": 5.771957604254228,
"learning_rate": 9.994767947821261e-06,
"loss": 0.188,
"step": 6
},
{
"epoch": 0.03398058252427184,
"grad_norm": 4.039324853603417,
"learning_rate": 9.992879044222887e-06,
"loss": 0.1602,
"step": 7
},
{
"epoch": 0.038834951456310676,
"grad_norm": 3.450341344849428,
"learning_rate": 9.99069983579947e-06,
"loss": 0.1716,
"step": 8
},
{
"epoch": 0.043689320388349516,
"grad_norm": 3.7152133363818325,
"learning_rate": 9.988230449258409e-06,
"loss": 0.1584,
"step": 9
},
{
"epoch": 0.04854368932038835,
"grad_norm": 2.9403672361375413,
"learning_rate": 9.985471028179155e-06,
"loss": 0.1338,
"step": 10
},
{
"epoch": 0.05339805825242718,
"grad_norm": 3.6346123927161558,
"learning_rate": 9.982421733004857e-06,
"loss": 0.1413,
"step": 11
},
{
"epoch": 0.05825242718446602,
"grad_norm": 3.918759862991223,
"learning_rate": 9.979082741033047e-06,
"loss": 0.1571,
"step": 12
},
{
"epoch": 0.06310679611650485,
"grad_norm": 3.3243324871274846,
"learning_rate": 9.975454246405312e-06,
"loss": 0.1373,
"step": 13
},
{
"epoch": 0.06796116504854369,
"grad_norm": 4.090995406850703,
"learning_rate": 9.971536460096021e-06,
"loss": 0.2064,
"step": 14
},
{
"epoch": 0.07281553398058252,
"grad_norm": 4.5651807246739535,
"learning_rate": 9.96732960990005e-06,
"loss": 0.213,
"step": 15
},
{
"epoch": 0.07766990291262135,
"grad_norm": 3.8522165104605186,
"learning_rate": 9.96283394041954e-06,
"loss": 0.1806,
"step": 16
},
{
"epoch": 0.0825242718446602,
"grad_norm": 4.090915249944868,
"learning_rate": 9.95804971304968e-06,
"loss": 0.1594,
"step": 17
},
{
"epoch": 0.08737864077669903,
"grad_norm": 3.343034439706098,
"learning_rate": 9.952977205963496e-06,
"loss": 0.143,
"step": 18
},
{
"epoch": 0.09223300970873786,
"grad_norm": 3.2771634678459036,
"learning_rate": 9.94761671409569e-06,
"loss": 0.1604,
"step": 19
},
{
"epoch": 0.0970873786407767,
"grad_norm": 3.4631851530033657,
"learning_rate": 9.941968549125481e-06,
"loss": 0.1618,
"step": 20
},
{
"epoch": 0.10194174757281553,
"grad_norm": 3.928255474637941,
"learning_rate": 9.936033039458494e-06,
"loss": 0.1778,
"step": 21
},
{
"epoch": 0.10679611650485436,
"grad_norm": 3.3479570283155202,
"learning_rate": 9.929810530207651e-06,
"loss": 0.1591,
"step": 22
},
{
"epoch": 0.11165048543689321,
"grad_norm": 3.6975883529242646,
"learning_rate": 9.923301383173119e-06,
"loss": 0.2006,
"step": 23
},
{
"epoch": 0.11650485436893204,
"grad_norm": 3.469665362839314,
"learning_rate": 9.916505976821262e-06,
"loss": 0.1657,
"step": 24
},
{
"epoch": 0.12135922330097088,
"grad_norm": 3.3643581564878673,
"learning_rate": 9.909424706262647e-06,
"loss": 0.1569,
"step": 25
},
{
"epoch": 0.1262135922330097,
"grad_norm": 4.110694436156542,
"learning_rate": 9.902057983229059e-06,
"loss": 0.1909,
"step": 26
},
{
"epoch": 0.13106796116504854,
"grad_norm": 3.186323923132206,
"learning_rate": 9.894406236049569e-06,
"loss": 0.163,
"step": 27
},
{
"epoch": 0.13592233009708737,
"grad_norm": 3.5498332735613127,
"learning_rate": 9.886469909625624e-06,
"loss": 0.196,
"step": 28
},
{
"epoch": 0.1407766990291262,
"grad_norm": 3.9980031295006313,
"learning_rate": 9.87824946540519e-06,
"loss": 0.208,
"step": 29
},
{
"epoch": 0.14563106796116504,
"grad_norm": 3.4906745199009417,
"learning_rate": 9.869745381355906e-06,
"loss": 0.1664,
"step": 30
},
{
"epoch": 0.15048543689320387,
"grad_norm": 3.954794586878399,
"learning_rate": 9.860958151937303e-06,
"loss": 0.1877,
"step": 31
},
{
"epoch": 0.1553398058252427,
"grad_norm": 3.6372789508629273,
"learning_rate": 9.851888288072053e-06,
"loss": 0.1735,
"step": 32
},
{
"epoch": 0.16019417475728157,
"grad_norm": 3.9820299514191384,
"learning_rate": 9.842536317116262e-06,
"loss": 0.2067,
"step": 33
},
{
"epoch": 0.1650485436893204,
"grad_norm": 3.7102209529281174,
"learning_rate": 9.832902782828801e-06,
"loss": 0.1915,
"step": 34
},
{
"epoch": 0.16990291262135923,
"grad_norm": 3.925675916083603,
"learning_rate": 9.822988245339701e-06,
"loss": 0.1979,
"step": 35
},
{
"epoch": 0.17475728155339806,
"grad_norm": 3.918214682037897,
"learning_rate": 9.81279328111758e-06,
"loss": 0.1596,
"step": 36
},
{
"epoch": 0.1796116504854369,
"grad_norm": 9.706515279117301,
"learning_rate": 9.802318482936121e-06,
"loss": 2.2571,
"step": 37
},
{
"epoch": 0.18446601941747573,
"grad_norm": 6.621327724804455,
"learning_rate": 9.791564459839609e-06,
"loss": 1.938,
"step": 38
},
{
"epoch": 0.18932038834951456,
"grad_norm": 3.7869250833437826,
"learning_rate": 9.780531837107519e-06,
"loss": 0.2153,
"step": 39
},
{
"epoch": 0.1941747572815534,
"grad_norm": 4.474156895071826,
"learning_rate": 9.769221256218165e-06,
"loss": 0.2017,
"step": 40
},
{
"epoch": 0.19902912621359223,
"grad_norm": 4.084460872286135,
"learning_rate": 9.75763337481139e-06,
"loss": 0.2182,
"step": 41
},
{
"epoch": 0.20388349514563106,
"grad_norm": 3.870693933215402,
"learning_rate": 9.745768866650339e-06,
"loss": 0.2244,
"step": 42
},
{
"epoch": 0.2087378640776699,
"grad_norm": 2.902378281214243,
"learning_rate": 9.73362842158228e-06,
"loss": 0.1467,
"step": 43
},
{
"epoch": 0.21359223300970873,
"grad_norm": 3.248116446046984,
"learning_rate": 9.721212745498493e-06,
"loss": 0.1843,
"step": 44
},
{
"epoch": 0.21844660194174756,
"grad_norm": 3.667256599988585,
"learning_rate": 9.70852256029323e-06,
"loss": 0.1765,
"step": 45
},
{
"epoch": 0.22330097087378642,
"grad_norm": 3.839808524241103,
"learning_rate": 9.695558603821735e-06,
"loss": 0.2249,
"step": 46
},
{
"epoch": 0.22815533980582525,
"grad_norm": 3.561913372270993,
"learning_rate": 9.682321629857348e-06,
"loss": 0.1945,
"step": 47
},
{
"epoch": 0.23300970873786409,
"grad_norm": 3.7140607428554686,
"learning_rate": 9.66881240804768e-06,
"loss": 0.1923,
"step": 48
},
{
"epoch": 0.23786407766990292,
"grad_norm": 4.162473919082487,
"learning_rate": 9.655031723869848e-06,
"loss": 0.2106,
"step": 49
},
{
"epoch": 0.24271844660194175,
"grad_norm": 4.088833429529486,
"learning_rate": 9.64098037858483e-06,
"loss": 0.1948,
"step": 50
},
{
"epoch": 0.24757281553398058,
"grad_norm": 6.098858386818918,
"learning_rate": 9.626659189190852e-06,
"loss": 1.3556,
"step": 51
},
{
"epoch": 0.2524271844660194,
"grad_norm": 3.2961565474058565,
"learning_rate": 9.612068988375898e-06,
"loss": 0.1675,
"step": 52
},
{
"epoch": 0.25728155339805825,
"grad_norm": 4.3986677165375525,
"learning_rate": 9.597210624469288e-06,
"loss": 0.1971,
"step": 53
},
{
"epoch": 0.2621359223300971,
"grad_norm": 3.177129834284871,
"learning_rate": 9.582084961392358e-06,
"loss": 0.1439,
"step": 54
},
{
"epoch": 0.2669902912621359,
"grad_norm": 3.3475437780719637,
"learning_rate": 9.566692878608229e-06,
"loss": 0.1618,
"step": 55
},
{
"epoch": 0.27184466019417475,
"grad_norm": 3.8249351827001177,
"learning_rate": 9.551035271070665e-06,
"loss": 0.2095,
"step": 56
},
{
"epoch": 0.2766990291262136,
"grad_norm": 3.7494052477874003,
"learning_rate": 9.53511304917204e-06,
"loss": 0.1899,
"step": 57
},
{
"epoch": 0.2815533980582524,
"grad_norm": 3.137113053607151,
"learning_rate": 9.51892713869041e-06,
"loss": 0.1361,
"step": 58
},
{
"epoch": 0.28640776699029125,
"grad_norm": 4.154996879718851,
"learning_rate": 9.502478480735678e-06,
"loss": 0.1873,
"step": 59
},
{
"epoch": 0.2912621359223301,
"grad_norm": 3.8345434030006658,
"learning_rate": 9.485768031694872e-06,
"loss": 0.1867,
"step": 60
},
{
"epoch": 0.2961165048543689,
"grad_norm": 4.01375505704675,
"learning_rate": 9.468796763176549e-06,
"loss": 0.2054,
"step": 61
},
{
"epoch": 0.30097087378640774,
"grad_norm": 4.06144310993101,
"learning_rate": 9.45156566195429e-06,
"loss": 0.1571,
"step": 62
},
{
"epoch": 0.3058252427184466,
"grad_norm": 3.792468878802408,
"learning_rate": 9.43407572990933e-06,
"loss": 0.1837,
"step": 63
},
{
"epoch": 0.3106796116504854,
"grad_norm": 3.648120070990038,
"learning_rate": 9.416327983972304e-06,
"loss": 0.1938,
"step": 64
},
{
"epoch": 0.3155339805825243,
"grad_norm": 3.489437914184596,
"learning_rate": 9.398323456064124e-06,
"loss": 0.2241,
"step": 65
},
{
"epoch": 0.32038834951456313,
"grad_norm": 3.374340511480333,
"learning_rate": 9.380063193035968e-06,
"loss": 0.177,
"step": 66
},
{
"epoch": 0.32524271844660196,
"grad_norm": 3.264598449702114,
"learning_rate": 9.361548256608421e-06,
"loss": 0.1797,
"step": 67
},
{
"epoch": 0.3300970873786408,
"grad_norm": 3.966830853033163,
"learning_rate": 9.342779723309746e-06,
"loss": 0.2554,
"step": 68
},
{
"epoch": 0.33495145631067963,
"grad_norm": 3.420409800935627,
"learning_rate": 9.323758684413272e-06,
"loss": 0.1478,
"step": 69
},
{
"epoch": 0.33980582524271846,
"grad_norm": 3.7130637493387324,
"learning_rate": 9.304486245873973e-06,
"loss": 0.1751,
"step": 70
},
{
"epoch": 0.3446601941747573,
"grad_norm": 3.5828005486461563,
"learning_rate": 9.284963528264133e-06,
"loss": 0.1823,
"step": 71
},
{
"epoch": 0.34951456310679613,
"grad_norm": 3.3855623884926134,
"learning_rate": 9.26519166670821e-06,
"loss": 0.1685,
"step": 72
},
{
"epoch": 0.35436893203883496,
"grad_norm": 8.968308123292674,
"learning_rate": 9.24517181081683e-06,
"loss": 2.1169,
"step": 73
},
{
"epoch": 0.3592233009708738,
"grad_norm": 3.4944633320992358,
"learning_rate": 9.22490512461995e-06,
"loss": 0.1636,
"step": 74
},
{
"epoch": 0.3640776699029126,
"grad_norm": 4.197357025299537,
"learning_rate": 9.204392786499168e-06,
"loss": 0.2401,
"step": 75
},
{
"epoch": 0.36893203883495146,
"grad_norm": 3.3051744285672373,
"learning_rate": 9.183635989119211e-06,
"loss": 0.1641,
"step": 76
},
{
"epoch": 0.3737864077669903,
"grad_norm": 3.6322680235984235,
"learning_rate": 9.162635939358593e-06,
"loss": 0.1614,
"step": 77
},
{
"epoch": 0.3786407766990291,
"grad_norm": 3.2637668132144895,
"learning_rate": 9.141393858239435e-06,
"loss": 0.1459,
"step": 78
},
{
"epoch": 0.38349514563106796,
"grad_norm": 3.2541730866975893,
"learning_rate": 9.119910980856477e-06,
"loss": 0.1804,
"step": 79
},
{
"epoch": 0.3883495145631068,
"grad_norm": 3.2213080042311586,
"learning_rate": 9.098188556305262e-06,
"loss": 0.19,
"step": 80
},
{
"epoch": 0.3932038834951456,
"grad_norm": 3.4206478733768075,
"learning_rate": 9.076227847609513e-06,
"loss": 0.144,
"step": 81
},
{
"epoch": 0.39805825242718446,
"grad_norm": 2.8054401115428824,
"learning_rate": 9.054030131647682e-06,
"loss": 0.1541,
"step": 82
},
{
"epoch": 0.4029126213592233,
"grad_norm": 3.833189307216376,
"learning_rate": 9.031596699078727e-06,
"loss": 0.231,
"step": 83
},
{
"epoch": 0.4077669902912621,
"grad_norm": 3.692036642881131,
"learning_rate": 9.008928854267054e-06,
"loss": 0.1729,
"step": 84
},
{
"epoch": 0.41262135922330095,
"grad_norm": 3.23541497427633,
"learning_rate": 8.986027915206686e-06,
"loss": 0.1286,
"step": 85
},
{
"epoch": 0.4174757281553398,
"grad_norm": 4.0009005067420365,
"learning_rate": 8.962895213444618e-06,
"loss": 0.1922,
"step": 86
},
{
"epoch": 0.4223300970873786,
"grad_norm": 3.371659427266,
"learning_rate": 8.939532094003409e-06,
"loss": 1.6075,
"step": 87
},
{
"epoch": 0.42718446601941745,
"grad_norm": 4.543078324073699,
"learning_rate": 8.91593991530297e-06,
"loss": 0.1903,
"step": 88
},
{
"epoch": 0.4320388349514563,
"grad_norm": 5.581237410035587,
"learning_rate": 8.892120049081577e-06,
"loss": 0.2364,
"step": 89
},
{
"epoch": 0.4368932038834951,
"grad_norm": 3.4788239166280337,
"learning_rate": 8.868073880316125e-06,
"loss": 0.2073,
"step": 90
},
{
"epoch": 0.441747572815534,
"grad_norm": 3.4847904706068893,
"learning_rate": 8.843802807141584e-06,
"loss": 0.1641,
"step": 91
},
{
"epoch": 0.44660194174757284,
"grad_norm": 4.376363674830701,
"learning_rate": 8.819308240769726e-06,
"loss": 0.2238,
"step": 92
},
{
"epoch": 0.45145631067961167,
"grad_norm": 3.269073216227844,
"learning_rate": 8.794591605407047e-06,
"loss": 0.1974,
"step": 93
},
{
"epoch": 0.4563106796116505,
"grad_norm": 4.3052039894235135,
"learning_rate": 8.769654338171986e-06,
"loss": 0.2162,
"step": 94
},
{
"epoch": 0.46116504854368934,
"grad_norm": 3.300951688264894,
"learning_rate": 8.744497889011344e-06,
"loss": 0.1779,
"step": 95
},
{
"epoch": 0.46601941747572817,
"grad_norm": 3.1987714308242885,
"learning_rate": 8.71912372061598e-06,
"loss": 0.1827,
"step": 96
},
{
"epoch": 0.470873786407767,
"grad_norm": 2.8017077028359885,
"learning_rate": 8.693533308335786e-06,
"loss": 0.148,
"step": 97
},
{
"epoch": 0.47572815533980584,
"grad_norm": 3.102368312031283,
"learning_rate": 8.667728140093876e-06,
"loss": 0.2068,
"step": 98
},
{
"epoch": 0.48058252427184467,
"grad_norm": 3.6508180598366695,
"learning_rate": 8.641709716300092e-06,
"loss": 0.2029,
"step": 99
},
{
"epoch": 0.4854368932038835,
"grad_norm": 3.3950171740747166,
"learning_rate": 8.615479549763756e-06,
"loss": 1.8884,
"step": 100
},
{
"epoch": 0.49029126213592233,
"grad_norm": 3.1312849938657896,
"learning_rate": 8.589039165605716e-06,
"loss": 0.1537,
"step": 101
},
{
"epoch": 0.49514563106796117,
"grad_norm": 3.5801315817647232,
"learning_rate": 8.56239010116966e-06,
"loss": 0.1824,
"step": 102
},
{
"epoch": 0.5,
"grad_norm": 3.546534300447041,
"learning_rate": 8.535533905932739e-06,
"loss": 0.1804,
"step": 103
},
{
"epoch": 0.5048543689320388,
"grad_norm": 3.2877479715730313,
"learning_rate": 8.508472141415468e-06,
"loss": 0.1617,
"step": 104
},
{
"epoch": 0.5097087378640777,
"grad_norm": 3.5584173121991323,
"learning_rate": 8.481206381090934e-06,
"loss": 0.1887,
"step": 105
},
{
"epoch": 0.5145631067961165,
"grad_norm": 3.805569430447853,
"learning_rate": 8.453738210293316e-06,
"loss": 0.1721,
"step": 106
},
{
"epoch": 0.5194174757281553,
"grad_norm": 3.881708238229782,
"learning_rate": 8.426069226125695e-06,
"loss": 0.1838,
"step": 107
},
{
"epoch": 0.5242718446601942,
"grad_norm": 3.98671495838238,
"learning_rate": 8.398201037367202e-06,
"loss": 0.1961,
"step": 108
},
{
"epoch": 0.529126213592233,
"grad_norm": 2.869964566560864,
"learning_rate": 8.370135264379475e-06,
"loss": 0.1474,
"step": 109
},
{
"epoch": 0.5339805825242718,
"grad_norm": 3.3720637127767605,
"learning_rate": 8.341873539012443e-06,
"loss": 0.1704,
"step": 110
},
{
"epoch": 0.5388349514563107,
"grad_norm": 3.3436728179491673,
"learning_rate": 8.313417504509446e-06,
"loss": 0.151,
"step": 111
},
{
"epoch": 0.5436893203883495,
"grad_norm": 3.1939397858342167,
"learning_rate": 8.284768815411693e-06,
"loss": 0.164,
"step": 112
},
{
"epoch": 0.5485436893203883,
"grad_norm": 3.9497595827371903,
"learning_rate": 8.255929137462049e-06,
"loss": 0.1844,
"step": 113
},
{
"epoch": 0.5533980582524272,
"grad_norm": 3.6900776204954093,
"learning_rate": 8.226900147508205e-06,
"loss": 0.1726,
"step": 114
},
{
"epoch": 0.558252427184466,
"grad_norm": 3.416053446028884,
"learning_rate": 8.197683533405156e-06,
"loss": 0.1651,
"step": 115
},
{
"epoch": 0.5631067961165048,
"grad_norm": 3.830007646061756,
"learning_rate": 8.168280993917078e-06,
"loss": 0.2018,
"step": 116
},
{
"epoch": 0.5679611650485437,
"grad_norm": 3.6356803944567706,
"learning_rate": 8.138694238618543e-06,
"loss": 0.186,
"step": 117
},
{
"epoch": 0.5728155339805825,
"grad_norm": 3.268566324515523,
"learning_rate": 8.108924987795137e-06,
"loss": 0.1807,
"step": 118
},
{
"epoch": 0.5776699029126213,
"grad_norm": 3.0731122010223157,
"learning_rate": 8.078974972343414e-06,
"loss": 0.1385,
"step": 119
},
{
"epoch": 0.5825242718446602,
"grad_norm": 3.506409488780749,
"learning_rate": 8.048845933670274e-06,
"loss": 0.1876,
"step": 120
},
{
"epoch": 0.587378640776699,
"grad_norm": 3.0091035344268167,
"learning_rate": 8.01853962359169e-06,
"loss": 0.1542,
"step": 121
},
{
"epoch": 0.5922330097087378,
"grad_norm": 4.000336192381524,
"learning_rate": 7.988057804230878e-06,
"loss": 0.1855,
"step": 122
},
{
"epoch": 0.5970873786407767,
"grad_norm": 2.6774596474593015,
"learning_rate": 7.957402247915817e-06,
"loss": 0.1398,
"step": 123
},
{
"epoch": 0.6019417475728155,
"grad_norm": 3.342400785474733,
"learning_rate": 7.92657473707621e-06,
"loss": 0.1571,
"step": 124
},
{
"epoch": 0.6067961165048543,
"grad_norm": 2.839224634847776,
"learning_rate": 7.895577064139847e-06,
"loss": 0.2838,
"step": 125
},
{
"epoch": 0.6116504854368932,
"grad_norm": 3.265216498891963,
"learning_rate": 7.864411031428379e-06,
"loss": 0.1605,
"step": 126
},
{
"epoch": 0.616504854368932,
"grad_norm": 2.826555903641288,
"learning_rate": 7.833078451052537e-06,
"loss": 0.1552,
"step": 127
},
{
"epoch": 0.6213592233009708,
"grad_norm": 3.263465403684562,
"learning_rate": 7.801581144806752e-06,
"loss": 0.1409,
"step": 128
},
{
"epoch": 0.6262135922330098,
"grad_norm": 3.115501404573261,
"learning_rate": 7.769920944063244e-06,
"loss": 0.1388,
"step": 129
},
{
"epoch": 0.6310679611650486,
"grad_norm": 3.454177176086228,
"learning_rate": 7.73809968966554e-06,
"loss": 0.1691,
"step": 130
},
{
"epoch": 0.6359223300970874,
"grad_norm": 3.669774425160351,
"learning_rate": 7.706119231821423e-06,
"loss": 2.0662,
"step": 131
},
{
"epoch": 0.6407766990291263,
"grad_norm": 2.9578717822726994,
"learning_rate": 7.673981429995372e-06,
"loss": 0.1217,
"step": 132
},
{
"epoch": 0.6456310679611651,
"grad_norm": 3.215506984482298,
"learning_rate": 7.641688152800433e-06,
"loss": 0.1641,
"step": 133
},
{
"epoch": 0.6504854368932039,
"grad_norm": 3.5261600831309683,
"learning_rate": 7.609241277889583e-06,
"loss": 0.1581,
"step": 134
},
{
"epoch": 0.6553398058252428,
"grad_norm": 3.348004239497448,
"learning_rate": 7.5766426918465455e-06,
"loss": 0.1655,
"step": 135
},
{
"epoch": 0.6601941747572816,
"grad_norm": 3.463003474785134,
"learning_rate": 7.5438942900761035e-06,
"loss": 0.179,
"step": 136
},
{
"epoch": 0.6650485436893204,
"grad_norm": 3.299946611300431,
"learning_rate": 7.51099797669389e-06,
"loss": 0.1838,
"step": 137
},
{
"epoch": 0.6699029126213593,
"grad_norm": 3.734578866130712,
"learning_rate": 7.477955664415678e-06,
"loss": 0.2121,
"step": 138
},
{
"epoch": 0.6747572815533981,
"grad_norm": 3.104951473650513,
"learning_rate": 7.444769274446168e-06,
"loss": 0.1522,
"step": 139
},
{
"epoch": 0.6796116504854369,
"grad_norm": 3.133143645012315,
"learning_rate": 7.411440736367281e-06,
"loss": 0.158,
"step": 140
},
{
"epoch": 0.6844660194174758,
"grad_norm": 3.059272264008987,
"learning_rate": 7.377971988025964e-06,
"loss": 0.1431,
"step": 141
},
{
"epoch": 0.6893203883495146,
"grad_norm": 3.4811771159052127,
"learning_rate": 7.3443649754215175e-06,
"loss": 0.1609,
"step": 142
},
{
"epoch": 0.6941747572815534,
"grad_norm": 3.85738998682577,
"learning_rate": 7.310621652592449e-06,
"loss": 0.1537,
"step": 143
},
{
"epoch": 0.6990291262135923,
"grad_norm": 3.4019001310027877,
"learning_rate": 7.276743981502856e-06,
"loss": 0.161,
"step": 144
},
{
"epoch": 0.7038834951456311,
"grad_norm": 3.396761694989364,
"learning_rate": 7.242733931928352e-06,
"loss": 1.968,
"step": 145
},
{
"epoch": 0.7087378640776699,
"grad_norm": 3.0749985922565415,
"learning_rate": 7.208593481341536e-06,
"loss": 0.164,
"step": 146
},
{
"epoch": 0.7135922330097088,
"grad_norm": 3.1539841039599077,
"learning_rate": 7.1743246147970095e-06,
"loss": 0.156,
"step": 147
},
{
"epoch": 0.7184466019417476,
"grad_norm": 3.4892734050571677,
"learning_rate": 7.139929324815965e-06,
"loss": 0.1884,
"step": 148
},
{
"epoch": 0.7233009708737864,
"grad_norm": 2.7320915520428635,
"learning_rate": 7.105409611270332e-06,
"loss": 0.1219,
"step": 149
},
{
"epoch": 0.7281553398058253,
"grad_norm": 3.0178546824462114,
"learning_rate": 7.070767481266493e-06,
"loss": 0.1732,
"step": 150
},
{
"epoch": 0.7330097087378641,
"grad_norm": 2.8056474690169986,
"learning_rate": 7.036004949028587e-06,
"loss": 0.1481,
"step": 151
},
{
"epoch": 0.7378640776699029,
"grad_norm": 3.10021388569035,
"learning_rate": 7.00112403578139e-06,
"loss": 0.1673,
"step": 152
},
{
"epoch": 0.7427184466019418,
"grad_norm": 3.26753532587258,
"learning_rate": 6.9661267696328015e-06,
"loss": 0.1883,
"step": 153
},
{
"epoch": 0.7475728155339806,
"grad_norm": 3.4255182899390033,
"learning_rate": 6.931015185455915e-06,
"loss": 0.2046,
"step": 154
},
{
"epoch": 0.7524271844660194,
"grad_norm": 3.636720170211082,
"learning_rate": 6.895791324770702e-06,
"loss": 0.2068,
"step": 155
},
{
"epoch": 0.7572815533980582,
"grad_norm": 3.053662406730595,
"learning_rate": 6.860457235625322e-06,
"loss": 0.169,
"step": 156
},
{
"epoch": 0.7621359223300971,
"grad_norm": 3.0420805095324686,
"learning_rate": 6.825014972477024e-06,
"loss": 0.1662,
"step": 157
},
{
"epoch": 0.7669902912621359,
"grad_norm": 3.5230733226696898,
"learning_rate": 6.7894665960727105e-06,
"loss": 0.2034,
"step": 158
},
{
"epoch": 0.7718446601941747,
"grad_norm": 3.891671449744887,
"learning_rate": 6.7538141733291e-06,
"loss": 0.2038,
"step": 159
},
{
"epoch": 0.7766990291262136,
"grad_norm": 4.12261748913082,
"learning_rate": 6.7180597772125665e-06,
"loss": 0.1852,
"step": 160
},
{
"epoch": 0.7815533980582524,
"grad_norm": 3.0068851640498195,
"learning_rate": 6.682205486618592e-06,
"loss": 0.1597,
"step": 161
},
{
"epoch": 0.7864077669902912,
"grad_norm": 3.5271313218773885,
"learning_rate": 6.646253386250909e-06,
"loss": 0.2022,
"step": 162
},
{
"epoch": 0.7912621359223301,
"grad_norm": 3.777114140168497,
"learning_rate": 6.610205566500272e-06,
"loss": 0.2029,
"step": 163
},
{
"epoch": 0.7961165048543689,
"grad_norm": 3.2070899922727127,
"learning_rate": 6.574064123322925e-06,
"loss": 0.1883,
"step": 164
},
{
"epoch": 0.8009708737864077,
"grad_norm": 2.856172766834049,
"learning_rate": 6.537831158118733e-06,
"loss": 0.1323,
"step": 165
},
{
"epoch": 0.8058252427184466,
"grad_norm": 3.2401941234963023,
"learning_rate": 6.50150877760899e-06,
"loss": 0.1786,
"step": 166
},
{
"epoch": 0.8106796116504854,
"grad_norm": 3.3661235424780385,
"learning_rate": 6.465099093713944e-06,
"loss": 0.1753,
"step": 167
},
{
"epoch": 0.8155339805825242,
"grad_norm": 2.688408919315557,
"learning_rate": 6.42860422342998e-06,
"loss": 0.1516,
"step": 168
},
{
"epoch": 0.8203883495145631,
"grad_norm": 3.7810659287217723,
"learning_rate": 6.392026288706549e-06,
"loss": 0.1481,
"step": 169
},
{
"epoch": 0.8252427184466019,
"grad_norm": 3.685444342996392,
"learning_rate": 6.3553674163227786e-06,
"loss": 0.1731,
"step": 170
},
{
"epoch": 0.8300970873786407,
"grad_norm": 2.946846864594844,
"learning_rate": 6.318629737763818e-06,
"loss": 0.1609,
"step": 171
},
{
"epoch": 0.8349514563106796,
"grad_norm": 3.8276099009344824,
"learning_rate": 6.281815389096903e-06,
"loss": 0.1881,
"step": 172
},
{
"epoch": 0.8398058252427184,
"grad_norm": 3.303535702159252,
"learning_rate": 6.244926510847162e-06,
"loss": 0.1479,
"step": 173
},
{
"epoch": 0.8446601941747572,
"grad_norm": 3.807883566694266,
"learning_rate": 6.207965247873151e-06,
"loss": 0.1903,
"step": 174
},
{
"epoch": 0.8495145631067961,
"grad_norm": 2.911567051543024,
"learning_rate": 6.1709337492421515e-06,
"loss": 0.1295,
"step": 175
},
{
"epoch": 0.8543689320388349,
"grad_norm": 3.2046177093058166,
"learning_rate": 6.133834168105206e-06,
"loss": 0.1568,
"step": 176
},
{
"epoch": 0.8592233009708737,
"grad_norm": 3.3349281673831803,
"learning_rate": 6.096668661571934e-06,
"loss": 0.1702,
"step": 177
},
{
"epoch": 0.8640776699029126,
"grad_norm": 3.9332514492121096,
"learning_rate": 6.0594393905851065e-06,
"loss": 0.2129,
"step": 178
},
{
"epoch": 0.8689320388349514,
"grad_norm": 3.6828564033007316,
"learning_rate": 6.0221485197949995e-06,
"loss": 0.1777,
"step": 179
},
{
"epoch": 0.8737864077669902,
"grad_norm": 3.9344402235799714,
"learning_rate": 5.9847982174335314e-06,
"loss": 0.218,
"step": 180
},
{
"epoch": 0.8786407766990292,
"grad_norm": 3.437911020577905,
"learning_rate": 5.9473906551881985e-06,
"loss": 0.1915,
"step": 181
},
{
"epoch": 0.883495145631068,
"grad_norm": 3.584257218638752,
"learning_rate": 5.9099280080758085e-06,
"loss": 0.2143,
"step": 182
},
{
"epoch": 0.8883495145631068,
"grad_norm": 3.3884751449077846,
"learning_rate": 5.872412454315999e-06,
"loss": 0.1848,
"step": 183
},
{
"epoch": 0.8932038834951457,
"grad_norm": 3.847696159285092,
"learning_rate": 5.834846175204612e-06,
"loss": 0.1914,
"step": 184
},
{
"epoch": 0.8980582524271845,
"grad_norm": 3.4294270132408733,
"learning_rate": 5.797231354986842e-06,
"loss": 0.1733,
"step": 185
},
{
"epoch": 0.9029126213592233,
"grad_norm": 2.7006091171005244,
"learning_rate": 5.759570180730255e-06,
"loss": 0.1786,
"step": 186
},
{
"epoch": 0.9077669902912622,
"grad_norm": 3.1518913649529128,
"learning_rate": 5.721864842197612e-06,
"loss": 0.1569,
"step": 187
},
{
"epoch": 0.912621359223301,
"grad_norm": 3.9310146604996876,
"learning_rate": 5.684117531719552e-06,
"loss": 0.2194,
"step": 188
},
{
"epoch": 0.9174757281553398,
"grad_norm": 3.6243954425466383,
"learning_rate": 5.646330444067121e-06,
"loss": 0.1515,
"step": 189
},
{
"epoch": 0.9223300970873787,
"grad_norm": 2.9597266924343373,
"learning_rate": 5.608505776324158e-06,
"loss": 0.1631,
"step": 190
},
{
"epoch": 0.9271844660194175,
"grad_norm": 3.29651731345373,
"learning_rate": 5.570645727759558e-06,
"loss": 0.1762,
"step": 191
},
{
"epoch": 0.9320388349514563,
"grad_norm": 3.02642514409216,
"learning_rate": 5.532752499699381e-06,
"loss": 0.1794,
"step": 192
},
{
"epoch": 0.9368932038834952,
"grad_norm": 3.1368920826260736,
"learning_rate": 5.494828295398874e-06,
"loss": 0.178,
"step": 193
},
{
"epoch": 0.941747572815534,
"grad_norm": 2.9748663003734457,
"learning_rate": 5.456875319914355e-06,
"loss": 0.1558,
"step": 194
},
{
"epoch": 0.9466019417475728,
"grad_norm": 3.447694232964055,
"learning_rate": 5.4188957799750145e-06,
"loss": 0.143,
"step": 195
},
{
"epoch": 0.9514563106796117,
"grad_norm": 2.9468105285885122,
"learning_rate": 5.380891883854591e-06,
"loss": 0.1585,
"step": 196
},
{
"epoch": 0.9563106796116505,
"grad_norm": 2.796739428227403,
"learning_rate": 5.34286584124299e-06,
"loss": 0.1345,
"step": 197
},
{
"epoch": 0.9611650485436893,
"grad_norm": 3.0833816121492337,
"learning_rate": 5.304819863117796e-06,
"loss": 0.1247,
"step": 198
},
{
"epoch": 0.9660194174757282,
"grad_norm": 3.0876458359936496,
"learning_rate": 5.266756161615719e-06,
"loss": 0.164,
"step": 199
},
{
"epoch": 0.970873786407767,
"grad_norm": 3.4454095679040484,
"learning_rate": 5.228676949903974e-06,
"loss": 0.1681,
"step": 200
},
{
"epoch": 0.9757281553398058,
"grad_norm": 3.661107038218155,
"learning_rate": 5.190584442051594e-06,
"loss": 0.2034,
"step": 201
},
{
"epoch": 0.9805825242718447,
"grad_norm": 2.9781763427858783,
"learning_rate": 5.1524808529007075e-06,
"loss": 2.0615,
"step": 202
},
{
"epoch": 0.9854368932038835,
"grad_norm": 3.4144794700513006,
"learning_rate": 5.114368397937744e-06,
"loss": 0.18,
"step": 203
},
{
"epoch": 0.9902912621359223,
"grad_norm": 2.891912771085034,
"learning_rate": 5.07624929316463e-06,
"loss": 0.1692,
"step": 204
},
{
"epoch": 0.9951456310679612,
"grad_norm": 3.940859743167215,
"learning_rate": 5.038125754969933e-06,
"loss": 0.1597,
"step": 205
},
{
"epoch": 1.0,
"grad_norm": 2.114078403400398,
"learning_rate": 5e-06,
"loss": 0.0509,
"step": 206
},
{
"epoch": 1.0048543689320388,
"grad_norm": 2.4861837147789907,
"learning_rate": 4.9618742450300675e-06,
"loss": 0.0947,
"step": 207
},
{
"epoch": 1.0097087378640777,
"grad_norm": 2.574928651437252,
"learning_rate": 4.923750706835371e-06,
"loss": 0.0877,
"step": 208
},
{
"epoch": 1.0145631067961165,
"grad_norm": 2.390628942600892,
"learning_rate": 4.8856316020622564e-06,
"loss": 0.0849,
"step": 209
},
{
"epoch": 1.0194174757281553,
"grad_norm": 2.0965587115046147,
"learning_rate": 4.847519147099294e-06,
"loss": 0.0691,
"step": 210
},
{
"epoch": 1.0242718446601942,
"grad_norm": 2.4149356959145742,
"learning_rate": 4.809415557948407e-06,
"loss": 0.095,
"step": 211
},
{
"epoch": 1.029126213592233,
"grad_norm": 2.235598621225836,
"learning_rate": 4.771323050096028e-06,
"loss": 0.0836,
"step": 212
},
{
"epoch": 1.0339805825242718,
"grad_norm": 2.777474574971205,
"learning_rate": 4.733243838384282e-06,
"loss": 0.0846,
"step": 213
},
{
"epoch": 1.0388349514563107,
"grad_norm": 2.7422183722820437,
"learning_rate": 4.6951801368822055e-06,
"loss": 0.0886,
"step": 214
},
{
"epoch": 1.0436893203883495,
"grad_norm": 3.583667258228157,
"learning_rate": 4.6571341587570114e-06,
"loss": 0.0853,
"step": 215
},
{
"epoch": 1.0485436893203883,
"grad_norm": 2.1429438147133983,
"learning_rate": 4.619108116145411e-06,
"loss": 0.0709,
"step": 216
},
{
"epoch": 1.0533980582524272,
"grad_norm": 2.638520623375338,
"learning_rate": 4.581104220024988e-06,
"loss": 0.0918,
"step": 217
},
{
"epoch": 1.058252427184466,
"grad_norm": 2.4373654485190546,
"learning_rate": 4.5431246800856455e-06,
"loss": 0.0659,
"step": 218
},
{
"epoch": 1.0631067961165048,
"grad_norm": 2.954470745196081,
"learning_rate": 4.505171704601128e-06,
"loss": 0.0865,
"step": 219
},
{
"epoch": 1.0679611650485437,
"grad_norm": 2.5357819989918675,
"learning_rate": 4.467247500300621e-06,
"loss": 0.0639,
"step": 220
},
{
"epoch": 1.0728155339805825,
"grad_norm": 3.196945279517692,
"learning_rate": 4.4293542722404435e-06,
"loss": 0.075,
"step": 221
},
{
"epoch": 1.0776699029126213,
"grad_norm": 3.602207511683068,
"learning_rate": 4.391494223675843e-06,
"loss": 0.0877,
"step": 222
},
{
"epoch": 1.0825242718446602,
"grad_norm": 3.072945774218816,
"learning_rate": 4.3536695559328816e-06,
"loss": 0.0886,
"step": 223
},
{
"epoch": 1.087378640776699,
"grad_norm": 2.9327628358781443,
"learning_rate": 4.31588246828045e-06,
"loss": 0.0779,
"step": 224
},
{
"epoch": 1.0922330097087378,
"grad_norm": 3.884559725530265,
"learning_rate": 4.278135157802389e-06,
"loss": 0.0957,
"step": 225
},
{
"epoch": 1.0970873786407767,
"grad_norm": 2.638727931000126,
"learning_rate": 4.240429819269746e-06,
"loss": 0.0578,
"step": 226
},
{
"epoch": 1.1019417475728155,
"grad_norm": 3.1117784042872283,
"learning_rate": 4.20276864501316e-06,
"loss": 0.0692,
"step": 227
},
{
"epoch": 1.1067961165048543,
"grad_norm": 2.7783624029964757,
"learning_rate": 4.165153824795391e-06,
"loss": 0.0768,
"step": 228
},
{
"epoch": 1.1116504854368932,
"grad_norm": 2.594503486377878,
"learning_rate": 4.127587545684002e-06,
"loss": 0.0669,
"step": 229
},
{
"epoch": 1.116504854368932,
"grad_norm": 2.929448813987237,
"learning_rate": 4.090071991924194e-06,
"loss": 0.0756,
"step": 230
},
{
"epoch": 1.1213592233009708,
"grad_norm": 2.6878856077703426,
"learning_rate": 4.052609344811802e-06,
"loss": 1.6006,
"step": 231
},
{
"epoch": 1.1262135922330097,
"grad_norm": 2.9800498201414296,
"learning_rate": 4.015201782566471e-06,
"loss": 0.084,
"step": 232
},
{
"epoch": 1.1310679611650485,
"grad_norm": 2.945464576525122,
"learning_rate": 3.977851480205003e-06,
"loss": 0.0831,
"step": 233
},
{
"epoch": 1.1359223300970873,
"grad_norm": 2.228512930491437,
"learning_rate": 3.940560609414894e-06,
"loss": 0.0563,
"step": 234
},
{
"epoch": 1.1407766990291262,
"grad_norm": 2.407897491051693,
"learning_rate": 3.903331338428067e-06,
"loss": 0.0633,
"step": 235
},
{
"epoch": 1.145631067961165,
"grad_norm": 2.4538104943960724,
"learning_rate": 3.866165831894796e-06,
"loss": 0.0602,
"step": 236
},
{
"epoch": 1.1504854368932038,
"grad_norm": 2.526844108676813,
"learning_rate": 3.829066250757851e-06,
"loss": 0.0772,
"step": 237
},
{
"epoch": 1.1553398058252426,
"grad_norm": 3.0770252354322003,
"learning_rate": 3.7920347521268514e-06,
"loss": 0.0761,
"step": 238
},
{
"epoch": 1.1601941747572815,
"grad_norm": 2.189508386492239,
"learning_rate": 3.7550734891528413e-06,
"loss": 1.7307,
"step": 239
},
{
"epoch": 1.1650485436893203,
"grad_norm": 2.9398466534526513,
"learning_rate": 3.7181846109031007e-06,
"loss": 0.0761,
"step": 240
},
{
"epoch": 1.1699029126213591,
"grad_norm": 2.746101050683826,
"learning_rate": 3.6813702622361858e-06,
"loss": 0.0847,
"step": 241
},
{
"epoch": 1.174757281553398,
"grad_norm": 2.7332263146255382,
"learning_rate": 3.6446325836772244e-06,
"loss": 0.0653,
"step": 242
},
{
"epoch": 1.1796116504854368,
"grad_norm": 2.7190442575382208,
"learning_rate": 3.6079737112934533e-06,
"loss": 0.0789,
"step": 243
},
{
"epoch": 1.1844660194174756,
"grad_norm": 2.658390418475969,
"learning_rate": 3.5713957765700224e-06,
"loss": 0.0603,
"step": 244
},
{
"epoch": 1.1893203883495145,
"grad_norm": 3.9381394273176276,
"learning_rate": 3.5349009062860586e-06,
"loss": 0.0811,
"step": 245
},
{
"epoch": 1.1941747572815533,
"grad_norm": 3.052172225803538,
"learning_rate": 3.4984912223910105e-06,
"loss": 0.0702,
"step": 246
},
{
"epoch": 1.1990291262135921,
"grad_norm": 3.0456430643875625,
"learning_rate": 3.46216884188127e-06,
"loss": 0.0784,
"step": 247
},
{
"epoch": 1.203883495145631,
"grad_norm": 2.5049847942964276,
"learning_rate": 3.425935876677077e-06,
"loss": 0.0656,
"step": 248
},
{
"epoch": 1.2087378640776698,
"grad_norm": 3.3960154571948,
"learning_rate": 3.38979443349973e-06,
"loss": 1.9855,
"step": 249
},
{
"epoch": 1.2135922330097086,
"grad_norm": 3.2720834167712214,
"learning_rate": 3.3537466137490937e-06,
"loss": 0.0577,
"step": 250
},
{
"epoch": 1.2184466019417475,
"grad_norm": 2.481707072834296,
"learning_rate": 3.3177945133814093e-06,
"loss": 0.0704,
"step": 251
},
{
"epoch": 1.2233009708737863,
"grad_norm": 3.154258406225022,
"learning_rate": 3.2819402227874364e-06,
"loss": 0.0873,
"step": 252
},
{
"epoch": 1.2281553398058254,
"grad_norm": 3.684874510102613,
"learning_rate": 3.2461858266709017e-06,
"loss": 0.0717,
"step": 253
},
{
"epoch": 1.233009708737864,
"grad_norm": 2.2393770842849205,
"learning_rate": 3.2105334039272924e-06,
"loss": 0.0712,
"step": 254
},
{
"epoch": 1.237864077669903,
"grad_norm": 2.5232811881585233,
"learning_rate": 3.1749850275229777e-06,
"loss": 0.059,
"step": 255
},
{
"epoch": 1.2427184466019416,
"grad_norm": 2.4330838257815612,
"learning_rate": 3.1395427643746802e-06,
"loss": 0.0665,
"step": 256
},
{
"epoch": 1.2475728155339807,
"grad_norm": 2.6008302295100245,
"learning_rate": 3.1042086752292995e-06,
"loss": 0.0714,
"step": 257
},
{
"epoch": 1.2524271844660193,
"grad_norm": 2.508640650867831,
"learning_rate": 3.068984814544087e-06,
"loss": 0.0881,
"step": 258
},
{
"epoch": 1.2572815533980584,
"grad_norm": 2.9570104245960014,
"learning_rate": 3.0338732303671993e-06,
"loss": 0.0725,
"step": 259
},
{
"epoch": 1.262135922330097,
"grad_norm": 2.8322291333835135,
"learning_rate": 2.99887596421861e-06,
"loss": 0.0779,
"step": 260
},
{
"epoch": 1.266990291262136,
"grad_norm": 3.4301825184788415,
"learning_rate": 2.9639950509714138e-06,
"loss": 0.1031,
"step": 261
},
{
"epoch": 1.2718446601941746,
"grad_norm": 2.767528223153958,
"learning_rate": 2.929232518733507e-06,
"loss": 1.7571,
"step": 262
},
{
"epoch": 1.2766990291262137,
"grad_norm": 2.5005755262187788,
"learning_rate": 2.8945903887296686e-06,
"loss": 0.0628,
"step": 263
},
{
"epoch": 1.2815533980582523,
"grad_norm": 3.101661943932367,
"learning_rate": 2.860070675184036e-06,
"loss": 0.0819,
"step": 264
},
{
"epoch": 1.2864077669902914,
"grad_norm": 3.1691737838498906,
"learning_rate": 2.8256753852029917e-06,
"loss": 0.0959,
"step": 265
},
{
"epoch": 1.29126213592233,
"grad_norm": 3.343426992113499,
"learning_rate": 2.7914065186584637e-06,
"loss": 0.0853,
"step": 266
},
{
"epoch": 1.296116504854369,
"grad_norm": 2.653786851479733,
"learning_rate": 2.757266068071648e-06,
"loss": 0.0711,
"step": 267
},
{
"epoch": 1.3009708737864076,
"grad_norm": 3.6695447681004953,
"learning_rate": 2.7232560184971437e-06,
"loss": 0.0925,
"step": 268
},
{
"epoch": 1.3058252427184467,
"grad_norm": 2.651580917610274,
"learning_rate": 2.689378347407553e-06,
"loss": 0.0733,
"step": 269
},
{
"epoch": 1.3106796116504853,
"grad_norm": 2.837228186908167,
"learning_rate": 2.6556350245784833e-06,
"loss": 1.3862,
"step": 270
},
{
"epoch": 1.3155339805825244,
"grad_norm": 2.744275874625859,
"learning_rate": 2.6220280119740376e-06,
"loss": 0.0802,
"step": 271
},
{
"epoch": 1.3203883495145632,
"grad_norm": 2.7150857011500733,
"learning_rate": 2.588559263632719e-06,
"loss": 0.0881,
"step": 272
},
{
"epoch": 1.325242718446602,
"grad_norm": 2.923244006407607,
"learning_rate": 2.555230725553832e-06,
"loss": 0.0695,
"step": 273
},
{
"epoch": 1.3300970873786409,
"grad_norm": 2.648842158951213,
"learning_rate": 2.522044335584322e-06,
"loss": 0.0808,
"step": 274
},
{
"epoch": 1.3349514563106797,
"grad_norm": 3.46448902282436,
"learning_rate": 2.489002023306112e-06,
"loss": 0.1095,
"step": 275
},
{
"epoch": 1.3398058252427185,
"grad_norm": 3.765519769264312,
"learning_rate": 2.4561057099238973e-06,
"loss": 0.124,
"step": 276
},
{
"epoch": 1.3446601941747574,
"grad_norm": 4.120315428329695,
"learning_rate": 2.423357308153454e-06,
"loss": 0.0726,
"step": 277
},
{
"epoch": 1.3495145631067962,
"grad_norm": 2.603195360336072,
"learning_rate": 2.390758722110418e-06,
"loss": 0.0746,
"step": 278
},
{
"epoch": 1.354368932038835,
"grad_norm": 2.3116504612959927,
"learning_rate": 2.358311847199567e-06,
"loss": 0.067,
"step": 279
},
{
"epoch": 1.3592233009708738,
"grad_norm": 2.510814292579119,
"learning_rate": 2.3260185700046295e-06,
"loss": 0.0553,
"step": 280
},
{
"epoch": 1.3640776699029127,
"grad_norm": 2.6478794602920432,
"learning_rate": 2.2938807681785764e-06,
"loss": 0.069,
"step": 281
},
{
"epoch": 1.3689320388349515,
"grad_norm": 2.7971908923982114,
"learning_rate": 2.2619003103344607e-06,
"loss": 0.1027,
"step": 282
},
{
"epoch": 1.3737864077669903,
"grad_norm": 2.6161854307520906,
"learning_rate": 2.2300790559367553e-06,
"loss": 0.0699,
"step": 283
},
{
"epoch": 1.3786407766990292,
"grad_norm": 2.634458727413624,
"learning_rate": 2.1984188551932513e-06,
"loss": 0.0674,
"step": 284
},
{
"epoch": 1.383495145631068,
"grad_norm": 3.3954408135653167,
"learning_rate": 2.166921548947466e-06,
"loss": 0.0978,
"step": 285
},
{
"epoch": 1.3883495145631068,
"grad_norm": 3.0988180285059483,
"learning_rate": 2.1355889685716225e-06,
"loss": 0.0867,
"step": 286
},
{
"epoch": 1.3932038834951457,
"grad_norm": 3.142246921881293,
"learning_rate": 2.1044229358601543e-06,
"loss": 0.0703,
"step": 287
},
{
"epoch": 1.3980582524271845,
"grad_norm": 2.650846344034233,
"learning_rate": 2.0734252629237892e-06,
"loss": 0.0659,
"step": 288
},
{
"epoch": 1.4029126213592233,
"grad_norm": 2.668075517191942,
"learning_rate": 2.0425977520841837e-06,
"loss": 0.0688,
"step": 289
},
{
"epoch": 1.4077669902912622,
"grad_norm": 2.2829588717500076,
"learning_rate": 2.011942195769122e-06,
"loss": 0.0559,
"step": 290
},
{
"epoch": 1.412621359223301,
"grad_norm": 2.3089636771028768,
"learning_rate": 1.9814603764083112e-06,
"loss": 0.0569,
"step": 291
},
{
"epoch": 1.4174757281553398,
"grad_norm": 3.404110096743085,
"learning_rate": 1.9511540663297284e-06,
"loss": 0.0898,
"step": 292
},
{
"epoch": 1.4223300970873787,
"grad_norm": 2.4241689478907693,
"learning_rate": 1.921025027656587e-06,
"loss": 0.0677,
"step": 293
},
{
"epoch": 1.4271844660194175,
"grad_norm": 3.421022845581842,
"learning_rate": 1.8910750122048638e-06,
"loss": 0.0753,
"step": 294
},
{
"epoch": 1.4320388349514563,
"grad_norm": 2.9505519123559205,
"learning_rate": 1.8613057613814584e-06,
"loss": 0.0575,
"step": 295
},
{
"epoch": 1.4368932038834952,
"grad_norm": 3.255569595427685,
"learning_rate": 1.8317190060829242e-06,
"loss": 0.0947,
"step": 296
},
{
"epoch": 1.441747572815534,
"grad_norm": 3.065400843837357,
"learning_rate": 1.8023164665948455e-06,
"loss": 0.0807,
"step": 297
},
{
"epoch": 1.4466019417475728,
"grad_norm": 2.9590611079634686,
"learning_rate": 1.773099852491796e-06,
"loss": 0.0736,
"step": 298
},
{
"epoch": 1.4514563106796117,
"grad_norm": 2.3025998916201313,
"learning_rate": 1.7440708625379503e-06,
"loss": 0.0625,
"step": 299
},
{
"epoch": 1.4563106796116505,
"grad_norm": 3.903839237318337,
"learning_rate": 1.7152311845883096e-06,
"loss": 0.0842,
"step": 300
},
{
"epoch": 1.4611650485436893,
"grad_norm": 2.4750320274405633,
"learning_rate": 1.686582495490554e-06,
"loss": 0.0708,
"step": 301
},
{
"epoch": 1.4660194174757282,
"grad_norm": 2.503972370862682,
"learning_rate": 1.658126460987558e-06,
"loss": 0.0637,
"step": 302
},
{
"epoch": 1.470873786407767,
"grad_norm": 3.3785939083312626,
"learning_rate": 1.6298647356205255e-06,
"loss": 0.0747,
"step": 303
},
{
"epoch": 1.4757281553398058,
"grad_norm": 2.2222765612234783,
"learning_rate": 1.601798962632799e-06,
"loss": 0.0597,
"step": 304
},
{
"epoch": 1.4805825242718447,
"grad_norm": 2.446669257372903,
"learning_rate": 1.573930773874306e-06,
"loss": 0.065,
"step": 305
},
{
"epoch": 1.4854368932038835,
"grad_norm": 2.652465423917927,
"learning_rate": 1.5462617897066863e-06,
"loss": 0.0712,
"step": 306
},
{
"epoch": 1.4902912621359223,
"grad_norm": 2.4004481423909203,
"learning_rate": 1.5187936189090668e-06,
"loss": 0.1966,
"step": 307
},
{
"epoch": 1.4951456310679612,
"grad_norm": 2.8096811136447672,
"learning_rate": 1.491527858584535e-06,
"loss": 0.0755,
"step": 308
},
{
"epoch": 1.5,
"grad_norm": 2.8236825513139565,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0955,
"step": 309
},
{
"epoch": 1.5048543689320388,
"grad_norm": 2.710817444163041,
"learning_rate": 1.4376098988303406e-06,
"loss": 0.0629,
"step": 310
},
{
"epoch": 1.5097087378640777,
"grad_norm": 3.096915505389332,
"learning_rate": 1.4109608343942855e-06,
"loss": 0.0638,
"step": 311
},
{
"epoch": 1.5145631067961165,
"grad_norm": 2.3241464797551057,
"learning_rate": 1.3845204502362442e-06,
"loss": 0.0712,
"step": 312
},
{
"epoch": 1.5194174757281553,
"grad_norm": 3.590980478005006,
"learning_rate": 1.35829028369991e-06,
"loss": 0.0877,
"step": 313
},
{
"epoch": 1.5242718446601942,
"grad_norm": 2.4429554321376723,
"learning_rate": 1.3322718599061252e-06,
"loss": 0.0568,
"step": 314
},
{
"epoch": 1.529126213592233,
"grad_norm": 3.0929878244984743,
"learning_rate": 1.306466691664216e-06,
"loss": 0.0818,
"step": 315
},
{
"epoch": 1.5339805825242718,
"grad_norm": 2.441836649677217,
"learning_rate": 1.28087627938402e-06,
"loss": 0.0686,
"step": 316
},
{
"epoch": 1.5388349514563107,
"grad_norm": 2.85692848721824,
"learning_rate": 1.2555021109886589e-06,
"loss": 0.0897,
"step": 317
},
{
"epoch": 1.5436893203883495,
"grad_norm": 3.2892259857698822,
"learning_rate": 1.2303456618280141e-06,
"loss": 0.0793,
"step": 318
},
{
"epoch": 1.5485436893203883,
"grad_norm": 2.586755097767611,
"learning_rate": 1.2054083945929534e-06,
"loss": 0.0633,
"step": 319
},
{
"epoch": 1.5533980582524272,
"grad_norm": 2.480887665282937,
"learning_rate": 1.1806917592302763e-06,
"loss": 0.0643,
"step": 320
},
{
"epoch": 1.558252427184466,
"grad_norm": 2.8797900449745115,
"learning_rate": 1.1561971928584158e-06,
"loss": 0.056,
"step": 321
},
{
"epoch": 1.5631067961165048,
"grad_norm": 2.866817525662025,
"learning_rate": 1.1319261196838782e-06,
"loss": 0.0839,
"step": 322
},
{
"epoch": 1.5679611650485437,
"grad_norm": 3.862409612419987,
"learning_rate": 1.1078799509184246e-06,
"loss": 0.097,
"step": 323
},
{
"epoch": 1.5728155339805825,
"grad_norm": 3.247997005248643,
"learning_rate": 1.0840600846970333e-06,
"loss": 0.0842,
"step": 324
},
{
"epoch": 1.5776699029126213,
"grad_norm": 2.9068905324472274,
"learning_rate": 1.0604679059965923e-06,
"loss": 0.1058,
"step": 325
},
{
"epoch": 1.5825242718446602,
"grad_norm": 2.8774681950293632,
"learning_rate": 1.0371047865553847e-06,
"loss": 0.0849,
"step": 326
},
{
"epoch": 1.587378640776699,
"grad_norm": 2.8220546167673857,
"learning_rate": 1.0139720847933166e-06,
"loss": 0.0626,
"step": 327
},
{
"epoch": 1.5922330097087378,
"grad_norm": 3.229748629712671,
"learning_rate": 9.91071145732948e-07,
"loss": 0.085,
"step": 328
},
{
"epoch": 1.5970873786407767,
"grad_norm": 3.8285741772219235,
"learning_rate": 9.684033009212752e-07,
"loss": 0.0922,
"step": 329
},
{
"epoch": 1.6019417475728155,
"grad_norm": 3.1716964433534804,
"learning_rate": 9.459698683523205e-07,
"loss": 0.0813,
"step": 330
},
{
"epoch": 1.6067961165048543,
"grad_norm": 4.025671520411109,
"learning_rate": 9.237721523904891e-07,
"loss": 0.1209,
"step": 331
},
{
"epoch": 1.6116504854368932,
"grad_norm": 1.853073997819308,
"learning_rate": 9.018114436947373e-07,
"loss": 0.0478,
"step": 332
},
{
"epoch": 1.616504854368932,
"grad_norm": 2.7252545212519985,
"learning_rate": 8.80089019143524e-07,
"loss": 0.0669,
"step": 333
},
{
"epoch": 1.6213592233009708,
"grad_norm": 2.688094491542211,
"learning_rate": 8.586061417605668e-07,
"loss": 0.0682,
"step": 334
},
{
"epoch": 1.6262135922330097,
"grad_norm": 2.4383269885779573,
"learning_rate": 8.373640606414097e-07,
"loss": 0.0559,
"step": 335
},
{
"epoch": 1.6310679611650487,
"grad_norm": 3.386771393269001,
"learning_rate": 8.163640108807897e-07,
"loss": 1.6835,
"step": 336
},
{
"epoch": 1.6359223300970873,
"grad_norm": 2.4964755511358048,
"learning_rate": 7.956072135008336e-07,
"loss": 0.0574,
"step": 337
},
{
"epoch": 1.6407766990291264,
"grad_norm": 2.804221131450833,
"learning_rate": 7.750948753800508e-07,
"loss": 1.6776,
"step": 338
},
{
"epoch": 1.645631067961165,
"grad_norm": 3.05585343475137,
"learning_rate": 7.548281891831715e-07,
"loss": 0.0599,
"step": 339
},
{
"epoch": 1.650485436893204,
"grad_norm": 3.094692901438052,
"learning_rate": 7.348083332917927e-07,
"loss": 0.0867,
"step": 340
},
{
"epoch": 1.6553398058252426,
"grad_norm": 5.994214379340551,
"learning_rate": 7.150364717358699e-07,
"loss": 0.1163,
"step": 341
},
{
"epoch": 1.6601941747572817,
"grad_norm": 2.6685684099421154,
"learning_rate": 6.955137541260287e-07,
"loss": 0.0729,
"step": 342
},
{
"epoch": 1.6650485436893203,
"grad_norm": 2.614974887848457,
"learning_rate": 6.762413155867276e-07,
"loss": 0.0669,
"step": 343
},
{
"epoch": 1.6699029126213594,
"grad_norm": 3.7605402445833884,
"learning_rate": 6.572202766902569e-07,
"loss": 0.102,
"step": 344
},
{
"epoch": 1.674757281553398,
"grad_norm": 2.530314273521925,
"learning_rate": 6.384517433915794e-07,
"loss": 0.0774,
"step": 345
},
{
"epoch": 1.679611650485437,
"grad_norm": 2.641884474987027,
"learning_rate": 6.199368069640343e-07,
"loss": 0.0674,
"step": 346
},
{
"epoch": 1.6844660194174756,
"grad_norm": 2.9272910498975544,
"learning_rate": 6.016765439358774e-07,
"loss": 0.1105,
"step": 347
},
{
"epoch": 1.6893203883495147,
"grad_norm": 2.8644898158113383,
"learning_rate": 5.836720160276971e-07,
"loss": 0.0629,
"step": 348
},
{
"epoch": 1.6941747572815533,
"grad_norm": 3.2193627421640496,
"learning_rate": 5.659242700906719e-07,
"loss": 0.0936,
"step": 349
},
{
"epoch": 1.6990291262135924,
"grad_norm": 2.6937432360871156,
"learning_rate": 5.484343380457124e-07,
"loss": 0.0835,
"step": 350
},
{
"epoch": 1.703883495145631,
"grad_norm": 2.403791791257734,
"learning_rate": 5.312032368234527e-07,
"loss": 0.0619,
"step": 351
},
{
"epoch": 1.70873786407767,
"grad_norm": 2.8985412438591696,
"learning_rate": 5.1423196830513e-07,
"loss": 0.0688,
"step": 352
},
{
"epoch": 1.7135922330097086,
"grad_norm": 2.3651095920680394,
"learning_rate": 4.975215192643246e-07,
"loss": 0.0595,
"step": 353
},
{
"epoch": 1.7184466019417477,
"grad_norm": 2.5298627296726153,
"learning_rate": 4.81072861309591e-07,
"loss": 2.1163,
"step": 354
},
{
"epoch": 1.7233009708737863,
"grad_norm": 3.119126853868973,
"learning_rate": 4.648869508279613e-07,
"loss": 0.0736,
"step": 355
},
{
"epoch": 1.7281553398058254,
"grad_norm": 2.695693195432077,
"learning_rate": 4.4896472892933693e-07,
"loss": 0.0651,
"step": 356
},
{
"epoch": 1.733009708737864,
"grad_norm": 2.324789330239615,
"learning_rate": 4.333071213917722e-07,
"loss": 0.0571,
"step": 357
},
{
"epoch": 1.737864077669903,
"grad_norm": 2.4268053939586176,
"learning_rate": 4.179150386076425e-07,
"loss": 1.55,
"step": 358
},
{
"epoch": 1.7427184466019416,
"grad_norm": 3.318783032596993,
"learning_rate": 4.027893755307144e-07,
"loss": 0.0879,
"step": 359
},
{
"epoch": 1.7475728155339807,
"grad_norm": 3.228808836385332,
"learning_rate": 3.8793101162410417e-07,
"loss": 0.0806,
"step": 360
},
{
"epoch": 1.7524271844660193,
"grad_norm": 2.5077231830691096,
"learning_rate": 3.733408108091485e-07,
"loss": 0.0592,
"step": 361
},
{
"epoch": 1.7572815533980584,
"grad_norm": 3.8907140756818803,
"learning_rate": 3.5901962141516975e-07,
"loss": 0.0906,
"step": 362
},
{
"epoch": 1.762135922330097,
"grad_norm": 3.188616645281919,
"learning_rate": 3.4496827613015206e-07,
"loss": 0.0893,
"step": 363
},
{
"epoch": 1.766990291262136,
"grad_norm": 4.00676524274261,
"learning_rate": 3.3118759195232273e-07,
"loss": 0.1123,
"step": 364
},
{
"epoch": 1.7718446601941746,
"grad_norm": 2.82176742226637,
"learning_rate": 3.176783701426528e-07,
"loss": 0.0783,
"step": 365
},
{
"epoch": 1.7766990291262137,
"grad_norm": 3.035783084499938,
"learning_rate": 3.0444139617826605e-07,
"loss": 0.0801,
"step": 366
},
{
"epoch": 1.7815533980582523,
"grad_norm": 2.68206770681917,
"learning_rate": 2.91477439706771e-07,
"loss": 0.0778,
"step": 367
},
{
"epoch": 1.7864077669902914,
"grad_norm": 2.6094497830306818,
"learning_rate": 2.787872545015069e-07,
"loss": 0.0703,
"step": 368
},
{
"epoch": 1.79126213592233,
"grad_norm": 2.6985662603724836,
"learning_rate": 2.663715784177201e-07,
"loss": 0.0713,
"step": 369
},
{
"epoch": 1.796116504854369,
"grad_norm": 3.416820336867604,
"learning_rate": 2.542311333496622e-07,
"loss": 0.0927,
"step": 370
},
{
"epoch": 1.8009708737864076,
"grad_norm": 2.967268889240781,
"learning_rate": 2.423666251886114e-07,
"loss": 0.0803,
"step": 371
},
{
"epoch": 1.8058252427184467,
"grad_norm": 2.808385614520899,
"learning_rate": 2.307787437818365e-07,
"loss": 0.0549,
"step": 372
},
{
"epoch": 1.8106796116504853,
"grad_norm": 3.204050738102251,
"learning_rate": 2.1946816289248163e-07,
"loss": 0.0768,
"step": 373
},
{
"epoch": 1.8155339805825244,
"grad_norm": 2.8887165325007063,
"learning_rate": 2.0843554016039326e-07,
"loss": 0.0812,
"step": 374
},
{
"epoch": 1.820388349514563,
"grad_norm": 2.4786338107907677,
"learning_rate": 1.9768151706388016e-07,
"loss": 0.0693,
"step": 375
},
{
"epoch": 1.825242718446602,
"grad_norm": 2.8499069296362567,
"learning_rate": 1.8720671888242058e-07,
"loss": 0.0698,
"step": 376
},
{
"epoch": 1.8300970873786406,
"grad_norm": 2.986846518621566,
"learning_rate": 1.7701175466029895e-07,
"loss": 0.073,
"step": 377
},
{
"epoch": 1.8349514563106797,
"grad_norm": 2.9569805825011954,
"learning_rate": 1.6709721717120042e-07,
"loss": 0.1111,
"step": 378
},
{
"epoch": 1.8398058252427183,
"grad_norm": 2.2641670429414633,
"learning_rate": 1.574636828837395e-07,
"loss": 0.081,
"step": 379
},
{
"epoch": 1.8446601941747574,
"grad_norm": 2.5283674709842168,
"learning_rate": 1.4811171192794628e-07,
"loss": 0.0684,
"step": 380
},
{
"epoch": 1.849514563106796,
"grad_norm": 2.897263302225223,
"learning_rate": 1.3904184806269705e-07,
"loss": 0.0841,
"step": 381
},
{
"epoch": 1.854368932038835,
"grad_norm": 2.354156493900917,
"learning_rate": 1.3025461864409395e-07,
"loss": 0.0527,
"step": 382
},
{
"epoch": 1.8592233009708736,
"grad_norm": 3.1425590941343917,
"learning_rate": 1.2175053459481e-07,
"loss": 0.0839,
"step": 383
},
{
"epoch": 1.8640776699029127,
"grad_norm": 2.8771004475766486,
"learning_rate": 1.1353009037437523e-07,
"loss": 0.0641,
"step": 384
},
{
"epoch": 1.8689320388349513,
"grad_norm": 3.7753913065176854,
"learning_rate": 1.0559376395043285e-07,
"loss": 0.0945,
"step": 385
},
{
"epoch": 1.8737864077669903,
"grad_norm": 3.045290352139969,
"learning_rate": 9.794201677094162e-08,
"loss": 0.0691,
"step": 386
},
{
"epoch": 1.8786407766990292,
"grad_norm": 2.827523364996351,
"learning_rate": 9.05752937373533e-08,
"loss": 0.067,
"step": 387
},
{
"epoch": 1.883495145631068,
"grad_norm": 2.3369063097065697,
"learning_rate": 8.34940231787379e-08,
"loss": 0.056,
"step": 388
},
{
"epoch": 1.8883495145631068,
"grad_norm": 2.7538101838954683,
"learning_rate": 7.66986168268824e-08,
"loss": 0.0538,
"step": 389
},
{
"epoch": 1.8932038834951457,
"grad_norm": 2.852941272376346,
"learning_rate": 7.018946979234997e-08,
"loss": 0.0713,
"step": 390
},
{
"epoch": 1.8980582524271845,
"grad_norm": 2.4418246951319422,
"learning_rate": 6.396696054150719e-08,
"loss": 0.0555,
"step": 391
},
{
"epoch": 1.9029126213592233,
"grad_norm": 2.9999090182347077,
"learning_rate": 5.803145087451945e-08,
"loss": 0.0621,
"step": 392
},
{
"epoch": 1.9077669902912622,
"grad_norm": 3.2442934086920667,
"learning_rate": 5.238328590431163e-08,
"loss": 0.1006,
"step": 393
},
{
"epoch": 1.912621359223301,
"grad_norm": 2.711694124657116,
"learning_rate": 4.702279403650534e-08,
"loss": 0.0677,
"step": 394
},
{
"epoch": 1.9174757281553398,
"grad_norm": 2.995618798054314,
"learning_rate": 4.195028695032133e-08,
"loss": 0.0888,
"step": 395
},
{
"epoch": 1.9223300970873787,
"grad_norm": 2.9995330453741804,
"learning_rate": 3.716605958046071e-08,
"loss": 0.0643,
"step": 396
},
{
"epoch": 1.9271844660194175,
"grad_norm": 3.3748574494740056,
"learning_rate": 3.2670390099951985e-08,
"loss": 0.07,
"step": 397
},
{
"epoch": 1.9320388349514563,
"grad_norm": 3.7760633589986488,
"learning_rate": 2.846353990398065e-08,
"loss": 0.0991,
"step": 398
},
{
"epoch": 1.9368932038834952,
"grad_norm": 2.7681700840158876,
"learning_rate": 2.4545753594688582e-08,
"loss": 1.4371,
"step": 399
},
{
"epoch": 1.941747572815534,
"grad_norm": 3.0753950908772985,
"learning_rate": 2.0917258966953735e-08,
"loss": 0.083,
"step": 400
},
{
"epoch": 1.9466019417475728,
"grad_norm": 3.136204625362315,
"learning_rate": 1.757826699514298e-08,
"loss": 0.0755,
"step": 401
},
{
"epoch": 1.9514563106796117,
"grad_norm": 2.6333380164118516,
"learning_rate": 1.4528971820846894e-08,
"loss": 0.0577,
"step": 402
},
{
"epoch": 1.9563106796116505,
"grad_norm": 2.3026842334958015,
"learning_rate": 1.176955074159214e-08,
"loss": 0.0553,
"step": 403
},
{
"epoch": 1.9611650485436893,
"grad_norm": 2.8305325809330752,
"learning_rate": 9.300164200530815e-09,
"loss": 0.0707,
"step": 404
},
{
"epoch": 1.9660194174757282,
"grad_norm": 3.4008450370034047,
"learning_rate": 7.120955777112915e-09,
"loss": 0.089,
"step": 405
},
{
"epoch": 1.970873786407767,
"grad_norm": 2.862907268733839,
"learning_rate": 5.232052178738567e-09,
"loss": 0.0642,
"step": 406
},
{
"epoch": 1.9757281553398058,
"grad_norm": 3.6074011718550447,
"learning_rate": 3.633563233388926e-09,
"loss": 0.0741,
"step": 407
},
{
"epoch": 1.9805825242718447,
"grad_norm": 2.583312048382848,
"learning_rate": 2.3255818832423894e-09,
"loss": 0.0573,
"step": 408
},
{
"epoch": 1.9854368932038835,
"grad_norm": 2.9051428728952593,
"learning_rate": 1.3081841792694783e-09,
"loss": 0.0764,
"step": 409
},
{
"epoch": 1.9902912621359223,
"grad_norm": 2.1471755925183484,
"learning_rate": 5.814292768108187e-10,
"loss": 0.0573,
"step": 410
},
{
"epoch": 1.9951456310679612,
"grad_norm": 2.8556205868247497,
"learning_rate": 1.453594321393359e-10,
"loss": 0.0679,
"step": 411
},
{
"epoch": 2.0,
"grad_norm": 1.7882390903441006,
"learning_rate": 0.0,
"loss": 0.0419,
"step": 412
},
{
"epoch": 2.0,
"step": 412,
"total_flos": 957358891008.0,
"train_loss": 0.20337393274982868,
"train_runtime": 258.6726,
"train_samples_per_second": 12.688,
"train_steps_per_second": 1.593
}
],
"logging_steps": 1,
"max_steps": 412,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 957358891008.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}