Metamath-reproduce-7b / trainer_state.json
feidfoe's picture
Add tokenizer and configs
e5a1efa
raw
history blame
113 kB
{
"best_metric": 0.1484375,
"best_model_checkpoint": "/mnt/vdc/metamath_leaderboard/checkpoint-6168",
"epoch": 3.0,
"global_step": 9252,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 7.194244604316547e-07,
"loss": 0.7531,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 1.4388489208633094e-06,
"loss": 0.6605,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 2.158273381294964e-06,
"loss": 0.4646,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 2.877697841726619e-06,
"loss": 0.3787,
"step": 40
},
{
"epoch": 0.02,
"learning_rate": 3.5971223021582737e-06,
"loss": 0.3369,
"step": 50
},
{
"epoch": 0.02,
"learning_rate": 4.316546762589928e-06,
"loss": 0.3264,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 5.035971223021583e-06,
"loss": 0.3008,
"step": 70
},
{
"epoch": 0.03,
"learning_rate": 5.755395683453238e-06,
"loss": 0.2889,
"step": 80
},
{
"epoch": 0.03,
"learning_rate": 6.474820143884892e-06,
"loss": 0.2497,
"step": 90
},
{
"epoch": 0.03,
"learning_rate": 7.194244604316547e-06,
"loss": 0.2774,
"step": 100
},
{
"epoch": 0.04,
"learning_rate": 7.913669064748202e-06,
"loss": 0.2653,
"step": 110
},
{
"epoch": 0.04,
"learning_rate": 8.633093525179856e-06,
"loss": 0.2691,
"step": 120
},
{
"epoch": 0.04,
"learning_rate": 9.35251798561151e-06,
"loss": 0.2661,
"step": 130
},
{
"epoch": 0.05,
"learning_rate": 1.0071942446043167e-05,
"loss": 0.2732,
"step": 140
},
{
"epoch": 0.05,
"learning_rate": 1.0791366906474821e-05,
"loss": 0.2507,
"step": 150
},
{
"epoch": 0.05,
"learning_rate": 1.1510791366906475e-05,
"loss": 0.2482,
"step": 160
},
{
"epoch": 0.06,
"learning_rate": 1.223021582733813e-05,
"loss": 0.2368,
"step": 170
},
{
"epoch": 0.06,
"learning_rate": 1.2949640287769784e-05,
"loss": 0.2357,
"step": 180
},
{
"epoch": 0.06,
"learning_rate": 1.3669064748201439e-05,
"loss": 0.2404,
"step": 190
},
{
"epoch": 0.06,
"learning_rate": 1.4388489208633095e-05,
"loss": 0.2479,
"step": 200
},
{
"epoch": 0.07,
"learning_rate": 1.5107913669064749e-05,
"loss": 0.2501,
"step": 210
},
{
"epoch": 0.07,
"learning_rate": 1.5827338129496403e-05,
"loss": 0.2502,
"step": 220
},
{
"epoch": 0.07,
"learning_rate": 1.6546762589928058e-05,
"loss": 0.232,
"step": 230
},
{
"epoch": 0.08,
"learning_rate": 1.7266187050359712e-05,
"loss": 0.237,
"step": 240
},
{
"epoch": 0.08,
"learning_rate": 1.7985611510791367e-05,
"loss": 0.2469,
"step": 250
},
{
"epoch": 0.08,
"learning_rate": 1.870503597122302e-05,
"loss": 0.2302,
"step": 260
},
{
"epoch": 0.09,
"learning_rate": 1.9424460431654675e-05,
"loss": 0.2409,
"step": 270
},
{
"epoch": 0.09,
"learning_rate": 1.99999975489194e-05,
"loss": 0.2106,
"step": 280
},
{
"epoch": 0.09,
"learning_rate": 1.9999911761224496e-05,
"loss": 0.2308,
"step": 290
},
{
"epoch": 0.1,
"learning_rate": 1.999970342070106e-05,
"loss": 0.2361,
"step": 300
},
{
"epoch": 0.1,
"learning_rate": 1.9999372529902386e-05,
"loss": 0.2277,
"step": 310
},
{
"epoch": 0.1,
"learning_rate": 1.9998919092883666e-05,
"loss": 0.2204,
"step": 320
},
{
"epoch": 0.11,
"learning_rate": 1.9998343115201945e-05,
"loss": 0.2374,
"step": 330
},
{
"epoch": 0.11,
"learning_rate": 1.999764460391606e-05,
"loss": 0.2073,
"step": 340
},
{
"epoch": 0.11,
"learning_rate": 1.999682356758654e-05,
"loss": 0.2217,
"step": 350
},
{
"epoch": 0.12,
"learning_rate": 1.9995880016275502e-05,
"loss": 0.2327,
"step": 360
},
{
"epoch": 0.12,
"learning_rate": 1.9994813961546543e-05,
"loss": 0.2303,
"step": 370
},
{
"epoch": 0.12,
"learning_rate": 1.9993625416464575e-05,
"loss": 0.2229,
"step": 380
},
{
"epoch": 0.13,
"learning_rate": 1.9992314395595686e-05,
"loss": 0.2188,
"step": 390
},
{
"epoch": 0.13,
"learning_rate": 1.9990880915006945e-05,
"loss": 0.2244,
"step": 400
},
{
"epoch": 0.13,
"learning_rate": 1.998932499226622e-05,
"loss": 0.2188,
"step": 410
},
{
"epoch": 0.14,
"learning_rate": 1.9987646646441956e-05,
"loss": 0.2196,
"step": 420
},
{
"epoch": 0.14,
"learning_rate": 1.9985845898102933e-05,
"loss": 0.2022,
"step": 430
},
{
"epoch": 0.14,
"learning_rate": 1.9983922769318024e-05,
"loss": 0.2219,
"step": 440
},
{
"epoch": 0.15,
"learning_rate": 1.9981877283655924e-05,
"loss": 0.2014,
"step": 450
},
{
"epoch": 0.15,
"learning_rate": 1.997970946618487e-05,
"loss": 0.1935,
"step": 460
},
{
"epoch": 0.15,
"learning_rate": 1.99774193434723e-05,
"loss": 0.2011,
"step": 470
},
{
"epoch": 0.16,
"learning_rate": 1.997500694358457e-05,
"loss": 0.2003,
"step": 480
},
{
"epoch": 0.16,
"learning_rate": 1.9972472296086583e-05,
"loss": 0.1996,
"step": 490
},
{
"epoch": 0.16,
"learning_rate": 1.9969815432041434e-05,
"loss": 0.2131,
"step": 500
},
{
"epoch": 0.17,
"learning_rate": 1.996703638401003e-05,
"loss": 0.2119,
"step": 510
},
{
"epoch": 0.17,
"learning_rate": 1.9964135186050692e-05,
"loss": 0.2192,
"step": 520
},
{
"epoch": 0.17,
"learning_rate": 1.996111187371874e-05,
"loss": 0.2051,
"step": 530
},
{
"epoch": 0.18,
"learning_rate": 1.995796648406604e-05,
"loss": 0.1961,
"step": 540
},
{
"epoch": 0.18,
"learning_rate": 1.9954699055640576e-05,
"loss": 0.2017,
"step": 550
},
{
"epoch": 0.18,
"learning_rate": 1.9951309628485963e-05,
"loss": 0.1997,
"step": 560
},
{
"epoch": 0.18,
"learning_rate": 1.9947798244140954e-05,
"loss": 0.2003,
"step": 570
},
{
"epoch": 0.19,
"learning_rate": 1.994416494563894e-05,
"loss": 0.2025,
"step": 580
},
{
"epoch": 0.19,
"learning_rate": 1.9940409777507407e-05,
"loss": 0.2038,
"step": 590
},
{
"epoch": 0.19,
"learning_rate": 1.9936532785767416e-05,
"loss": 0.2068,
"step": 600
},
{
"epoch": 0.2,
"learning_rate": 1.9932534017933015e-05,
"loss": 0.205,
"step": 610
},
{
"epoch": 0.2,
"learning_rate": 1.9928413523010667e-05,
"loss": 0.2122,
"step": 620
},
{
"epoch": 0.2,
"learning_rate": 1.9924171351498645e-05,
"loss": 0.1979,
"step": 630
},
{
"epoch": 0.21,
"learning_rate": 1.9919807555386426e-05,
"loss": 0.1921,
"step": 640
},
{
"epoch": 0.21,
"learning_rate": 1.9915322188154033e-05,
"loss": 0.2027,
"step": 650
},
{
"epoch": 0.21,
"learning_rate": 1.9910715304771396e-05,
"loss": 0.1852,
"step": 660
},
{
"epoch": 0.22,
"learning_rate": 1.9905986961697675e-05,
"loss": 0.1957,
"step": 670
},
{
"epoch": 0.22,
"learning_rate": 1.9901137216880556e-05,
"loss": 0.1945,
"step": 680
},
{
"epoch": 0.22,
"learning_rate": 1.989616612975557e-05,
"loss": 0.209,
"step": 690
},
{
"epoch": 0.23,
"learning_rate": 1.9891073761245318e-05,
"loss": 0.1963,
"step": 700
},
{
"epoch": 0.23,
"learning_rate": 1.988586017375878e-05,
"loss": 0.1757,
"step": 710
},
{
"epoch": 0.23,
"learning_rate": 1.9880525431190503e-05,
"loss": 0.1856,
"step": 720
},
{
"epoch": 0.24,
"learning_rate": 1.9875069598919844e-05,
"loss": 0.179,
"step": 730
},
{
"epoch": 0.24,
"learning_rate": 1.9869492743810163e-05,
"loss": 0.1891,
"step": 740
},
{
"epoch": 0.24,
"learning_rate": 1.9863794934207994e-05,
"loss": 0.1975,
"step": 750
},
{
"epoch": 0.25,
"learning_rate": 1.9857976239942228e-05,
"loss": 0.1819,
"step": 760
},
{
"epoch": 0.25,
"learning_rate": 1.9852036732323237e-05,
"loss": 0.2062,
"step": 770
},
{
"epoch": 0.25,
"learning_rate": 1.9845976484142003e-05,
"loss": 0.1912,
"step": 780
},
{
"epoch": 0.26,
"learning_rate": 1.9839795569669246e-05,
"loss": 0.1938,
"step": 790
},
{
"epoch": 0.26,
"learning_rate": 1.9833494064654485e-05,
"loss": 0.1864,
"step": 800
},
{
"epoch": 0.26,
"learning_rate": 1.982707204632513e-05,
"loss": 0.1907,
"step": 810
},
{
"epoch": 0.27,
"learning_rate": 1.9820529593385516e-05,
"loss": 0.188,
"step": 820
},
{
"epoch": 0.27,
"learning_rate": 1.981386678601598e-05,
"loss": 0.1889,
"step": 830
},
{
"epoch": 0.27,
"learning_rate": 1.980708370587182e-05,
"loss": 0.1871,
"step": 840
},
{
"epoch": 0.28,
"learning_rate": 1.9800180436082335e-05,
"loss": 0.1772,
"step": 850
},
{
"epoch": 0.28,
"learning_rate": 1.97931570612498e-05,
"loss": 0.1817,
"step": 860
},
{
"epoch": 0.28,
"learning_rate": 1.9786013667448416e-05,
"loss": 0.1765,
"step": 870
},
{
"epoch": 0.29,
"learning_rate": 1.977875034222327e-05,
"loss": 0.1987,
"step": 880
},
{
"epoch": 0.29,
"learning_rate": 1.977136717458925e-05,
"loss": 0.2069,
"step": 890
},
{
"epoch": 0.29,
"learning_rate": 1.9763864255029962e-05,
"loss": 0.1817,
"step": 900
},
{
"epoch": 0.3,
"learning_rate": 1.975624167549662e-05,
"loss": 0.1883,
"step": 910
},
{
"epoch": 0.3,
"learning_rate": 1.9748499529406918e-05,
"loss": 0.1738,
"step": 920
},
{
"epoch": 0.3,
"learning_rate": 1.9740637911643882e-05,
"loss": 0.1873,
"step": 930
},
{
"epoch": 0.3,
"learning_rate": 1.973265691855471e-05,
"loss": 0.193,
"step": 940
},
{
"epoch": 0.31,
"learning_rate": 1.9724556647949597e-05,
"loss": 0.1725,
"step": 950
},
{
"epoch": 0.31,
"learning_rate": 1.971633719910052e-05,
"loss": 0.1896,
"step": 960
},
{
"epoch": 0.31,
"learning_rate": 1.9707998672740045e-05,
"loss": 0.185,
"step": 970
},
{
"epoch": 0.32,
"learning_rate": 1.9699541171060068e-05,
"loss": 0.1745,
"step": 980
},
{
"epoch": 0.32,
"learning_rate": 1.9690964797710585e-05,
"loss": 0.1862,
"step": 990
},
{
"epoch": 0.32,
"learning_rate": 1.9682269657798395e-05,
"loss": 0.1801,
"step": 1000
},
{
"epoch": 0.33,
"learning_rate": 1.9673455857885846e-05,
"loss": 0.1754,
"step": 1010
},
{
"epoch": 0.33,
"learning_rate": 1.9664523505989498e-05,
"loss": 0.1882,
"step": 1020
},
{
"epoch": 0.33,
"learning_rate": 1.965547271157882e-05,
"loss": 0.1888,
"step": 1030
},
{
"epoch": 0.34,
"learning_rate": 1.9646303585574832e-05,
"loss": 0.1965,
"step": 1040
},
{
"epoch": 0.34,
"learning_rate": 1.9637016240348755e-05,
"loss": 0.1785,
"step": 1050
},
{
"epoch": 0.34,
"learning_rate": 1.9627610789720647e-05,
"loss": 0.19,
"step": 1060
},
{
"epoch": 0.35,
"learning_rate": 1.9618087348957973e-05,
"loss": 0.1789,
"step": 1070
},
{
"epoch": 0.35,
"learning_rate": 1.9608446034774225e-05,
"loss": 0.1785,
"step": 1080
},
{
"epoch": 0.35,
"learning_rate": 1.9598686965327483e-05,
"loss": 0.2006,
"step": 1090
},
{
"epoch": 0.36,
"learning_rate": 1.9588810260218955e-05,
"loss": 0.1937,
"step": 1100
},
{
"epoch": 0.36,
"learning_rate": 1.9578816040491526e-05,
"loss": 0.183,
"step": 1110
},
{
"epoch": 0.36,
"learning_rate": 1.956870442862826e-05,
"loss": 0.1834,
"step": 1120
},
{
"epoch": 0.37,
"learning_rate": 1.9558475548550924e-05,
"loss": 0.1784,
"step": 1130
},
{
"epoch": 0.37,
"learning_rate": 1.9548129525618434e-05,
"loss": 0.1753,
"step": 1140
},
{
"epoch": 0.37,
"learning_rate": 1.9537666486625352e-05,
"loss": 0.1813,
"step": 1150
},
{
"epoch": 0.38,
"learning_rate": 1.9527086559800307e-05,
"loss": 0.191,
"step": 1160
},
{
"epoch": 0.38,
"learning_rate": 1.9516389874804442e-05,
"loss": 0.1749,
"step": 1170
},
{
"epoch": 0.38,
"learning_rate": 1.9505576562729818e-05,
"loss": 0.184,
"step": 1180
},
{
"epoch": 0.39,
"learning_rate": 1.949464675609779e-05,
"loss": 0.1711,
"step": 1190
},
{
"epoch": 0.39,
"learning_rate": 1.9483600588857428e-05,
"loss": 0.1784,
"step": 1200
},
{
"epoch": 0.39,
"learning_rate": 1.9472438196383817e-05,
"loss": 0.1721,
"step": 1210
},
{
"epoch": 0.4,
"learning_rate": 1.946115971547645e-05,
"loss": 0.1883,
"step": 1220
},
{
"epoch": 0.4,
"learning_rate": 1.9449765284357514e-05,
"loss": 0.181,
"step": 1230
},
{
"epoch": 0.4,
"learning_rate": 1.943825504267022e-05,
"loss": 0.1884,
"step": 1240
},
{
"epoch": 0.41,
"learning_rate": 1.942662913147708e-05,
"loss": 0.1586,
"step": 1250
},
{
"epoch": 0.41,
"learning_rate": 1.9414887693258185e-05,
"loss": 0.1689,
"step": 1260
},
{
"epoch": 0.41,
"learning_rate": 1.9403030871909443e-05,
"loss": 0.1663,
"step": 1270
},
{
"epoch": 0.42,
"learning_rate": 1.9391058812740845e-05,
"loss": 0.1652,
"step": 1280
},
{
"epoch": 0.42,
"learning_rate": 1.9378971662474652e-05,
"loss": 0.1728,
"step": 1290
},
{
"epoch": 0.42,
"learning_rate": 1.9366769569243614e-05,
"loss": 0.1883,
"step": 1300
},
{
"epoch": 0.42,
"learning_rate": 1.9354452682589162e-05,
"loss": 0.183,
"step": 1310
},
{
"epoch": 0.43,
"learning_rate": 1.9342021153459554e-05,
"loss": 0.1786,
"step": 1320
},
{
"epoch": 0.43,
"learning_rate": 1.9329475134208037e-05,
"loss": 0.158,
"step": 1330
},
{
"epoch": 0.43,
"learning_rate": 1.9316814778590984e-05,
"loss": 0.1811,
"step": 1340
},
{
"epoch": 0.44,
"learning_rate": 1.9304040241766008e-05,
"loss": 0.1834,
"step": 1350
},
{
"epoch": 0.44,
"learning_rate": 1.9291151680290045e-05,
"loss": 0.1691,
"step": 1360
},
{
"epoch": 0.44,
"learning_rate": 1.927814925211746e-05,
"loss": 0.1707,
"step": 1370
},
{
"epoch": 0.45,
"learning_rate": 1.9265033116598096e-05,
"loss": 0.1738,
"step": 1380
},
{
"epoch": 0.45,
"learning_rate": 1.9251803434475317e-05,
"loss": 0.1783,
"step": 1390
},
{
"epoch": 0.45,
"learning_rate": 1.923846036788405e-05,
"loss": 0.1722,
"step": 1400
},
{
"epoch": 0.46,
"learning_rate": 1.92250040803488e-05,
"loss": 0.1752,
"step": 1410
},
{
"epoch": 0.46,
"learning_rate": 1.9211434736781624e-05,
"loss": 0.1702,
"step": 1420
},
{
"epoch": 0.46,
"learning_rate": 1.919775250348014e-05,
"loss": 0.1676,
"step": 1430
},
{
"epoch": 0.47,
"learning_rate": 1.918395754812546e-05,
"loss": 0.1676,
"step": 1440
},
{
"epoch": 0.47,
"learning_rate": 1.9170050039780158e-05,
"loss": 0.1753,
"step": 1450
},
{
"epoch": 0.47,
"learning_rate": 1.9156030148886193e-05,
"loss": 0.1604,
"step": 1460
},
{
"epoch": 0.48,
"learning_rate": 1.91418980472628e-05,
"loss": 0.184,
"step": 1470
},
{
"epoch": 0.48,
"learning_rate": 1.9127653908104414e-05,
"loss": 0.1724,
"step": 1480
},
{
"epoch": 0.48,
"learning_rate": 1.911329790597853e-05,
"loss": 0.1765,
"step": 1490
},
{
"epoch": 0.49,
"learning_rate": 1.9098830216823568e-05,
"loss": 0.1708,
"step": 1500
},
{
"epoch": 0.49,
"learning_rate": 1.9084251017946713e-05,
"loss": 0.1725,
"step": 1510
},
{
"epoch": 0.49,
"learning_rate": 1.9069560488021744e-05,
"loss": 0.178,
"step": 1520
},
{
"epoch": 0.5,
"learning_rate": 1.905475880708686e-05,
"loss": 0.1853,
"step": 1530
},
{
"epoch": 0.5,
"learning_rate": 1.9039846156542442e-05,
"loss": 0.1619,
"step": 1540
},
{
"epoch": 0.5,
"learning_rate": 1.9024822719148853e-05,
"loss": 0.1616,
"step": 1550
},
{
"epoch": 0.51,
"learning_rate": 1.900968867902419e-05,
"loss": 0.1689,
"step": 1560
},
{
"epoch": 0.51,
"learning_rate": 1.899444422164204e-05,
"loss": 0.1567,
"step": 1570
},
{
"epoch": 0.51,
"learning_rate": 1.8979089533829182e-05,
"loss": 0.1683,
"step": 1580
},
{
"epoch": 0.52,
"learning_rate": 1.8963624803763318e-05,
"loss": 0.1677,
"step": 1590
},
{
"epoch": 0.52,
"learning_rate": 1.8948050220970763e-05,
"loss": 0.1642,
"step": 1600
},
{
"epoch": 0.52,
"learning_rate": 1.893236597632412e-05,
"loss": 0.1792,
"step": 1610
},
{
"epoch": 0.53,
"learning_rate": 1.891657226203994e-05,
"loss": 0.1805,
"step": 1620
},
{
"epoch": 0.53,
"learning_rate": 1.8900669271676367e-05,
"loss": 0.1573,
"step": 1630
},
{
"epoch": 0.53,
"learning_rate": 1.8884657200130763e-05,
"loss": 0.1696,
"step": 1640
},
{
"epoch": 0.54,
"learning_rate": 1.8868536243637327e-05,
"loss": 0.1725,
"step": 1650
},
{
"epoch": 0.54,
"learning_rate": 1.8852306599764683e-05,
"loss": 0.1755,
"step": 1660
},
{
"epoch": 0.54,
"learning_rate": 1.8835968467413465e-05,
"loss": 0.1597,
"step": 1670
},
{
"epoch": 0.54,
"learning_rate": 1.8819522046813873e-05,
"loss": 0.1741,
"step": 1680
},
{
"epoch": 0.55,
"learning_rate": 1.8802967539523215e-05,
"loss": 0.1712,
"step": 1690
},
{
"epoch": 0.55,
"learning_rate": 1.8786305148423463e-05,
"loss": 0.1759,
"step": 1700
},
{
"epoch": 0.55,
"learning_rate": 1.8769535077718725e-05,
"loss": 0.1602,
"step": 1710
},
{
"epoch": 0.56,
"learning_rate": 1.8752657532932774e-05,
"loss": 0.1693,
"step": 1720
},
{
"epoch": 0.56,
"learning_rate": 1.8735672720906527e-05,
"loss": 0.1539,
"step": 1730
},
{
"epoch": 0.56,
"learning_rate": 1.8718580849795494e-05,
"loss": 0.166,
"step": 1740
},
{
"epoch": 0.57,
"learning_rate": 1.8701382129067232e-05,
"loss": 0.1695,
"step": 1750
},
{
"epoch": 0.57,
"learning_rate": 1.86840767694988e-05,
"loss": 0.1664,
"step": 1760
},
{
"epoch": 0.57,
"learning_rate": 1.8666664983174137e-05,
"loss": 0.1693,
"step": 1770
},
{
"epoch": 0.58,
"learning_rate": 1.864914698348149e-05,
"loss": 0.168,
"step": 1780
},
{
"epoch": 0.58,
"learning_rate": 1.8631522985110803e-05,
"loss": 0.161,
"step": 1790
},
{
"epoch": 0.58,
"learning_rate": 1.8613793204051066e-05,
"loss": 0.1825,
"step": 1800
},
{
"epoch": 0.59,
"learning_rate": 1.859595785758767e-05,
"loss": 0.1688,
"step": 1810
},
{
"epoch": 0.59,
"learning_rate": 1.8578017164299767e-05,
"loss": 0.1584,
"step": 1820
},
{
"epoch": 0.59,
"learning_rate": 1.8559971344057562e-05,
"loss": 0.1602,
"step": 1830
},
{
"epoch": 0.6,
"learning_rate": 1.8541820618019647e-05,
"loss": 0.1773,
"step": 1840
},
{
"epoch": 0.6,
"learning_rate": 1.8523565208630257e-05,
"loss": 0.1665,
"step": 1850
},
{
"epoch": 0.6,
"learning_rate": 1.8505205339616577e-05,
"loss": 0.1706,
"step": 1860
},
{
"epoch": 0.61,
"learning_rate": 1.848674123598598e-05,
"loss": 0.1699,
"step": 1870
},
{
"epoch": 0.61,
"learning_rate": 1.846817312402327e-05,
"loss": 0.1613,
"step": 1880
},
{
"epoch": 0.61,
"learning_rate": 1.8449501231287926e-05,
"loss": 0.1678,
"step": 1890
},
{
"epoch": 0.62,
"learning_rate": 1.8430725786611293e-05,
"loss": 0.1777,
"step": 1900
},
{
"epoch": 0.62,
"learning_rate": 1.8411847020093784e-05,
"loss": 0.1729,
"step": 1910
},
{
"epoch": 0.62,
"learning_rate": 1.8392865163102065e-05,
"loss": 0.1619,
"step": 1920
},
{
"epoch": 0.63,
"learning_rate": 1.8373780448266213e-05,
"loss": 0.1723,
"step": 1930
},
{
"epoch": 0.63,
"learning_rate": 1.8354593109476877e-05,
"loss": 0.1561,
"step": 1940
},
{
"epoch": 0.63,
"learning_rate": 1.833530338188239e-05,
"loss": 0.166,
"step": 1950
},
{
"epoch": 0.64,
"learning_rate": 1.8315911501885905e-05,
"loss": 0.1684,
"step": 1960
},
{
"epoch": 0.64,
"learning_rate": 1.82964177071425e-05,
"loss": 0.1627,
"step": 1970
},
{
"epoch": 0.64,
"learning_rate": 1.8276822236556246e-05,
"loss": 0.171,
"step": 1980
},
{
"epoch": 0.65,
"learning_rate": 1.82571253302773e-05,
"loss": 0.1604,
"step": 1990
},
{
"epoch": 0.65,
"learning_rate": 1.8237327229698943e-05,
"loss": 0.176,
"step": 2000
},
{
"epoch": 0.65,
"learning_rate": 1.821742817745465e-05,
"loss": 0.1724,
"step": 2010
},
{
"epoch": 0.65,
"learning_rate": 1.8197428417415075e-05,
"loss": 0.1688,
"step": 2020
},
{
"epoch": 0.66,
"learning_rate": 1.8177328194685108e-05,
"loss": 0.1579,
"step": 2030
},
{
"epoch": 0.66,
"learning_rate": 1.8157127755600826e-05,
"loss": 0.1561,
"step": 2040
},
{
"epoch": 0.66,
"learning_rate": 1.8136827347726516e-05,
"loss": 0.1663,
"step": 2050
},
{
"epoch": 0.67,
"learning_rate": 1.8116427219851615e-05,
"loss": 0.1621,
"step": 2060
},
{
"epoch": 0.67,
"learning_rate": 1.8095927621987658e-05,
"loss": 0.1647,
"step": 2070
},
{
"epoch": 0.67,
"learning_rate": 1.807532880536524e-05,
"loss": 0.1773,
"step": 2080
},
{
"epoch": 0.68,
"learning_rate": 1.8054631022430913e-05,
"loss": 0.1668,
"step": 2090
},
{
"epoch": 0.68,
"learning_rate": 1.8033834526844095e-05,
"loss": 0.1496,
"step": 2100
},
{
"epoch": 0.68,
"learning_rate": 1.8012939573473972e-05,
"loss": 0.169,
"step": 2110
},
{
"epoch": 0.69,
"learning_rate": 1.7991946418396365e-05,
"loss": 0.1706,
"step": 2120
},
{
"epoch": 0.69,
"learning_rate": 1.7970855318890606e-05,
"loss": 0.1599,
"step": 2130
},
{
"epoch": 0.69,
"learning_rate": 1.7949666533436358e-05,
"loss": 0.1673,
"step": 2140
},
{
"epoch": 0.7,
"learning_rate": 1.792838032171047e-05,
"loss": 0.1586,
"step": 2150
},
{
"epoch": 0.7,
"learning_rate": 1.79069969445838e-05,
"loss": 0.1623,
"step": 2160
},
{
"epoch": 0.7,
"learning_rate": 1.7885516664117982e-05,
"loss": 0.1572,
"step": 2170
},
{
"epoch": 0.71,
"learning_rate": 1.7863939743562266e-05,
"loss": 0.1637,
"step": 2180
},
{
"epoch": 0.71,
"learning_rate": 1.7842266447350236e-05,
"loss": 0.1637,
"step": 2190
},
{
"epoch": 0.71,
"learning_rate": 1.782049704109662e-05,
"loss": 0.1568,
"step": 2200
},
{
"epoch": 0.72,
"learning_rate": 1.7798631791594e-05,
"loss": 0.1585,
"step": 2210
},
{
"epoch": 0.72,
"learning_rate": 1.777667096680956e-05,
"loss": 0.1649,
"step": 2220
},
{
"epoch": 0.72,
"learning_rate": 1.7754614835881795e-05,
"loss": 0.1646,
"step": 2230
},
{
"epoch": 0.73,
"learning_rate": 1.7732463669117206e-05,
"loss": 0.1605,
"step": 2240
},
{
"epoch": 0.73,
"learning_rate": 1.7710217737987008e-05,
"loss": 0.1515,
"step": 2250
},
{
"epoch": 0.73,
"learning_rate": 1.768787731512379e-05,
"loss": 0.1458,
"step": 2260
},
{
"epoch": 0.74,
"learning_rate": 1.766544267431816e-05,
"loss": 0.1674,
"step": 2270
},
{
"epoch": 0.74,
"learning_rate": 1.7642914090515423e-05,
"loss": 0.1659,
"step": 2280
},
{
"epoch": 0.74,
"learning_rate": 1.762029183981217e-05,
"loss": 0.1512,
"step": 2290
},
{
"epoch": 0.75,
"learning_rate": 1.759757619945294e-05,
"loss": 0.1736,
"step": 2300
},
{
"epoch": 0.75,
"learning_rate": 1.7574767447826776e-05,
"loss": 0.1656,
"step": 2310
},
{
"epoch": 0.75,
"learning_rate": 1.7551865864463857e-05,
"loss": 0.157,
"step": 2320
},
{
"epoch": 0.76,
"learning_rate": 1.7528871730032034e-05,
"loss": 0.1588,
"step": 2330
},
{
"epoch": 0.76,
"learning_rate": 1.750578532633342e-05,
"loss": 0.1547,
"step": 2340
},
{
"epoch": 0.76,
"learning_rate": 1.748260693630092e-05,
"loss": 0.1528,
"step": 2350
},
{
"epoch": 0.77,
"learning_rate": 1.7459336843994758e-05,
"loss": 0.1541,
"step": 2360
},
{
"epoch": 0.77,
"learning_rate": 1.7435975334599026e-05,
"loss": 0.1554,
"step": 2370
},
{
"epoch": 0.77,
"learning_rate": 1.741252269441815e-05,
"loss": 0.1728,
"step": 2380
},
{
"epoch": 0.77,
"learning_rate": 1.73889792108734e-05,
"loss": 0.1678,
"step": 2390
},
{
"epoch": 0.78,
"learning_rate": 1.736534517249938e-05,
"loss": 0.1586,
"step": 2400
},
{
"epoch": 0.78,
"learning_rate": 1.7341620868940467e-05,
"loss": 0.1549,
"step": 2410
},
{
"epoch": 0.78,
"learning_rate": 1.731780659094728e-05,
"loss": 0.1561,
"step": 2420
},
{
"epoch": 0.79,
"learning_rate": 1.7293902630373103e-05,
"loss": 0.1624,
"step": 2430
},
{
"epoch": 0.79,
"learning_rate": 1.726990928017032e-05,
"loss": 0.1561,
"step": 2440
},
{
"epoch": 0.79,
"learning_rate": 1.7245826834386825e-05,
"loss": 0.1424,
"step": 2450
},
{
"epoch": 0.8,
"learning_rate": 1.7221655588162397e-05,
"loss": 0.1605,
"step": 2460
},
{
"epoch": 0.8,
"learning_rate": 1.7197395837725118e-05,
"loss": 0.1547,
"step": 2470
},
{
"epoch": 0.8,
"learning_rate": 1.717304788038771e-05,
"loss": 0.164,
"step": 2480
},
{
"epoch": 0.81,
"learning_rate": 1.7148612014543915e-05,
"loss": 0.1569,
"step": 2490
},
{
"epoch": 0.81,
"learning_rate": 1.712408853966482e-05,
"loss": 0.1527,
"step": 2500
},
{
"epoch": 0.81,
"learning_rate": 1.7099477756295195e-05,
"loss": 0.154,
"step": 2510
},
{
"epoch": 0.82,
"learning_rate": 1.7074779966049818e-05,
"loss": 0.1588,
"step": 2520
},
{
"epoch": 0.82,
"learning_rate": 1.7049995471609765e-05,
"loss": 0.1595,
"step": 2530
},
{
"epoch": 0.82,
"learning_rate": 1.70251245767187e-05,
"loss": 0.1761,
"step": 2540
},
{
"epoch": 0.83,
"learning_rate": 1.7000167586179173e-05,
"loss": 0.1563,
"step": 2550
},
{
"epoch": 0.83,
"learning_rate": 1.6975124805848852e-05,
"loss": 0.1592,
"step": 2560
},
{
"epoch": 0.83,
"learning_rate": 1.694999654263681e-05,
"loss": 0.1597,
"step": 2570
},
{
"epoch": 0.84,
"learning_rate": 1.692478310449973e-05,
"loss": 0.1611,
"step": 2580
},
{
"epoch": 0.84,
"learning_rate": 1.689948480043816e-05,
"loss": 0.1712,
"step": 2590
},
{
"epoch": 0.84,
"learning_rate": 1.6874101940492707e-05,
"loss": 0.1603,
"step": 2600
},
{
"epoch": 0.85,
"learning_rate": 1.684863483574024e-05,
"loss": 0.1666,
"step": 2610
},
{
"epoch": 0.85,
"learning_rate": 1.6823083798290092e-05,
"loss": 0.1599,
"step": 2620
},
{
"epoch": 0.85,
"learning_rate": 1.6797449141280213e-05,
"loss": 0.1468,
"step": 2630
},
{
"epoch": 0.86,
"learning_rate": 1.6771731178873344e-05,
"loss": 0.1519,
"step": 2640
},
{
"epoch": 0.86,
"learning_rate": 1.674593022625318e-05,
"loss": 0.1565,
"step": 2650
},
{
"epoch": 0.86,
"learning_rate": 1.6720046599620476e-05,
"loss": 0.1513,
"step": 2660
},
{
"epoch": 0.87,
"learning_rate": 1.6694080616189197e-05,
"loss": 0.1616,
"step": 2670
},
{
"epoch": 0.87,
"learning_rate": 1.6668032594182623e-05,
"loss": 0.1642,
"step": 2680
},
{
"epoch": 0.87,
"learning_rate": 1.664190285282945e-05,
"loss": 0.1564,
"step": 2690
},
{
"epoch": 0.88,
"learning_rate": 1.661569171235988e-05,
"loss": 0.1604,
"step": 2700
},
{
"epoch": 0.88,
"learning_rate": 1.658939949400167e-05,
"loss": 0.1552,
"step": 2710
},
{
"epoch": 0.88,
"learning_rate": 1.656302651997626e-05,
"loss": 0.1526,
"step": 2720
},
{
"epoch": 0.89,
"learning_rate": 1.6536573113494737e-05,
"loss": 0.16,
"step": 2730
},
{
"epoch": 0.89,
"learning_rate": 1.6510039598753953e-05,
"loss": 0.155,
"step": 2740
},
{
"epoch": 0.89,
"learning_rate": 1.64834263009325e-05,
"loss": 0.1641,
"step": 2750
},
{
"epoch": 0.89,
"learning_rate": 1.6456733546186755e-05,
"loss": 0.1423,
"step": 2760
},
{
"epoch": 0.9,
"learning_rate": 1.6429961661646858e-05,
"loss": 0.1604,
"step": 2770
},
{
"epoch": 0.9,
"learning_rate": 1.6403110975412723e-05,
"loss": 0.1698,
"step": 2780
},
{
"epoch": 0.9,
"learning_rate": 1.637618181655001e-05,
"loss": 0.1537,
"step": 2790
},
{
"epoch": 0.91,
"learning_rate": 1.6349174515086087e-05,
"loss": 0.158,
"step": 2800
},
{
"epoch": 0.91,
"learning_rate": 1.6322089402005995e-05,
"loss": 0.145,
"step": 2810
},
{
"epoch": 0.91,
"learning_rate": 1.629492680924839e-05,
"loss": 0.1462,
"step": 2820
},
{
"epoch": 0.92,
"learning_rate": 1.6267687069701455e-05,
"loss": 0.1536,
"step": 2830
},
{
"epoch": 0.92,
"learning_rate": 1.6240370517198855e-05,
"loss": 0.1456,
"step": 2840
},
{
"epoch": 0.92,
"learning_rate": 1.6212977486515626e-05,
"loss": 0.1576,
"step": 2850
},
{
"epoch": 0.93,
"learning_rate": 1.618550831336406e-05,
"loss": 0.1555,
"step": 2860
},
{
"epoch": 0.93,
"learning_rate": 1.6157963334389623e-05,
"loss": 0.1593,
"step": 2870
},
{
"epoch": 0.93,
"learning_rate": 1.61303428871668e-05,
"loss": 0.155,
"step": 2880
},
{
"epoch": 0.94,
"learning_rate": 1.6102647310194964e-05,
"loss": 0.1502,
"step": 2890
},
{
"epoch": 0.94,
"learning_rate": 1.607487694289425e-05,
"loss": 0.144,
"step": 2900
},
{
"epoch": 0.94,
"learning_rate": 1.6047032125601364e-05,
"loss": 0.1422,
"step": 2910
},
{
"epoch": 0.95,
"learning_rate": 1.6019113199565424e-05,
"loss": 0.1594,
"step": 2920
},
{
"epoch": 0.95,
"learning_rate": 1.599112050694379e-05,
"loss": 0.1488,
"step": 2930
},
{
"epoch": 0.95,
"learning_rate": 1.596305439079785e-05,
"loss": 0.1631,
"step": 2940
},
{
"epoch": 0.96,
"learning_rate": 1.5934915195088842e-05,
"loss": 0.1401,
"step": 2950
},
{
"epoch": 0.96,
"learning_rate": 1.5906703264673598e-05,
"loss": 0.1526,
"step": 2960
},
{
"epoch": 0.96,
"learning_rate": 1.5878418945300363e-05,
"loss": 0.15,
"step": 2970
},
{
"epoch": 0.97,
"learning_rate": 1.5850062583604534e-05,
"loss": 0.1589,
"step": 2980
},
{
"epoch": 0.97,
"learning_rate": 1.58216345271044e-05,
"loss": 0.1551,
"step": 2990
},
{
"epoch": 0.97,
"learning_rate": 1.5793135124196916e-05,
"loss": 0.1482,
"step": 3000
},
{
"epoch": 0.98,
"learning_rate": 1.5764564724153406e-05,
"loss": 0.1518,
"step": 3010
},
{
"epoch": 0.98,
"learning_rate": 1.5735923677115298e-05,
"loss": 0.1495,
"step": 3020
},
{
"epoch": 0.98,
"learning_rate": 1.570721233408981e-05,
"loss": 0.1492,
"step": 3030
},
{
"epoch": 0.99,
"learning_rate": 1.567843104694569e-05,
"loss": 0.1538,
"step": 3040
},
{
"epoch": 0.99,
"learning_rate": 1.5649580168408854e-05,
"loss": 0.1521,
"step": 3050
},
{
"epoch": 0.99,
"learning_rate": 1.5620660052058108e-05,
"loss": 0.1593,
"step": 3060
},
{
"epoch": 1.0,
"learning_rate": 1.5591671052320784e-05,
"loss": 0.1604,
"step": 3070
},
{
"epoch": 1.0,
"learning_rate": 1.55626135244684e-05,
"loss": 0.1405,
"step": 3080
},
{
"epoch": 1.0,
"eval_loss": 0.1611328125,
"eval_runtime": 6.2849,
"eval_samples_per_second": 20.366,
"eval_steps_per_second": 0.159,
"step": 3084
},
{
"epoch": 1.0,
"learning_rate": 1.553348782461233e-05,
"loss": 0.1398,
"step": 3090
},
{
"epoch": 1.01,
"learning_rate": 1.550429430969941e-05,
"loss": 0.1198,
"step": 3100
},
{
"epoch": 1.01,
"learning_rate": 1.5475033337507583e-05,
"loss": 0.109,
"step": 3110
},
{
"epoch": 1.01,
"learning_rate": 1.54457052666415e-05,
"loss": 0.1167,
"step": 3120
},
{
"epoch": 1.01,
"learning_rate": 1.541631045652814e-05,
"loss": 0.108,
"step": 3130
},
{
"epoch": 1.02,
"learning_rate": 1.5386849267412388e-05,
"loss": 0.1184,
"step": 3140
},
{
"epoch": 1.02,
"learning_rate": 1.5357322060352646e-05,
"loss": 0.1193,
"step": 3150
},
{
"epoch": 1.02,
"learning_rate": 1.5327729197216373e-05,
"loss": 0.1218,
"step": 3160
},
{
"epoch": 1.03,
"learning_rate": 1.529807104067568e-05,
"loss": 0.1152,
"step": 3170
},
{
"epoch": 1.03,
"learning_rate": 1.5268347954202872e-05,
"loss": 0.1079,
"step": 3180
},
{
"epoch": 1.03,
"learning_rate": 1.5238560302065992e-05,
"loss": 0.1128,
"step": 3190
},
{
"epoch": 1.04,
"learning_rate": 1.5208708449324369e-05,
"loss": 0.1158,
"step": 3200
},
{
"epoch": 1.04,
"learning_rate": 1.5178792761824129e-05,
"loss": 0.1204,
"step": 3210
},
{
"epoch": 1.04,
"learning_rate": 1.5148813606193715e-05,
"loss": 0.111,
"step": 3220
},
{
"epoch": 1.05,
"learning_rate": 1.5118771349839402e-05,
"loss": 0.1161,
"step": 3230
},
{
"epoch": 1.05,
"learning_rate": 1.5088666360940795e-05,
"loss": 0.1158,
"step": 3240
},
{
"epoch": 1.05,
"learning_rate": 1.5058499008446296e-05,
"loss": 0.1143,
"step": 3250
},
{
"epoch": 1.06,
"learning_rate": 1.502826966206861e-05,
"loss": 0.113,
"step": 3260
},
{
"epoch": 1.06,
"learning_rate": 1.4997978692280191e-05,
"loss": 0.122,
"step": 3270
},
{
"epoch": 1.06,
"learning_rate": 1.496762647030872e-05,
"loss": 0.1213,
"step": 3280
},
{
"epoch": 1.07,
"learning_rate": 1.4937213368132549e-05,
"loss": 0.125,
"step": 3290
},
{
"epoch": 1.07,
"learning_rate": 1.490673975847613e-05,
"loss": 0.1162,
"step": 3300
},
{
"epoch": 1.07,
"learning_rate": 1.4876206014805465e-05,
"loss": 0.1181,
"step": 3310
},
{
"epoch": 1.08,
"learning_rate": 1.4845612511323526e-05,
"loss": 0.1216,
"step": 3320
},
{
"epoch": 1.08,
"learning_rate": 1.4814959622965657e-05,
"loss": 0.1216,
"step": 3330
},
{
"epoch": 1.08,
"learning_rate": 1.478424772539499e-05,
"loss": 0.106,
"step": 3340
},
{
"epoch": 1.09,
"learning_rate": 1.4753477194997836e-05,
"loss": 0.1239,
"step": 3350
},
{
"epoch": 1.09,
"learning_rate": 1.4722648408879078e-05,
"loss": 0.1101,
"step": 3360
},
{
"epoch": 1.09,
"learning_rate": 1.4691761744857545e-05,
"loss": 0.1233,
"step": 3370
},
{
"epoch": 1.1,
"learning_rate": 1.466081758146138e-05,
"loss": 0.117,
"step": 3380
},
{
"epoch": 1.1,
"learning_rate": 1.4629816297923404e-05,
"loss": 0.1162,
"step": 3390
},
{
"epoch": 1.1,
"learning_rate": 1.4598758274176467e-05,
"loss": 0.1214,
"step": 3400
},
{
"epoch": 1.11,
"learning_rate": 1.4567643890848796e-05,
"loss": 0.1139,
"step": 3410
},
{
"epoch": 1.11,
"learning_rate": 1.4536473529259325e-05,
"loss": 0.1191,
"step": 3420
},
{
"epoch": 1.11,
"learning_rate": 1.4505247571413019e-05,
"loss": 0.1132,
"step": 3430
},
{
"epoch": 1.12,
"learning_rate": 1.4473966399996203e-05,
"loss": 0.1151,
"step": 3440
},
{
"epoch": 1.12,
"learning_rate": 1.444263039837186e-05,
"loss": 0.1244,
"step": 3450
},
{
"epoch": 1.12,
"learning_rate": 1.4411239950574946e-05,
"loss": 0.113,
"step": 3460
},
{
"epoch": 1.13,
"learning_rate": 1.4379795441307673e-05,
"loss": 0.1155,
"step": 3470
},
{
"epoch": 1.13,
"learning_rate": 1.4348297255934793e-05,
"loss": 0.12,
"step": 3480
},
{
"epoch": 1.13,
"learning_rate": 1.4316745780478885e-05,
"loss": 0.1129,
"step": 3490
},
{
"epoch": 1.13,
"learning_rate": 1.4285141401615619e-05,
"loss": 0.1191,
"step": 3500
},
{
"epoch": 1.14,
"learning_rate": 1.4253484506669012e-05,
"loss": 0.1143,
"step": 3510
},
{
"epoch": 1.14,
"learning_rate": 1.422177548360669e-05,
"loss": 0.124,
"step": 3520
},
{
"epoch": 1.14,
"learning_rate": 1.4190014721035127e-05,
"loss": 0.1236,
"step": 3530
},
{
"epoch": 1.15,
"learning_rate": 1.4158202608194893e-05,
"loss": 0.116,
"step": 3540
},
{
"epoch": 1.15,
"learning_rate": 1.4126339534955863e-05,
"loss": 0.1128,
"step": 3550
},
{
"epoch": 1.15,
"learning_rate": 1.4094425891812457e-05,
"loss": 0.1196,
"step": 3560
},
{
"epoch": 1.16,
"learning_rate": 1.4062462069878855e-05,
"loss": 0.1128,
"step": 3570
},
{
"epoch": 1.16,
"learning_rate": 1.4030448460884191e-05,
"loss": 0.1163,
"step": 3580
},
{
"epoch": 1.16,
"learning_rate": 1.3998385457167758e-05,
"loss": 0.1178,
"step": 3590
},
{
"epoch": 1.17,
"learning_rate": 1.3966273451674203e-05,
"loss": 0.1128,
"step": 3600
},
{
"epoch": 1.17,
"learning_rate": 1.3934112837948712e-05,
"loss": 0.1167,
"step": 3610
},
{
"epoch": 1.17,
"learning_rate": 1.3901904010132178e-05,
"loss": 0.1181,
"step": 3620
},
{
"epoch": 1.18,
"learning_rate": 1.3869647362956381e-05,
"loss": 0.1124,
"step": 3630
},
{
"epoch": 1.18,
"learning_rate": 1.3837343291739143e-05,
"loss": 0.1189,
"step": 3640
},
{
"epoch": 1.18,
"learning_rate": 1.3804992192379487e-05,
"loss": 0.121,
"step": 3650
},
{
"epoch": 1.19,
"learning_rate": 1.3772594461352786e-05,
"loss": 0.1185,
"step": 3660
},
{
"epoch": 1.19,
"learning_rate": 1.3740150495705904e-05,
"loss": 0.1208,
"step": 3670
},
{
"epoch": 1.19,
"learning_rate": 1.3707660693052318e-05,
"loss": 0.1214,
"step": 3680
},
{
"epoch": 1.2,
"learning_rate": 1.3675125451567268e-05,
"loss": 0.1103,
"step": 3690
},
{
"epoch": 1.2,
"learning_rate": 1.364254516998286e-05,
"loss": 0.1119,
"step": 3700
},
{
"epoch": 1.2,
"learning_rate": 1.3609920247583182e-05,
"loss": 0.1192,
"step": 3710
},
{
"epoch": 1.21,
"learning_rate": 1.3577251084199412e-05,
"loss": 0.1249,
"step": 3720
},
{
"epoch": 1.21,
"learning_rate": 1.3544538080204922e-05,
"loss": 0.1212,
"step": 3730
},
{
"epoch": 1.21,
"learning_rate": 1.351178163651037e-05,
"loss": 0.115,
"step": 3740
},
{
"epoch": 1.22,
"learning_rate": 1.3478982154558778e-05,
"loss": 0.1195,
"step": 3750
},
{
"epoch": 1.22,
"learning_rate": 1.3446140036320621e-05,
"loss": 0.1264,
"step": 3760
},
{
"epoch": 1.22,
"learning_rate": 1.34132556842889e-05,
"loss": 0.1165,
"step": 3770
},
{
"epoch": 1.23,
"learning_rate": 1.3380329501474207e-05,
"loss": 0.1211,
"step": 3780
},
{
"epoch": 1.23,
"learning_rate": 1.3347361891399786e-05,
"loss": 0.113,
"step": 3790
},
{
"epoch": 1.23,
"learning_rate": 1.3314353258096588e-05,
"loss": 0.1135,
"step": 3800
},
{
"epoch": 1.24,
"learning_rate": 1.3281304006098324e-05,
"loss": 0.1125,
"step": 3810
},
{
"epoch": 1.24,
"learning_rate": 1.3248214540436495e-05,
"loss": 0.1245,
"step": 3820
},
{
"epoch": 1.24,
"learning_rate": 1.3215085266635442e-05,
"loss": 0.1112,
"step": 3830
},
{
"epoch": 1.25,
"learning_rate": 1.3181916590707366e-05,
"loss": 0.1209,
"step": 3840
},
{
"epoch": 1.25,
"learning_rate": 1.3148708919147364e-05,
"loss": 0.117,
"step": 3850
},
{
"epoch": 1.25,
"learning_rate": 1.3115462658928434e-05,
"loss": 0.1164,
"step": 3860
},
{
"epoch": 1.25,
"learning_rate": 1.3082178217496488e-05,
"loss": 0.1148,
"step": 3870
},
{
"epoch": 1.26,
"learning_rate": 1.304885600276538e-05,
"loss": 0.1159,
"step": 3880
},
{
"epoch": 1.26,
"learning_rate": 1.3015496423111871e-05,
"loss": 0.1198,
"step": 3890
},
{
"epoch": 1.26,
"learning_rate": 1.298209988737066e-05,
"loss": 0.1186,
"step": 3900
},
{
"epoch": 1.27,
"learning_rate": 1.2948666804829345e-05,
"loss": 0.1093,
"step": 3910
},
{
"epoch": 1.27,
"learning_rate": 1.2915197585223427e-05,
"loss": 0.1189,
"step": 3920
},
{
"epoch": 1.27,
"learning_rate": 1.288169263873128e-05,
"loss": 0.1027,
"step": 3930
},
{
"epoch": 1.28,
"learning_rate": 1.284815237596912e-05,
"loss": 0.114,
"step": 3940
},
{
"epoch": 1.28,
"learning_rate": 1.2814577207985984e-05,
"loss": 0.11,
"step": 3950
},
{
"epoch": 1.28,
"learning_rate": 1.2780967546258683e-05,
"loss": 0.1129,
"step": 3960
},
{
"epoch": 1.29,
"learning_rate": 1.2747323802686761e-05,
"loss": 0.1159,
"step": 3970
},
{
"epoch": 1.29,
"learning_rate": 1.2713646389587453e-05,
"loss": 0.1213,
"step": 3980
},
{
"epoch": 1.29,
"learning_rate": 1.267993571969062e-05,
"loss": 0.1117,
"step": 3990
},
{
"epoch": 1.3,
"learning_rate": 1.2646192206133705e-05,
"loss": 0.1187,
"step": 4000
},
{
"epoch": 1.3,
"learning_rate": 1.2612416262456659e-05,
"loss": 0.1165,
"step": 4010
},
{
"epoch": 1.3,
"learning_rate": 1.2578608302596878e-05,
"loss": 0.1277,
"step": 4020
},
{
"epoch": 1.31,
"learning_rate": 1.254476874088413e-05,
"loss": 0.1223,
"step": 4030
},
{
"epoch": 1.31,
"learning_rate": 1.2510897992035475e-05,
"loss": 0.1187,
"step": 4040
},
{
"epoch": 1.31,
"learning_rate": 1.2476996471150183e-05,
"loss": 0.1177,
"step": 4050
},
{
"epoch": 1.32,
"learning_rate": 1.2443064593704645e-05,
"loss": 0.1202,
"step": 4060
},
{
"epoch": 1.32,
"learning_rate": 1.240910277554729e-05,
"loss": 0.1163,
"step": 4070
},
{
"epoch": 1.32,
"learning_rate": 1.2375111432893479e-05,
"loss": 0.1062,
"step": 4080
},
{
"epoch": 1.33,
"learning_rate": 1.2341090982320398e-05,
"loss": 0.1186,
"step": 4090
},
{
"epoch": 1.33,
"learning_rate": 1.2307041840761983e-05,
"loss": 0.1193,
"step": 4100
},
{
"epoch": 1.33,
"learning_rate": 1.2272964425503768e-05,
"loss": 0.1174,
"step": 4110
},
{
"epoch": 1.34,
"learning_rate": 1.2238859154177805e-05,
"loss": 0.109,
"step": 4120
},
{
"epoch": 1.34,
"learning_rate": 1.2204726444757527e-05,
"loss": 0.1251,
"step": 4130
},
{
"epoch": 1.34,
"learning_rate": 1.2170566715552634e-05,
"loss": 0.1166,
"step": 4140
},
{
"epoch": 1.35,
"learning_rate": 1.2136380385203965e-05,
"loss": 0.1123,
"step": 4150
},
{
"epoch": 1.35,
"learning_rate": 1.2102167872678366e-05,
"loss": 0.1273,
"step": 4160
},
{
"epoch": 1.35,
"learning_rate": 1.2067929597263552e-05,
"loss": 0.1201,
"step": 4170
},
{
"epoch": 1.36,
"learning_rate": 1.2033665978562973e-05,
"loss": 0.1197,
"step": 4180
},
{
"epoch": 1.36,
"learning_rate": 1.1999377436490682e-05,
"loss": 0.1126,
"step": 4190
},
{
"epoch": 1.36,
"learning_rate": 1.1965064391266158e-05,
"loss": 0.1264,
"step": 4200
},
{
"epoch": 1.37,
"learning_rate": 1.1930727263409194e-05,
"loss": 0.1153,
"step": 4210
},
{
"epoch": 1.37,
"learning_rate": 1.1896366473734715e-05,
"loss": 0.1085,
"step": 4220
},
{
"epoch": 1.37,
"learning_rate": 1.1861982443347633e-05,
"loss": 0.1116,
"step": 4230
},
{
"epoch": 1.37,
"learning_rate": 1.1827575593637683e-05,
"loss": 0.1107,
"step": 4240
},
{
"epoch": 1.38,
"learning_rate": 1.1793146346274262e-05,
"loss": 0.121,
"step": 4250
},
{
"epoch": 1.38,
"learning_rate": 1.1758695123201262e-05,
"loss": 0.1179,
"step": 4260
},
{
"epoch": 1.38,
"learning_rate": 1.1724222346631886e-05,
"loss": 0.1118,
"step": 4270
},
{
"epoch": 1.39,
"learning_rate": 1.1689728439043495e-05,
"loss": 0.1135,
"step": 4280
},
{
"epoch": 1.39,
"learning_rate": 1.1655213823172407e-05,
"loss": 0.1168,
"step": 4290
},
{
"epoch": 1.39,
"learning_rate": 1.1620678922008736e-05,
"loss": 0.1076,
"step": 4300
},
{
"epoch": 1.4,
"learning_rate": 1.1586124158791205e-05,
"loss": 0.1145,
"step": 4310
},
{
"epoch": 1.4,
"learning_rate": 1.1551549957001944e-05,
"loss": 0.1222,
"step": 4320
},
{
"epoch": 1.4,
"learning_rate": 1.151695674036131e-05,
"loss": 0.1219,
"step": 4330
},
{
"epoch": 1.41,
"learning_rate": 1.1482344932822706e-05,
"loss": 0.1145,
"step": 4340
},
{
"epoch": 1.41,
"learning_rate": 1.1447714958567361e-05,
"loss": 0.1201,
"step": 4350
},
{
"epoch": 1.41,
"learning_rate": 1.1413067241999153e-05,
"loss": 0.1203,
"step": 4360
},
{
"epoch": 1.42,
"learning_rate": 1.1378402207739394e-05,
"loss": 0.1135,
"step": 4370
},
{
"epoch": 1.42,
"learning_rate": 1.134372028062163e-05,
"loss": 0.1151,
"step": 4380
},
{
"epoch": 1.42,
"learning_rate": 1.1309021885686446e-05,
"loss": 0.1167,
"step": 4390
},
{
"epoch": 1.43,
"learning_rate": 1.1274307448176227e-05,
"loss": 0.1125,
"step": 4400
},
{
"epoch": 1.43,
"learning_rate": 1.1239577393529988e-05,
"loss": 0.1128,
"step": 4410
},
{
"epoch": 1.43,
"learning_rate": 1.1204832147378125e-05,
"loss": 0.1201,
"step": 4420
},
{
"epoch": 1.44,
"learning_rate": 1.1170072135537213e-05,
"loss": 0.1081,
"step": 4430
},
{
"epoch": 1.44,
"learning_rate": 1.113529778400479e-05,
"loss": 0.1055,
"step": 4440
},
{
"epoch": 1.44,
"learning_rate": 1.110050951895413e-05,
"loss": 0.1167,
"step": 4450
},
{
"epoch": 1.45,
"learning_rate": 1.1065707766729024e-05,
"loss": 0.1257,
"step": 4460
},
{
"epoch": 1.45,
"learning_rate": 1.1030892953838548e-05,
"loss": 0.1137,
"step": 4470
},
{
"epoch": 1.45,
"learning_rate": 1.0996065506951854e-05,
"loss": 0.1106,
"step": 4480
},
{
"epoch": 1.46,
"learning_rate": 1.0961225852892914e-05,
"loss": 0.111,
"step": 4490
},
{
"epoch": 1.46,
"learning_rate": 1.0926374418635317e-05,
"loss": 0.107,
"step": 4500
},
{
"epoch": 1.46,
"learning_rate": 1.0891511631297009e-05,
"loss": 0.117,
"step": 4510
},
{
"epoch": 1.47,
"learning_rate": 1.0856637918135087e-05,
"loss": 0.1237,
"step": 4520
},
{
"epoch": 1.47,
"learning_rate": 1.0821753706540539e-05,
"loss": 0.1168,
"step": 4530
},
{
"epoch": 1.47,
"learning_rate": 1.0786859424033014e-05,
"loss": 0.1055,
"step": 4540
},
{
"epoch": 1.48,
"learning_rate": 1.0751955498255595e-05,
"loss": 0.1207,
"step": 4550
},
{
"epoch": 1.48,
"learning_rate": 1.0717042356969529e-05,
"loss": 0.1104,
"step": 4560
},
{
"epoch": 1.48,
"learning_rate": 1.0682120428049025e-05,
"loss": 0.1231,
"step": 4570
},
{
"epoch": 1.49,
"learning_rate": 1.0647190139475967e-05,
"loss": 0.1176,
"step": 4580
},
{
"epoch": 1.49,
"learning_rate": 1.0612251919334703e-05,
"loss": 0.1168,
"step": 4590
},
{
"epoch": 1.49,
"learning_rate": 1.057730619580678e-05,
"loss": 0.1098,
"step": 4600
},
{
"epoch": 1.49,
"learning_rate": 1.0542353397165706e-05,
"loss": 0.1119,
"step": 4610
},
{
"epoch": 1.5,
"learning_rate": 1.0507393951771695e-05,
"loss": 0.111,
"step": 4620
},
{
"epoch": 1.5,
"learning_rate": 1.0472428288066413e-05,
"loss": 0.1134,
"step": 4630
},
{
"epoch": 1.5,
"learning_rate": 1.043745683456775e-05,
"loss": 0.1146,
"step": 4640
},
{
"epoch": 1.51,
"learning_rate": 1.040248001986453e-05,
"loss": 0.1133,
"step": 4650
},
{
"epoch": 1.51,
"learning_rate": 1.0367498272611303e-05,
"loss": 0.1121,
"step": 4660
},
{
"epoch": 1.51,
"learning_rate": 1.0332512021523054e-05,
"loss": 0.1174,
"step": 4670
},
{
"epoch": 1.52,
"learning_rate": 1.0297521695369974e-05,
"loss": 0.1161,
"step": 4680
},
{
"epoch": 1.52,
"learning_rate": 1.0262527722972185e-05,
"loss": 0.1004,
"step": 4690
},
{
"epoch": 1.52,
"learning_rate": 1.0227530533194508e-05,
"loss": 0.1155,
"step": 4700
},
{
"epoch": 1.53,
"learning_rate": 1.0192530554941177e-05,
"loss": 0.1261,
"step": 4710
},
{
"epoch": 1.53,
"learning_rate": 1.0157528217150624e-05,
"loss": 0.1201,
"step": 4720
},
{
"epoch": 1.53,
"learning_rate": 1.0122523948790174e-05,
"loss": 0.1192,
"step": 4730
},
{
"epoch": 1.54,
"learning_rate": 1.0087518178850824e-05,
"loss": 0.1115,
"step": 4740
},
{
"epoch": 1.54,
"learning_rate": 1.005251133634198e-05,
"loss": 0.1127,
"step": 4750
},
{
"epoch": 1.54,
"learning_rate": 1.0017503850286167e-05,
"loss": 0.1117,
"step": 4760
},
{
"epoch": 1.55,
"learning_rate": 9.982496149713835e-06,
"loss": 0.1112,
"step": 4770
},
{
"epoch": 1.55,
"learning_rate": 9.947488663658027e-06,
"loss": 0.1084,
"step": 4780
},
{
"epoch": 1.55,
"learning_rate": 9.912481821149176e-06,
"loss": 0.1109,
"step": 4790
},
{
"epoch": 1.56,
"learning_rate": 9.877476051209827e-06,
"loss": 0.1051,
"step": 4800
},
{
"epoch": 1.56,
"learning_rate": 9.842471782849381e-06,
"loss": 0.1187,
"step": 4810
},
{
"epoch": 1.56,
"learning_rate": 9.807469445058824e-06,
"loss": 0.1246,
"step": 4820
},
{
"epoch": 1.57,
"learning_rate": 9.772469466805499e-06,
"loss": 0.1111,
"step": 4830
},
{
"epoch": 1.57,
"learning_rate": 9.737472277027817e-06,
"loss": 0.112,
"step": 4840
},
{
"epoch": 1.57,
"learning_rate": 9.702478304630028e-06,
"loss": 0.112,
"step": 4850
},
{
"epoch": 1.58,
"learning_rate": 9.66748797847695e-06,
"loss": 0.115,
"step": 4860
},
{
"epoch": 1.58,
"learning_rate": 9.6325017273887e-06,
"loss": 0.1166,
"step": 4870
},
{
"epoch": 1.58,
"learning_rate": 9.597519980135472e-06,
"loss": 0.1186,
"step": 4880
},
{
"epoch": 1.59,
"learning_rate": 9.562543165432255e-06,
"loss": 0.1185,
"step": 4890
},
{
"epoch": 1.59,
"learning_rate": 9.52757171193359e-06,
"loss": 0.1133,
"step": 4900
},
{
"epoch": 1.59,
"learning_rate": 9.49260604822831e-06,
"loss": 0.1193,
"step": 4910
},
{
"epoch": 1.6,
"learning_rate": 9.457646602834295e-06,
"loss": 0.1076,
"step": 4920
},
{
"epoch": 1.6,
"learning_rate": 9.42269380419322e-06,
"loss": 0.1147,
"step": 4930
},
{
"epoch": 1.6,
"learning_rate": 9.387748080665298e-06,
"loss": 0.1067,
"step": 4940
},
{
"epoch": 1.61,
"learning_rate": 9.352809860524037e-06,
"loss": 0.1146,
"step": 4950
},
{
"epoch": 1.61,
"learning_rate": 9.31787957195098e-06,
"loss": 0.1094,
"step": 4960
},
{
"epoch": 1.61,
"learning_rate": 9.28295764303047e-06,
"loss": 0.1011,
"step": 4970
},
{
"epoch": 1.61,
"learning_rate": 9.248044501744409e-06,
"loss": 0.1108,
"step": 4980
},
{
"epoch": 1.62,
"learning_rate": 9.21314057596699e-06,
"loss": 0.1108,
"step": 4990
},
{
"epoch": 1.62,
"learning_rate": 9.178246293459466e-06,
"loss": 0.1078,
"step": 5000
},
{
"epoch": 1.62,
"learning_rate": 9.143362081864917e-06,
"loss": 0.1123,
"step": 5010
},
{
"epoch": 1.63,
"learning_rate": 9.108488368702991e-06,
"loss": 0.1079,
"step": 5020
},
{
"epoch": 1.63,
"learning_rate": 9.073625581364686e-06,
"loss": 0.1053,
"step": 5030
},
{
"epoch": 1.63,
"learning_rate": 9.03877414710709e-06,
"loss": 0.1116,
"step": 5040
},
{
"epoch": 1.64,
"learning_rate": 9.00393449304815e-06,
"loss": 0.1056,
"step": 5050
},
{
"epoch": 1.64,
"learning_rate": 8.969107046161452e-06,
"loss": 0.1082,
"step": 5060
},
{
"epoch": 1.64,
"learning_rate": 8.93429223327098e-06,
"loss": 0.1106,
"step": 5070
},
{
"epoch": 1.65,
"learning_rate": 8.899490481045873e-06,
"loss": 0.1157,
"step": 5080
},
{
"epoch": 1.65,
"learning_rate": 8.864702215995213e-06,
"loss": 0.1134,
"step": 5090
},
{
"epoch": 1.65,
"learning_rate": 8.82992786446279e-06,
"loss": 0.111,
"step": 5100
},
{
"epoch": 1.66,
"learning_rate": 8.795167852621877e-06,
"loss": 0.1267,
"step": 5110
},
{
"epoch": 1.66,
"learning_rate": 8.760422606470015e-06,
"loss": 0.1096,
"step": 5120
},
{
"epoch": 1.66,
"learning_rate": 8.725692551823776e-06,
"loss": 0.111,
"step": 5130
},
{
"epoch": 1.67,
"learning_rate": 8.69097811431356e-06,
"loss": 0.1127,
"step": 5140
},
{
"epoch": 1.67,
"learning_rate": 8.65627971937837e-06,
"loss": 0.1062,
"step": 5150
},
{
"epoch": 1.67,
"learning_rate": 8.621597792260608e-06,
"loss": 0.1128,
"step": 5160
},
{
"epoch": 1.68,
"learning_rate": 8.58693275800085e-06,
"loss": 0.1089,
"step": 5170
},
{
"epoch": 1.68,
"learning_rate": 8.55228504143264e-06,
"loss": 0.1109,
"step": 5180
},
{
"epoch": 1.68,
"learning_rate": 8.517655067177295e-06,
"loss": 0.1125,
"step": 5190
},
{
"epoch": 1.69,
"learning_rate": 8.48304325963869e-06,
"loss": 0.1176,
"step": 5200
},
{
"epoch": 1.69,
"learning_rate": 8.44845004299806e-06,
"loss": 0.1101,
"step": 5210
},
{
"epoch": 1.69,
"learning_rate": 8.413875841208797e-06,
"loss": 0.1122,
"step": 5220
},
{
"epoch": 1.7,
"learning_rate": 8.379321077991265e-06,
"loss": 0.1115,
"step": 5230
},
{
"epoch": 1.7,
"learning_rate": 8.344786176827594e-06,
"loss": 0.1139,
"step": 5240
},
{
"epoch": 1.7,
"learning_rate": 8.310271560956509e-06,
"loss": 0.1117,
"step": 5250
},
{
"epoch": 1.71,
"learning_rate": 8.275777653368119e-06,
"loss": 0.1073,
"step": 5260
},
{
"epoch": 1.71,
"learning_rate": 8.241304876798742e-06,
"loss": 0.1193,
"step": 5270
},
{
"epoch": 1.71,
"learning_rate": 8.20685365372574e-06,
"loss": 0.1171,
"step": 5280
},
{
"epoch": 1.72,
"learning_rate": 8.172424406362319e-06,
"loss": 0.1189,
"step": 5290
},
{
"epoch": 1.72,
"learning_rate": 8.13801755665237e-06,
"loss": 0.1183,
"step": 5300
},
{
"epoch": 1.72,
"learning_rate": 8.103633526265289e-06,
"loss": 0.1169,
"step": 5310
},
{
"epoch": 1.73,
"learning_rate": 8.069272736590809e-06,
"loss": 0.1044,
"step": 5320
},
{
"epoch": 1.73,
"learning_rate": 8.034935608733843e-06,
"loss": 0.1128,
"step": 5330
},
{
"epoch": 1.73,
"learning_rate": 8.00062256350932e-06,
"loss": 0.1086,
"step": 5340
},
{
"epoch": 1.73,
"learning_rate": 7.966334021437028e-06,
"loss": 0.1181,
"step": 5350
},
{
"epoch": 1.74,
"learning_rate": 7.932070402736451e-06,
"loss": 0.1153,
"step": 5360
},
{
"epoch": 1.74,
"learning_rate": 7.897832127321639e-06,
"loss": 0.1158,
"step": 5370
},
{
"epoch": 1.74,
"learning_rate": 7.863619614796035e-06,
"loss": 0.1068,
"step": 5380
},
{
"epoch": 1.75,
"learning_rate": 7.829433284447367e-06,
"loss": 0.1138,
"step": 5390
},
{
"epoch": 1.75,
"learning_rate": 7.795273555242476e-06,
"loss": 0.1123,
"step": 5400
},
{
"epoch": 1.75,
"learning_rate": 7.761140845822199e-06,
"loss": 0.1093,
"step": 5410
},
{
"epoch": 1.76,
"learning_rate": 7.727035574496234e-06,
"loss": 0.1094,
"step": 5420
},
{
"epoch": 1.76,
"learning_rate": 7.69295815923802e-06,
"loss": 0.113,
"step": 5430
},
{
"epoch": 1.76,
"learning_rate": 7.658909017679604e-06,
"loss": 0.1124,
"step": 5440
},
{
"epoch": 1.77,
"learning_rate": 7.6248885671065264e-06,
"loss": 0.1058,
"step": 5450
},
{
"epoch": 1.77,
"learning_rate": 7.590897224452716e-06,
"loss": 0.1107,
"step": 5460
},
{
"epoch": 1.77,
"learning_rate": 7.556935406295356e-06,
"loss": 0.106,
"step": 5470
},
{
"epoch": 1.78,
"learning_rate": 7.5230035288498204e-06,
"loss": 0.115,
"step": 5480
},
{
"epoch": 1.78,
"learning_rate": 7.4891020079645285e-06,
"loss": 0.1082,
"step": 5490
},
{
"epoch": 1.78,
"learning_rate": 7.455231259115872e-06,
"loss": 0.1146,
"step": 5500
},
{
"epoch": 1.79,
"learning_rate": 7.421391697403122e-06,
"loss": 0.1126,
"step": 5510
},
{
"epoch": 1.79,
"learning_rate": 7.3875837375433445e-06,
"loss": 0.1119,
"step": 5520
},
{
"epoch": 1.79,
"learning_rate": 7.353807793866299e-06,
"loss": 0.1081,
"step": 5530
},
{
"epoch": 1.8,
"learning_rate": 7.3200642803093835e-06,
"loss": 0.1127,
"step": 5540
},
{
"epoch": 1.8,
"learning_rate": 7.286353610412553e-06,
"loss": 0.1146,
"step": 5550
},
{
"epoch": 1.8,
"learning_rate": 7.2526761973132395e-06,
"loss": 0.1079,
"step": 5560
},
{
"epoch": 1.81,
"learning_rate": 7.2190324537413196e-06,
"loss": 0.1059,
"step": 5570
},
{
"epoch": 1.81,
"learning_rate": 7.185422792014019e-06,
"loss": 0.1072,
"step": 5580
},
{
"epoch": 1.81,
"learning_rate": 7.151847624030882e-06,
"loss": 0.1123,
"step": 5590
},
{
"epoch": 1.82,
"learning_rate": 7.118307361268721e-06,
"loss": 0.108,
"step": 5600
},
{
"epoch": 1.82,
"learning_rate": 7.084802414776575e-06,
"loss": 0.1056,
"step": 5610
},
{
"epoch": 1.82,
"learning_rate": 7.051333195170658e-06,
"loss": 0.099,
"step": 5620
},
{
"epoch": 1.83,
"learning_rate": 7.0179001126293435e-06,
"loss": 0.1123,
"step": 5630
},
{
"epoch": 1.83,
"learning_rate": 6.9845035768881285e-06,
"loss": 0.1089,
"step": 5640
},
{
"epoch": 1.83,
"learning_rate": 6.951143997234622e-06,
"loss": 0.1123,
"step": 5650
},
{
"epoch": 1.84,
"learning_rate": 6.917821782503513e-06,
"loss": 0.1081,
"step": 5660
},
{
"epoch": 1.84,
"learning_rate": 6.884537341071571e-06,
"loss": 0.1112,
"step": 5670
},
{
"epoch": 1.84,
"learning_rate": 6.85129108085264e-06,
"loss": 0.1064,
"step": 5680
},
{
"epoch": 1.85,
"learning_rate": 6.818083409292634e-06,
"loss": 0.1145,
"step": 5690
},
{
"epoch": 1.85,
"learning_rate": 6.784914733364563e-06,
"loss": 0.1083,
"step": 5700
},
{
"epoch": 1.85,
"learning_rate": 6.751785459563509e-06,
"loss": 0.119,
"step": 5710
},
{
"epoch": 1.85,
"learning_rate": 6.718695993901678e-06,
"loss": 0.1134,
"step": 5720
},
{
"epoch": 1.86,
"learning_rate": 6.685646741903411e-06,
"loss": 0.1154,
"step": 5730
},
{
"epoch": 1.86,
"learning_rate": 6.652638108600215e-06,
"loss": 0.1128,
"step": 5740
},
{
"epoch": 1.86,
"learning_rate": 6.619670498525796e-06,
"loss": 0.1043,
"step": 5750
},
{
"epoch": 1.87,
"learning_rate": 6.586744315711102e-06,
"loss": 0.1103,
"step": 5760
},
{
"epoch": 1.87,
"learning_rate": 6.5538599636793846e-06,
"loss": 0.1063,
"step": 5770
},
{
"epoch": 1.87,
"learning_rate": 6.521017845441225e-06,
"loss": 0.1125,
"step": 5780
},
{
"epoch": 1.88,
"learning_rate": 6.488218363489633e-06,
"loss": 0.105,
"step": 5790
},
{
"epoch": 1.88,
"learning_rate": 6.455461919795079e-06,
"loss": 0.1096,
"step": 5800
},
{
"epoch": 1.88,
"learning_rate": 6.422748915800592e-06,
"loss": 0.1126,
"step": 5810
},
{
"epoch": 1.89,
"learning_rate": 6.39007975241682e-06,
"loss": 0.1078,
"step": 5820
},
{
"epoch": 1.89,
"learning_rate": 6.357454830017143e-06,
"loss": 0.1161,
"step": 5830
},
{
"epoch": 1.89,
"learning_rate": 6.324874548432734e-06,
"loss": 0.1121,
"step": 5840
},
{
"epoch": 1.9,
"learning_rate": 6.292339306947685e-06,
"loss": 0.1067,
"step": 5850
},
{
"epoch": 1.9,
"learning_rate": 6.259849504294102e-06,
"loss": 0.1119,
"step": 5860
},
{
"epoch": 1.9,
"learning_rate": 6.227405538647213e-06,
"loss": 0.1046,
"step": 5870
},
{
"epoch": 1.91,
"learning_rate": 6.195007807620514e-06,
"loss": 0.1049,
"step": 5880
},
{
"epoch": 1.91,
"learning_rate": 6.16265670826086e-06,
"loss": 0.1111,
"step": 5890
},
{
"epoch": 1.91,
"learning_rate": 6.130352637043622e-06,
"loss": 0.0993,
"step": 5900
},
{
"epoch": 1.92,
"learning_rate": 6.098095989867822e-06,
"loss": 0.1073,
"step": 5910
},
{
"epoch": 1.92,
"learning_rate": 6.065887162051291e-06,
"loss": 0.1219,
"step": 5920
},
{
"epoch": 1.92,
"learning_rate": 6.033726548325798e-06,
"loss": 0.1139,
"step": 5930
},
{
"epoch": 1.93,
"learning_rate": 6.0016145428322445e-06,
"loss": 0.1108,
"step": 5940
},
{
"epoch": 1.93,
"learning_rate": 5.969551539115814e-06,
"loss": 0.1118,
"step": 5950
},
{
"epoch": 1.93,
"learning_rate": 5.937537930121145e-06,
"loss": 0.1002,
"step": 5960
},
{
"epoch": 1.94,
"learning_rate": 5.905574108187544e-06,
"loss": 0.1038,
"step": 5970
},
{
"epoch": 1.94,
"learning_rate": 5.873660465044141e-06,
"loss": 0.1023,
"step": 5980
},
{
"epoch": 1.94,
"learning_rate": 5.841797391805113e-06,
"loss": 0.1099,
"step": 5990
},
{
"epoch": 1.95,
"learning_rate": 5.809985278964875e-06,
"loss": 0.104,
"step": 6000
},
{
"epoch": 1.95,
"learning_rate": 5.778224516393312e-06,
"loss": 0.1036,
"step": 6010
},
{
"epoch": 1.95,
"learning_rate": 5.746515493330992e-06,
"loss": 0.1053,
"step": 6020
},
{
"epoch": 1.96,
"learning_rate": 5.714858598384387e-06,
"loss": 0.1099,
"step": 6030
},
{
"epoch": 1.96,
"learning_rate": 5.683254219521117e-06,
"loss": 0.1014,
"step": 6040
},
{
"epoch": 1.96,
"learning_rate": 5.651702744065207e-06,
"loss": 0.1054,
"step": 6050
},
{
"epoch": 1.96,
"learning_rate": 5.620204558692331e-06,
"loss": 0.102,
"step": 6060
},
{
"epoch": 1.97,
"learning_rate": 5.588760049425057e-06,
"loss": 0.1084,
"step": 6070
},
{
"epoch": 1.97,
"learning_rate": 5.557369601628142e-06,
"loss": 0.1095,
"step": 6080
},
{
"epoch": 1.97,
"learning_rate": 5.5260336000038e-06,
"loss": 0.1104,
"step": 6090
},
{
"epoch": 1.98,
"learning_rate": 5.494752428586985e-06,
"loss": 0.1011,
"step": 6100
},
{
"epoch": 1.98,
"learning_rate": 5.46352647074068e-06,
"loss": 0.1108,
"step": 6110
},
{
"epoch": 1.98,
"learning_rate": 5.4323561091512045e-06,
"loss": 0.1034,
"step": 6120
},
{
"epoch": 1.99,
"learning_rate": 5.401241725823536e-06,
"loss": 0.1085,
"step": 6130
},
{
"epoch": 1.99,
"learning_rate": 5.370183702076599e-06,
"loss": 0.116,
"step": 6140
},
{
"epoch": 1.99,
"learning_rate": 5.33918241853862e-06,
"loss": 0.1138,
"step": 6150
},
{
"epoch": 2.0,
"learning_rate": 5.308238255142457e-06,
"loss": 0.112,
"step": 6160
},
{
"epoch": 2.0,
"eval_loss": 0.1484375,
"eval_runtime": 6.4637,
"eval_samples_per_second": 19.803,
"eval_steps_per_second": 0.155,
"step": 6168
},
{
"epoch": 2.0,
"learning_rate": 5.277351591120926e-06,
"loss": 0.0985,
"step": 6170
},
{
"epoch": 2.0,
"learning_rate": 5.246522805002168e-06,
"loss": 0.0714,
"step": 6180
},
{
"epoch": 2.01,
"learning_rate": 5.215752274605012e-06,
"loss": 0.0702,
"step": 6190
},
{
"epoch": 2.01,
"learning_rate": 5.185040377034347e-06,
"loss": 0.0655,
"step": 6200
},
{
"epoch": 2.01,
"learning_rate": 5.1543874886764774e-06,
"loss": 0.0648,
"step": 6210
},
{
"epoch": 2.02,
"learning_rate": 5.123793985194536e-06,
"loss": 0.0666,
"step": 6220
},
{
"epoch": 2.02,
"learning_rate": 5.093260241523872e-06,
"loss": 0.0688,
"step": 6230
},
{
"epoch": 2.02,
"learning_rate": 5.0627866318674544e-06,
"loss": 0.0657,
"step": 6240
},
{
"epoch": 2.03,
"learning_rate": 5.032373529691283e-06,
"loss": 0.0696,
"step": 6250
},
{
"epoch": 2.03,
"learning_rate": 5.002021307719811e-06,
"loss": 0.0691,
"step": 6260
},
{
"epoch": 2.03,
"learning_rate": 4.971730337931391e-06,
"loss": 0.065,
"step": 6270
},
{
"epoch": 2.04,
"learning_rate": 4.9415009915537045e-06,
"loss": 0.0648,
"step": 6280
},
{
"epoch": 2.04,
"learning_rate": 4.911333639059208e-06,
"loss": 0.0624,
"step": 6290
},
{
"epoch": 2.04,
"learning_rate": 4.881228650160598e-06,
"loss": 0.0708,
"step": 6300
},
{
"epoch": 2.05,
"learning_rate": 4.85118639380629e-06,
"loss": 0.0691,
"step": 6310
},
{
"epoch": 2.05,
"learning_rate": 4.8212072381758744e-06,
"loss": 0.0708,
"step": 6320
},
{
"epoch": 2.05,
"learning_rate": 4.791291550675635e-06,
"loss": 0.0716,
"step": 6330
},
{
"epoch": 2.06,
"learning_rate": 4.761439697934009e-06,
"loss": 0.0712,
"step": 6340
},
{
"epoch": 2.06,
"learning_rate": 4.731652045797134e-06,
"loss": 0.0689,
"step": 6350
},
{
"epoch": 2.06,
"learning_rate": 4.701928959324323e-06,
"loss": 0.0662,
"step": 6360
},
{
"epoch": 2.07,
"learning_rate": 4.672270802783628e-06,
"loss": 0.0718,
"step": 6370
},
{
"epoch": 2.07,
"learning_rate": 4.642677939647356e-06,
"loss": 0.0733,
"step": 6380
},
{
"epoch": 2.07,
"learning_rate": 4.6131507325876144e-06,
"loss": 0.0686,
"step": 6390
},
{
"epoch": 2.08,
"learning_rate": 4.583689543471863e-06,
"loss": 0.0706,
"step": 6400
},
{
"epoch": 2.08,
"learning_rate": 4.5542947333585e-06,
"loss": 0.0649,
"step": 6410
},
{
"epoch": 2.08,
"learning_rate": 4.5249666624924195e-06,
"loss": 0.0677,
"step": 6420
},
{
"epoch": 2.08,
"learning_rate": 4.495705690300593e-06,
"loss": 0.0675,
"step": 6430
},
{
"epoch": 2.09,
"learning_rate": 4.466512175387672e-06,
"loss": 0.0642,
"step": 6440
},
{
"epoch": 2.09,
"learning_rate": 4.437386475531601e-06,
"loss": 0.0714,
"step": 6450
},
{
"epoch": 2.09,
"learning_rate": 4.408328947679221e-06,
"loss": 0.0693,
"step": 6460
},
{
"epoch": 2.1,
"learning_rate": 4.379339947941896e-06,
"loss": 0.0676,
"step": 6470
},
{
"epoch": 2.1,
"learning_rate": 4.350419831591147e-06,
"loss": 0.068,
"step": 6480
},
{
"epoch": 2.1,
"learning_rate": 4.321568953054316e-06,
"loss": 0.0696,
"step": 6490
},
{
"epoch": 2.11,
"learning_rate": 4.2927876659101905e-06,
"loss": 0.0699,
"step": 6500
},
{
"epoch": 2.11,
"learning_rate": 4.264076322884708e-06,
"loss": 0.0683,
"step": 6510
},
{
"epoch": 2.11,
"learning_rate": 4.2354352758465945e-06,
"loss": 0.0673,
"step": 6520
},
{
"epoch": 2.12,
"learning_rate": 4.206864875803086e-06,
"loss": 0.0702,
"step": 6530
},
{
"epoch": 2.12,
"learning_rate": 4.178365472895602e-06,
"loss": 0.0692,
"step": 6540
},
{
"epoch": 2.12,
"learning_rate": 4.149937416395468e-06,
"loss": 0.0699,
"step": 6550
},
{
"epoch": 2.13,
"learning_rate": 4.121581054699636e-06,
"loss": 0.0651,
"step": 6560
},
{
"epoch": 2.13,
"learning_rate": 4.093296735326404e-06,
"loss": 0.07,
"step": 6570
},
{
"epoch": 2.13,
"learning_rate": 4.065084804911165e-06,
"loss": 0.0689,
"step": 6580
},
{
"epoch": 2.14,
"learning_rate": 4.036945609202146e-06,
"loss": 0.0759,
"step": 6590
},
{
"epoch": 2.14,
"learning_rate": 4.008879493056212e-06,
"loss": 0.0721,
"step": 6600
},
{
"epoch": 2.14,
"learning_rate": 3.98088680043458e-06,
"loss": 0.0662,
"step": 6610
},
{
"epoch": 2.15,
"learning_rate": 3.95296787439864e-06,
"loss": 0.0758,
"step": 6620
},
{
"epoch": 2.15,
"learning_rate": 3.9251230571057495e-06,
"loss": 0.0663,
"step": 6630
},
{
"epoch": 2.15,
"learning_rate": 3.897352689805036e-06,
"loss": 0.069,
"step": 6640
},
{
"epoch": 2.16,
"learning_rate": 3.869657112833206e-06,
"loss": 0.069,
"step": 6650
},
{
"epoch": 2.16,
"learning_rate": 3.842036665610379e-06,
"loss": 0.0711,
"step": 6660
},
{
"epoch": 2.16,
"learning_rate": 3.814491686635943e-06,
"loss": 0.0671,
"step": 6670
},
{
"epoch": 2.17,
"learning_rate": 3.7870225134843776e-06,
"loss": 0.0706,
"step": 6680
},
{
"epoch": 2.17,
"learning_rate": 3.7596294828011483e-06,
"loss": 0.0685,
"step": 6690
},
{
"epoch": 2.17,
"learning_rate": 3.7323129302985485e-06,
"loss": 0.0659,
"step": 6700
},
{
"epoch": 2.18,
"learning_rate": 3.705073190751617e-06,
"loss": 0.0664,
"step": 6710
},
{
"epoch": 2.18,
"learning_rate": 3.6779105979940056e-06,
"loss": 0.0702,
"step": 6720
},
{
"epoch": 2.18,
"learning_rate": 3.650825484913916e-06,
"loss": 0.0657,
"step": 6730
},
{
"epoch": 2.19,
"learning_rate": 3.623818183449992e-06,
"loss": 0.0666,
"step": 6740
},
{
"epoch": 2.19,
"learning_rate": 3.59688902458728e-06,
"loss": 0.0668,
"step": 6750
},
{
"epoch": 2.19,
"learning_rate": 3.5700383383531467e-06,
"loss": 0.0643,
"step": 6760
},
{
"epoch": 2.2,
"learning_rate": 3.5432664538132446e-06,
"loss": 0.0618,
"step": 6770
},
{
"epoch": 2.2,
"learning_rate": 3.516573699067499e-06,
"loss": 0.0685,
"step": 6780
},
{
"epoch": 2.2,
"learning_rate": 3.48996040124605e-06,
"loss": 0.0664,
"step": 6790
},
{
"epoch": 2.2,
"learning_rate": 3.463426886505268e-06,
"loss": 0.0704,
"step": 6800
},
{
"epoch": 2.21,
"learning_rate": 3.436973480023743e-06,
"loss": 0.0671,
"step": 6810
},
{
"epoch": 2.21,
"learning_rate": 3.4106005059983283e-06,
"loss": 0.068,
"step": 6820
},
{
"epoch": 2.21,
"learning_rate": 3.3843082876401265e-06,
"loss": 0.0685,
"step": 6830
},
{
"epoch": 2.22,
"learning_rate": 3.3580971471705492e-06,
"loss": 0.0677,
"step": 6840
},
{
"epoch": 2.22,
"learning_rate": 3.331967405817379e-06,
"loss": 0.07,
"step": 6850
},
{
"epoch": 2.22,
"learning_rate": 3.3059193838108037e-06,
"loss": 0.0684,
"step": 6860
},
{
"epoch": 2.23,
"learning_rate": 3.2799534003795274e-06,
"loss": 0.0677,
"step": 6870
},
{
"epoch": 2.23,
"learning_rate": 3.254069773746822e-06,
"loss": 0.0719,
"step": 6880
},
{
"epoch": 2.23,
"learning_rate": 3.2282688211266568e-06,
"loss": 0.0706,
"step": 6890
},
{
"epoch": 2.24,
"learning_rate": 3.2025508587197907e-06,
"loss": 0.067,
"step": 6900
},
{
"epoch": 2.24,
"learning_rate": 3.176916201709912e-06,
"loss": 0.069,
"step": 6910
},
{
"epoch": 2.24,
"learning_rate": 3.1513651642597607e-06,
"loss": 0.065,
"step": 6920
},
{
"epoch": 2.25,
"learning_rate": 3.1258980595072976e-06,
"loss": 0.0708,
"step": 6930
},
{
"epoch": 2.25,
"learning_rate": 3.1005151995618454e-06,
"loss": 0.0684,
"step": 6940
},
{
"epoch": 2.25,
"learning_rate": 3.0752168955002735e-06,
"loss": 0.068,
"step": 6950
},
{
"epoch": 2.26,
"learning_rate": 3.0500034573631943e-06,
"loss": 0.0661,
"step": 6960
},
{
"epoch": 2.26,
"learning_rate": 3.024875194151151e-06,
"loss": 0.0669,
"step": 6970
},
{
"epoch": 2.26,
"learning_rate": 2.9998324138208336e-06,
"loss": 0.0672,
"step": 6980
},
{
"epoch": 2.27,
"learning_rate": 2.974875423281299e-06,
"loss": 0.0702,
"step": 6990
},
{
"epoch": 2.27,
"learning_rate": 2.950004528390238e-06,
"loss": 0.0674,
"step": 7000
},
{
"epoch": 2.27,
"learning_rate": 2.9252200339501847e-06,
"loss": 0.0687,
"step": 7010
},
{
"epoch": 2.28,
"learning_rate": 2.9005222437048054e-06,
"loss": 0.0689,
"step": 7020
},
{
"epoch": 2.28,
"learning_rate": 2.8759114603351836e-06,
"loss": 0.0695,
"step": 7030
},
{
"epoch": 2.28,
"learning_rate": 2.8513879854560856e-06,
"loss": 0.0667,
"step": 7040
},
{
"epoch": 2.29,
"learning_rate": 2.8269521196122907e-06,
"loss": 0.072,
"step": 7050
},
{
"epoch": 2.29,
"learning_rate": 2.8026041622748822e-06,
"loss": 0.0665,
"step": 7060
},
{
"epoch": 2.29,
"learning_rate": 2.7783444118376046e-06,
"loss": 0.0633,
"step": 7070
},
{
"epoch": 2.3,
"learning_rate": 2.754173165613179e-06,
"loss": 0.0663,
"step": 7080
},
{
"epoch": 2.3,
"learning_rate": 2.730090719829682e-06,
"loss": 0.0682,
"step": 7090
},
{
"epoch": 2.3,
"learning_rate": 2.7060973696269e-06,
"loss": 0.0688,
"step": 7100
},
{
"epoch": 2.31,
"learning_rate": 2.6821934090527245e-06,
"loss": 0.0683,
"step": 7110
},
{
"epoch": 2.31,
"learning_rate": 2.6583791310595376e-06,
"loss": 0.0666,
"step": 7120
},
{
"epoch": 2.31,
"learning_rate": 2.6346548275006232e-06,
"loss": 0.0664,
"step": 7130
},
{
"epoch": 2.32,
"learning_rate": 2.6110207891266013e-06,
"loss": 0.0667,
"step": 7140
},
{
"epoch": 2.32,
"learning_rate": 2.5874773055818557e-06,
"loss": 0.0663,
"step": 7150
},
{
"epoch": 2.32,
"learning_rate": 2.564024665400978e-06,
"loss": 0.0671,
"step": 7160
},
{
"epoch": 2.32,
"learning_rate": 2.5406631560052396e-06,
"loss": 0.0703,
"step": 7170
},
{
"epoch": 2.33,
"learning_rate": 2.517393063699084e-06,
"loss": 0.0665,
"step": 7180
},
{
"epoch": 2.33,
"learning_rate": 2.4942146736665827e-06,
"loss": 0.0667,
"step": 7190
},
{
"epoch": 2.33,
"learning_rate": 2.4711282699679718e-06,
"loss": 0.0665,
"step": 7200
},
{
"epoch": 2.34,
"learning_rate": 2.4481341355361487e-06,
"loss": 0.0656,
"step": 7210
},
{
"epoch": 2.34,
"learning_rate": 2.4252325521732267e-06,
"loss": 0.0676,
"step": 7220
},
{
"epoch": 2.34,
"learning_rate": 2.402423800547067e-06,
"loss": 0.0703,
"step": 7230
},
{
"epoch": 2.35,
"learning_rate": 2.3797081601878315e-06,
"loss": 0.0702,
"step": 7240
},
{
"epoch": 2.35,
"learning_rate": 2.3570859094845823e-06,
"loss": 0.0661,
"step": 7250
},
{
"epoch": 2.35,
"learning_rate": 2.33455732568184e-06,
"loss": 0.0714,
"step": 7260
},
{
"epoch": 2.36,
"learning_rate": 2.3121226848762124e-06,
"loss": 0.0687,
"step": 7270
},
{
"epoch": 2.36,
"learning_rate": 2.2897822620129904e-06,
"loss": 0.0669,
"step": 7280
},
{
"epoch": 2.36,
"learning_rate": 2.267536330882797e-06,
"loss": 0.0683,
"step": 7290
},
{
"epoch": 2.37,
"learning_rate": 2.2453851641182124e-06,
"loss": 0.0663,
"step": 7300
},
{
"epoch": 2.37,
"learning_rate": 2.2233290331904432e-06,
"loss": 0.0669,
"step": 7310
},
{
"epoch": 2.37,
"learning_rate": 2.2013682084060008e-06,
"loss": 0.0673,
"step": 7320
},
{
"epoch": 2.38,
"learning_rate": 2.1795029589033835e-06,
"loss": 0.061,
"step": 7330
},
{
"epoch": 2.38,
"learning_rate": 2.1577335526497677e-06,
"loss": 0.065,
"step": 7340
},
{
"epoch": 2.38,
"learning_rate": 2.1360602564377386e-06,
"loss": 0.0653,
"step": 7350
},
{
"epoch": 2.39,
"learning_rate": 2.114483335882017e-06,
"loss": 0.0653,
"step": 7360
},
{
"epoch": 2.39,
"learning_rate": 2.093003055416204e-06,
"loss": 0.0671,
"step": 7370
},
{
"epoch": 2.39,
"learning_rate": 2.0716196782895326e-06,
"loss": 0.0707,
"step": 7380
},
{
"epoch": 2.4,
"learning_rate": 2.050333466563643e-06,
"loss": 0.0639,
"step": 7390
},
{
"epoch": 2.4,
"learning_rate": 2.0291446811093964e-06,
"loss": 0.0665,
"step": 7400
},
{
"epoch": 2.4,
"learning_rate": 2.0080535816036363e-06,
"loss": 0.0657,
"step": 7410
},
{
"epoch": 2.41,
"learning_rate": 1.987060426526033e-06,
"loss": 0.0671,
"step": 7420
},
{
"epoch": 2.41,
"learning_rate": 1.9661654731559086e-06,
"loss": 0.0618,
"step": 7430
},
{
"epoch": 2.41,
"learning_rate": 1.945368977569089e-06,
"loss": 0.0697,
"step": 7440
},
{
"epoch": 2.42,
"learning_rate": 1.924671194634761e-06,
"loss": 0.0662,
"step": 7450
},
{
"epoch": 2.42,
"learning_rate": 1.9040723780123416e-06,
"loss": 0.0629,
"step": 7460
},
{
"epoch": 2.42,
"learning_rate": 1.8835727801483894e-06,
"loss": 0.0706,
"step": 7470
},
{
"epoch": 2.43,
"learning_rate": 1.863172652273485e-06,
"loss": 0.065,
"step": 7480
},
{
"epoch": 2.43,
"learning_rate": 1.8428722443991764e-06,
"loss": 0.0634,
"step": 7490
},
{
"epoch": 2.43,
"learning_rate": 1.8226718053148951e-06,
"loss": 0.0637,
"step": 7500
},
{
"epoch": 2.44,
"learning_rate": 1.8025715825849266e-06,
"loss": 0.0683,
"step": 7510
},
{
"epoch": 2.44,
"learning_rate": 1.7825718225453547e-06,
"loss": 0.0688,
"step": 7520
},
{
"epoch": 2.44,
"learning_rate": 1.762672770301057e-06,
"loss": 0.0666,
"step": 7530
},
{
"epoch": 2.44,
"learning_rate": 1.742874669722703e-06,
"loss": 0.0679,
"step": 7540
},
{
"epoch": 2.45,
"learning_rate": 1.7231777634437563e-06,
"loss": 0.0645,
"step": 7550
},
{
"epoch": 2.45,
"learning_rate": 1.703582292857503e-06,
"loss": 0.0667,
"step": 7560
},
{
"epoch": 2.45,
"learning_rate": 1.6840884981140948e-06,
"loss": 0.0674,
"step": 7570
},
{
"epoch": 2.46,
"learning_rate": 1.6646966181176117e-06,
"loss": 0.0671,
"step": 7580
},
{
"epoch": 2.46,
"learning_rate": 1.6454068905231258e-06,
"loss": 0.0687,
"step": 7590
},
{
"epoch": 2.46,
"learning_rate": 1.6262195517337887e-06,
"loss": 0.0637,
"step": 7600
},
{
"epoch": 2.47,
"learning_rate": 1.6071348368979377e-06,
"loss": 0.0636,
"step": 7610
},
{
"epoch": 2.47,
"learning_rate": 1.5881529799062167e-06,
"loss": 0.0675,
"step": 7620
},
{
"epoch": 2.47,
"learning_rate": 1.5692742133887095e-06,
"loss": 0.0637,
"step": 7630
},
{
"epoch": 2.48,
"learning_rate": 1.550498768712073e-06,
"loss": 0.065,
"step": 7640
},
{
"epoch": 2.48,
"learning_rate": 1.5318268759767307e-06,
"loss": 0.0623,
"step": 7650
},
{
"epoch": 2.48,
"learning_rate": 1.5132587640140227e-06,
"loss": 0.0681,
"step": 7660
},
{
"epoch": 2.49,
"learning_rate": 1.494794660383425e-06,
"loss": 0.0692,
"step": 7670
},
{
"epoch": 2.49,
"learning_rate": 1.4764347913697441e-06,
"loss": 0.0678,
"step": 7680
},
{
"epoch": 2.49,
"learning_rate": 1.4581793819803559e-06,
"loss": 0.0617,
"step": 7690
},
{
"epoch": 2.5,
"learning_rate": 1.4400286559424392e-06,
"loss": 0.0675,
"step": 7700
},
{
"epoch": 2.5,
"learning_rate": 1.4219828357002351e-06,
"loss": 0.0681,
"step": 7710
},
{
"epoch": 2.5,
"learning_rate": 1.4040421424123308e-06,
"loss": 0.0651,
"step": 7720
},
{
"epoch": 2.51,
"learning_rate": 1.3862067959489377e-06,
"loss": 0.0666,
"step": 7730
},
{
"epoch": 2.51,
"learning_rate": 1.368477014889199e-06,
"loss": 0.0637,
"step": 7740
},
{
"epoch": 2.51,
"learning_rate": 1.3508530165185096e-06,
"loss": 0.0677,
"step": 7750
},
{
"epoch": 2.52,
"learning_rate": 1.3333350168258651e-06,
"loss": 0.0639,
"step": 7760
},
{
"epoch": 2.52,
"learning_rate": 1.3159232305012027e-06,
"loss": 0.064,
"step": 7770
},
{
"epoch": 2.52,
"learning_rate": 1.298617870932769e-06,
"loss": 0.0643,
"step": 7780
},
{
"epoch": 2.53,
"learning_rate": 1.2814191502045093e-06,
"loss": 0.0655,
"step": 7790
},
{
"epoch": 2.53,
"learning_rate": 1.2643272790934735e-06,
"loss": 0.0666,
"step": 7800
},
{
"epoch": 2.53,
"learning_rate": 1.2473424670672264e-06,
"loss": 0.0645,
"step": 7810
},
{
"epoch": 2.54,
"learning_rate": 1.2304649222812792e-06,
"loss": 0.0662,
"step": 7820
},
{
"epoch": 2.54,
"learning_rate": 1.2136948515765402e-06,
"loss": 0.0688,
"step": 7830
},
{
"epoch": 2.54,
"learning_rate": 1.1970324604767836e-06,
"loss": 0.0655,
"step": 7840
},
{
"epoch": 2.55,
"learning_rate": 1.180477953186131e-06,
"loss": 0.0646,
"step": 7850
},
{
"epoch": 2.55,
"learning_rate": 1.1640315325865358e-06,
"loss": 0.069,
"step": 7860
},
{
"epoch": 2.55,
"learning_rate": 1.1476934002353191e-06,
"loss": 0.0635,
"step": 7870
},
{
"epoch": 2.56,
"learning_rate": 1.1314637563626774e-06,
"loss": 0.0638,
"step": 7880
},
{
"epoch": 2.56,
"learning_rate": 1.1153427998692401e-06,
"loss": 0.0656,
"step": 7890
},
{
"epoch": 2.56,
"learning_rate": 1.0993307283236355e-06,
"loss": 0.0647,
"step": 7900
},
{
"epoch": 2.56,
"learning_rate": 1.083427737960062e-06,
"loss": 0.0664,
"step": 7910
},
{
"epoch": 2.57,
"learning_rate": 1.067634023675882e-06,
"loss": 0.0653,
"step": 7920
},
{
"epoch": 2.57,
"learning_rate": 1.0519497790292388e-06,
"loss": 0.0662,
"step": 7930
},
{
"epoch": 2.57,
"learning_rate": 1.036375196236684e-06,
"loss": 0.0632,
"step": 7940
},
{
"epoch": 2.58,
"learning_rate": 1.0209104661708225e-06,
"loss": 0.0627,
"step": 7950
},
{
"epoch": 2.58,
"learning_rate": 1.0055557783579627e-06,
"loss": 0.0646,
"step": 7960
},
{
"epoch": 2.58,
"learning_rate": 9.903113209758098e-07,
"loss": 0.0689,
"step": 7970
},
{
"epoch": 2.59,
"learning_rate": 9.751772808511474e-07,
"loss": 0.0667,
"step": 7980
},
{
"epoch": 2.59,
"learning_rate": 9.601538434575586e-07,
"loss": 0.0589,
"step": 7990
},
{
"epoch": 2.59,
"learning_rate": 9.452411929131411e-07,
"loss": 0.0668,
"step": 8000
},
{
"epoch": 2.6,
"learning_rate": 9.30439511978255e-07,
"loss": 0.0633,
"step": 8010
},
{
"epoch": 2.6,
"learning_rate": 9.157489820532905e-07,
"loss": 0.0669,
"step": 8020
},
{
"epoch": 2.6,
"learning_rate": 9.011697831764366e-07,
"loss": 0.0614,
"step": 8030
},
{
"epoch": 2.61,
"learning_rate": 8.867020940214743e-07,
"loss": 0.0641,
"step": 8040
},
{
"epoch": 2.61,
"learning_rate": 8.723460918955895e-07,
"loss": 0.0676,
"step": 8050
},
{
"epoch": 2.61,
"learning_rate": 8.581019527372037e-07,
"loss": 0.0687,
"step": 8060
},
{
"epoch": 2.62,
"learning_rate": 8.439698511138106e-07,
"loss": 0.0665,
"step": 8070
},
{
"epoch": 2.62,
"learning_rate": 8.299499602198413e-07,
"loss": 0.0664,
"step": 8080
},
{
"epoch": 2.62,
"learning_rate": 8.160424518745425e-07,
"loss": 0.0693,
"step": 8090
},
{
"epoch": 2.63,
"learning_rate": 8.022474965198635e-07,
"loss": 0.0628,
"step": 8100
},
{
"epoch": 2.63,
"learning_rate": 7.885652632183771e-07,
"loss": 0.0626,
"step": 8110
},
{
"epoch": 2.63,
"learning_rate": 7.749959196512014e-07,
"loss": 0.0674,
"step": 8120
},
{
"epoch": 2.64,
"learning_rate": 7.615396321159496e-07,
"loss": 0.0685,
"step": 8130
},
{
"epoch": 2.64,
"learning_rate": 7.481965655246859e-07,
"loss": 0.0663,
"step": 8140
},
{
"epoch": 2.64,
"learning_rate": 7.349668834019063e-07,
"loss": 0.0663,
"step": 8150
},
{
"epoch": 2.65,
"learning_rate": 7.218507478825387e-07,
"loss": 0.0636,
"step": 8160
},
{
"epoch": 2.65,
"learning_rate": 7.088483197099561e-07,
"loss": 0.0643,
"step": 8170
},
{
"epoch": 2.65,
"learning_rate": 6.95959758233995e-07,
"loss": 0.0625,
"step": 8180
},
{
"epoch": 2.66,
"learning_rate": 6.831852214090163e-07,
"loss": 0.064,
"step": 8190
},
{
"epoch": 2.66,
"learning_rate": 6.705248657919638e-07,
"loss": 0.064,
"step": 8200
},
{
"epoch": 2.66,
"learning_rate": 6.579788465404491e-07,
"loss": 0.064,
"step": 8210
},
{
"epoch": 2.67,
"learning_rate": 6.455473174108396e-07,
"loss": 0.0641,
"step": 8220
},
{
"epoch": 2.67,
"learning_rate": 6.332304307563853e-07,
"loss": 0.0647,
"step": 8230
},
{
"epoch": 2.67,
"learning_rate": 6.210283375253512e-07,
"loss": 0.0653,
"step": 8240
},
{
"epoch": 2.68,
"learning_rate": 6.089411872591566e-07,
"loss": 0.0643,
"step": 8250
},
{
"epoch": 2.68,
"learning_rate": 5.969691280905565e-07,
"loss": 0.0635,
"step": 8260
},
{
"epoch": 2.68,
"learning_rate": 5.851123067418185e-07,
"loss": 0.0685,
"step": 8270
},
{
"epoch": 2.68,
"learning_rate": 5.733708685229222e-07,
"loss": 0.0644,
"step": 8280
},
{
"epoch": 2.69,
"learning_rate": 5.617449573297828e-07,
"loss": 0.0665,
"step": 8290
},
{
"epoch": 2.69,
"learning_rate": 5.502347156424881e-07,
"loss": 0.064,
"step": 8300
},
{
"epoch": 2.69,
"learning_rate": 5.388402845235541e-07,
"loss": 0.0673,
"step": 8310
},
{
"epoch": 2.7,
"learning_rate": 5.275618036161856e-07,
"loss": 0.0619,
"step": 8320
},
{
"epoch": 2.7,
"learning_rate": 5.163994111425752e-07,
"loss": 0.0654,
"step": 8330
},
{
"epoch": 2.7,
"learning_rate": 5.05353243902208e-07,
"loss": 0.0627,
"step": 8340
},
{
"epoch": 2.71,
"learning_rate": 4.944234372701851e-07,
"loss": 0.067,
"step": 8350
},
{
"epoch": 2.71,
"learning_rate": 4.836101251955583e-07,
"loss": 0.0639,
"step": 8360
},
{
"epoch": 2.71,
"learning_rate": 4.7291344019969374e-07,
"loss": 0.0671,
"step": 8370
},
{
"epoch": 2.72,
"learning_rate": 4.6233351337464984e-07,
"loss": 0.066,
"step": 8380
},
{
"epoch": 2.72,
"learning_rate": 4.518704743815672e-07,
"loss": 0.0688,
"step": 8390
},
{
"epoch": 2.72,
"learning_rate": 4.415244514490791e-07,
"loss": 0.0644,
"step": 8400
},
{
"epoch": 2.73,
"learning_rate": 4.312955713717404e-07,
"loss": 0.0632,
"step": 8410
},
{
"epoch": 2.73,
"learning_rate": 4.2118395950847767e-07,
"loss": 0.0641,
"step": 8420
},
{
"epoch": 2.73,
"learning_rate": 4.1118973978104603e-07,
"loss": 0.0665,
"step": 8430
},
{
"epoch": 2.74,
"learning_rate": 4.0131303467251804e-07,
"loss": 0.0643,
"step": 8440
},
{
"epoch": 2.74,
"learning_rate": 3.9155396522577496e-07,
"loss": 0.0628,
"step": 8450
},
{
"epoch": 2.74,
"learning_rate": 3.8191265104203014e-07,
"loss": 0.0638,
"step": 8460
},
{
"epoch": 2.75,
"learning_rate": 3.723892102793558e-07,
"loss": 0.0648,
"step": 8470
},
{
"epoch": 2.75,
"learning_rate": 3.629837596512453e-07,
"loss": 0.0665,
"step": 8480
},
{
"epoch": 2.75,
"learning_rate": 3.53696414425172e-07,
"loss": 0.0661,
"step": 8490
},
{
"epoch": 2.76,
"learning_rate": 3.445272884211837e-07,
"loss": 0.0691,
"step": 8500
},
{
"epoch": 2.76,
"learning_rate": 3.3547649401050265e-07,
"loss": 0.0628,
"step": 8510
},
{
"epoch": 2.76,
"learning_rate": 3.2654414211415463e-07,
"loss": 0.0624,
"step": 8520
},
{
"epoch": 2.77,
"learning_rate": 3.177303422016065e-07,
"loss": 0.0662,
"step": 8530
},
{
"epoch": 2.77,
"learning_rate": 3.0903520228941944e-07,
"loss": 0.0636,
"step": 8540
},
{
"epoch": 2.77,
"learning_rate": 3.004588289399324e-07,
"loss": 0.0657,
"step": 8550
},
{
"epoch": 2.78,
"learning_rate": 2.9200132725995644e-07,
"loss": 0.0639,
"step": 8560
},
{
"epoch": 2.78,
"learning_rate": 2.8366280089948126e-07,
"loss": 0.0648,
"step": 8570
},
{
"epoch": 2.78,
"learning_rate": 2.7544335205040626e-07,
"loss": 0.0648,
"step": 8580
},
{
"epoch": 2.79,
"learning_rate": 2.6734308144529154e-07,
"loss": 0.0644,
"step": 8590
},
{
"epoch": 2.79,
"learning_rate": 2.59362088356121e-07,
"loss": 0.0631,
"step": 8600
},
{
"epoch": 2.79,
"learning_rate": 2.515004705930835e-07,
"loss": 0.0628,
"step": 8610
},
{
"epoch": 2.8,
"learning_rate": 2.437583245033814e-07,
"loss": 0.0631,
"step": 8620
},
{
"epoch": 2.8,
"learning_rate": 2.3613574497003967e-07,
"loss": 0.0624,
"step": 8630
},
{
"epoch": 2.8,
"learning_rate": 2.2863282541075394e-07,
"loss": 0.0617,
"step": 8640
},
{
"epoch": 2.8,
"learning_rate": 2.2124965777673313e-07,
"loss": 0.067,
"step": 8650
},
{
"epoch": 2.81,
"learning_rate": 2.1398633255158675e-07,
"loss": 0.0618,
"step": 8660
},
{
"epoch": 2.81,
"learning_rate": 2.0684293875020245e-07,
"loss": 0.0644,
"step": 8670
},
{
"epoch": 2.81,
"learning_rate": 1.99819563917667e-07,
"loss": 0.0698,
"step": 8680
},
{
"epoch": 2.82,
"learning_rate": 1.9291629412818368e-07,
"loss": 0.0672,
"step": 8690
},
{
"epoch": 2.82,
"learning_rate": 1.8613321398402107e-07,
"loss": 0.0694,
"step": 8700
},
{
"epoch": 2.82,
"learning_rate": 1.7947040661448256e-07,
"loss": 0.0661,
"step": 8710
},
{
"epoch": 2.83,
"learning_rate": 1.7292795367487513e-07,
"loss": 0.0628,
"step": 8720
},
{
"epoch": 2.83,
"learning_rate": 1.6650593534551673e-07,
"loss": 0.0646,
"step": 8730
},
{
"epoch": 2.83,
"learning_rate": 1.6020443033075485e-07,
"loss": 0.0627,
"step": 8740
},
{
"epoch": 2.84,
"learning_rate": 1.5402351585799725e-07,
"loss": 0.0647,
"step": 8750
},
{
"epoch": 2.84,
"learning_rate": 1.4796326767676617e-07,
"loss": 0.0675,
"step": 8760
},
{
"epoch": 2.84,
"learning_rate": 1.420237600577734e-07,
"loss": 0.0627,
"step": 8770
},
{
"epoch": 2.85,
"learning_rate": 1.3620506579200777e-07,
"loss": 0.0614,
"step": 8780
},
{
"epoch": 2.85,
"learning_rate": 1.3050725618984017e-07,
"loss": 0.0644,
"step": 8790
},
{
"epoch": 2.85,
"learning_rate": 1.2493040108015774e-07,
"loss": 0.0634,
"step": 8800
},
{
"epoch": 2.86,
"learning_rate": 1.1947456880949893e-07,
"loss": 0.0649,
"step": 8810
},
{
"epoch": 2.86,
"learning_rate": 1.1413982624122189e-07,
"loss": 0.0632,
"step": 8820
},
{
"epoch": 2.86,
"learning_rate": 1.08926238754683e-07,
"loss": 0.0639,
"step": 8830
},
{
"epoch": 2.87,
"learning_rate": 1.0383387024443414e-07,
"loss": 0.0619,
"step": 8840
},
{
"epoch": 2.87,
"learning_rate": 9.88627831194433e-08,
"loss": 0.0627,
"step": 8850
},
{
"epoch": 2.87,
"learning_rate": 9.401303830232855e-08,
"loss": 0.0657,
"step": 8860
},
{
"epoch": 2.88,
"learning_rate": 8.928469522860527e-08,
"loss": 0.0646,
"step": 8870
},
{
"epoch": 2.88,
"learning_rate": 8.467781184596901e-08,
"loss": 0.0635,
"step": 8880
},
{
"epoch": 2.88,
"learning_rate": 8.0192444613576e-08,
"loss": 0.0671,
"step": 8890
},
{
"epoch": 2.89,
"learning_rate": 7.582864850135707e-08,
"loss": 0.0652,
"step": 8900
},
{
"epoch": 2.89,
"learning_rate": 7.158647698933707e-08,
"loss": 0.0676,
"step": 8910
},
{
"epoch": 2.89,
"learning_rate": 6.746598206698762e-08,
"loss": 0.0631,
"step": 8920
},
{
"epoch": 2.9,
"learning_rate": 6.34672142325865e-08,
"loss": 0.065,
"step": 8930
},
{
"epoch": 2.9,
"learning_rate": 5.959022249259594e-08,
"loss": 0.0669,
"step": 8940
},
{
"epoch": 2.9,
"learning_rate": 5.583505436106529e-08,
"loss": 0.067,
"step": 8950
},
{
"epoch": 2.91,
"learning_rate": 5.220175585904819e-08,
"loss": 0.061,
"step": 8960
},
{
"epoch": 2.91,
"learning_rate": 4.8690371514039656e-08,
"loss": 0.0661,
"step": 8970
},
{
"epoch": 2.91,
"learning_rate": 4.5300944359425446e-08,
"loss": 0.0626,
"step": 8980
},
{
"epoch": 2.92,
"learning_rate": 4.203351593396354e-08,
"loss": 0.0633,
"step": 8990
},
{
"epoch": 2.92,
"learning_rate": 3.8888126281264593e-08,
"loss": 0.0652,
"step": 9000
},
{
"epoch": 2.92,
"learning_rate": 3.586481394930896e-08,
"loss": 0.065,
"step": 9010
},
{
"epoch": 2.92,
"learning_rate": 3.2963615989971553e-08,
"loss": 0.0652,
"step": 9020
},
{
"epoch": 2.93,
"learning_rate": 3.0184567958567724e-08,
"loss": 0.0655,
"step": 9030
},
{
"epoch": 2.93,
"learning_rate": 2.752770391341919e-08,
"loss": 0.066,
"step": 9040
},
{
"epoch": 2.93,
"learning_rate": 2.499305641543104e-08,
"loss": 0.0654,
"step": 9050
},
{
"epoch": 2.94,
"learning_rate": 2.2580656527700916e-08,
"loss": 0.0673,
"step": 9060
},
{
"epoch": 2.94,
"learning_rate": 2.0290533815132683e-08,
"loss": 0.0658,
"step": 9070
},
{
"epoch": 2.94,
"learning_rate": 1.8122716344074476e-08,
"loss": 0.0628,
"step": 9080
},
{
"epoch": 2.95,
"learning_rate": 1.6077230681978972e-08,
"loss": 0.0635,
"step": 9090
},
{
"epoch": 2.95,
"learning_rate": 1.4154101897070338e-08,
"loss": 0.0635,
"step": 9100
},
{
"epoch": 2.95,
"learning_rate": 1.2353353558045566e-08,
"loss": 0.0679,
"step": 9110
},
{
"epoch": 2.96,
"learning_rate": 1.0675007733780273e-08,
"loss": 0.0656,
"step": 9120
},
{
"epoch": 2.96,
"learning_rate": 9.119084993055583e-09,
"loss": 0.0652,
"step": 9130
},
{
"epoch": 2.96,
"learning_rate": 7.685604404316094e-09,
"loss": 0.0594,
"step": 9140
},
{
"epoch": 2.97,
"learning_rate": 6.374583535426748e-09,
"loss": 0.0645,
"step": 9150
},
{
"epoch": 2.97,
"learning_rate": 5.186038453458553e-09,
"loss": 0.0657,
"step": 9160
},
{
"epoch": 2.97,
"learning_rate": 4.119983724497623e-09,
"loss": 0.0668,
"step": 9170
},
{
"epoch": 2.98,
"learning_rate": 3.1764324134631043e-09,
"loss": 0.0617,
"step": 9180
},
{
"epoch": 2.98,
"learning_rate": 2.355396083941752e-09,
"loss": 0.0632,
"step": 9190
},
{
"epoch": 2.98,
"learning_rate": 1.656884798058034e-09,
"loss": 0.0633,
"step": 9200
},
{
"epoch": 2.99,
"learning_rate": 1.0809071163386808e-09,
"loss": 0.0661,
"step": 9210
},
{
"epoch": 2.99,
"learning_rate": 6.274700976161008e-10,
"loss": 0.0656,
"step": 9220
},
{
"epoch": 2.99,
"learning_rate": 2.9657929893955886e-10,
"loss": 0.0652,
"step": 9230
},
{
"epoch": 3.0,
"learning_rate": 8.823877550301341e-11,
"loss": 0.0603,
"step": 9240
},
{
"epoch": 3.0,
"learning_rate": 2.4510806018174237e-12,
"loss": 0.06,
"step": 9250
},
{
"epoch": 3.0,
"eval_loss": 0.1728515625,
"eval_runtime": 6.4727,
"eval_samples_per_second": 19.775,
"eval_steps_per_second": 0.154,
"step": 9252
}
],
"max_steps": 9252,
"num_train_epochs": 3,
"total_flos": 9.615097431625354e+19,
"trial_name": null,
"trial_params": null
}